From ccbfea78adf75d3d9e87aa739dab83254f5333fa Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Mon, 8 Jul 2024 23:18:57 +0200 Subject: [PATCH 001/991] Input: ads7846 - ratelimit the spi_sync error message In case the touch controller is not connected, this message keeps scrolling on the console indefinitelly. Ratelimit it to avoid filling kernel logs. " ads7846 spi2.1: spi_sync --> -22 " Signed-off-by: Marek Vasut Link: https://lore.kernel.org/r/20240708211913.171243-1-marex@denx.de Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/ads7846.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c index 4247283c7271d5..f89c0dd15d8b91 100644 --- a/drivers/input/touchscreen/ads7846.c +++ b/drivers/input/touchscreen/ads7846.c @@ -824,7 +824,7 @@ static void ads7846_read_state(struct ads7846 *ts) m = &ts->msg[msg_idx]; error = spi_sync(ts->spi, m); if (error) { - dev_err(&ts->spi->dev, "spi_sync --> %d\n", error); + dev_err_ratelimited(&ts->spi->dev, "spi_sync --> %d\n", error); packet->ignore = true; return; } From da897484557b34a54fabb81f6c223c19a69e546d Mon Sep 17 00:00:00 2001 From: Jonathan Denose Date: Tue, 23 Jul 2024 21:33:30 -0700 Subject: [PATCH 002/991] Input: synaptics - enable SMBus for HP Elitebook 840 G2 The kernel reports that the touchpad for this device can support a different bus. With SMBus enabled the touchpad movement is smoother and three-finger gestures are recognized. Signed-off-by: Jonathan Denose Link: https://lore.kernel.org/r/20240719180612.1.Ib652dd808c274076f32cd7fc6c1160d2cf71753b@changeid Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/synaptics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index 38191c3b31bf51..380aa1614442f4 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -189,6 +189,7 @@ static const char * const smbus_pnp_ids[] = { "LEN2054", /* E480 */ "LEN2055", /* E580 */ "LEN2068", /* T14 Gen 1 */ + "SYN3015", /* HP EliteBook 840 G2 */ "SYN3052", /* HP EliteBook 840 G4 */ "SYN3221", /* HP 15-ay000 */ "SYN323d", /* HP Spectre X360 13-w013dx */ From 5ce86c6c861352c9346ebb5c96ed70cb67414aa3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 27 Jul 2024 23:02:59 +0900 Subject: [PATCH 003/991] rust: suppress error messages from CONFIG_{RUSTC,BINDGEN}_VERSION_TEXT While this is a somewhat unusual case, I encountered odd error messages when I ran Kconfig in a foreign architecture chroot. $ make allmodconfig sh: 1: rustc: not found sh: 1: bindgen: not found # # configuration written to .config # The successful execution of 'command -v rustc' does not necessarily mean that 'rustc --version' will succeed. $ sh -c 'command -v rustc' /home/masahiro/.cargo/bin/rustc $ sh -c 'rustc --version' sh: 1: rustc: not found Here, 'rustc' is built for x86, and I ran it in an arm64 system. The current code: command -v $(RUSTC) >/dev/null 2>&1 && $(RUSTC) --version || echo n can be turned into: command -v $(RUSTC) >/dev/null 2>&1 && $(RUSTC) --version 2>/dev/null || echo n However, I did not understand the necessity of 'command -v $(RUSTC)'. I simplified it to: $(RUSTC) --version 2>/dev/null || echo n Fixes: 2f7ab1267dc9 ("Kbuild: add Rust support") Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20240727140302.1806011-1-masahiroy@kernel.org [ Rebased on top of v6.11-rc1. - Miguel ] Signed-off-by: Miguel Ojeda --- init/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index a465ea9525bd59..cddeec9fcb7213 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1919,7 +1919,7 @@ config RUST config RUSTC_VERSION_TEXT string depends on RUST - default $(shell,command -v $(RUSTC) >/dev/null 2>&1 && $(RUSTC) --version || echo n) + default $(shell,$(RUSTC) --version 2>/dev/null || echo n) config BINDGEN_VERSION_TEXT string @@ -1927,7 +1927,7 @@ config BINDGEN_VERSION_TEXT # The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0 # (https://github.com/rust-lang/rust-bindgen/pull/2678). It can be removed when # the minimum version is upgraded past that (0.69.1 already fixed the issue). - default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version workaround-for-0.69.0 || echo n) + default $(shell,$(BINDGEN) --version workaround-for-0.69.0 2>/dev/null || echo n) # # Place an empty function call at each tracepoint site. Can be From aacf93e87f0d808ef46e621aa56caea336b4433c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 27 Jul 2024 23:03:00 +0900 Subject: [PATCH 004/991] rust: fix the default format for CONFIG_{RUSTC,BINDGEN}_VERSION_TEXT Another oddity in these config entries is their default value can fall back to 'n', which is a value for bool or tristate symbols. The '|| echo n' is an incorrect workaround to avoid the syntax error. This is not a big deal, as the entry is hidden by 'depends on RUST' in situations where '$(RUSTC) --version' or '$(BINDGEN) --version' fails. Anyway, it looks odd. The default of a string type symbol should be a double-quoted string literal. Turn it into an empty string when the version command fails. Fixes: 2f7ab1267dc9 ("Kbuild: add Rust support") Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20240727140302.1806011-2-masahiroy@kernel.org [ Rebased on top of v6.11-rc1. - Miguel ] Signed-off-by: Miguel Ojeda --- init/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index cddeec9fcb7213..3ada33b1d681b6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1919,7 +1919,7 @@ config RUST config RUSTC_VERSION_TEXT string depends on RUST - default $(shell,$(RUSTC) --version 2>/dev/null || echo n) + default "$(shell,$(RUSTC) --version 2>/dev/null)" config BINDGEN_VERSION_TEXT string @@ -1927,7 +1927,7 @@ config BINDGEN_VERSION_TEXT # The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0 # (https://github.com/rust-lang/rust-bindgen/pull/2678). It can be removed when # the minimum version is upgraded past that (0.69.1 already fixed the issue). - default $(shell,$(BINDGEN) --version workaround-for-0.69.0 2>/dev/null || echo n) + default "$(shell,$(BINDGEN) --version workaround-for-0.69.0 2>/dev/null)" # # Place an empty function call at each tracepoint site. Can be From e4ab5d7cb5f19858305395e034f214c92afc3cf5 Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Thu, 25 Jul 2024 11:23:33 -0500 Subject: [PATCH 005/991] soc: qcom: pd-mapper: Depend on ARCH_QCOM || COMPILE_TEST The pd-mapper driver doesn't make sense on non Qualcomm systems. Let's follow suit with the rest of the Qualcomm SoC Kconfigs and depend on ARCH_QCOM || COMPILE_TEST to avoid asking users about a config they will not use. Fixes: 1ebcde047c54 ("soc: qcom: add pd-mapper implementation") Signed-off-by: Andrew Halaney Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240725-pd-mapper-config-v1-1-f26e513608c6@redhat.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig index 7f02f05259331a..74b9121240f893 100644 --- a/drivers/soc/qcom/Kconfig +++ b/drivers/soc/qcom/Kconfig @@ -77,7 +77,7 @@ config QCOM_PD_MAPPER select QCOM_QMI_HELPERS select QCOM_PDR_MSG select AUXILIARY_BUS - depends on NET && QRTR + depends on NET && QRTR && (ARCH_QCOM || COMPILE_TEST) default QCOM_RPROC_COMMON help The Protection Domain Mapper maps registered services to the domains From 10f98bb9d98137b544b00abb4f9df45e9be7878d Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Mon, 15 Jul 2024 14:15:40 +0200 Subject: [PATCH 006/991] arm64: defconfig: Add CONFIG_DRM_PANEL_SAMSUNG_ATNA33XC20 This is needed for the display panel to work on the Qualcomm sc7180-trogdor-homestar and x1e80100-crd. Signed-off-by: Stephan Gerhold Reviewed-by: Douglas Anderson Reviewed-by: Johan Hovold Link: https://lore.kernel.org/r/20240715-x1e80100-crd-backlight-v2-4-31b7f2f658a3@linaro.org Signed-off-by: Bjorn Andersson --- arch/arm64/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 7d32fca649965a..362df939026383 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -887,6 +887,7 @@ CONFIG_DRM_PANEL_KHADAS_TS050=m CONFIG_DRM_PANEL_MANTIX_MLAF057WE51=m CONFIG_DRM_PANEL_NOVATEK_NT36672E=m CONFIG_DRM_PANEL_RAYDIUM_RM67191=m +CONFIG_DRM_PANEL_SAMSUNG_ATNA33XC20=m CONFIG_DRM_PANEL_SITRONIX_ST7703=m CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m CONFIG_DRM_PANEL_VISIONOX_VTDR6130=m From f9bb896eab221618927ae6a2f1d566567999839d Mon Sep 17 00:00:00 2001 From: Volodymyr Babchuk Date: Thu, 18 Jul 2024 11:33:23 +0530 Subject: [PATCH 007/991] soc: qcom: cmd-db: Map shared memory as WC, not WB Linux does not write into cmd-db region. This region of memory is write protected by XPU. XPU may sometime falsely detect clean cache eviction as "write" into the write protected region leading to secure interrupt which causes an endless loop somewhere in Trust Zone. The only reason it is working right now is because Qualcomm Hypervisor maps the same region as Non-Cacheable memory in Stage 2 translation tables. The issue manifests if we want to use another hypervisor (like Xen or KVM), which does not know anything about those specific mappings. Changing the mapping of cmd-db memory from MEMREMAP_WB to MEMREMAP_WT/WC removes dependency on correct mappings in Stage 2 tables. This patch fixes the issue by updating the mapping to MEMREMAP_WC. I tested this on SA8155P with Xen. Fixes: 312416d9171a ("drivers: qcom: add command DB driver") Cc: stable@vger.kernel.org # 5.4+ Signed-off-by: Volodymyr Babchuk Tested-by: Nikita Travkin # sc7180 WoA in EL2 Signed-off-by: Maulik Shah Tested-by: Pavankumar Kondeti Reviewed-by: Caleb Connolly Link: https://lore.kernel.org/r/20240718-cmd_db_uncached-v2-1-f6cf53164c90@quicinc.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/cmd-db.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/qcom/cmd-db.c b/drivers/soc/qcom/cmd-db.c index d8457266201758..ae66c2623d250d 100644 --- a/drivers/soc/qcom/cmd-db.c +++ b/drivers/soc/qcom/cmd-db.c @@ -349,7 +349,7 @@ static int cmd_db_dev_probe(struct platform_device *pdev) return -EINVAL; } - cmd_db_header = memremap(rmem->base, rmem->size, MEMREMAP_WB); + cmd_db_header = memremap(rmem->base, rmem->size, MEMREMAP_WC); if (!cmd_db_header) { ret = -ENOMEM; cmd_db_header = NULL; From dbd6bd124e34f9f859271ed9ae2afc39f36c7e8c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 19 Jul 2024 12:12:31 +0200 Subject: [PATCH 008/991] soc: qcom: pd-mapper: mark qcom_pdm_domains as __maybe_unused The qcom_pdm_domains[] array is used only when passing it into of_match_node() but is not also referenced by MODULE_DEVICE_TABLE() or the platform driver as a table. When CONFIG_OF is disabled, this causes a harmless build warning: drivers/soc/qcom/qcom_pd_mapper.c:520:34: error: 'qcom_pdm_domains' defined but not used [-Werror=unused-const-variable=] Avoid this by marking the variable as __maybe_unused. This also makes it clear that anything referenced by it will be dropped by the compiler when it is unused. Fixes: 1ebcde047c54 ("soc: qcom: add pd-mapper implementation") Signed-off-by: Arnd Bergmann Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240719101238.199850-1-arnd@kernel.org Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/qcom_pd_mapper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/qcom/qcom_pd_mapper.c b/drivers/soc/qcom/qcom_pd_mapper.c index a4c00708066503..9afa09c3920e10 100644 --- a/drivers/soc/qcom/qcom_pd_mapper.c +++ b/drivers/soc/qcom/qcom_pd_mapper.c @@ -517,7 +517,7 @@ static const struct qcom_pdm_domain_data *sm8550_domains[] = { NULL, }; -static const struct of_device_id qcom_pdm_domains[] = { +static const struct of_device_id qcom_pdm_domains[] __maybe_unused = { { .compatible = "qcom,apq8064", .data = NULL, }, { .compatible = "qcom,apq8074", .data = NULL, }, { .compatible = "qcom,apq8084", .data = NULL, }, From 8bc7cb73df8644423758c79d4504d501c8ef3854 Mon Sep 17 00:00:00 2001 From: Patrick Wildt Date: Mon, 15 Jul 2024 21:40:41 +0200 Subject: [PATCH 009/991] arm64: dts: qcom: x1e80100-yoga: add wifi calibration variant Describe the bus topology for PCIe domain 4 and add the ath12k calibration variant so that the board file (calibration data) can be loaded. Signed-off-by: Patrick Wildt Reviewed-by: Konrad Dybcio Reviewed-by: Manivannan Sadhasivam Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/ZpV7OeGNIGGpqNC0@windev.fritz.box Signed-off-by: Bjorn Andersson --- .../boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts | 9 +++++++++ arch/arm64/boot/dts/qcom/x1e80100.dtsi | 10 ++++++++++ 2 files changed, 19 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts index fbff558f5b070b..f569f0fbd1fc3b 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts @@ -635,6 +635,15 @@ status = "okay"; }; +&pcie4_port0 { + wifi@0 { + compatible = "pci17cb,1107"; + reg = <0x10000 0x0 0x0 0x0 0x0>; + + qcom,ath12k-calibration-variant = "LES790"; + }; +}; + &pcie6a { perst-gpios = <&tlmm 152 GPIO_ACTIVE_LOW>; wake-gpios = <&tlmm 154 GPIO_ACTIVE_LOW>; diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi index 7bca5fcd7d5277..70eeacd4f9adfc 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi @@ -3085,6 +3085,16 @@ phy-names = "pciephy"; status = "disabled"; + + pcie4_port0: pcie@0 { + device_type = "pci"; + reg = <0x0 0x0 0x0 0x0 0x0>; + bus-range = <0x01 0xff>; + + #address-cells = <3>; + #size-cells = <2>; + ranges; + }; }; pcie4_phy: phy@1c0e000 { From 60a76f7826b88ebf7697a56fdcd9596b23c2b616 Mon Sep 17 00:00:00 2001 From: Varadarajan Narayanan Date: Tue, 23 Jul 2024 15:31:51 +0530 Subject: [PATCH 010/991] arm64: dts: qcom: ipq5332: Fix interrupt trigger type for usb Trigger type is incorrectly specified as IRQ_TYPE_EDGE_BOTH instead of IRQ_TYPE_LEVEL_HIGH. This trigger type is not supported for SPIs and results in probe failure with -EINVAL. Reviewed-by: Konrad Dybcio Fixes: 927173bf8a0e ("arm64: dts: qcom: Add missing interrupts for qcs404/ipq5332") Signed-off-by: Varadarajan Narayanan Link: https://lore.kernel.org/r/20240723100151.402300-3-quic_varada@quicinc.com Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/ipq5332.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/ipq5332.dtsi b/arch/arm64/boot/dts/qcom/ipq5332.dtsi index 573656587c0d38..0a74ed4f72cc77 100644 --- a/arch/arm64/boot/dts/qcom/ipq5332.dtsi +++ b/arch/arm64/boot/dts/qcom/ipq5332.dtsi @@ -320,8 +320,8 @@ reg = <0x08af8800 0x400>; interrupts = , - , - ; + , + ; interrupt-names = "pwr_event", "dp_hs_phy_irq", "dm_hs_phy_irq"; From 0ba521d6948ecb4acf1276494dfed127fe096ca6 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 25 Jul 2024 20:46:44 +0200 Subject: [PATCH 011/991] rust: macros: indent list item in `module!`'s docs Like commit e516211f615f ("rust: macros: indent list item in `paste!`'s docs"), but for `module!`. Reviewed-by: Trevor Gross Link: https://lore.kernel.org/r/20240725184644.135185-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/macros/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs index 159e7529297075..5be0cb9db3ee49 100644 --- a/rust/macros/lib.rs +++ b/rust/macros/lib.rs @@ -94,7 +94,7 @@ use proc_macro::TokenStream; /// - `license`: ASCII string literal of the license of the kernel module (required). /// - `alias`: array of ASCII string literals of the alias names of the kernel module. /// - `firmware`: array of ASCII string literals of the firmware files of -/// the kernel module. +/// the kernel module. #[proc_macro] pub fn module(ts: TokenStream) -> TokenStream { module::module(ts) From 950aeefb34923fe3c28ade35fe05f24e2c5b1d55 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 17 Jul 2024 22:01:30 -0700 Subject: [PATCH 012/991] iommufd/device: Fix hwpt at err_unresv in iommufd_device_do_replace() The rewind routine should remove the reserved iovas added to the new hwpt. Fixes: 89db31635c87 ("iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable") Cc: stable@vger.kernel.org Link: https://patch.msgid.link/r/20240718050130.1956804-1-nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 9a7ec5997c61c5..3214a4c17c6b3b 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -526,7 +526,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, err_unresv: if (hwpt_is_paging(hwpt)) iommufd_group_remove_reserved_iova(igroup, - to_hwpt_paging(old_hwpt)); + to_hwpt_paging(hwpt)); err_unlock: mutex_unlock(&idev->igroup->lock); return ERR_PTR(rc); From 0710c3d304f67f9b68f5082214e311ec8f82bd82 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Fri, 26 Jul 2024 13:18:25 +0200 Subject: [PATCH 013/991] dt-bindings: Batch-update Konrad Dybcio's email Use my @kernel.org address everywhere. Signed-off-by: Konrad Dybcio Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240726-topic-konrad_email-v1-3-f94665da2919@kernel.org Signed-off-by: Rob Herring (Arm) --- Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml | 2 +- .../devicetree/bindings/clock/qcom,sm8350-videocc.yaml | 2 +- Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml | 2 +- .../devicetree/bindings/display/msm/qcom,sm6375-mdss.yaml | 2 +- .../bindings/display/panel/asus,z00t-tm5p5-nt35596.yaml | 2 +- .../devicetree/bindings/display/panel/sony,td4353-jdi.yaml | 2 +- .../devicetree/bindings/interconnect/qcom,sc7280-rpmh.yaml | 2 +- .../devicetree/bindings/interconnect/qcom,sc8280xp-rpmh.yaml | 2 +- .../devicetree/bindings/interconnect/qcom,sm8450-rpmh.yaml | 2 +- Documentation/devicetree/bindings/iommu/qcom,iommu.yaml | 2 +- .../devicetree/bindings/pinctrl/qcom,mdm9607-tlmm.yaml | 2 +- Documentation/devicetree/bindings/pinctrl/qcom,sm6350-tlmm.yaml | 2 +- Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml | 2 +- Documentation/devicetree/bindings/remoteproc/qcom,rpm-proc.yaml | 2 +- .../devicetree/bindings/soc/qcom/qcom,rpm-master-stats.yaml | 2 +- 24 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml b/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml index a584b4953e686d..46403b98411f81 100644 --- a/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Display Clock & Reset Controller on SM6350 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm display clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml index 6b9c1d198b14d0..10afe984e2fbc1 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Global Clock & Reset Controller on MSM8994 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm global clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml index a5a29dc75ae11b..1fe68e07a2b208 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Global Clock & Reset Controller on SM6125 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm global clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml index 2280b859b2ad63..78e232fa95dc60 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Global Clock & Reset Controller on SM6350 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm global clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml index cf19f44af77440..4ff17a91344bad 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Graphics Clock & Reset Controller on SM6115 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm graphics clock control module provides clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml index 374a1844a159a0..10a9c96a97b6a1 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Graphics Clock & Reset Controller on SM6125 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm graphics clock control module provides clocks and power domains on diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml index fd6658cb793dbb..c03b30f64f359a 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Camera Clock & Reset Controller on SM6350 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm camera clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml index 183b1c75dbdf3b..3cd422a645fd87 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Display Clock & Reset Controller on SM6375 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm display clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml index 147b75a21508f1..de4e9066eeb835 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Global Clock & Reset Controller on SM6375 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm global clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml index cf4cad76f6c951..d9dd479c17bd63 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Graphics Clock & Reset Controller on SM6375 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm graphics clock control module provides clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8350-videocc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8350-videocc.yaml index 46d1d91e3a01ef..5c2ecec0624e35 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm8350-videocc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm8350-videocc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm SM8350 Video Clock & Reset Controller maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm video clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml index 3c2cac14e6c3a0..d10bb002906e9f 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Graphics Clock & Reset Controller on SM8450 maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm graphics clock control module provides the clocks, resets and power diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm6375-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm6375-mdss.yaml index 8e8a288d318c34..e22b4c433fd07c 100644 --- a/Documentation/devicetree/bindings/display/msm/qcom,sm6375-mdss.yaml +++ b/Documentation/devicetree/bindings/display/msm/qcom,sm6375-mdss.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm SM6375 Display MDSS maintainers: - - Konrad Dybcio + - Konrad Dybcio description: SM6375 MSM Mobile Display Subsystem (MDSS), which encapsulates sub-blocks diff --git a/Documentation/devicetree/bindings/display/panel/asus,z00t-tm5p5-nt35596.yaml b/Documentation/devicetree/bindings/display/panel/asus,z00t-tm5p5-nt35596.yaml index 2399cabf044c2f..dd614e077bbff3 100644 --- a/Documentation/devicetree/bindings/display/panel/asus,z00t-tm5p5-nt35596.yaml +++ b/Documentation/devicetree/bindings/display/panel/asus,z00t-tm5p5-nt35596.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: ASUS Z00T TM5P5 NT35596 5.5" 1080×1920 LCD Panel maintainers: - - Konrad Dybcio + - Konrad Dybcio description: |+ This panel seems to only be found in the Asus Z00T diff --git a/Documentation/devicetree/bindings/display/panel/sony,td4353-jdi.yaml b/Documentation/devicetree/bindings/display/panel/sony,td4353-jdi.yaml index 191b692125e145..032a989184ff05 100644 --- a/Documentation/devicetree/bindings/display/panel/sony,td4353-jdi.yaml +++ b/Documentation/devicetree/bindings/display/panel/sony,td4353-jdi.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Sony TD4353 JDI 5 / 5.7" 2160x1080 MIPI-DSI Panel maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | The Sony TD4353 JDI is a 5 (XZ2c) / 5.7 (XZ2) inch 2160x1080 diff --git a/Documentation/devicetree/bindings/interconnect/qcom,sc7280-rpmh.yaml b/Documentation/devicetree/bindings/interconnect/qcom,sc7280-rpmh.yaml index 9fce7203bd42db..78210791496f84 100644 --- a/Documentation/devicetree/bindings/interconnect/qcom,sc7280-rpmh.yaml +++ b/Documentation/devicetree/bindings/interconnect/qcom,sc7280-rpmh.yaml @@ -8,7 +8,7 @@ title: Qualcomm RPMh Network-On-Chip Interconnect on SC7280 maintainers: - Bjorn Andersson - - Konrad Dybcio + - Konrad Dybcio description: | RPMh interconnect providers support system bandwidth requirements through diff --git a/Documentation/devicetree/bindings/interconnect/qcom,sc8280xp-rpmh.yaml b/Documentation/devicetree/bindings/interconnect/qcom,sc8280xp-rpmh.yaml index 6c2da03f0cd224..100c6863690926 100644 --- a/Documentation/devicetree/bindings/interconnect/qcom,sc8280xp-rpmh.yaml +++ b/Documentation/devicetree/bindings/interconnect/qcom,sc8280xp-rpmh.yaml @@ -8,7 +8,7 @@ title: Qualcomm RPMh Network-On-Chip Interconnect on SC8280XP maintainers: - Bjorn Andersson - - Konrad Dybcio + - Konrad Dybcio description: | RPMh interconnect providers support system bandwidth requirements through diff --git a/Documentation/devicetree/bindings/interconnect/qcom,sm8450-rpmh.yaml b/Documentation/devicetree/bindings/interconnect/qcom,sm8450-rpmh.yaml index 3cff7e66225538..300640a533dd67 100644 --- a/Documentation/devicetree/bindings/interconnect/qcom,sm8450-rpmh.yaml +++ b/Documentation/devicetree/bindings/interconnect/qcom,sm8450-rpmh.yaml @@ -8,7 +8,7 @@ title: Qualcomm RPMh Network-On-Chip Interconnect on SM8450 maintainers: - Bjorn Andersson - - Konrad Dybcio + - Konrad Dybcio description: | RPMh interconnect providers support system bandwidth requirements through diff --git a/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml b/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml index 571e5746d1776f..f8cebc9e8cd9d4 100644 --- a/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml +++ b/Documentation/devicetree/bindings/iommu/qcom,iommu.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies legacy IOMMU implementations maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | Qualcomm "B" family devices which are not compatible with arm-smmu have diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,mdm9607-tlmm.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,mdm9607-tlmm.yaml index bd3cbb44c99a46..e75393b3d196c5 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,mdm9607-tlmm.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,mdm9607-tlmm.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies, Inc. MDM9607 TLMM block maintainers: - - Konrad Dybcio + - Konrad Dybcio description: Top Level Mode Multiplexer pin controller in Qualcomm MDM9607 SoC. diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-tlmm.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-tlmm.yaml index a4771f87d93645..b262af6be97da3 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-tlmm.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6350-tlmm.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies, Inc. SM6350 TLMM block maintainers: - - Konrad Dybcio + - Konrad Dybcio description: Top Level Mode Multiplexer pin controller in Qualcomm SM6350 SoC. diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml index 047f82863f9bbf..c11af09c3f5b89 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies, Inc. SM6375 TLMM block maintainers: - - Konrad Dybcio + - Konrad Dybcio description: Top Level Mode Multiplexer pin controller in Qualcomm SM6375 SoC. diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,rpm-proc.yaml b/Documentation/devicetree/bindings/remoteproc/qcom,rpm-proc.yaml index 7afafde17a38bf..61cf4fe19ca53e 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,rpm-proc.yaml +++ b/Documentation/devicetree/bindings/remoteproc/qcom,rpm-proc.yaml @@ -8,7 +8,7 @@ title: Qualcomm Resource Power Manager (RPM) Processor/Subsystem maintainers: - Bjorn Andersson - - Konrad Dybcio + - Konrad Dybcio - Stephan Gerhold description: | diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,rpm-master-stats.yaml b/Documentation/devicetree/bindings/soc/qcom/qcom,rpm-master-stats.yaml index 9410404f87f1af..ad2dcc39a5f548 100644 --- a/Documentation/devicetree/bindings/soc/qcom/qcom,rpm-master-stats.yaml +++ b/Documentation/devicetree/bindings/soc/qcom/qcom,rpm-master-stats.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies, Inc. (QTI) RPM Master Stats maintainers: - - Konrad Dybcio + - Konrad Dybcio description: | The Qualcomm RPM (Resource Power Manager) architecture includes a concept From 30f593fa0088b89f479f7358640687b3cbca93d4 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:42:42 +0200 Subject: [PATCH 014/991] arm64: dts: qcom: x1e80100-crd: fix PCIe4 PHY supply The PCIe4 PHY is powered by vreg_l3i (not vreg_l3j). Fixes: d7e03cce0400 ("arm64: dts: qcom: x1e80100-crd: Enable more support") Cc: stable@vger.kernel.org # 6.9 Signed-off-by: Johan Hovold Reviewed-by: Abel Vesa Link: https://lore.kernel.org/r/20240722094249.26471-2-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-crd.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts index 6152bcd0bc1f0a..dabc9362c72cb2 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts @@ -760,7 +760,7 @@ }; &pcie4_phy { - vdda-phy-supply = <&vreg_l3j_0p8>; + vdda-phy-supply = <&vreg_l3i_0p8>; vdda-pll-supply = <&vreg_l3e_1p2>; status = "okay"; From f8fa1f2f6412bffa71972f9506b72992d0e6e485 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:42:43 +0200 Subject: [PATCH 015/991] arm64: dts: qcom: x1e80100: fix PCIe domain numbers The current PCIe domain numbers are off by one and do not match the numbers that the UEFI firmware (and Windows) uses. Fixes: 5eb83fc10289 ("arm64: dts: qcom: x1e80100: Add PCIe nodes") Cc: stable@vger.kernel.org # 6.9 Reviewed-by: Konrad Dybcio Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20240722094249.26471-3-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi index 70eeacd4f9adfc..626fb2565cf422 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi @@ -2901,7 +2901,7 @@ dma-coherent; - linux,pci-domain = <7>; + linux,pci-domain = <6>; num-lanes = <2>; interrupts = , @@ -3022,7 +3022,7 @@ dma-coherent; - linux,pci-domain = <5>; + linux,pci-domain = <4>; num-lanes = <2>; interrupts = , From 98abf2fbd179017833c38edc9f3b587c69d07e2a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:42:44 +0200 Subject: [PATCH 016/991] arm64: dts: qcom: x1e80100: add missing PCIe minimum OPP Add the missing PCIe CX performance level votes to avoid relying on other drivers (e.g. USB) to maintain the nominal performance level required for Gen3 speeds. Fixes: 5eb83fc10289 ("arm64: dts: qcom: x1e80100: Add PCIe nodes") Cc: stable@vger.kernel.org # 6.9 Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240722094249.26471-4-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi index 626fb2565cf422..c13811a4ef909d 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi @@ -2959,6 +2959,7 @@ "link_down"; power-domains = <&gcc GCC_PCIE_6A_GDSC>; + required-opps = <&rpmhpd_opp_nom>; phys = <&pcie6a_phy>; phy-names = "pciephy"; @@ -3080,6 +3081,7 @@ "link_down"; power-domains = <&gcc GCC_PCIE_4_GDSC>; + required-opps = <&rpmhpd_opp_nom>; phys = <&pcie4_phy>; phy-names = "pciephy"; From 6e3902c499544291ac4fd1a1bb69f2e9037a0e86 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:42:45 +0200 Subject: [PATCH 017/991] arm64: dts: qcom: x1e80100-crd: fix up PCIe6a pinctrl node The PCIe6a pinctrl node appears to have been copied from the sc8280xp CRD dts, which has the NVMe on pcie2a and uses some funny indentation. Fix up the node name to match the x1e80100 use and label and use only tabs for indentation. Reviewed-by: Konrad Dybcio Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20240722094249.26471-5-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-crd.dts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts index dabc9362c72cb2..85e32101a47164 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts @@ -931,7 +931,7 @@ bias-disable; }; - pcie6a_default: pcie2a-default-state { + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; function = "pcie6a_clk"; @@ -947,11 +947,11 @@ }; wake-n-pins { - pins = "gpio154"; - function = "gpio"; - drive-strength = <2>; - bias-pull-up; - }; + pins = "gpio154"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; }; tpad_default: tpad-default-state { From 8a6e1dbf1362e78081e71b2690750e9556136f26 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:42:46 +0200 Subject: [PATCH 018/991] arm64: dts: qcom: x1e80100-crd: disable PCIe6a perst pull down Disable the PCIe6a perst pull-down resistor to save some power. Reviewed-by: Konrad Dybcio Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20240722094249.26471-6-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-crd.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts index 85e32101a47164..aeb279b1a0ccd7 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts @@ -943,7 +943,7 @@ pins = "gpio152"; function = "gpio"; drive-strength = <2>; - bias-pull-down; + bias-disable; }; wake-n-pins { From 42b33ad188466292eaac9825544b8be8deddb3cb Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:42:47 +0200 Subject: [PATCH 019/991] arm64: dts: qcom: x1e80100-crd: fix missing PCIe4 gpios Add the missing PCIe4 perst, wake and clkreq GPIOs and pin config. Fixes: d7e03cce0400 ("arm64: dts: qcom: x1e80100-crd: Enable more support") Cc: stable@vger.kernel.org # 6.9 Reviewed-by: Konrad Dybcio Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20240722094249.26471-7-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-crd.dts | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts index aeb279b1a0ccd7..d65a22172006bb 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts @@ -756,6 +756,12 @@ }; &pcie4 { + perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>; + + pinctrl-0 = <&pcie4_default>; + pinctrl-names = "default"; + status = "okay"; }; @@ -931,6 +937,29 @@ bias-disable; }; + pcie4_default: pcie4-default-state { + clkreq-n-pins { + pins = "gpio147"; + function = "pcie4_clk"; + drive-strength = <2>; + bias-pull-up; + }; + + perst-n-pins { + pins = "gpio146"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + + wake-n-pins { + pins = "gpio148"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; + }; + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; From f03dd49f884f428ba71efe23383ff842f4f15e0e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:54:48 +0200 Subject: [PATCH 020/991] arm64: dts: qcom: x1e80100-qcp: fix PCIe4 PHY supply The PCIe4 PHY is powered by vreg_l3i (not vreg_l3j) on the CRD so assume the same applies to the QCP. Fixes: f9a9c11471da ("arm64: dts: qcom: x1e80100-qcp: Enable more support") Cc: stable@vger.kernel.org # 6.9 Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Reviewed-by: Abel Vesa Link: https://lore.kernel.org/r/20240722095459.27437-2-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-qcp.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts index 72a4f4138616a1..ebfcccbb55e835 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts @@ -664,7 +664,7 @@ }; &pcie4_phy { - vdda-phy-supply = <&vreg_l3j_0p8>; + vdda-phy-supply = <&vreg_l3i_0p8>; vdda-pll-supply = <&vreg_l3e_1p2>; status = "okay"; From 0aab6eaac72ac140dfc5e0a38bf3178497762e43 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:54:49 +0200 Subject: [PATCH 021/991] arm64: dts: qcom: x1e80100-qcp: fix up PCIe6a pinctrl node The PCIe6a pinctrl node appears to have been copied from the sc8280xp CRD dts, which has the NVMe on pcie2a and uses some funny indentation. Fix up the node name to match the x1e80100 use and label and use only tabs for indentation. Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Reviewed-by: Abel Vesa Link: https://lore.kernel.org/r/20240722095459.27437-3-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-qcp.dts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts index ebfcccbb55e835..b067d7841d7226 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts @@ -804,7 +804,7 @@ bias-disable; }; - pcie6a_default: pcie2a-default-state { + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; function = "pcie6a_clk"; @@ -820,11 +820,11 @@ }; wake-n-pins { - pins = "gpio154"; - function = "gpio"; - drive-strength = <2>; - bias-pull-up; - }; + pins = "gpio154"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; }; wcd_default: wcd-reset-n-active-state { From 12661b333374c892f9053261b4bceb346a709ea4 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:54:50 +0200 Subject: [PATCH 022/991] arm64: dts: qcom: x1e80100-qcp: disable PCIe6a perst pull down Disable the PCIe6a perst pull-down resistor to save some power. Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Reviewed-by: Abel Vesa Link: https://lore.kernel.org/r/20240722095459.27437-4-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-qcp.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts index b067d7841d7226..653673e423bf77 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts @@ -816,7 +816,7 @@ pins = "gpio152"; function = "gpio"; drive-strength = <2>; - bias-pull-down; + bias-disable; }; wake-n-pins { From 2ac90e4d2b6d6823ca10642ef39595ff1181c3fa Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:54:51 +0200 Subject: [PATCH 023/991] arm64: dts: qcom: x1e80100-qcp: fix missing PCIe4 gpios Add the missing PCIe4 perst, wake and clkreq GPIOs and pin config. Fixes: f9a9c11471da ("arm64: dts: qcom: x1e80100-qcp: Enable more support") Cc: stable@vger.kernel.org # 6.9 Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240722095459.27437-5-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-qcp.dts | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts index 653673e423bf77..2dcf2a17511dbf 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts @@ -660,6 +660,12 @@ }; &pcie4 { + perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>; + + pinctrl-0 = <&pcie4_default>; + pinctrl-names = "default"; + status = "okay"; }; @@ -804,6 +810,29 @@ bias-disable; }; + pcie4_default: pcie4-default-state { + clkreq-n-pins { + pins = "gpio147"; + function = "pcie4_clk"; + drive-strength = <2>; + bias-pull-up; + }; + + perst-n-pins { + pins = "gpio146"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + + wake-n-pins { + pins = "gpio148"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; + }; + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; From e89fe0596c62363082cabbaa5ccb38989e714e68 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:54:52 +0200 Subject: [PATCH 024/991] arm64: dts: qcom: x1e80100-vivobook-s15: fix PCIe4 PHY supply The PCIe4 PHY is powered by vreg_l3i (not vreg_l3j) on the CRD reference design so assume the same applies to the Asus Vivobook S15. Fixes: d0e2f8f62dff ("arm64: dts: qcom: Add device tree for ASUS Vivobook S 15") Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240722095459.27437-6-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts index 7fb980fcb30752..f7337251349b18 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts @@ -278,6 +278,13 @@ vdd-l3-supply = <&vreg_s1f_0p7>; vdd-s1-supply = <&vph_pwr>; vdd-s2-supply = <&vph_pwr>; + + vreg_l3i_0p8: ldo3 { + regulator-name = "vreg_l3i_0p8"; + regulator-min-microvolt = <880000>; + regulator-max-microvolt = <920000>; + regulator-initial-mode = ; + }; }; regulators-7 { @@ -427,7 +434,7 @@ }; &pcie4_phy { - vdda-phy-supply = <&vreg_l3j_0p8>; + vdda-phy-supply = <&vreg_l3i_0p8>; vdda-pll-supply = <&vreg_l3e_1p2>; status = "okay"; From c67b3dfd8d69164f70ab3aaff889fca1e536c909 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:54:53 +0200 Subject: [PATCH 025/991] arm64: dts: qcom: x1e80100-vivobook-s15: fix up PCIe6a pinctrl node The PCIe6a pinctrl node appears to have been copied from the sc8280xp CRD dts (via the x1e80100 CRD dts), which has the NVMe on pcie2a. Fix up the node name to match the x1e80100 use and label. Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240722095459.27437-7-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts index f7337251349b18..ff51dd98351ce0 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts @@ -524,7 +524,7 @@ bias-disable; }; - pcie6a_default: pcie2a-default-state { + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; function = "pcie6a_clk"; From d7ff5d1868d1cfd1c06a601a7cfa2dbb6dba4be9 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:54:54 +0200 Subject: [PATCH 026/991] arm64: dts: qcom: x1e80100-vivobook-s15: disable PCIe6a perst pull down Disable the PCIe6a perst pull-down resistor to save some power. Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240722095459.27437-8-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts index ff51dd98351ce0..1eb0abcbf650aa 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts @@ -536,7 +536,7 @@ pins = "gpio152"; function = "gpio"; drive-strength = <2>; - bias-pull-down; + bias-disable; }; wake-n-pins { From e7f3f3cbbfef84729ad6c10eb589957e7b28b95a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:54:55 +0200 Subject: [PATCH 027/991] arm64: dts: qcom: x1e80100-vivobook-s15: fix missing PCIe4 gpios Add the missing PCIe4 perst, wake and clkreq GPIOs and pin config. Fixes: d0e2f8f62dff ("arm64: dts: qcom: Add device tree for ASUS Vivobook S 15") Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240722095459.27437-9-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- .../dts/qcom/x1e80100-asus-vivobook-s15.dts | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts index 1eb0abcbf650aa..9caa14dda58552 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts @@ -430,6 +430,12 @@ }; &pcie4 { + perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>; + + pinctrl-0 = <&pcie4_default>; + pinctrl-names = "default"; + status = "okay"; }; @@ -524,6 +530,29 @@ bias-disable; }; + pcie4_default: pcie4-default-state { + clkreq-n-pins { + pins = "gpio147"; + function = "pcie4_clk"; + drive-strength = <2>; + bias-pull-up; + }; + + perst-n-pins { + pins = "gpio146"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + + wake-n-pins { + pins = "gpio148"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; + }; + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; From b90567c262fc3a3e703f3091499dec799a6147ab Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:54:56 +0200 Subject: [PATCH 028/991] arm64: dts: qcom: x1e80100-yoga-slim7x: fix PCIe4 PHY supply The PCIe4 PHY is powered by vreg_l3i (not vreg_l3j) on the CRD reference design so assume the same applies to the Lenovo Yoga Slim 7x. Fixes: 45247fe17db2 ("arm64: dts: qcom: x1e80100: add Lenovo Thinkpad Yoga slim 7x devicetree") Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240722095459.27437-10-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts index f569f0fbd1fc3b..6902548974d0e3 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts @@ -629,7 +629,7 @@ }; &pcie4_phy { - vdda-phy-supply = <&vreg_l3j_0p8>; + vdda-phy-supply = <&vreg_l3i_0p8>; vdda-pll-supply = <&vreg_l3e_1p2>; status = "okay"; From a655dacf2a35a35eadd95f0ba8fe9cf70359eeb9 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:54:57 +0200 Subject: [PATCH 029/991] arm64: dts: qcom: x1e80100-yoga-slim7x: fix up PCIe6a pinctrl node The PCIe6a pinctrl node appears to have been copied from the sc8280xp CRD dts (via the x1e80100 CRD dts), which has the NVMe on pcie2a and uses some funny indentation. Fix up the node name to match the x1e80100 use and label and use only tabs for indentation. Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240722095459.27437-11-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- .../boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts index 6902548974d0e3..ad96bb8d5400fc 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts @@ -791,7 +791,7 @@ bias-disable; }; - pcie6a_default: pcie2a-default-state { + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; function = "pcie6a_clk"; @@ -807,11 +807,11 @@ }; wake-n-pins { - pins = "gpio154"; - function = "gpio"; - drive-strength = <2>; - bias-pull-up; - }; + pins = "gpio154"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; }; tpad_default: tpad-default-state { From 750b8a3b5a4476cf000f3db1fe46293c97fcd979 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:54:58 +0200 Subject: [PATCH 030/991] arm64: dts: qcom: x1e80100-yoga-slim7x: disable PCIe6a perst pull down Disable the PCIe6a perst pull-down resistor to save some power. Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240722095459.27437-12-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts index ad96bb8d5400fc..48a7b8eb98296f 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts @@ -803,7 +803,7 @@ pins = "gpio152"; function = "gpio"; drive-strength = <2>; - bias-pull-down; + bias-disable; }; wake-n-pins { From 86c71c0e893d58447e4a9e5c0d1c2c0f89c1b9e1 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 22 Jul 2024 11:54:59 +0200 Subject: [PATCH 031/991] arm64: dts: qcom: x1e80100-yoga-slim7x: fix missing PCIe4 gpios Add the missing PCIe4 perst, wake and clkreq GPIOs and pin config. Fixes: 45247fe17db2 ("arm64: dts: qcom: x1e80100: add Lenovo Thinkpad Yoga slim 7x devicetree") Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240722095459.27437-13-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- .../dts/qcom/x1e80100-lenovo-yoga-slim7x.dts | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts index 48a7b8eb98296f..1943bdbfb8c00c 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts @@ -625,6 +625,12 @@ }; &pcie4 { + perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>; + + pinctrl-0 = <&pcie4_default>; + pinctrl-names = "default"; + status = "okay"; }; @@ -791,6 +797,29 @@ bias-disable; }; + pcie4_default: pcie4-default-state { + clkreq-n-pins { + pins = "gpio147"; + function = "pcie4_clk"; + drive-strength = <2>; + bias-pull-up; + }; + + perst-n-pins { + pins = "gpio146"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + + wake-n-pins { + pins = "gpio148"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; + }; + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; From a0e6fbf22439f796b51ea583a68eb763b0a99393 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Mon, 15 Jul 2024 14:15:39 +0200 Subject: [PATCH 032/991] arm64: dts: qcom: x1e80100-crd: Fix backlight The backlight does not work correctly with the current display panel configuration: It works after boot, but once the display gets disabled it is not possible to get it back on. It turns out that the ATNA45AF01 panel needs exactly the same non-standard power sequence as implemented by the panel-samsung-atna33xc20 driver for sc7180-trogdor-homestar. Switch the panel in the DT to the new compatible and make two more changes to make it work correctly: 1. Add the missing GPIO for the panel EL_ON3 line (EDP_BL_EN on CRD and enable-gpios in the DT). 2. Drop the regulator-always-on for the panel regulator. The panel does not seem to power off properly if the regulator stays on. Fixes: d7e03cce0400 ("arm64: dts: qcom: x1e80100-crd: Enable more support") Reviewed-by: Konrad Dybcio Signed-off-by: Stephan Gerhold Reviewed-by: Johan Hovold Tested-by: Johan Hovold Link: https://lore.kernel.org/r/20240715-x1e80100-crd-backlight-v2-3-31b7f2f658a3@linaro.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-crd.dts | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts index d65a22172006bb..82f34dfe409057 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts @@ -268,7 +268,6 @@ pinctrl-0 = <&edp_reg_en>; pinctrl-names = "default"; - regulator-always-on; regulator-boot-on; }; @@ -724,9 +723,13 @@ aux-bus { panel { - compatible = "edp-panel"; + compatible = "samsung,atna45af01", "samsung,atna33xc20"; + enable-gpios = <&pmc8380_3_gpios 4 GPIO_ACTIVE_HIGH>; power-supply = <&vreg_edp_3p3>; + pinctrl-0 = <&edp_bl_en>; + pinctrl-names = "default"; + port { edp_panel_in: endpoint { remote-endpoint = <&mdss_dp3_out>; @@ -791,6 +794,16 @@ status = "okay"; }; +&pmc8380_3_gpios { + edp_bl_en: edp-bl-en-state { + pins = "gpio4"; + function = "normal"; + power-source = <1>; /* 1.8V */ + input-disable; + output-enable; + }; +}; + &qupv3_0 { status = "okay"; }; From 72c93f3e0dcdc05fceafcb32e79352a45716d181 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Fri, 26 Jul 2024 13:18:23 +0200 Subject: [PATCH 033/991] mailmap: Add an entry for Konrad Dybcio Map my old addresses. Signed-off-by: Konrad Dybcio Signed-off-by: Konrad Dybcio Acked-by: Neil Armstrong Link: https://lore.kernel.org/r/20240726-topic-konrad_email-v1-1-f94665da2919@kernel.org Signed-off-by: Bjorn Andersson --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index e51d76df75c2de..d189c64246979d 100644 --- a/.mailmap +++ b/.mailmap @@ -353,6 +353,8 @@ Kenneth Westfield Kiran Gunda Kirill Tkhai Kishon Vijay Abraham I +Konrad Dybcio +Konrad Dybcio Konstantin Khlebnikov Konstantin Khlebnikov Koushik From fce6a1eefb2a1db706fa17ca21e3e7107811d2e8 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Fri, 26 Jul 2024 13:18:24 +0200 Subject: [PATCH 034/991] MAINTAINERS: Update Konrad Dybcio's email address Use my @kernel.org address everywhere. Signed-off-by: Konrad Dybcio Signed-off-by: Konrad Dybcio Acked-by: Neil Armstrong Link: https://lore.kernel.org/r/20240726-topic-konrad_email-v1-2-f94665da2919@kernel.org Signed-off-by: Bjorn Andersson --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 42decde3832066..7b599269a8214c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2745,7 +2745,7 @@ F: include/linux/soc/qcom/ ARM/QUALCOMM SUPPORT M: Bjorn Andersson -M: Konrad Dybcio +M: Konrad Dybcio L: linux-arm-msm@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git @@ -7106,7 +7106,7 @@ F: drivers/gpu/drm/tiny/panel-mipi-dbi.c DRM DRIVER for Qualcomm Adreno GPUs M: Rob Clark R: Sean Paul -R: Konrad Dybcio +R: Konrad Dybcio L: linux-arm-msm@vger.kernel.org L: dri-devel@lists.freedesktop.org L: freedreno@lists.freedesktop.org @@ -18771,7 +18771,7 @@ F: include/uapi/drm/qaic_accel.h QUALCOMM CORE POWER REDUCTION (CPR) AVS DRIVER M: Bjorn Andersson -M: Konrad Dybcio +M: Konrad Dybcio L: linux-pm@vger.kernel.org L: linux-arm-msm@vger.kernel.org S: Maintained From e1cf752ede8e82c2d084868c50a1ca6cdb07c9c4 Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Wed, 12 Jun 2024 11:29:34 +0200 Subject: [PATCH 035/991] dt-bindings: eeprom: at25: add fujitsu,mb85rs256 compatible The fujitsu,mb85rs256 is a 256 Kbit SPI memory FRAM in the same family as the two existing fujitsu,mb85rs* compatibles and at25 compatible. Signed-off-by: Francesco Dolcini Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240612092934.12282-1-francesco@dolcini.it Signed-off-by: Rob Herring (Arm) --- Documentation/devicetree/bindings/eeprom/at25.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/eeprom/at25.yaml b/Documentation/devicetree/bindings/eeprom/at25.yaml index 1715b0c9feeafe..c31e5e71952501 100644 --- a/Documentation/devicetree/bindings/eeprom/at25.yaml +++ b/Documentation/devicetree/bindings/eeprom/at25.yaml @@ -28,6 +28,7 @@ properties: - anvo,anv32e61w - atmel,at25256B - fujitsu,mb85rs1mt + - fujitsu,mb85rs256 - fujitsu,mb85rs64 - microchip,at25160bn - microchip,25lc040 From 684890a0185dabf5920c43b639133adc4c2632cf Mon Sep 17 00:00:00 2001 From: John Keeping Date: Wed, 31 Jul 2024 10:33:09 +0100 Subject: [PATCH 036/991] Input: adc-joystick - fix optional value handling The abs-fuzz and abs-flat properties are documented as optional. When these are absent, fwnode_property_read_u32() will leave the input unchanged, meaning that an axis either picks up the value for the previous axis or an uninitialized value. Explicitly set these values to zero when they are unspecified to match the documented behaviour in the device tree bindings. Signed-off-by: John Keeping Link: https://lore.kernel.org/r/20240731093310.3696919-1-jkeeping@inmusicbrands.com Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/adc-joystick.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/input/joystick/adc-joystick.c b/drivers/input/joystick/adc-joystick.c index 5f46a7104b52a4..de1fa4cf291b20 100644 --- a/drivers/input/joystick/adc-joystick.c +++ b/drivers/input/joystick/adc-joystick.c @@ -182,8 +182,11 @@ static int adc_joystick_set_axes(struct device *dev, struct adc_joystick *joy) swap(range[0], range[1]); } - fwnode_property_read_u32(child, "abs-fuzz", &fuzz); - fwnode_property_read_u32(child, "abs-flat", &flat); + if (fwnode_property_read_u32(child, "abs-fuzz", &fuzz)) + fuzz = 0; + + if (fwnode_property_read_u32(child, "abs-flat", &flat)) + flat = 0; input_set_abs_params(joy->input, axes[i].code, range[0], range[1], fuzz, flat); From 39a3396558fb97e6e7d4c1eb04c2166da31904a9 Mon Sep 17 00:00:00 2001 From: Drew Fustini Date: Tue, 30 Jul 2024 23:14:40 -0700 Subject: [PATCH 037/991] clk: thead: fix dependency on clk_ignore_unused Add the CLK_IGNORE_UNUSED flag to the vp-axi clock (CLK_VP_AXI) to avoid depending on clk_ignore_unused in the cmdline. Without this fix, the emmc-sdio clock (CLK_EMMC_SDIO) fails to work after vp-axi is disabled. Signed-off-by: Drew Fustini Link: https://lore.kernel.org/r/20240731061439.3807172-1-drew@pdp7.com Fixes: ae81b69fd2b1 ("clk: thead: Add support for T-Head TH1520 AP_SUBSYS clocks") Signed-off-by: Stephen Boyd --- drivers/clk/thead/clk-th1520-ap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/thead/clk-th1520-ap.c b/drivers/clk/thead/clk-th1520-ap.c index cbc176b27c091e..17e32ae08720cb 100644 --- a/drivers/clk/thead/clk-th1520-ap.c +++ b/drivers/clk/thead/clk-th1520-ap.c @@ -738,7 +738,7 @@ static struct ccu_div vp_axi_clk = { .hw.init = CLK_HW_INIT_PARENTS_HW("vp-axi", video_pll_clk_parent, &ccu_div_ops, - 0), + CLK_IGNORE_UNUSED), }, }; From 9374ae912dbb1eed8139ed75fd2c0f1b30ca454d Mon Sep 17 00:00:00 2001 From: Mengqi Zhang Date: Tue, 16 Jul 2024 09:37:04 +0800 Subject: [PATCH 038/991] mmc: mtk-sd: receive cmd8 data when hs400 tuning fail When we use cmd8 as the tuning command in hs400 mode, the command response sent back by some eMMC devices cannot be correctly sampled by MTK eMMC controller at some weak sample timing. In this case, command timeout error may occur. So we must receive the following data to make sure the next cmd8 send correctly. Signed-off-by: Mengqi Zhang Fixes: c4ac38c6539b ("mmc: mtk-sd: Add HS400 online tuning support") Cc: stable@vger.stable.com Link: https://lore.kernel.org/r/20240716013704.10578-1-mengqi.zhang@mediatek.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index a94835b8ab9394..e386f78e326790 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -1230,7 +1230,7 @@ static bool msdc_cmd_done(struct msdc_host *host, int events, } if (!sbc_error && !(events & MSDC_INT_CMDRDY)) { - if (events & MSDC_INT_CMDTMO || + if ((events & MSDC_INT_CMDTMO && !host->hs400_tuning) || (!mmc_op_tuning(cmd->opcode) && !host->hs400_tuning)) /* * should not clear fifo/interrupt as the tune data @@ -1323,9 +1323,9 @@ static void msdc_start_command(struct msdc_host *host, static void msdc_cmd_next(struct msdc_host *host, struct mmc_request *mrq, struct mmc_command *cmd) { - if ((cmd->error && - !(cmd->error == -EILSEQ && - (mmc_op_tuning(cmd->opcode) || host->hs400_tuning))) || + if ((cmd->error && !host->hs400_tuning && + !(cmd->error == -EILSEQ && + mmc_op_tuning(cmd->opcode))) || (mrq->sbc && mrq->sbc->error)) msdc_request_done(host, mrq); else if (cmd == mrq->sbc) From a6e9c391d45b5865b61e569146304cff72821a5d Mon Sep 17 00:00:00 2001 From: Camila Alvarez Date: Tue, 30 Jul 2024 19:42:43 -0400 Subject: [PATCH 039/991] HID: cougar: fix slab-out-of-bounds Read in cougar_report_fixup report_fixup for the Cougar 500k Gaming Keyboard was not verifying that the report descriptor size was correct before accessing it Reported-by: syzbot+24c0361074799d02c452@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=24c0361074799d02c452 Signed-off-by: Camila Alvarez Reviewed-by: Silvan Jegen Signed-off-by: Jiri Kosina --- drivers/hid/hid-cougar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-cougar.c b/drivers/hid/hid-cougar.c index cb8bd8aae15b51..0fa785f52707ac 100644 --- a/drivers/hid/hid-cougar.c +++ b/drivers/hid/hid-cougar.c @@ -106,7 +106,7 @@ static void cougar_fix_g6_mapping(void) static __u8 *cougar_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { - if (rdesc[2] == 0x09 && rdesc[3] == 0x02 && + if (*rsize >= 117 && rdesc[2] == 0x09 && rdesc[3] == 0x02 && (rdesc[115] | rdesc[116] << 8) >= HID_MAX_USAGES) { hid_info(hdev, "usage count exceeds max: fixing up report descriptor\n"); From ab3de2c7ec91db6a3cf5fc07765852c81ca7d6ef Mon Sep 17 00:00:00 2001 From: Aapo Vienamo Date: Thu, 20 Jun 2024 13:43:03 +0300 Subject: [PATCH 040/991] thunderbolt: Fix memory leaks in {port|retimer}_sb_regs_write() Add missing free_page() call for the memory allocated by validate_and_copy_from_user(). Fixes: 6d241fa00159 ("thunderbolt: Add sideband register access to debugfs") Signed-off-by: Aapo Vienamo Reviewed-by: Przemek Kitszel Signed-off-by: Mika Westerberg --- drivers/thunderbolt/debugfs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/thunderbolt/debugfs.c b/drivers/thunderbolt/debugfs.c index 11185cc1db9299..9ed4bb2e8d05fd 100644 --- a/drivers/thunderbolt/debugfs.c +++ b/drivers/thunderbolt/debugfs.c @@ -323,16 +323,17 @@ static ssize_t port_sb_regs_write(struct file *file, const char __user *user_buf if (mutex_lock_interruptible(&tb->lock)) { ret = -ERESTARTSYS; - goto out_rpm_put; + goto out; } ret = sb_regs_write(port, port_sb_regs, ARRAY_SIZE(port_sb_regs), USB4_SB_TARGET_ROUTER, 0, buf, count, ppos); mutex_unlock(&tb->lock); -out_rpm_put: +out: pm_runtime_mark_last_busy(&sw->dev); pm_runtime_put_autosuspend(&sw->dev); + free_page((unsigned long)buf); return ret < 0 ? ret : count; } @@ -355,16 +356,17 @@ static ssize_t retimer_sb_regs_write(struct file *file, if (mutex_lock_interruptible(&tb->lock)) { ret = -ERESTARTSYS; - goto out_rpm_put; + goto out; } ret = sb_regs_write(rt->port, retimer_sb_regs, ARRAY_SIZE(retimer_sb_regs), USB4_SB_TARGET_RETIMER, rt->index, buf, count, ppos); mutex_unlock(&tb->lock); -out_rpm_put: +out: pm_runtime_mark_last_busy(&rt->dev); pm_runtime_put_autosuspend(&rt->dev); + free_page((unsigned long)buf); return ret < 0 ? ret : count; } From d1aa95e86f178dc597e80228cd9bd81fc3510f34 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Thu, 25 Jul 2024 10:31:25 +1200 Subject: [PATCH 041/991] hid-asus: add ROG Ally X prod ID to quirk list The new ASUS ROG Ally X functions almost exactly the same as the previous model, so we can use the same quirks. Signed-off-by: Luke D. Jones Signed-off-by: Jiri Kosina --- drivers/hid/hid-asus.c | 3 +++ drivers/hid/hid-ids.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c index 37e6d25593c211..a282388b7aa5c1 100644 --- a/drivers/hid/hid-asus.c +++ b/drivers/hid/hid-asus.c @@ -1248,6 +1248,9 @@ static const struct hid_device_id asus_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY), QUIRK_USE_KBD_BACKLIGHT | QUIRK_ROG_NKEY_KEYBOARD }, + { HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, + USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY_X), + QUIRK_USE_KBD_BACKLIGHT | QUIRK_ROG_NKEY_KEYBOARD }, { HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_ROG_CLAYMORE_II_KEYBOARD), QUIRK_ROG_CLAYMORE_II_KEYBOARD }, diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 72d56ee7ce1b98..6e322338908022 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -210,6 +210,7 @@ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD3 0x1a30 #define USB_DEVICE_ID_ASUSTEK_ROG_Z13_LIGHTBAR 0x18c6 #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY 0x1abe +#define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY_X 0x1b4c #define USB_DEVICE_ID_ASUSTEK_ROG_CLAYMORE_II_KEYBOARD 0x196b #define USB_DEVICE_ID_ASUSTEK_FX503VD_KEYBOARD 0x1869 From 97155021ae17b86985121b33cf8098bcde00d497 Mon Sep 17 00:00:00 2001 From: Olivier Sobrie Date: Tue, 23 Jul 2024 10:44:35 +0200 Subject: [PATCH 042/991] HID: amd_sfh: free driver_data after destroying hid device HID driver callbacks aren't called anymore once hid_destroy_device() has been called. Hence, hid driver_data should be freed only after the hid_destroy_device() function returned as driver_data is used in several callbacks. I observed a crash with kernel 6.10.0 on my T14s Gen 3, after enabling KASAN to debug memory allocation, I got this output: [ 13.050438] ================================================================== [ 13.054060] BUG: KASAN: slab-use-after-free in amd_sfh_get_report+0x3ec/0x530 [amd_sfh] [ 13.054809] psmouse serio1: trackpoint: Synaptics TrackPoint firmware: 0x02, buttons: 3/3 [ 13.056432] Read of size 8 at addr ffff88813152f408 by task (udev-worker)/479 [ 13.060970] CPU: 5 PID: 479 Comm: (udev-worker) Not tainted 6.10.0-arch1-2 #1 893bb55d7f0073f25c46adbb49eb3785fefd74b0 [ 13.063978] Hardware name: LENOVO 21CQCTO1WW/21CQCTO1WW, BIOS R22ET70W (1.40 ) 03/21/2024 [ 13.067860] Call Trace: [ 13.069383] input: TPPS/2 Synaptics TrackPoint as /devices/platform/i8042/serio1/input/input8 [ 13.071486] [ 13.071492] dump_stack_lvl+0x5d/0x80 [ 13.074870] snd_hda_intel 0000:33:00.6: enabling device (0000 -> 0002) [ 13.078296] ? amd_sfh_get_report+0x3ec/0x530 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.082199] print_report+0x174/0x505 [ 13.085776] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 13.089367] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.093255] ? amd_sfh_get_report+0x3ec/0x530 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.097464] kasan_report+0xc8/0x150 [ 13.101461] ? amd_sfh_get_report+0x3ec/0x530 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.105802] amd_sfh_get_report+0x3ec/0x530 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.110303] amdtp_hid_request+0xb8/0x110 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.114879] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.119450] sensor_hub_get_feature+0x1d3/0x540 [hid_sensor_hub 3f13be3016ff415bea03008d45d99da837ee3082] [ 13.124097] hid_sensor_parse_common_attributes+0x4d0/0xad0 [hid_sensor_iio_common c3a5cbe93969c28b122609768bbe23efe52eb8f5] [ 13.127404] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.131925] ? __pfx_hid_sensor_parse_common_attributes+0x10/0x10 [hid_sensor_iio_common c3a5cbe93969c28b122609768bbe23efe52eb8f5] [ 13.136455] ? _raw_spin_lock_irqsave+0x96/0xf0 [ 13.140197] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 13.143602] ? devm_iio_device_alloc+0x34/0x50 [industrialio 3d261d5e5765625d2b052be40e526d62b1d2123b] [ 13.147234] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.150446] ? __devm_add_action+0x167/0x1d0 [ 13.155061] hid_gyro_3d_probe+0x120/0x7f0 [hid_sensor_gyro_3d 63da36a143b775846ab2dbb86c343b401b5e3172] [ 13.158581] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.161814] platform_probe+0xa2/0x150 [ 13.165029] really_probe+0x1e3/0x8a0 [ 13.168243] __driver_probe_device+0x18c/0x370 [ 13.171500] driver_probe_device+0x4a/0x120 [ 13.175000] __driver_attach+0x190/0x4a0 [ 13.178521] ? __pfx___driver_attach+0x10/0x10 [ 13.181771] bus_for_each_dev+0x106/0x180 [ 13.185033] ? __pfx__raw_spin_lock+0x10/0x10 [ 13.188229] ? __pfx_bus_for_each_dev+0x10/0x10 [ 13.191446] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.194382] bus_add_driver+0x29e/0x4d0 [ 13.197328] driver_register+0x1a5/0x360 [ 13.200283] ? __pfx_hid_gyro_3d_platform_driver_init+0x10/0x10 [hid_sensor_gyro_3d 63da36a143b775846ab2dbb86c343b401b5e3172] [ 13.203362] do_one_initcall+0xa7/0x380 [ 13.206432] ? __pfx_do_one_initcall+0x10/0x10 [ 13.210175] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.213211] ? kasan_unpoison+0x44/0x70 [ 13.216688] do_init_module+0x238/0x750 [ 13.219696] load_module+0x5011/0x6af0 [ 13.223096] ? kasan_save_stack+0x30/0x50 [ 13.226743] ? kasan_save_track+0x14/0x30 [ 13.230080] ? kasan_save_free_info+0x3b/0x60 [ 13.233323] ? poison_slab_object+0x109/0x180 [ 13.236778] ? __pfx_load_module+0x10/0x10 [ 13.239703] ? poison_slab_object+0x109/0x180 [ 13.243070] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.245924] ? init_module_from_file+0x13d/0x150 [ 13.248745] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.251503] ? init_module_from_file+0xdf/0x150 [ 13.254198] init_module_from_file+0xdf/0x150 [ 13.256826] ? __pfx_init_module_from_file+0x10/0x10 [ 13.259428] ? kasan_save_track+0x14/0x30 [ 13.261959] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.264471] ? kasan_save_free_info+0x3b/0x60 [ 13.267026] ? poison_slab_object+0x109/0x180 [ 13.269494] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.271949] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.274324] ? _raw_spin_lock+0x85/0xe0 [ 13.276671] ? __pfx__raw_spin_lock+0x10/0x10 [ 13.278963] ? __rseq_handle_notify_resume+0x1a6/0xad0 [ 13.281193] idempotent_init_module+0x23b/0x650 [ 13.283420] ? __pfx_idempotent_init_module+0x10/0x10 [ 13.285619] ? __pfx___seccomp_filter+0x10/0x10 [ 13.287714] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.289828] ? __fget_light+0x57/0x420 [ 13.291870] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.293880] ? security_capable+0x74/0xb0 [ 13.295820] __x64_sys_finit_module+0xbe/0x130 [ 13.297874] do_syscall_64+0x82/0x190 [ 13.299898] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.301905] ? irqtime_account_irq+0x3d/0x1f0 [ 13.303877] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.305753] ? __irq_exit_rcu+0x4e/0x130 [ 13.307577] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.309489] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 13.311371] RIP: 0033:0x7a21f96ade9d [ 13.313234] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 63 de 0c 00 f7 d8 64 89 01 48 [ 13.317051] RSP: 002b:00007ffeae934e78 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 13.319024] RAX: ffffffffffffffda RBX: 00005987276bfcf0 RCX: 00007a21f96ade9d [ 13.321100] RDX: 0000000000000004 RSI: 00007a21f8eda376 RDI: 000000000000001c [ 13.323314] RBP: 00007a21f8eda376 R08: 0000000000000001 R09: 00007ffeae934ec0 [ 13.325505] R10: 0000000000000050 R11: 0000000000000246 R12: 0000000000020000 [ 13.327637] R13: 00005987276c1250 R14: 0000000000000000 R15: 00005987276c4530 [ 13.329737] [ 13.333945] Allocated by task 139: [ 13.336111] kasan_save_stack+0x30/0x50 [ 13.336121] kasan_save_track+0x14/0x30 [ 13.336125] __kasan_kmalloc+0xaa/0xb0 [ 13.336129] amdtp_hid_probe+0xb1/0x440 [amd_sfh] [ 13.336138] amd_sfh_hid_client_init+0xb8a/0x10f0 [amd_sfh] [ 13.336144] sfh_init_work+0x47/0x120 [amd_sfh] [ 13.336150] process_one_work+0x673/0xeb0 [ 13.336155] worker_thread+0x795/0x1250 [ 13.336160] kthread+0x290/0x350 [ 13.336164] ret_from_fork+0x34/0x70 [ 13.336169] ret_from_fork_asm+0x1a/0x30 [ 13.338175] Freed by task 139: [ 13.340064] kasan_save_stack+0x30/0x50 [ 13.340072] kasan_save_track+0x14/0x30 [ 13.340076] kasan_save_free_info+0x3b/0x60 [ 13.340081] poison_slab_object+0x109/0x180 [ 13.340085] __kasan_slab_free+0x32/0x50 [ 13.340089] kfree+0xe5/0x310 [ 13.340094] amdtp_hid_remove+0xb2/0x160 [amd_sfh] [ 13.340102] amd_sfh_hid_client_deinit+0x324/0x640 [amd_sfh] [ 13.340107] amd_sfh_hid_client_init+0x94a/0x10f0 [amd_sfh] [ 13.340113] sfh_init_work+0x47/0x120 [amd_sfh] [ 13.340118] process_one_work+0x673/0xeb0 [ 13.340123] worker_thread+0x795/0x1250 [ 13.340127] kthread+0x290/0x350 [ 13.340132] ret_from_fork+0x34/0x70 [ 13.340136] ret_from_fork_asm+0x1a/0x30 [ 13.342482] The buggy address belongs to the object at ffff88813152f400 which belongs to the cache kmalloc-64 of size 64 [ 13.347357] The buggy address is located 8 bytes inside of freed 64-byte region [ffff88813152f400, ffff88813152f440) [ 13.347367] The buggy address belongs to the physical page: [ 13.355409] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x13152f [ 13.355416] anon flags: 0x2ffff8000000000(node=0|zone=2|lastcpupid=0x1ffff) [ 13.355423] page_type: 0xffffefff(slab) [ 13.355429] raw: 02ffff8000000000 ffff8881000428c0 ffffea0004c43a00 0000000000000005 [ 13.355435] raw: 0000000000000000 0000000000200020 00000001ffffefff 0000000000000000 [ 13.355439] page dumped because: kasan: bad access detected [ 13.357295] Memory state around the buggy address: [ 13.357299] ffff88813152f300: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 13.357303] ffff88813152f380: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 13.357306] >ffff88813152f400: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 13.357309] ^ [ 13.357311] ffff88813152f480: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc [ 13.357315] ffff88813152f500: 00 00 00 00 00 00 00 06 fc fc fc fc fc fc fc fc [ 13.357318] ================================================================== [ 13.357405] Disabling lock debugging due to kernel taint [ 13.383534] Oops: general protection fault, probably for non-canonical address 0xe0a1bc4140000013: 0000 [#1] PREEMPT SMP KASAN NOPTI [ 13.383544] KASAN: maybe wild-memory-access in range [0x050e020a00000098-0x050e020a0000009f] [ 13.383551] CPU: 3 PID: 479 Comm: (udev-worker) Tainted: G B 6.10.0-arch1-2 #1 893bb55d7f0073f25c46adbb49eb3785fefd74b0 [ 13.383561] Hardware name: LENOVO 21CQCTO1WW/21CQCTO1WW, BIOS R22ET70W (1.40 ) 03/21/2024 [ 13.383565] RIP: 0010:amd_sfh_get_report+0x81/0x530 [amd_sfh] [ 13.383580] Code: 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 78 03 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8b 63 08 49 8d 7c 24 10 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e 1a 03 00 00 45 8b 74 24 10 45 [ 13.383585] RSP: 0018:ffff8881261f7388 EFLAGS: 00010212 [ 13.383592] RAX: dffffc0000000000 RBX: ffff88813152f400 RCX: 0000000000000002 [ 13.383597] RDX: 00a1c04140000013 RSI: 0000000000000008 RDI: 050e020a0000009b [ 13.383600] RBP: ffff88814d010000 R08: 0000000000000002 R09: fffffbfff3ddb8c0 [ 13.383604] R10: ffffffff9eedc607 R11: ffff88810ce98000 R12: 050e020a0000008b [ 13.383607] R13: ffff88814d010000 R14: dffffc0000000000 R15: 0000000000000004 [ 13.383611] FS: 00007a21f94d0880(0000) GS:ffff8887e7d80000(0000) knlGS:0000000000000000 [ 13.383615] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 13.383618] CR2: 00007e0014c438f0 CR3: 000000012614c000 CR4: 0000000000f50ef0 [ 13.383622] PKRU: 55555554 [ 13.383625] Call Trace: [ 13.383629] [ 13.383632] ? __die_body.cold+0x19/0x27 [ 13.383644] ? die_addr+0x46/0x70 [ 13.383652] ? exc_general_protection+0x150/0x240 [ 13.383664] ? asm_exc_general_protection+0x26/0x30 [ 13.383674] ? amd_sfh_get_report+0x81/0x530 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.383686] ? amd_sfh_get_report+0x3ec/0x530 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.383697] amdtp_hid_request+0xb8/0x110 [amd_sfh 05f43221435b5205f734cd9da29399130f398a38] [ 13.383706] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.383713] sensor_hub_get_feature+0x1d3/0x540 [hid_sensor_hub 3f13be3016ff415bea03008d45d99da837ee3082] [ 13.383727] hid_sensor_parse_common_attributes+0x4d0/0xad0 [hid_sensor_iio_common c3a5cbe93969c28b122609768bbe23efe52eb8f5] [ 13.383739] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.383745] ? __pfx_hid_sensor_parse_common_attributes+0x10/0x10 [hid_sensor_iio_common c3a5cbe93969c28b122609768bbe23efe52eb8f5] [ 13.383753] ? _raw_spin_lock_irqsave+0x96/0xf0 [ 13.383762] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 13.383768] ? devm_iio_device_alloc+0x34/0x50 [industrialio 3d261d5e5765625d2b052be40e526d62b1d2123b] [ 13.383790] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.383795] ? __devm_add_action+0x167/0x1d0 [ 13.383806] hid_gyro_3d_probe+0x120/0x7f0 [hid_sensor_gyro_3d 63da36a143b775846ab2dbb86c343b401b5e3172] [ 13.383818] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.383826] platform_probe+0xa2/0x150 [ 13.383832] really_probe+0x1e3/0x8a0 [ 13.383838] __driver_probe_device+0x18c/0x370 [ 13.383844] driver_probe_device+0x4a/0x120 [ 13.383851] __driver_attach+0x190/0x4a0 [ 13.383857] ? __pfx___driver_attach+0x10/0x10 [ 13.383863] bus_for_each_dev+0x106/0x180 [ 13.383868] ? __pfx__raw_spin_lock+0x10/0x10 [ 13.383874] ? __pfx_bus_for_each_dev+0x10/0x10 [ 13.383880] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.383887] bus_add_driver+0x29e/0x4d0 [ 13.383895] driver_register+0x1a5/0x360 [ 13.383902] ? __pfx_hid_gyro_3d_platform_driver_init+0x10/0x10 [hid_sensor_gyro_3d 63da36a143b775846ab2dbb86c343b401b5e3172] [ 13.383910] do_one_initcall+0xa7/0x380 [ 13.383919] ? __pfx_do_one_initcall+0x10/0x10 [ 13.383927] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.383933] ? kasan_unpoison+0x44/0x70 [ 13.383943] do_init_module+0x238/0x750 [ 13.383955] load_module+0x5011/0x6af0 [ 13.383962] ? kasan_save_stack+0x30/0x50 [ 13.383968] ? kasan_save_track+0x14/0x30 [ 13.383973] ? kasan_save_free_info+0x3b/0x60 [ 13.383980] ? poison_slab_object+0x109/0x180 [ 13.383993] ? __pfx_load_module+0x10/0x10 [ 13.384007] ? poison_slab_object+0x109/0x180 [ 13.384012] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384018] ? init_module_from_file+0x13d/0x150 [ 13.384025] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384032] ? init_module_from_file+0xdf/0x150 [ 13.384037] init_module_from_file+0xdf/0x150 [ 13.384044] ? __pfx_init_module_from_file+0x10/0x10 [ 13.384050] ? kasan_save_track+0x14/0x30 [ 13.384055] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384060] ? kasan_save_free_info+0x3b/0x60 [ 13.384066] ? poison_slab_object+0x109/0x180 [ 13.384071] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384080] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384085] ? _raw_spin_lock+0x85/0xe0 [ 13.384091] ? __pfx__raw_spin_lock+0x10/0x10 [ 13.384096] ? __rseq_handle_notify_resume+0x1a6/0xad0 [ 13.384106] idempotent_init_module+0x23b/0x650 [ 13.384114] ? __pfx_idempotent_init_module+0x10/0x10 [ 13.384120] ? __pfx___seccomp_filter+0x10/0x10 [ 13.384129] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384135] ? __fget_light+0x57/0x420 [ 13.384142] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384147] ? security_capable+0x74/0xb0 [ 13.384157] __x64_sys_finit_module+0xbe/0x130 [ 13.384164] do_syscall_64+0x82/0x190 [ 13.384174] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384179] ? irqtime_account_irq+0x3d/0x1f0 [ 13.384188] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384193] ? __irq_exit_rcu+0x4e/0x130 [ 13.384201] ? srso_alias_return_thunk+0x5/0xfbef5 [ 13.384206] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 13.384212] RIP: 0033:0x7a21f96ade9d [ 13.384263] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 63 de 0c 00 f7 d8 64 89 01 48 [ 13.384267] RSP: 002b:00007ffeae934e78 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 13.384273] RAX: ffffffffffffffda RBX: 00005987276bfcf0 RCX: 00007a21f96ade9d [ 13.384277] RDX: 0000000000000004 RSI: 00007a21f8eda376 RDI: 000000000000001c [ 13.384280] RBP: 00007a21f8eda376 R08: 0000000000000001 R09: 00007ffeae934ec0 [ 13.384284] R10: 0000000000000050 R11: 0000000000000246 R12: 0000000000020000 [ 13.384288] R13: 00005987276c1250 R14: 0000000000000000 R15: 00005987276c4530 [ 13.384297] [ 13.384299] Modules linked in: soundwire_amd(+) hid_sensor_gyro_3d(+) hid_sensor_magn_3d hid_sensor_accel_3d soundwire_generic_allocation amdxcp hid_sensor_trigger drm_exec industrialio_triggered_buffer soundwire_bus gpu_sched kvm_amd kfifo_buf qmi_helpers joydev drm_buddy hid_sensor_iio_common mousedev snd_soc_core industrialio i2c_algo_bit mac80211 snd_compress drm_suballoc_helper kvm snd_hda_intel drm_ttm_helper ac97_bus snd_pcm_dmaengine snd_intel_dspcfg ttm thinkpad_acpi(+) snd_intel_sdw_acpi hid_sensor_hub snd_rpl_pci_acp6x drm_display_helper snd_hda_codec hid_multitouch libarc4 snd_acp_pci platform_profile think_lmi(+) hid_generic firmware_attributes_class wmi_bmof cec snd_acp_legacy_common sparse_keymap rapl snd_hda_core psmouse cfg80211 pcspkr snd_pci_acp6x snd_hwdep video snd_pcm snd_pci_acp5x snd_timer snd_rn_pci_acp3x ucsi_acpi snd_acp_config snd sp5100_tco rfkill snd_soc_acpi typec_ucsi thunderbolt amd_sfh k10temp mhi soundcore i2c_piix4 snd_pci_acp3x typec i2c_hid_acpi roles i2c_hid wmi acpi_tad amd_pmc [ 13.384454] mac_hid i2c_dev crypto_user loop nfnetlink zram ip_tables x_tables dm_crypt cbc encrypted_keys trusted asn1_encoder tee dm_mod crct10dif_pclmul crc32_pclmul polyval_clmulni polyval_generic gf128mul ghash_clmulni_intel serio_raw sha512_ssse3 atkbd sha256_ssse3 libps2 sha1_ssse3 vivaldi_fmap nvme aesni_intel crypto_simd nvme_core cryptd ccp xhci_pci i8042 nvme_auth xhci_pci_renesas serio vfat fat btrfs blake2b_generic libcrc32c crc32c_generic crc32c_intel xor raid6_pq [ 13.384552] ---[ end trace 0000000000000000 ]--- KASAN reports a use-after-free of hid->driver_data in function amd_sfh_get_report(). The backtrace indicates that the function is called by amdtp_hid_request() which is one of the callbacks of hid device. The current make sure that driver_data is freed only once hid_destroy_device() returned. Note that I observed the crash both on v6.9.9 and v6.10.0. The code seems to be as it was from the early days of the driver. Signed-off-by: Olivier Sobrie Acked-by: Basavaraj Natikar Signed-off-by: Jiri Kosina --- drivers/hid/amd-sfh-hid/amd_sfh_hid.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_hid.c b/drivers/hid/amd-sfh-hid/amd_sfh_hid.c index 705b5233706845..81f3024b7b1b51 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_hid.c +++ b/drivers/hid/amd-sfh-hid/amd_sfh_hid.c @@ -171,11 +171,13 @@ int amdtp_hid_probe(u32 cur_hid_dev, struct amdtp_cl_data *cli_data) void amdtp_hid_remove(struct amdtp_cl_data *cli_data) { int i; + struct amdtp_hid_data *hid_data; for (i = 0; i < cli_data->num_hid_devices; ++i) { if (cli_data->hid_sensor_hubs[i]) { - kfree(cli_data->hid_sensor_hubs[i]->driver_data); + hid_data = cli_data->hid_sensor_hubs[i]->driver_data; hid_destroy_device(cli_data->hid_sensor_hubs[i]); + kfree(hid_data); cli_data->hid_sensor_hubs[i] = NULL; } } From c8000deb68365b461b324d68c7ea89d730f0bb85 Mon Sep 17 00:00:00 2001 From: Dmitry Savin Date: Tue, 16 Jul 2024 23:27:57 +0100 Subject: [PATCH 043/991] HID: multitouch: Add support for GT7868Q GT7868Q has incorrect data in the report and needs a fixup. The change enables haptic touchpad on Lenovo ThinkBook 13x Gen 4 and has been tested on the device. Signed-off-by: Dmitry Savin Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 2 ++ drivers/hid/hid-multitouch.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 6e322338908022..781c5aa298598a 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -521,6 +521,8 @@ #define USB_DEVICE_ID_GENERAL_TOUCH_WIN8_PIT_E100 0xe100 #define I2C_VENDOR_ID_GOODIX 0x27c6 +#define I2C_DEVICE_ID_GOODIX_01E8 0x01e8 +#define I2C_DEVICE_ID_GOODIX_01E9 0x01e9 #define I2C_DEVICE_ID_GOODIX_01F0 0x01f0 #define USB_VENDOR_ID_GOODTOUCH 0x1aad diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 56fc78841f245a..99812c0f830b5e 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1441,6 +1441,30 @@ static int mt_event(struct hid_device *hid, struct hid_field *field, return 0; } +static __u8 *mt_report_fixup(struct hid_device *hdev, __u8 *rdesc, + unsigned int *size) +{ + if (hdev->vendor == I2C_VENDOR_ID_GOODIX && + (hdev->product == I2C_DEVICE_ID_GOODIX_01E8 || + hdev->product == I2C_DEVICE_ID_GOODIX_01E9)) { + if (rdesc[607] == 0x15) { + rdesc[607] = 0x25; + dev_info( + &hdev->dev, + "GT7868Q report descriptor fixup is applied.\n"); + } else { + dev_info( + &hdev->dev, + "The byte is not expected for fixing the report descriptor. \ +It's possible that the touchpad firmware is not suitable for applying the fix. \ +got: %x\n", + rdesc[607]); + } + } + + return rdesc; +} + static void mt_report(struct hid_device *hid, struct hid_report *report) { struct mt_device *td = hid_get_drvdata(hid); @@ -2035,6 +2059,14 @@ static const struct hid_device_id mt_devices[] = { MT_BT_DEVICE(USB_VENDOR_ID_FRUCTEL, USB_DEVICE_ID_GAMETEL_MT_MODE) }, + /* Goodix GT7868Q devices */ + { .driver_data = MT_CLS_WIN_8_FORCE_MULTI_INPUT_NSMU, + HID_DEVICE(BUS_I2C, HID_GROUP_ANY, I2C_VENDOR_ID_GOODIX, + I2C_DEVICE_ID_GOODIX_01E8) }, + { .driver_data = MT_CLS_WIN_8_FORCE_MULTI_INPUT_NSMU, + HID_DEVICE(BUS_I2C, HID_GROUP_ANY, I2C_VENDOR_ID_GOODIX, + I2C_DEVICE_ID_GOODIX_01E8) }, + /* GoodTouch panels */ { .driver_data = MT_CLS_NSMU, MT_USB_DEVICE(USB_VENDOR_ID_GOODTOUCH, @@ -2270,6 +2302,7 @@ static struct hid_driver mt_driver = { .feature_mapping = mt_feature_mapping, .usage_table = mt_grabbed_usages, .event = mt_event, + .report_fixup = mt_report_fixup, .report = mt_report, .suspend = pm_ptr(mt_suspend), .reset_resume = pm_ptr(mt_reset_resume), From 1b8f9c1fb464968a5b18d3acc1da8c00bad24fad Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Tue, 30 Jul 2024 08:51:55 -0700 Subject: [PATCH 044/991] HID: wacom: Defer calculation of resolution until resolution_code is known The Wacom driver maps the HID_DG_TWIST usage to ABS_Z (rather than ABS_RZ) for historic reasons. When the code to support twist was introduced in commit 50066a042da5 ("HID: wacom: generic: Add support for height, tilt, and twist usages"), we were careful to write it in such a way that it had HID calculate the resolution of the twist axis assuming ABS_RZ instead (so that we would get correct angular behavior). This was broken with the introduction of commit 08a46b4190d3 ("HID: wacom: Set a default resolution for older tablets"), which moved the resolution calculation to occur *before* the adjustment from ABS_Z to ABS_RZ occurred. This commit moves the calculation of resolution after the point that we are finished setting things up for its proper use. Signed-off-by: Jason Gerecke Fixes: 08a46b4190d3 ("HID: wacom: Set a default resolution for older tablets") Cc: stable@vger.kernel.org Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 1f4564982b9584..2541fa2e0fa3b1 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -1878,12 +1878,14 @@ static void wacom_map_usage(struct input_dev *input, struct hid_usage *usage, int fmax = field->logical_maximum; unsigned int equivalent_usage = wacom_equivalent_usage(usage->hid); int resolution_code = code; - int resolution = hidinput_calc_abs_res(field, resolution_code); + int resolution; if (equivalent_usage == HID_DG_TWIST) { resolution_code = ABS_RZ; } + resolution = hidinput_calc_abs_res(field, resolution_code); + if (equivalent_usage == HID_GD_X) { fmin += features->offset_left; fmax -= features->offset_right; From fca5b78511e98bdff2cdd55c172b23200a7b3404 Mon Sep 17 00:00:00 2001 From: Barak Biber Date: Thu, 1 Aug 2024 09:26:04 -0300 Subject: [PATCH 045/991] iommu: Restore lost return in iommu_report_device_fault() When iommu_report_device_fault gets called with a partial fault it is supposed to collect the fault into the group and then return. Instead the return was accidently deleted which results in trying to process the fault and an eventual crash. Deleting the return was a typo, put it back. Fixes: 3dfa64aecbaf ("iommu: Make iommu_report_device_fault() return void") Signed-off-by: Barak Biber Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/0-v1-e7153d9c8cee+1c6-iommu_fault_fix_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgfault.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index cd679c13752e00..81e9cc6e3164a4 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -170,6 +170,7 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) report_partial_fault(iopf_param, fault); iopf_put_dev_fault_param(iopf_param); /* A request that is not the last does not need to be ack'd */ + return; } /* From 90ec3a8a7fd0d43026fcca979713e077d4883b56 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 2 Aug 2024 16:22:13 +0100 Subject: [PATCH 046/991] spi: Add empty versions of ACPI functions Provide empty versions of acpi_spi_count_resources(), acpi_spi_device_alloc() and acpi_spi_find_controller_by_adev() if the real functions are not being built. This commit fixes two problems with the original definitions: 1) There wasn't an empty version of these functions 2) The #if only depended on CONFIG_ACPI. But the functions are implemented in the core spi.c so CONFIG_SPI_MASTER must also be enabled for the real functions to exist. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20240802152215.20831-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e4f3f3d30a0390..d47d5f14ff992f 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -902,12 +902,29 @@ extern int devm_spi_register_controller(struct device *dev, struct spi_controller *ctlr); extern void spi_unregister_controller(struct spi_controller *ctlr); -#if IS_ENABLED(CONFIG_ACPI) +#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_SPI_MASTER) extern struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev); extern struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, struct acpi_device *adev, int index); int acpi_spi_count_resources(struct acpi_device *adev); +#else +static inline struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev) +{ + return NULL; +} + +static inline struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, + struct acpi_device *adev, + int index) +{ + return ERR_PTR(-ENODEV); +} + +static inline int acpi_spi_count_resources(struct acpi_device *adev) +{ + return 0; +} #endif /* From 32b9a52f88a5713bf8a02dae66f2ad69705de69f Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 23 Jul 2024 16:20:52 +0200 Subject: [PATCH 047/991] KVM: arm64: free kvm->arch.nested_mmus with kvfree() kvm->arch.nested_mmus is allocated with kvrealloc(), hence free it with kvfree() instead of kfree(). Fixes: 4f128f8e1aaa ("KVM: arm64: nv: Support multiple nested Stage-2 mmu structures") Signed-off-by: Danilo Krummrich Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240723142204.758796-1-dakr@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index de789e0f1ae9cb..bab27f9d8cc659 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -786,7 +786,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) if (!WARN_ON(atomic_read(&mmu->refcnt))) kvm_free_stage2_pgd(mmu); } - kfree(kvm->arch.nested_mmus); + kvfree(kvm->arch.nested_mmus); kvm->arch.nested_mmus = NULL; kvm->arch.nested_mmus_size = 0; kvm_uninit_stage2_mmu(kvm); From 963a08e586bd45fd55f4c1752e98029ce83fc091 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 23 Jul 2024 12:12:02 +0200 Subject: [PATCH 048/991] KVM: arm64: fix override-init warnings in W=1 builds Add -Wno-override-init to the build flags for sys_regs.c, handle_exit.c, and switch.c to fix warnings like the following: arch/arm64/kvm/hyp/vhe/switch.c:271:43: warning: initialized field overwritten [-Woverride-init] 271 | [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, | Signed-off-by: Sebastian Ott Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240723101204.7356-2-sebott@redhat.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/Makefile | 3 +++ arch/arm64/kvm/hyp/nvhe/Makefile | 2 ++ arch/arm64/kvm/hyp/vhe/Makefile | 2 ++ 3 files changed, 7 insertions(+) diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index a6497228c5a8c3..86a629aaf0a13f 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -10,6 +10,9 @@ include $(srctree)/virt/kvm/Makefile.kvm obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM) += hyp/ +CFLAGS_sys_regs.o += -Wno-override-init +CFLAGS_handle_exit.o += -Wno-override-init + kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o stacktrace.o \ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 782b34b004be3c..b43426a493df5a 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -20,6 +20,8 @@ HOST_EXTRACFLAGS += -I$(objtree)/include lib-objs := clear_page.o copy_page.o memcpy.o memset.o lib-objs := $(addprefix ../../../lib/, $(lib-objs)) +CFLAGS_switch.nvhe.o += -Wno-override-init + hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile index 3b9e5464b5b39c..afc4aed9231ac0 100644 --- a/arch/arm64/kvm/hyp/vhe/Makefile +++ b/arch/arm64/kvm/hyp/vhe/Makefile @@ -6,6 +6,8 @@ asflags-y := -D__KVM_VHE_HYPERVISOR__ ccflags-y := -D__KVM_VHE_HYPERVISOR__ +CFLAGS_switch.o += -Wno-override-init + obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o From 0aa34b37a78d063da58838b84b20a68a94d919fd Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 23 Jul 2024 12:12:03 +0200 Subject: [PATCH 049/991] KVM: arm64: fix kdoc warnings in W=1 builds Fix kdoc warnings by adding missing function parameter descriptions or by conversion to a normal comment. Signed-off-by: Sebastian Ott Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240723101204.7356-3-sebott@redhat.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/vgic/vgic-irqfd.c | 7 ++++--- arch/arm64/kvm/vgic/vgic-its.c | 18 +++++++++++------- arch/arm64/kvm/vgic/vgic-v3.c | 2 +- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a7ca776b51ec8a..23e1fa56c02dd0 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -164,6 +164,7 @@ static int kvm_arm_default_max_vcpus(void) /** * kvm_arch_init_vm - initializes a VM data structure * @kvm: pointer to the KVM struct + * @type: kvm device type */ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c index 8c711deb25aa00..c314c016659abe 100644 --- a/arch/arm64/kvm/vgic/vgic-irqfd.c +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c @@ -9,7 +9,7 @@ #include #include "vgic.h" -/** +/* * vgic_irqfd_set_irq: inject the IRQ corresponding to the * irqchip routing entry * @@ -75,7 +75,8 @@ static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e, msi->flags = e->msi.flags; msi->devid = e->msi.devid; } -/** + +/* * kvm_set_msi: inject the MSI corresponding to the * MSI routing entry * @@ -98,7 +99,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return vgic_its_inject_msi(kvm, &msi); } -/** +/* * kvm_arch_set_irq_inatomic: fast-path for irqfd injection */ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 40bb43f20bf347..ba945ba78cc7d7 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2040,6 +2040,7 @@ typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry, * @start_id: the ID of the first entry in the table * (non zero for 2d level tables) * @fn: function to apply on each entry + * @opaque: pointer to opaque data * * Return: < 0 on error, 0 if last element was identified, 1 otherwise * (the last element may not be found on second level tables) @@ -2079,7 +2080,7 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, return 1; } -/** +/* * vgic_its_save_ite - Save an interrupt translation entry at @gpa */ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, @@ -2099,6 +2100,8 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, /** * vgic_its_restore_ite - restore an interrupt translation entry + * + * @its: its handle * @event_id: id used for indexing * @ptr: pointer to the ITE entry * @opaque: pointer to the its_device @@ -2231,6 +2234,7 @@ static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) * @its: ITS handle * @dev: ITS device * @ptr: GPA + * @dte_esz: device table entry size */ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, gpa_t ptr, int dte_esz) @@ -2313,7 +2317,7 @@ static int vgic_its_device_cmp(void *priv, const struct list_head *a, return 1; } -/** +/* * vgic_its_save_device_tables - Save the device table and all ITT * into guest RAM * @@ -2386,7 +2390,7 @@ static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr, return ret; } -/** +/* * vgic_its_restore_device_tables - Restore the device table and all ITT * from guest RAM to internal data structs */ @@ -2478,7 +2482,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) return 1; } -/** +/* * vgic_its_save_collection_table - Save the collection table into * guest RAM */ @@ -2518,7 +2522,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) return ret; } -/** +/* * vgic_its_restore_collection_table - reads the collection table * in guest memory and restores the ITS internal state. Requires the * BASER registers to be restored before. @@ -2556,7 +2560,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) return ret; } -/** +/* * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM * according to v0 ABI */ @@ -2571,7 +2575,7 @@ static int vgic_its_save_tables_v0(struct vgic_its *its) return vgic_its_save_collection_table(its); } -/** +/* * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM * to internal data structs according to V0 ABI * diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index ed6e412cd74bac..3eecdd2f4b8f55 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -370,7 +370,7 @@ static void map_all_vpes(struct kvm *kvm) dist->its_vm.vpes[i]->irq)); } -/** +/* * vgic_v3_save_pending_tables - Save the pending tables into guest RAM * kvm lock and all vcpu lock must be held */ From 19d837bc881b2f9f72f9eb506b46c2e2d983896d Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 23 Jul 2024 12:12:04 +0200 Subject: [PATCH 050/991] KVM: arm64: vgic: fix unexpected unlock sparse warnings Get rid of unexpected unlock sparse warnings in vgic code by adding an annotation to vgic_queue_irq_unlock(). arch/arm64/kvm/vgic/vgic.c:334:17: warning: context imbalance in 'vgic_queue_irq_unlock' - unexpected unlock arch/arm64/kvm/vgic/vgic.c:419:5: warning: context imbalance in 'kvm_vgic_inject_irq' - different lock contexts for basic block Signed-off-by: Sebastian Ott Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240723101204.7356-4-sebott@redhat.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 2 +- arch/arm64/kvm/vgic/vgic.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index f07b3ddff7d442..974849ea7101c6 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -313,7 +313,7 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne * with all locks dropped. */ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, - unsigned long flags) + unsigned long flags) __releases(&irq->irq_lock) { struct kvm_vcpu *vcpu; diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 03d356a123771f..ba8f790431bd3a 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -186,7 +186,7 @@ bool vgic_get_phys_line_level(struct vgic_irq *irq); void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, - unsigned long flags); + unsigned long flags) __releases(&irq->irq_lock); void vgic_kick_vcpus(struct kvm *kvm); void vgic_irq_handle_resampling(struct vgic_irq *irq, bool lr_deactivated, bool lr_pending); From 0e8a0504da59041e775a95db3ebc1a6211423593 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 1 Aug 2024 13:40:24 +0300 Subject: [PATCH 051/991] phy: qcom: qmp-pcie: Fix X1E80100 PCIe Gen4 PHY initialisation Update the PCIe Gen4 PHY init sequence with the latest based on internal Qualcomm documentation. Fixes: 606060ce8fd0 ("phy: qcom-qmp-pcie: Add support for X1E80100 g3x2 and g4x2 PCIE") Signed-off-by: Abel Vesa Link: https://lore.kernel.org/r/20240801-x1e80100-phy-qmp-pcie-fix-config-v2-1-cdc0f22b4169@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-pcie.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c b/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c index 5b36cc7ac78bbc..06cd9787e70029 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c @@ -1245,8 +1245,8 @@ static const struct qmp_phy_init_tbl x1e80100_qmp_gen4x2_pcie_serdes_tbl[] = { static const struct qmp_phy_init_tbl x1e80100_qmp_gen4x2_pcie_ln_shrd_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_RXCLK_DIV2_CTRL, 0x01), QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_DFE_DAC_ENABLE1, 0x88), - QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_TX_ADAPT_POST_THRESH1, 0x00), - QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_TX_ADAPT_POST_THRESH2, 0x1f), + QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_TX_ADAPT_POST_THRESH1, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_TX_ADAPT_POST_THRESH2, 0x0d), QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_RX_MODE_RATE_0_1_B0, 0xd4), QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_RX_MODE_RATE_0_1_B1, 0x12), QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_RX_MODE_RATE_0_1_B2, 0xdb), @@ -1263,6 +1263,7 @@ static const struct qmp_phy_init_tbl x1e80100_qmp_gen4x2_pcie_ln_shrd_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_RX_MARG_COARSE_THRESH4_RATE3, 0x1f), QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_RX_MARG_COARSE_THRESH5_RATE3, 0x1f), QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_RX_MARG_COARSE_THRESH6_RATE3, 0x1f), + QMP_PHY_INIT_CFG(QSERDES_V6_LN_SHRD_RX_SUMMER_CAL_SPD_MODE, 0x5b), }; static const struct qmp_phy_init_tbl x1e80100_qmp_gen4x2_pcie_tx_tbl[] = { @@ -1286,12 +1287,15 @@ static const struct qmp_phy_init_tbl x1e80100_qmp_gen4x2_pcie_rx_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_DFE_1, 0x01), QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_DFE_2, 0x01), QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_DFE_3, 0x45), - QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_VGA_CAL_MAN_VAL, 0x0b), + QMP_PHY_INIT_CFG_LANE(QSERDES_V6_20_RX_VGA_CAL_MAN_VAL, 0x0a, 1), + QMP_PHY_INIT_CFG_LANE(QSERDES_V6_20_RX_VGA_CAL_MAN_VAL, 0x0b, 2), + QMP_PHY_INIT_CFG(QSERDES_V6_20_VGA_CAL_CNTRL1, 0x00), QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_GM_CAL, 0x0d), QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_EQU_ADAPTOR_CNTRL4, 0x0b), QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_SIGDET_ENABLES, 0x1c), QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_PHPRE_CTRL, 0x20), - QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_DFE_CTLE_POST_CAL_OFFSET, 0x38), + QMP_PHY_INIT_CFG_LANE(QSERDES_V6_20_RX_DFE_CTLE_POST_CAL_OFFSET, 0x3a, 1), + QMP_PHY_INIT_CFG_LANE(QSERDES_V6_20_RX_DFE_CTLE_POST_CAL_OFFSET, 0x38, 2), QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_Q_PI_INTRINSIC_BIAS_RATE32, 0x39), QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE2_B0, 0x14), QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE2_B1, 0xb3), @@ -1307,6 +1311,7 @@ static const struct qmp_phy_init_tbl x1e80100_qmp_gen4x2_pcie_rx_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE3_B4, 0x4b), QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE3_B5, 0x76), QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE3_B6, 0xff), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_TX_ADPT_CTRL, 0x10), }; static const struct qmp_phy_init_tbl x1e80100_qmp_gen4x2_pcie_pcs_tbl[] = { @@ -1314,6 +1319,8 @@ static const struct qmp_phy_init_tbl x1e80100_qmp_gen4x2_pcie_pcs_tbl[] = { QMP_PHY_INIT_CFG(QPHY_V6_20_PCS_RX_SIGDET_LVL, 0xcc), QMP_PHY_INIT_CFG(QPHY_V6_20_PCS_EQ_CONFIG4, 0x00), QMP_PHY_INIT_CFG(QPHY_V6_20_PCS_EQ_CONFIG5, 0x22), + QMP_PHY_INIT_CFG(QPHY_V6_20_PCS_TX_RX_CONFIG1, 0x04), + QMP_PHY_INIT_CFG(QPHY_V6_20_PCS_TX_RX_CONFIG2, 0x02), }; static const struct qmp_phy_init_tbl x1e80100_qmp_gen4x2_pcie_pcs_misc_tbl[] = { @@ -1324,11 +1331,13 @@ static const struct qmp_phy_init_tbl x1e80100_qmp_gen4x2_pcie_pcs_misc_tbl[] = { QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_G4_PRE_GAIN, 0x2e), QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_RX_MARGINING_CONFIG1, 0x03), QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_RX_MARGINING_CONFIG3, 0x28), + QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_G3_RXEQEVAL_TIME, 0x27), + QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_G4_RXEQEVAL_TIME, 0x27), QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_TX_RX_CONFIG, 0xc0), QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_POWER_STATE_CONFIG2, 0x1d), - QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_RX_MARGINING_CONFIG5, 0x0f), - QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_G3_FOM_EQ_CONFIG5, 0xf2), - QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_G4_FOM_EQ_CONFIG5, 0xf2), + QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_RX_MARGINING_CONFIG5, 0x18), + QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_G3_FOM_EQ_CONFIG5, 0x7a), + QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_G4_FOM_EQ_CONFIG5, 0x8a), }; static const struct qmp_phy_init_tbl sm8250_qmp_pcie_serdes_tbl[] = { From ce52c2532299c7ccfd34a52db8d071e890a78c59 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Thu, 1 Aug 2024 20:46:42 +0800 Subject: [PATCH 052/991] phy: fsl-imx8mq-usb: fix tuning parameter name According to fsl,imx8mq-usb-phy.yaml, this tuning parameter should be fsl,phy-pcs-tx-deemph-3p5db-attenuation-db. Fixes: 63c85ad0cd81 ("phy: fsl-imx8mp-usb: add support for phy tuning") Cc: stable@vger.kernel.org Signed-off-by: Xu Yang Reviewed-by: Alexander Stein Link: https://lore.kernel.org/r/20240801124642.1152838-1-xu.yang_2@nxp.com Signed-off-by: Vinod Koul --- drivers/phy/freescale/phy-fsl-imx8mq-usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/freescale/phy-fsl-imx8mq-usb.c b/drivers/phy/freescale/phy-fsl-imx8mq-usb.c index 0b9a59d5b8f023..adc6394626ce83 100644 --- a/drivers/phy/freescale/phy-fsl-imx8mq-usb.c +++ b/drivers/phy/freescale/phy-fsl-imx8mq-usb.c @@ -176,7 +176,7 @@ static void imx8m_get_phy_tuning_data(struct imx8mq_usb_phy *imx_phy) imx_phy->comp_dis_tune = phy_comp_dis_tune_from_property(imx_phy->comp_dis_tune); - if (device_property_read_u32(dev, "fsl,pcs-tx-deemph-3p5db-attenuation-db", + if (device_property_read_u32(dev, "fsl,phy-pcs-tx-deemph-3p5db-attenuation-db", &imx_phy->pcs_tx_deemph_3p5db)) imx_phy->pcs_tx_deemph_3p5db = PHY_TUNE_DEFAULT; else From 3a07703a523045cbdb0a5fa5e0902a9145ee43e9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Jul 2024 12:04:33 -0500 Subject: [PATCH 053/991] phy: exynos5-usbdrd: fix error code in probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return negative -ENOMEM instead of positive ENOMEM. Fixes: 497ddafe915e ("phy: exynos5-usbdrd: convert Vbus supplies to regulator_bulk") Signed-off-by: Dan Carpenter Reviewed-by: Peter Griffin Reviewed-by: André Draszik Reviewed-by: Sam Protsenko Link: https://lore.kernel.org/r/a956a3e2-c6ce-4f07-ad80-ec8a96e00d16@stanley.mountain Signed-off-by: Vinod Koul --- drivers/phy/samsung/phy-exynos5-usbdrd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/samsung/phy-exynos5-usbdrd.c b/drivers/phy/samsung/phy-exynos5-usbdrd.c index df52b78a120b47..9cbf9014295036 100644 --- a/drivers/phy/samsung/phy-exynos5-usbdrd.c +++ b/drivers/phy/samsung/phy-exynos5-usbdrd.c @@ -1745,7 +1745,7 @@ static int exynos5_usbdrd_phy_probe(struct platform_device *pdev) sizeof(*phy_drd->regulators), GFP_KERNEL); if (!phy_drd->regulators) - return ENOMEM; + return -ENOMEM; regulator_bulk_set_supply_names(phy_drd->regulators, drv_data->regulator_names, drv_data->n_regulators); From 70d76b0e85ad126358baec1b44f797e61e3ebecc Mon Sep 17 00:00:00 2001 From: Felix Kaechele Date: Sat, 3 Aug 2024 23:13:09 -0400 Subject: [PATCH 054/991] dt-bindings: input: touchscreen: edt-ft5x06: Document FT8201 support Document FocalTech FT8201 support by adding the compatible. Signed-off-by: Felix Kaechele Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240804031310.331871-2-felix@kaechele.ca Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/touchscreen/edt-ft5x06.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml index 379721027bf84b..51d48d4130d380 100644 --- a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml +++ b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml @@ -42,6 +42,7 @@ properties: - focaltech,ft5426 - focaltech,ft5452 - focaltech,ft6236 + - focaltech,ft8201 - focaltech,ft8719 reg: From fc289d3e8698f9b11edad6d73f371ebf35944c57 Mon Sep 17 00:00:00 2001 From: Felix Kaechele Date: Sat, 3 Aug 2024 23:13:10 -0400 Subject: [PATCH 055/991] Input: edt-ft5x06 - add support for FocalTech FT8201 The driver supports the FT8201 chip as well. It registers up to 10 touch points. Tested on: Lenovo ThinkSmart View (CD-18781Y), LCM: BOE TV080WXM-LL4 Signed-off-by: Felix Kaechele Link: https://lore.kernel.org/r/20240804031310.331871-3-felix@kaechele.ca Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/edt-ft5x06.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/input/touchscreen/edt-ft5x06.c b/drivers/input/touchscreen/edt-ft5x06.c index 42f99e57fbb795..e70415f189a556 100644 --- a/drivers/input/touchscreen/edt-ft5x06.c +++ b/drivers/input/touchscreen/edt-ft5x06.c @@ -1474,6 +1474,10 @@ static const struct edt_i2c_chip_data edt_ft6236_data = { .max_support_points = 2, }; +static const struct edt_i2c_chip_data edt_ft8201_data = { + .max_support_points = 10, +}; + static const struct edt_i2c_chip_data edt_ft8719_data = { .max_support_points = 10, }; @@ -1485,6 +1489,7 @@ static const struct i2c_device_id edt_ft5x06_ts_id[] = { { .name = "ft5452", .driver_data = (long)&edt_ft5452_data }, /* Note no edt- prefix for compatibility with the ft6236.c driver */ { .name = "ft6236", .driver_data = (long)&edt_ft6236_data }, + { .name = "ft8201", .driver_data = (long)&edt_ft8201_data }, { .name = "ft8719", .driver_data = (long)&edt_ft8719_data }, { /* sentinel */ } }; @@ -1500,6 +1505,7 @@ static const struct of_device_id edt_ft5x06_of_match[] = { { .compatible = "focaltech,ft5452", .data = &edt_ft5452_data }, /* Note focaltech vendor prefix for compatibility with ft6236.c */ { .compatible = "focaltech,ft6236", .data = &edt_ft6236_data }, + { .compatible = "focaltech,ft8201", .data = &edt_ft8201_data }, { .compatible = "focaltech,ft8719", .data = &edt_ft8719_data }, { /* sentinel */ } }; From 206f533a0a7c683982af473079c4111f4a0f9f5e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 4 Aug 2024 17:50:25 -0700 Subject: [PATCH 056/991] Input: uinput - reject requests with unreasonable number of slots From: Dmitry Torokhov When exercising uinput interface syzkaller may try setting up device with a really large number of slots, which causes memory allocation failure in input_mt_init_slots(). While this allocation failure is handled properly and request is rejected, it results in syzkaller reports. Additionally, such request may put undue burden on the system which will try to free a lot of memory for a bogus request. Fix it by limiting allowed number of slots to 100. This can easily be extended if we see devices that can track more than 100 contacts. Reported-by: Tetsuo Handa Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=0122fa359a69694395d5 Link: https://lore.kernel.org/r/Zqgi7NYEbpRsJfa2@google.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/uinput.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c index d23f3225b00ff7..445856c9127aaa 100644 --- a/drivers/input/misc/uinput.c +++ b/drivers/input/misc/uinput.c @@ -417,6 +417,20 @@ static int uinput_validate_absinfo(struct input_dev *dev, unsigned int code, return -EINVAL; } + /* + * Limit number of contacts to a reasonable value (100). This + * ensures that we need less than 2 pages for struct input_mt + * (we are not using in-kernel slot assignment so not going to + * allocate memory for the "red" table), and we should have no + * trouble getting this much memory. + */ + if (code == ABS_MT_SLOT && max > 99) { + printk(KERN_DEBUG + "%s: unreasonably large number of slots requested: %d\n", + UINPUT_NAME, max); + return -EINVAL; + } + return 0; } From b7fd10333713e9984cc9b9c04f3681f80efdc809 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Thu, 11 Jul 2024 11:37:57 +0200 Subject: [PATCH 057/991] pinctrl: qcom: x1e80100: Update PDC hwirq map The current map seems to be out of sync (and includes a duplicate entry for GPIO193..). Replace it with the map present in shipping devices' ACPI tables. This new one seems more complete, as it e.g. contains GPIO145 (PCIE6a WAKE#) Fixes: 05e4941d97ef ("pinctrl: qcom: Add X1E80100 pinctrl driver") Signed-off-by: Konrad Dybcio Reviewed-by: Abel Vesa Reviewed-by: Rajendra Nayak Link: https://lore.kernel.org/20240711-topic-x1e_pdc_tlmm-v1-1-e278b249d793@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-x1e80100.c | 27 ++++++++++++++----------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/drivers/pinctrl/qcom/pinctrl-x1e80100.c b/drivers/pinctrl/qcom/pinctrl-x1e80100.c index e30e9384035747..6cd4d10e6fd6fa 100644 --- a/drivers/pinctrl/qcom/pinctrl-x1e80100.c +++ b/drivers/pinctrl/qcom/pinctrl-x1e80100.c @@ -1813,18 +1813,21 @@ static const struct msm_pingroup x1e80100_groups[] = { static const struct msm_gpio_wakeirq_map x1e80100_pdc_map[] = { { 0, 72 }, { 2, 70 }, { 3, 71 }, { 6, 123 }, { 7, 67 }, { 11, 85 }, - { 15, 68 }, { 18, 122 }, { 19, 69 }, { 21, 158 }, { 23, 143 }, { 26, 129 }, - { 27, 144 }, { 28, 77 }, { 29, 78 }, { 30, 92 }, { 32, 145 }, { 33, 115 }, - { 34, 130 }, { 35, 146 }, { 36, 147 }, { 39, 80 }, { 43, 148 }, { 47, 149 }, - { 51, 79 }, { 53, 89 }, { 59, 87 }, { 64, 90 }, { 65, 106 }, { 66, 142 }, - { 67, 88 }, { 71, 91 }, { 75, 152 }, { 79, 153 }, { 80, 125 }, { 81, 128 }, - { 84, 137 }, { 85, 155 }, { 87, 156 }, { 91, 157 }, { 92, 138 }, { 94, 140 }, - { 95, 141 }, { 113, 84 }, { 121, 73 }, { 123, 74 }, { 129, 76 }, { 131, 82 }, - { 134, 83 }, { 141, 93 }, { 144, 94 }, { 147, 96 }, { 148, 97 }, { 150, 102 }, - { 151, 103 }, { 153, 104 }, { 156, 105 }, { 157, 107 }, { 163, 98 }, { 166, 112 }, - { 172, 99 }, { 181, 101 }, { 184, 116 }, { 193, 40 }, { 193, 117 }, { 196, 108 }, - { 203, 133 }, { 212, 120 }, { 213, 150 }, { 214, 121 }, { 215, 118 }, { 217, 109 }, - { 220, 110 }, { 221, 111 }, { 222, 124 }, { 224, 131 }, { 225, 132 }, + { 13, 86 }, { 15, 68 }, { 18, 122 }, { 19, 69 }, { 21, 158 }, { 23, 143 }, + { 24, 126 }, { 26, 129 }, { 27, 144 }, { 28, 77 }, { 29, 78 }, { 30, 92 }, + { 31, 159 }, { 32, 145 }, { 33, 115 }, { 34, 130 }, { 35, 146 }, { 36, 147 }, + { 38, 113 }, { 39, 80 }, { 43, 148 }, { 47, 149 }, { 51, 79 }, { 53, 89 }, + { 55, 81 }, { 59, 87 }, { 64, 90 }, { 65, 106 }, { 66, 142 }, { 67, 88 }, + { 68, 151 }, { 71, 91 }, { 75, 152 }, { 79, 153 }, { 80, 125 }, { 81, 128 }, + { 83, 154 }, { 84, 137 }, { 85, 155 }, { 87, 156 }, { 91, 157 }, { 92, 138 }, + { 93, 139 }, { 94, 140 }, { 95, 141 }, { 113, 84 }, { 121, 73 }, { 123, 74 }, + { 125, 75 }, { 129, 76 }, { 131, 82 }, { 134, 83 }, { 141, 93 }, { 144, 94 }, + { 145, 95 }, { 147, 96 }, { 148, 97 }, { 150, 102 }, { 151, 103 }, { 153, 104 }, + { 154, 100 }, { 156, 105 }, { 157, 107 }, { 163, 98 }, { 166, 112 }, { 172, 99 }, + { 175, 114 }, { 181, 101 }, { 184, 116 }, { 193, 117 }, { 196, 108 }, { 203, 133 }, + { 208, 134 }, { 212, 120 }, { 213, 150 }, { 214, 121 }, { 215, 118 }, { 217, 109 }, + { 219, 119 }, { 220, 110 }, { 221, 111 }, { 222, 124 }, { 224, 131 }, { 225, 132 }, + { 228, 135 }, { 230, 136 }, { 232, 162 }, }; static const struct msm_pinctrl_soc_data x1e80100_pinctrl = { From 203ed203fcc223d80737a7799f8244646363b739 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 2 Jul 2024 16:54:17 +0200 Subject: [PATCH 058/991] arm64: dts: layerscape: fix thermal node names length Linux kernel expects thermal zone node names to be maximum of 19 characters (see THERMAL_NAME_LENGTH, including terminating NUL byte) and bindings/dtbs_check points that: fsl-ls2088a-rdb.dtb: thermal-zones: 'core-cluster1-thermal', 'core-cluster2-thermal', 'core-cluster3-thermal', 'core-cluster4-thermal' do not match any of the regexes: '^[a-zA-Z][a-zA-Z0-9\\-]{1,10}-thermal$', 'pinctrl-[0-9]+' Name longer than 19 characters leads to driver probe errors when registering such thermal zone. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi | 2 +- arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi | 2 +- arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi | 2 +- arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi | 2 +- arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi | 8 ++++---- arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi index 6b6e3ee950e538..acf293310f7a09 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi @@ -175,7 +175,7 @@ }; }; - core-cluster-thermal { + cluster-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 1>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 17f4e317112095..ab4c919e3e1659 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -214,7 +214,7 @@ }; }; - core-cluster-thermal { + cluster-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 3>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index 200e52622f9981..55019866d6a25b 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -182,7 +182,7 @@ }; }; - core-cluster-thermal { + cluster-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 3>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi index 8ce4b6aae79d47..e3a7db21fe29a3 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi @@ -131,7 +131,7 @@ }; thermal-zones { - core-cluster-thermal { + cluster-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 0>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi index bde89de2576e16..1b306d6802ce3d 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi @@ -122,7 +122,7 @@ }; }; - core-cluster1-thermal { + cluster1-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 4>; @@ -151,7 +151,7 @@ }; }; - core-cluster2-thermal { + cluster2-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 5>; @@ -180,7 +180,7 @@ }; }; - core-cluster3-thermal { + cluster3-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 6>; @@ -209,7 +209,7 @@ }; }; - core-cluster4-thermal { + cluster4-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 7>; diff --git a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi index 26c7ca31e22e7b..bd75a658767ddf 100644 --- a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi @@ -492,7 +492,7 @@ }; }; - ddr-cluster5-thermal { + ddr-ctrl5-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 1>; From 38055789d15155109b41602ad719d770af507030 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 1 Aug 2024 18:04:07 +0300 Subject: [PATCH 059/991] wifi: ath12k: use 128 bytes aligned iova in transmit path for WCN7850 In transmit path, it is likely that the iova is not aligned to PCIe TLP max payload size, which is 128 for WCN7850. Normally in such cases hardware is expected to split the packet into several parts in a manner such that they, other than the first one, have aligned iova. However due to hardware limitations, WCN7850 does not behave like that properly with some specific unaligned iova in transmit path. This easily results in target hang in a KPI transmit test: packet send/receive failure, WMI command send timeout etc. Also fatal error seen in PCIe level: ... Capabilities: ... ... DevSta: ... FatalErr+ ... ... ... Work around this by manually moving/reallocating payload buffer such that we can map it to a 128 bytes aligned iova. The moving requires sufficient head room or tail room in skb: for the former we can do ourselves a favor by asking some extra bytes when registering with mac80211, while for the latter we can do nothing. Moving/reallocating buffer consumes additional CPU cycles, but the good news is that an aligned iova increases PCIe efficiency. In my tests on some X86 platforms the KPI results are almost consistent. Since this is seen only with WCN7850, add a new hardware parameter to differentiate from others. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Baochen Qiang Cc: Tested-by: Mark Pearson Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240715023814.20242-1-quic_bqiang@quicinc.com --- drivers/net/wireless/ath/ath12k/dp_tx.c | 72 +++++++++++++++++++++++++ drivers/net/wireless/ath/ath12k/hw.c | 6 +++ drivers/net/wireless/ath/ath12k/hw.h | 4 ++ drivers/net/wireless/ath/ath12k/mac.c | 1 + 4 files changed, 83 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/dp_tx.c b/drivers/net/wireless/ath/ath12k/dp_tx.c index d08c04343e9007..44406e0b4a342f 100644 --- a/drivers/net/wireless/ath/ath12k/dp_tx.c +++ b/drivers/net/wireless/ath/ath12k/dp_tx.c @@ -162,6 +162,60 @@ static int ath12k_dp_prepare_htt_metadata(struct sk_buff *skb) return 0; } +static void ath12k_dp_tx_move_payload(struct sk_buff *skb, + unsigned long delta, + bool head) +{ + unsigned long len = skb->len; + + if (head) { + skb_push(skb, delta); + memmove(skb->data, skb->data + delta, len); + skb_trim(skb, len); + } else { + skb_put(skb, delta); + memmove(skb->data + delta, skb->data, len); + skb_pull(skb, delta); + } +} + +static int ath12k_dp_tx_align_payload(struct ath12k_base *ab, + struct sk_buff **pskb) +{ + u32 iova_mask = ab->hw_params->iova_mask; + unsigned long offset, delta1, delta2; + struct sk_buff *skb2, *skb = *pskb; + unsigned int headroom = skb_headroom(skb); + int tailroom = skb_tailroom(skb); + int ret = 0; + + offset = (unsigned long)skb->data & iova_mask; + delta1 = offset; + delta2 = iova_mask - offset + 1; + + if (headroom >= delta1) { + ath12k_dp_tx_move_payload(skb, delta1, true); + } else if (tailroom >= delta2) { + ath12k_dp_tx_move_payload(skb, delta2, false); + } else { + skb2 = skb_realloc_headroom(skb, iova_mask); + if (!skb2) { + ret = -ENOMEM; + goto out; + } + + dev_kfree_skb_any(skb); + + offset = (unsigned long)skb2->data & iova_mask; + if (offset) + ath12k_dp_tx_move_payload(skb2, offset, true); + *pskb = skb2; + } + +out: + return ret; +} + int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif, struct sk_buff *skb) { @@ -184,6 +238,7 @@ int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif, bool tcl_ring_retry; bool msdu_ext_desc = false; bool add_htt_metadata = false; + u32 iova_mask = ab->hw_params->iova_mask; if (test_bit(ATH12K_FLAG_CRASH_FLUSH, &ar->ab->dev_flags)) return -ESHUTDOWN; @@ -279,6 +334,23 @@ int ath12k_dp_tx(struct ath12k *ar, struct ath12k_vif *arvif, goto fail_remove_tx_buf; } + if (iova_mask && + (unsigned long)skb->data & iova_mask) { + ret = ath12k_dp_tx_align_payload(ab, &skb); + if (ret) { + ath12k_warn(ab, "failed to align TX buffer %d\n", ret); + /* don't bail out, give original buffer + * a chance even unaligned. + */ + goto map; + } + + /* hdr is pointing to a wrong place after alignment, + * so refresh it for later use. + */ + hdr = (void *)skb->data; + } +map: ti.paddr = dma_map_single(ab->dev, skb->data, skb->len, DMA_TO_DEVICE); if (dma_mapping_error(ab->dev, ti.paddr)) { atomic_inc(&ab->soc_stats.tx_err.misc_fail); diff --git a/drivers/net/wireless/ath/ath12k/hw.c b/drivers/net/wireless/ath/ath12k/hw.c index 2e11ea7635740c..7b0b6a7f4701ab 100644 --- a/drivers/net/wireless/ath/ath12k/hw.c +++ b/drivers/net/wireless/ath/ath12k/hw.c @@ -924,6 +924,8 @@ static const struct ath12k_hw_params ath12k_hw_params[] = { .acpi_guid = NULL, .supports_dynamic_smps_6ghz = true, + + .iova_mask = 0, }, { .name = "wcn7850 hw2.0", @@ -1000,6 +1002,8 @@ static const struct ath12k_hw_params ath12k_hw_params[] = { .acpi_guid = &wcn7850_uuid, .supports_dynamic_smps_6ghz = false, + + .iova_mask = ATH12K_PCIE_MAX_PAYLOAD_SIZE - 1, }, { .name = "qcn9274 hw2.0", @@ -1072,6 +1076,8 @@ static const struct ath12k_hw_params ath12k_hw_params[] = { .acpi_guid = NULL, .supports_dynamic_smps_6ghz = true, + + .iova_mask = 0, }, }; diff --git a/drivers/net/wireless/ath/ath12k/hw.h b/drivers/net/wireless/ath/ath12k/hw.h index e792eb6b249b41..b1d302c48326b8 100644 --- a/drivers/net/wireless/ath/ath12k/hw.h +++ b/drivers/net/wireless/ath/ath12k/hw.h @@ -96,6 +96,8 @@ #define ATH12K_M3_FILE "m3.bin" #define ATH12K_REGDB_FILE_NAME "regdb.bin" +#define ATH12K_PCIE_MAX_PAYLOAD_SIZE 128 + enum ath12k_hw_rate_cck { ATH12K_HW_RATE_CCK_LP_11M = 0, ATH12K_HW_RATE_CCK_LP_5_5M, @@ -215,6 +217,8 @@ struct ath12k_hw_params { const guid_t *acpi_guid; bool supports_dynamic_smps_6ghz; + + u32 iova_mask; }; struct ath12k_hw_ops { diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 8106297f0bc1c7..ce41c8153080cd 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -9193,6 +9193,7 @@ static int ath12k_mac_hw_register(struct ath12k_hw *ah) hw->vif_data_size = sizeof(struct ath12k_vif); hw->sta_data_size = sizeof(struct ath12k_sta); + hw->extra_tx_headroom = ab->hw_params->iova_mask; wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_STA_TX_PWR); From 50359c9c3cb3e55e840e3485f5ee37da5b2b16b6 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Wed, 17 Jul 2024 10:03:33 +0200 Subject: [PATCH 060/991] pmdomain: imx: scu-pd: Remove duplicated clocks These clocks are already added to the list. Remove the duplicates ones. Fixes: a67d780720ff ("genpd: imx: scu-pd: add more PDs") Signed-off-by: Alexander Stein Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240717080334.2210988-1-alexander.stein@ew.tq-group.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/imx/scu-pd.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/pmdomain/imx/scu-pd.c b/drivers/pmdomain/imx/scu-pd.c index 05841b0bf7f30d..01d465d88f60dc 100644 --- a/drivers/pmdomain/imx/scu-pd.c +++ b/drivers/pmdomain/imx/scu-pd.c @@ -223,11 +223,6 @@ static const struct imx_sc_pd_range imx8qxp_scu_pd_ranges[] = { { "lvds1-pwm", IMX_SC_R_LVDS_1_PWM_0, 1, false, 0 }, { "lvds1-lpi2c", IMX_SC_R_LVDS_1_I2C_0, 2, true, 0 }, - { "mipi1", IMX_SC_R_MIPI_1, 1, 0 }, - { "mipi1-pwm0", IMX_SC_R_MIPI_1_PWM_0, 1, 0 }, - { "mipi1-i2c", IMX_SC_R_MIPI_1_I2C_0, 2, 1 }, - { "lvds1", IMX_SC_R_LVDS_1, 1, 0 }, - /* DC SS */ { "dc0", IMX_SC_R_DC_0, 1, false, 0 }, { "dc0-pll", IMX_SC_R_DC_0_PLL_0, 2, true, 0 }, From 042b8711a0beafb2c3b888bebe3c300ab4c817fa Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Thu, 18 Jul 2024 10:24:10 +0200 Subject: [PATCH 061/991] drm/mediatek: Set sensible cursor width/height values to fix crash Hardware-speaking, there is no feature-reduced cursor specific plane, so this driver reserves the last all Overlay plane as a Cursor plane, but sets the maximum cursor width/height to the maximum value that the full overlay plane can use. While this could be ok, it raises issues with common userspace using libdrm (especially Mutter, but other compositors too) which will crash upon performing allocations and/or using said cursor plane. Reduce the maximum width/height for the cursor to 512x512 pixels, value taken from IGT's maximum cursor size test, which succeeds. Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Fei Shao Tested-by: Fei Shao Reviewed-by: Daniel Stone Reviewed-by: CK Hu Link: https://patchwork.kernel.org/project/dri-devel/patch/20240718082410.204459-1-angelogioacchino.delregno@collabora.com/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index ae5c6ec24a1e61..77b50c56c124ce 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -539,8 +539,8 @@ static int mtk_drm_kms_init(struct drm_device *drm) } /* IGT will check if the cursor size is configured */ - drm->mode_config.cursor_width = drm->mode_config.max_width; - drm->mode_config.cursor_height = drm->mode_config.max_height; + drm->mode_config.cursor_width = 512; + drm->mode_config.cursor_height = 512; /* Use OVL device for all DMA memory allocations */ crtc = drm_crtc_from_index(drm, 0); From 5af9b304bc6010723c02f74de0bfd24ff19b1a10 Mon Sep 17 00:00:00 2001 From: Piyush Mehta Date: Mon, 5 Aug 2024 11:29:07 +0530 Subject: [PATCH 062/991] phy: xilinx: phy-zynqmp: Fix SGMII linkup failure on resume On a few Kria KR260 Robotics Starter Kit the PS-GEM SGMII linkup is not happening after the resume. This is because serdes registers are reset when FPD is off (in suspend state) and needs to be reprogrammed in the resume path with the same default initialization as done in the first stage bootloader psu_init routine. To address the failure introduce a set of serdes registers to be saved in the suspend path and then restore it on resume. Fixes: 4a33bea00314 ("phy: zynqmp: Add PHY driver for the Xilinx ZynqMP Gigabit Transceiver") Signed-off-by: Piyush Mehta Signed-off-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/1722837547-2578381-1-git-send-email-radhey.shyam.pandey@amd.com Signed-off-by: Vinod Koul --- drivers/phy/xilinx/phy-zynqmp.c | 56 +++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/drivers/phy/xilinx/phy-zynqmp.c b/drivers/phy/xilinx/phy-zynqmp.c index cb15041371c90b..e6579002f11463 100644 --- a/drivers/phy/xilinx/phy-zynqmp.c +++ b/drivers/phy/xilinx/phy-zynqmp.c @@ -160,6 +160,24 @@ static const char *const xpsgtr_icm_str[] = { /* Timeout values */ #define TIMEOUT_US 1000 +/* Lane 0/1/2/3 offset */ +#define DIG_8(n) ((0x4000 * (n)) + 0x1074) +#define ILL13(n) ((0x4000 * (n)) + 0x1994) +#define DIG_10(n) ((0x4000 * (n)) + 0x107c) +#define RST_DLY(n) ((0x4000 * (n)) + 0x19a4) +#define BYP_15(n) ((0x4000 * (n)) + 0x1038) +#define BYP_12(n) ((0x4000 * (n)) + 0x102c) +#define MISC3(n) ((0x4000 * (n)) + 0x19ac) +#define EQ11(n) ((0x4000 * (n)) + 0x1978) + +static u32 save_reg_address[] = { + /* Lane 0/1/2/3 Register */ + DIG_8(0), ILL13(0), DIG_10(0), RST_DLY(0), BYP_15(0), BYP_12(0), MISC3(0), EQ11(0), + DIG_8(1), ILL13(1), DIG_10(1), RST_DLY(1), BYP_15(1), BYP_12(1), MISC3(1), EQ11(1), + DIG_8(2), ILL13(2), DIG_10(2), RST_DLY(2), BYP_15(2), BYP_12(2), MISC3(2), EQ11(2), + DIG_8(3), ILL13(3), DIG_10(3), RST_DLY(3), BYP_15(3), BYP_12(3), MISC3(3), EQ11(3), +}; + struct xpsgtr_dev; /** @@ -209,6 +227,7 @@ struct xpsgtr_phy { * @tx_term_fix: fix for GT issue * @saved_icm_cfg0: stored value of ICM CFG0 register * @saved_icm_cfg1: stored value of ICM CFG1 register + * @saved_regs: registers to be saved/restored during suspend/resume */ struct xpsgtr_dev { struct device *dev; @@ -221,6 +240,7 @@ struct xpsgtr_dev { bool tx_term_fix; unsigned int saved_icm_cfg0; unsigned int saved_icm_cfg1; + u32 *saved_regs; }; /* @@ -294,6 +314,32 @@ static inline void xpsgtr_clr_set_phy(struct xpsgtr_phy *gtr_phy, writel((readl(addr) & ~clr) | set, addr); } +/** + * xpsgtr_save_lane_regs - Saves registers on suspend + * @gtr_dev: pointer to phy controller context structure + */ +static void xpsgtr_save_lane_regs(struct xpsgtr_dev *gtr_dev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(save_reg_address); i++) + gtr_dev->saved_regs[i] = xpsgtr_read(gtr_dev, + save_reg_address[i]); +} + +/** + * xpsgtr_restore_lane_regs - Restores registers on resume + * @gtr_dev: pointer to phy controller context structure + */ +static void xpsgtr_restore_lane_regs(struct xpsgtr_dev *gtr_dev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(save_reg_address); i++) + xpsgtr_write(gtr_dev, save_reg_address[i], + gtr_dev->saved_regs[i]); +} + /* * Hardware Configuration */ @@ -837,6 +883,8 @@ static int xpsgtr_runtime_suspend(struct device *dev) gtr_dev->saved_icm_cfg0 = xpsgtr_read(gtr_dev, ICM_CFG0); gtr_dev->saved_icm_cfg1 = xpsgtr_read(gtr_dev, ICM_CFG1); + xpsgtr_save_lane_regs(gtr_dev); + return 0; } @@ -847,6 +895,8 @@ static int xpsgtr_runtime_resume(struct device *dev) unsigned int i; bool skip_phy_init; + xpsgtr_restore_lane_regs(gtr_dev); + icm_cfg0 = xpsgtr_read(gtr_dev, ICM_CFG0); icm_cfg1 = xpsgtr_read(gtr_dev, ICM_CFG1); @@ -994,6 +1044,12 @@ static int xpsgtr_probe(struct platform_device *pdev) return ret; } + gtr_dev->saved_regs = devm_kmalloc(gtr_dev->dev, + sizeof(save_reg_address), + GFP_KERNEL); + if (!gtr_dev->saved_regs) + return -ENOMEM; + return 0; } From b336268dde75cb09bd795cb24893d52152a9191f Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 2 Aug 2024 10:50:46 +0300 Subject: [PATCH 063/991] dmaengine: dw: Add peripheral bus width verification Currently the src_addr_width and dst_addr_width fields of the dma_slave_config structure are mapped to the CTLx.SRC_TR_WIDTH and CTLx.DST_TR_WIDTH fields of the peripheral bus side in order to have the properly aligned data passed to the target device. It's done just by converting the passed peripheral bus width to the encoded value using the __ffs() function. This implementation has several problematic sides: 1. __ffs() is undefined if no bit exist in the passed value. Thus if the specified addr-width is DMA_SLAVE_BUSWIDTH_UNDEFINED, __ffs() may return unexpected value depending on the platform-specific implementation. 2. DW AHB DMA-engine permits having the power-of-2 transfer width limited by the DMAH_Mk_HDATA_WIDTH IP-core synthesize parameter. Specifying bus-width out of that constraints scope will definitely cause unexpected result since the destination reg will be only partly touched than the client driver implied. Let's fix all of that by adding the peripheral bus width verification method and calling it in dwc_config() which is supposed to be executed before preparing any transfer. The new method will make sure that the passed source or destination address width is valid and if undefined then the driver will just fallback to the 1-byte width transfer. Fixes: 029a40e97d0d ("dmaengine: dw: provide DMA capabilities") Signed-off-by: Serge Semin Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240802075100.6475-2-fancer.lancer@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/dw/core.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c index 5f7d690e3dbae8..11e269a31a0929 100644 --- a/drivers/dma/dw/core.c +++ b/drivers/dma/dw/core.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -780,10 +781,43 @@ bool dw_dma_filter(struct dma_chan *chan, void *param) } EXPORT_SYMBOL_GPL(dw_dma_filter); +static int dwc_verify_p_buswidth(struct dma_chan *chan) +{ + struct dw_dma_chan *dwc = to_dw_dma_chan(chan); + struct dw_dma *dw = to_dw_dma(chan->device); + u32 reg_width, max_width; + + if (dwc->dma_sconfig.direction == DMA_MEM_TO_DEV) + reg_width = dwc->dma_sconfig.dst_addr_width; + else if (dwc->dma_sconfig.direction == DMA_DEV_TO_MEM) + reg_width = dwc->dma_sconfig.src_addr_width; + else /* DMA_MEM_TO_MEM */ + return 0; + + max_width = dw->pdata->data_width[dwc->dws.p_master]; + + /* Fall-back to 1-byte transfer width if undefined */ + if (reg_width == DMA_SLAVE_BUSWIDTH_UNDEFINED) + reg_width = DMA_SLAVE_BUSWIDTH_1_BYTE; + else if (!is_power_of_2(reg_width) || reg_width > max_width) + return -EINVAL; + else /* bus width is valid */ + return 0; + + /* Update undefined addr width value */ + if (dwc->dma_sconfig.direction == DMA_MEM_TO_DEV) + dwc->dma_sconfig.dst_addr_width = reg_width; + else /* DMA_DEV_TO_MEM */ + dwc->dma_sconfig.src_addr_width = reg_width; + + return 0; +} + static int dwc_config(struct dma_chan *chan, struct dma_slave_config *sconfig) { struct dw_dma_chan *dwc = to_dw_dma_chan(chan); struct dw_dma *dw = to_dw_dma(chan->device); + int ret; memcpy(&dwc->dma_sconfig, sconfig, sizeof(*sconfig)); @@ -792,6 +826,10 @@ static int dwc_config(struct dma_chan *chan, struct dma_slave_config *sconfig) dwc->dma_sconfig.dst_maxburst = clamp(dwc->dma_sconfig.dst_maxburst, 0U, dwc->max_burst); + ret = dwc_verify_p_buswidth(chan); + if (ret) + return ret; + dw->encode_maxburst(dwc, &dwc->dma_sconfig.src_maxburst); dw->encode_maxburst(dwc, &dwc->dma_sconfig.dst_maxburst); From d04b21bfa1c50a2ade4816cab6fdc91827b346b1 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 2 Aug 2024 10:50:47 +0300 Subject: [PATCH 064/991] dmaengine: dw: Add memory bus width verification Currently in case of the DEV_TO_MEM or MEM_TO_DEV DMA transfers the memory data width (single transfer width) is determined based on the buffer length, buffer base address or DMA master-channel max address width capability. It isn't enough in case of the channel disabling prior the block transfer is finished. Here is what DW AHB DMA IP-core databook says regarding the port suspension (DMA-transfer pause) implementation in the controller: "When CTLx.SRC_TR_WIDTH < CTLx.DST_TR_WIDTH and the CFGx.CH_SUSP bit is high, the CFGx.FIFO_EMPTY is asserted once the contents of the FIFO do not permit a single word of CTLx.DST_TR_WIDTH to be formed. However, there may still be data in the channel FIFO, but not enough to form a single transfer of CTLx.DST_TR_WIDTH. In this scenario, once the channel is disabled, the remaining data in the channel FIFO is not transferred to the destination peripheral." So in case if the port gets to be suspended and then disabled it's possible to have the data silently discarded even though the controller reported that FIFO is empty and the CTLx.BLOCK_TS indicated the dropped data already received from the source device. This looks as if the data somehow got lost on a way from the peripheral device to memory and causes problems for instance in the DW APB UART driver, which pauses and disables the DMA-transfer as soon as the recv data timeout happens. Here is the way it looks: Memory <------- DMA FIFO <------ UART FIFO <---------------- UART DST_TR_WIDTH -+--------| | | | | | | No more data Current lvl -+--------| |---------+- DMA-burst lvl | | |---------+- Leftover data | | |---------+- SRC_TR_WIDTH -+--------+-------+---------+ In the example above: no more data is getting received over the UART port and BLOCK_TS is not even close to be fully received; some data is left in the UART FIFO, but not enough to perform a bursted DMA-xfer to the DMA FIFO; some data is left in the DMA FIFO, but not enough to be passed further to the system memory in a single transfer. In this situation the 8250 UART driver catches the recv timeout interrupt, pauses the DMA-transfer and terminates it completely, after which the IRQ handler manually fetches the leftover data from the UART FIFO into the recv-buffer. But since the DMA-channel has been disabled with the data left in the DMA FIFO, that data will be just discarded and the recv-buffer will have a gap of the "current lvl" size in the recv-buffer at the tail of the lately received data portion. So the data will be lost just due to the misconfigured DMA transfer. Note this is only relevant for the case of the transfer suspension and _disabling_. No problem will happen if the transfer will be re-enabled afterwards or the block transfer is fully completed. In the later case the "FIFO flush mode" will be executed at the transfer final stage in order to push out the data left in the DMA FIFO. In order to fix the denoted problem the DW AHB DMA-engine driver needs to make sure that the _bursted_ source transfer width is greater or equal to the single destination transfer (note the HW databook describes more strict constraint than actually required). Since the peripheral-device side is prescribed by the client driver logic, the memory-side can be only used for that. The solution can be easily implemented for the DEV_TO_MEM transfers just by adjusting the memory-channel address width. Sadly it's not that easy for the MEM_TO_DEV transfers since the mem-to-dma burst size is normally dynamically determined by the controller. So the only thing that can be done is to make sure that memory-side address width is greater than the peripheral device address width. Fixes: a09820043c9e ("dw_dmac: autoconfigure data_width or get it via platform data") Signed-off-by: Serge Semin Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240802075100.6475-3-fancer.lancer@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/dw/core.c | 51 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c index 11e269a31a0929..b341a6f1b04383 100644 --- a/drivers/dma/dw/core.c +++ b/drivers/dma/dw/core.c @@ -622,12 +622,10 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, struct dw_desc *prev; struct dw_desc *first; u32 ctllo, ctlhi; - u8 m_master = dwc->dws.m_master; - u8 lms = DWC_LLP_LMS(m_master); + u8 lms = DWC_LLP_LMS(dwc->dws.m_master); dma_addr_t reg; unsigned int reg_width; unsigned int mem_width; - unsigned int data_width = dw->pdata->data_width[m_master]; unsigned int i; struct scatterlist *sg; size_t total_len = 0; @@ -661,7 +659,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, mem = sg_dma_address(sg); len = sg_dma_len(sg); - mem_width = __ffs(data_width | mem | len); + mem_width = __ffs(sconfig->src_addr_width | mem | len); slave_sg_todev_fill_desc: desc = dwc_desc_get(dwc); @@ -721,7 +719,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, lli_write(desc, sar, reg); lli_write(desc, dar, mem); lli_write(desc, ctlhi, ctlhi); - mem_width = __ffs(data_width | mem); + mem_width = __ffs(sconfig->dst_addr_width | mem); lli_write(desc, ctllo, ctllo | DWC_CTLL_DST_WIDTH(mem_width)); desc->len = dlen; @@ -813,6 +811,41 @@ static int dwc_verify_p_buswidth(struct dma_chan *chan) return 0; } +static int dwc_verify_m_buswidth(struct dma_chan *chan) +{ + struct dw_dma_chan *dwc = to_dw_dma_chan(chan); + struct dw_dma *dw = to_dw_dma(chan->device); + u32 reg_width, reg_burst, mem_width; + + mem_width = dw->pdata->data_width[dwc->dws.m_master]; + + /* + * It's possible to have a data portion locked in the DMA FIFO in case + * of the channel suspension. Subsequent channel disabling will cause + * that data silent loss. In order to prevent that maintain the src and + * dst transfer widths coherency by means of the relation: + * (CTLx.SRC_TR_WIDTH * CTLx.SRC_MSIZE >= CTLx.DST_TR_WIDTH) + * Look for the details in the commit message that brings this change. + * + * Note the DMA configs utilized in the calculations below must have + * been verified to have correct values by this method call. + */ + if (dwc->dma_sconfig.direction == DMA_MEM_TO_DEV) { + reg_width = dwc->dma_sconfig.dst_addr_width; + if (mem_width < reg_width) + return -EINVAL; + + dwc->dma_sconfig.src_addr_width = mem_width; + } else if (dwc->dma_sconfig.direction == DMA_DEV_TO_MEM) { + reg_width = dwc->dma_sconfig.src_addr_width; + reg_burst = rounddown_pow_of_two(dwc->dma_sconfig.src_maxburst); + + dwc->dma_sconfig.dst_addr_width = min(mem_width, reg_width * reg_burst); + } + + return 0; +} + static int dwc_config(struct dma_chan *chan, struct dma_slave_config *sconfig) { struct dw_dma_chan *dwc = to_dw_dma_chan(chan); @@ -822,14 +855,18 @@ static int dwc_config(struct dma_chan *chan, struct dma_slave_config *sconfig) memcpy(&dwc->dma_sconfig, sconfig, sizeof(*sconfig)); dwc->dma_sconfig.src_maxburst = - clamp(dwc->dma_sconfig.src_maxburst, 0U, dwc->max_burst); + clamp(dwc->dma_sconfig.src_maxburst, 1U, dwc->max_burst); dwc->dma_sconfig.dst_maxburst = - clamp(dwc->dma_sconfig.dst_maxburst, 0U, dwc->max_burst); + clamp(dwc->dma_sconfig.dst_maxburst, 1U, dwc->max_burst); ret = dwc_verify_p_buswidth(chan); if (ret) return ret; + ret = dwc_verify_m_buswidth(chan); + if (ret) + return ret; + dw->encode_maxburst(dwc, &dwc->dma_sconfig.src_maxburst); dw->encode_maxburst(dwc, &dwc->dma_sconfig.dst_maxburst); From 1fd6fe89055e6dbb4be8f16b8dcab8602e3603d6 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 2 Aug 2024 10:50:48 +0300 Subject: [PATCH 065/991] dmaengine: dw: Simplify prepare CTL_LO methods Currently the CTL LO fields are calculated on the platform-specific basis. It's implemented by means of the prepare_ctllo() callbacks using the ternary operator within the local variables init block at the beginning of the block scope. The functions code currently is relatively hard to comprehend and isn't that optimal since implies four conditional statements executed and two additional local variables defined. Let's simplify the DW AHB DMA prepare_ctllo() method by unrolling the ternary operators into the normal if-else statement, dropping redundant master-interface ID variables and initializing the local variables based on the singly evaluated DMA-transfer direction check. Thus the method will look much more readable since now the fields content can be easily inferred right from the if-else branch. Provide the same update in the Intel DMA32 core driver for the sake of the driver code unification. Note besides of the effects described above this update is basically a preparation before dropping the max burst encoding callback. The dropping will require to call the burst fields calculation methods right in the prepare_ctllo() callbacks. It would have made the later functions code even more complex should they were left in the original state. Signed-off-by: Serge Semin Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240802075100.6475-4-fancer.lancer@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/dw/dw.c | 21 +++++++++++++++------ drivers/dma/dw/idma32.c | 8 ++++++-- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/dma/dw/dw.c b/drivers/dma/dw/dw.c index a4862263ff14df..e3d2cc3ea68c0a 100644 --- a/drivers/dma/dw/dw.c +++ b/drivers/dma/dw/dw.c @@ -67,12 +67,21 @@ static size_t dw_dma_block2bytes(struct dw_dma_chan *dwc, u32 block, u32 width) static u32 dw_dma_prepare_ctllo(struct dw_dma_chan *dwc) { struct dma_slave_config *sconfig = &dwc->dma_sconfig; - u8 smsize = (dwc->direction == DMA_DEV_TO_MEM) ? sconfig->src_maxburst : 0; - u8 dmsize = (dwc->direction == DMA_MEM_TO_DEV) ? sconfig->dst_maxburst : 0; - u8 p_master = dwc->dws.p_master; - u8 m_master = dwc->dws.m_master; - u8 dms = (dwc->direction == DMA_MEM_TO_DEV) ? p_master : m_master; - u8 sms = (dwc->direction == DMA_DEV_TO_MEM) ? p_master : m_master; + u8 smsize = 0, dmsize = 0; + u8 sms, dms; + + if (dwc->direction == DMA_MEM_TO_DEV) { + sms = dwc->dws.m_master; + dms = dwc->dws.p_master; + dmsize = sconfig->dst_maxburst; + } else if (dwc->direction == DMA_DEV_TO_MEM) { + sms = dwc->dws.p_master; + dms = dwc->dws.m_master; + smsize = sconfig->src_maxburst; + } else /* DMA_MEM_TO_MEM */ { + sms = dwc->dws.m_master; + dms = dwc->dws.m_master; + } return DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN | DWC_CTLL_DST_MSIZE(dmsize) | DWC_CTLL_SRC_MSIZE(smsize) | diff --git a/drivers/dma/dw/idma32.c b/drivers/dma/dw/idma32.c index 58f4078d83feca..e0c31f77cd0f27 100644 --- a/drivers/dma/dw/idma32.c +++ b/drivers/dma/dw/idma32.c @@ -202,8 +202,12 @@ static size_t idma32_block2bytes(struct dw_dma_chan *dwc, u32 block, u32 width) static u32 idma32_prepare_ctllo(struct dw_dma_chan *dwc) { struct dma_slave_config *sconfig = &dwc->dma_sconfig; - u8 smsize = (dwc->direction == DMA_DEV_TO_MEM) ? sconfig->src_maxburst : 0; - u8 dmsize = (dwc->direction == DMA_MEM_TO_DEV) ? sconfig->dst_maxburst : 0; + u8 smsize = 0, dmsize = 0; + + if (dwc->direction == DMA_MEM_TO_DEV) + dmsize = sconfig->dst_maxburst; + else if (dwc->direction == DMA_DEV_TO_MEM) + smsize = sconfig->src_maxburst; return DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN | DWC_CTLL_DST_MSIZE(dmsize) | DWC_CTLL_SRC_MSIZE(smsize); From 3acb301d33749a8974e61ecda16a5f5441fc9628 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 2 Aug 2024 10:50:49 +0300 Subject: [PATCH 066/991] dmaengine: dw: Define encode_maxburst() above prepare_ctllo() callbacks As a preparatory change before dropping the encode_maxburst() callbacks let's move dw_dma_encode_maxburst() and idma32_encode_maxburst() to being defined above the dw_dma_prepare_ctllo() and idma32_prepare_ctllo() methods respectively. That's required since the former methods will be called from the later ones directly. Signed-off-by: Serge Semin Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240802075100.6475-5-fancer.lancer@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/dw/dw.c | 18 +++++++++--------- drivers/dma/dw/idma32.c | 10 +++++----- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/dma/dw/dw.c b/drivers/dma/dw/dw.c index e3d2cc3ea68c0a..628ee1e77505d7 100644 --- a/drivers/dma/dw/dw.c +++ b/drivers/dma/dw/dw.c @@ -64,6 +64,15 @@ static size_t dw_dma_block2bytes(struct dw_dma_chan *dwc, u32 block, u32 width) return DWC_CTLH_BLOCK_TS(block) << width; } +static void dw_dma_encode_maxburst(struct dw_dma_chan *dwc, u32 *maxburst) +{ + /* + * Fix burst size according to dw_dmac. We need to convert them as: + * 1 -> 0, 4 -> 1, 8 -> 2, 16 -> 3. + */ + *maxburst = *maxburst > 1 ? fls(*maxburst) - 2 : 0; +} + static u32 dw_dma_prepare_ctllo(struct dw_dma_chan *dwc) { struct dma_slave_config *sconfig = &dwc->dma_sconfig; @@ -88,15 +97,6 @@ static u32 dw_dma_prepare_ctllo(struct dw_dma_chan *dwc) DWC_CTLL_DMS(dms) | DWC_CTLL_SMS(sms); } -static void dw_dma_encode_maxburst(struct dw_dma_chan *dwc, u32 *maxburst) -{ - /* - * Fix burst size according to dw_dmac. We need to convert them as: - * 1 -> 0, 4 -> 1, 8 -> 2, 16 -> 3. - */ - *maxburst = *maxburst > 1 ? fls(*maxburst) - 2 : 0; -} - static void dw_dma_set_device_name(struct dw_dma *dw, int id) { snprintf(dw->name, sizeof(dw->name), "dw:dmac%d", id); diff --git a/drivers/dma/dw/idma32.c b/drivers/dma/dw/idma32.c index e0c31f77cd0f27..493fcbafa2b8ed 100644 --- a/drivers/dma/dw/idma32.c +++ b/drivers/dma/dw/idma32.c @@ -199,6 +199,11 @@ static size_t idma32_block2bytes(struct dw_dma_chan *dwc, u32 block, u32 width) return IDMA32C_CTLH_BLOCK_TS(block); } +static void idma32_encode_maxburst(struct dw_dma_chan *dwc, u32 *maxburst) +{ + *maxburst = *maxburst > 1 ? fls(*maxburst) - 1 : 0; +} + static u32 idma32_prepare_ctllo(struct dw_dma_chan *dwc) { struct dma_slave_config *sconfig = &dwc->dma_sconfig; @@ -213,11 +218,6 @@ static u32 idma32_prepare_ctllo(struct dw_dma_chan *dwc) DWC_CTLL_DST_MSIZE(dmsize) | DWC_CTLL_SRC_MSIZE(smsize); } -static void idma32_encode_maxburst(struct dw_dma_chan *dwc, u32 *maxburst) -{ - *maxburst = *maxburst > 1 ? fls(*maxburst) - 1 : 0; -} - static void idma32_set_device_name(struct dw_dma *dw, int id) { snprintf(dw->name, sizeof(dw->name), "idma32:dmac%d", id); From d8fa0802f63502c0409d02c6b701d51841a6f1bd Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 2 Aug 2024 10:50:50 +0300 Subject: [PATCH 067/991] dmaengine: dw: Simplify max-burst calculation procedure In order to have a more coherent DW AHB DMA slave configuration method - dwc_config() - let's simplify the source and destination channel max-burst calculation procedure: 1. Create the max-burst verification method as it has been just done for the memory and peripheral address widths. Thus the dwc_config() method will turn to a set of the verification methods execution. 2. Since both the generic DW AHB DMA and Intel iDMA 32-bit engines support the power-of-2 bursts only, then the specified by the client driver max-burst values can be converted to being power-of-2 right in the max-burst verification method. 3. Since max-burst encoded value is required on the CTL_LO fields calculation stage, the encode_maxburst() callback can be easily dropped from the dw_dma structure meanwhile the encoding procedure will be executed right in the CTL_LO register value calculation. Thus the update will provide the next positive effects: the internal DMA-slave config structure will contain only the real DMA-transfer config values, which will be encoded to the DMA-controller register fields only when it's required on the buffer mapping; the redundant encode_maxburst() callback will be dropped simplifying the internal HW-abstraction API; dwc_config() will look more readable executing the verification functions one-by-one. Signed-off-by: Serge Semin Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240802075100.6475-6-fancer.lancer@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/dw/core.c | 30 +++++++++++++++++++++--------- drivers/dma/dw/dw.c | 9 ++++----- drivers/dma/dw/idma32.c | 9 ++++----- drivers/dma/dw/regs.h | 1 - 4 files changed, 29 insertions(+), 20 deletions(-) diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c index b341a6f1b04383..32a66f9effd9cc 100644 --- a/drivers/dma/dw/core.c +++ b/drivers/dma/dw/core.c @@ -779,6 +779,23 @@ bool dw_dma_filter(struct dma_chan *chan, void *param) } EXPORT_SYMBOL_GPL(dw_dma_filter); +static int dwc_verify_maxburst(struct dma_chan *chan) +{ + struct dw_dma_chan *dwc = to_dw_dma_chan(chan); + + dwc->dma_sconfig.src_maxburst = + clamp(dwc->dma_sconfig.src_maxburst, 1U, dwc->max_burst); + dwc->dma_sconfig.dst_maxburst = + clamp(dwc->dma_sconfig.dst_maxburst, 1U, dwc->max_burst); + + dwc->dma_sconfig.src_maxburst = + rounddown_pow_of_two(dwc->dma_sconfig.src_maxburst); + dwc->dma_sconfig.dst_maxburst = + rounddown_pow_of_two(dwc->dma_sconfig.dst_maxburst); + + return 0; +} + static int dwc_verify_p_buswidth(struct dma_chan *chan) { struct dw_dma_chan *dwc = to_dw_dma_chan(chan); @@ -838,7 +855,7 @@ static int dwc_verify_m_buswidth(struct dma_chan *chan) dwc->dma_sconfig.src_addr_width = mem_width; } else if (dwc->dma_sconfig.direction == DMA_DEV_TO_MEM) { reg_width = dwc->dma_sconfig.src_addr_width; - reg_burst = rounddown_pow_of_two(dwc->dma_sconfig.src_maxburst); + reg_burst = dwc->dma_sconfig.src_maxburst; dwc->dma_sconfig.dst_addr_width = min(mem_width, reg_width * reg_burst); } @@ -849,15 +866,13 @@ static int dwc_verify_m_buswidth(struct dma_chan *chan) static int dwc_config(struct dma_chan *chan, struct dma_slave_config *sconfig) { struct dw_dma_chan *dwc = to_dw_dma_chan(chan); - struct dw_dma *dw = to_dw_dma(chan->device); int ret; memcpy(&dwc->dma_sconfig, sconfig, sizeof(*sconfig)); - dwc->dma_sconfig.src_maxburst = - clamp(dwc->dma_sconfig.src_maxburst, 1U, dwc->max_burst); - dwc->dma_sconfig.dst_maxburst = - clamp(dwc->dma_sconfig.dst_maxburst, 1U, dwc->max_burst); + ret = dwc_verify_maxburst(chan); + if (ret) + return ret; ret = dwc_verify_p_buswidth(chan); if (ret) @@ -867,9 +882,6 @@ static int dwc_config(struct dma_chan *chan, struct dma_slave_config *sconfig) if (ret) return ret; - dw->encode_maxburst(dwc, &dwc->dma_sconfig.src_maxburst); - dw->encode_maxburst(dwc, &dwc->dma_sconfig.dst_maxburst); - return 0; } diff --git a/drivers/dma/dw/dw.c b/drivers/dma/dw/dw.c index 628ee1e77505d7..6766142884b66e 100644 --- a/drivers/dma/dw/dw.c +++ b/drivers/dma/dw/dw.c @@ -64,13 +64,13 @@ static size_t dw_dma_block2bytes(struct dw_dma_chan *dwc, u32 block, u32 width) return DWC_CTLH_BLOCK_TS(block) << width; } -static void dw_dma_encode_maxburst(struct dw_dma_chan *dwc, u32 *maxburst) +static inline u8 dw_dma_encode_maxburst(u32 maxburst) { /* * Fix burst size according to dw_dmac. We need to convert them as: * 1 -> 0, 4 -> 1, 8 -> 2, 16 -> 3. */ - *maxburst = *maxburst > 1 ? fls(*maxburst) - 2 : 0; + return maxburst > 1 ? fls(maxburst) - 2 : 0; } static u32 dw_dma_prepare_ctllo(struct dw_dma_chan *dwc) @@ -82,11 +82,11 @@ static u32 dw_dma_prepare_ctllo(struct dw_dma_chan *dwc) if (dwc->direction == DMA_MEM_TO_DEV) { sms = dwc->dws.m_master; dms = dwc->dws.p_master; - dmsize = sconfig->dst_maxburst; + dmsize = dw_dma_encode_maxburst(sconfig->dst_maxburst); } else if (dwc->direction == DMA_DEV_TO_MEM) { sms = dwc->dws.p_master; dms = dwc->dws.m_master; - smsize = sconfig->src_maxburst; + smsize = dw_dma_encode_maxburst(sconfig->src_maxburst); } else /* DMA_MEM_TO_MEM */ { sms = dwc->dws.m_master; dms = dwc->dws.m_master; @@ -125,7 +125,6 @@ int dw_dma_probe(struct dw_dma_chip *chip) dw->suspend_chan = dw_dma_suspend_chan; dw->resume_chan = dw_dma_resume_chan; dw->prepare_ctllo = dw_dma_prepare_ctllo; - dw->encode_maxburst = dw_dma_encode_maxburst; dw->bytes2block = dw_dma_bytes2block; dw->block2bytes = dw_dma_block2bytes; diff --git a/drivers/dma/dw/idma32.c b/drivers/dma/dw/idma32.c index 493fcbafa2b8ed..dac617c183e6a5 100644 --- a/drivers/dma/dw/idma32.c +++ b/drivers/dma/dw/idma32.c @@ -199,9 +199,9 @@ static size_t idma32_block2bytes(struct dw_dma_chan *dwc, u32 block, u32 width) return IDMA32C_CTLH_BLOCK_TS(block); } -static void idma32_encode_maxburst(struct dw_dma_chan *dwc, u32 *maxburst) +static inline u8 idma32_encode_maxburst(u32 maxburst) { - *maxburst = *maxburst > 1 ? fls(*maxburst) - 1 : 0; + return maxburst > 1 ? fls(maxburst) - 1 : 0; } static u32 idma32_prepare_ctllo(struct dw_dma_chan *dwc) @@ -210,9 +210,9 @@ static u32 idma32_prepare_ctllo(struct dw_dma_chan *dwc) u8 smsize = 0, dmsize = 0; if (dwc->direction == DMA_MEM_TO_DEV) - dmsize = sconfig->dst_maxburst; + dmsize = idma32_encode_maxburst(sconfig->dst_maxburst); else if (dwc->direction == DMA_DEV_TO_MEM) - smsize = sconfig->src_maxburst; + smsize = idma32_encode_maxburst(sconfig->src_maxburst); return DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN | DWC_CTLL_DST_MSIZE(dmsize) | DWC_CTLL_SRC_MSIZE(smsize); @@ -274,7 +274,6 @@ int idma32_dma_probe(struct dw_dma_chip *chip) dw->suspend_chan = idma32_suspend_chan; dw->resume_chan = idma32_resume_chan; dw->prepare_ctllo = idma32_prepare_ctllo; - dw->encode_maxburst = idma32_encode_maxburst; dw->bytes2block = idma32_bytes2block; dw->block2bytes = idma32_block2bytes; diff --git a/drivers/dma/dw/regs.h b/drivers/dma/dw/regs.h index 76654bd13c1abf..5969d9cc8d7ae2 100644 --- a/drivers/dma/dw/regs.h +++ b/drivers/dma/dw/regs.h @@ -327,7 +327,6 @@ struct dw_dma { void (*suspend_chan)(struct dw_dma_chan *dwc, bool drain); void (*resume_chan)(struct dw_dma_chan *dwc, bool drain); u32 (*prepare_ctllo)(struct dw_dma_chan *dwc); - void (*encode_maxburst)(struct dw_dma_chan *dwc, u32 *maxburst); u32 (*bytes2block)(struct dw_dma_chan *dwc, size_t bytes, unsigned int width, size_t *len); size_t (*block2bytes)(struct dw_dma_chan *dwc, u32 block, u32 width); From 2ebc36b9581df31eed271e5de61fc8a8b66dbc56 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 2 Aug 2024 10:50:51 +0300 Subject: [PATCH 068/991] dmaengine: dw: Unify ret-val local variables naming Currently there are two names utilized in the driver to keep the functions call status: ret and err. For the sake of unification convert to using the first version only. Signed-off-by: Serge Semin Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240802075100.6475-7-fancer.lancer@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/dw/core.c | 20 ++++++++++---------- drivers/dma/dw/platform.c | 20 ++++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c index 32a66f9effd9cc..dd75f97a33b3d3 100644 --- a/drivers/dma/dw/core.c +++ b/drivers/dma/dw/core.c @@ -1155,7 +1155,7 @@ int do_dma_probe(struct dw_dma_chip *chip) bool autocfg = false; unsigned int dw_params; unsigned int i; - int err; + int ret; dw->pdata = devm_kzalloc(chip->dev, sizeof(*dw->pdata), GFP_KERNEL); if (!dw->pdata) @@ -1171,7 +1171,7 @@ int do_dma_probe(struct dw_dma_chip *chip) autocfg = dw_params >> DW_PARAMS_EN & 1; if (!autocfg) { - err = -EINVAL; + ret = -EINVAL; goto err_pdata; } @@ -1191,7 +1191,7 @@ int do_dma_probe(struct dw_dma_chip *chip) pdata->chan_allocation_order = CHAN_ALLOCATION_ASCENDING; pdata->chan_priority = CHAN_PRIORITY_ASCENDING; } else if (chip->pdata->nr_channels > DW_DMA_MAX_NR_CHANNELS) { - err = -EINVAL; + ret = -EINVAL; goto err_pdata; } else { memcpy(dw->pdata, chip->pdata, sizeof(*dw->pdata)); @@ -1203,7 +1203,7 @@ int do_dma_probe(struct dw_dma_chip *chip) dw->chan = devm_kcalloc(chip->dev, pdata->nr_channels, sizeof(*dw->chan), GFP_KERNEL); if (!dw->chan) { - err = -ENOMEM; + ret = -ENOMEM; goto err_pdata; } @@ -1221,15 +1221,15 @@ int do_dma_probe(struct dw_dma_chip *chip) sizeof(struct dw_desc), 4, 0); if (!dw->desc_pool) { dev_err(chip->dev, "No memory for descriptors dma pool\n"); - err = -ENOMEM; + ret = -ENOMEM; goto err_pdata; } tasklet_setup(&dw->tasklet, dw_dma_tasklet); - err = request_irq(chip->irq, dw_dma_interrupt, IRQF_SHARED, + ret = request_irq(chip->irq, dw_dma_interrupt, IRQF_SHARED, dw->name, dw); - if (err) + if (ret) goto err_pdata; INIT_LIST_HEAD(&dw->dma.channels); @@ -1341,8 +1341,8 @@ int do_dma_probe(struct dw_dma_chip *chip) */ dma_set_max_seg_size(dw->dma.dev, dw->chan[0].block_size); - err = dma_async_device_register(&dw->dma); - if (err) + ret = dma_async_device_register(&dw->dma); + if (ret) goto err_dma_register; dev_info(chip->dev, "DesignWare DMA Controller, %d channels\n", @@ -1356,7 +1356,7 @@ int do_dma_probe(struct dw_dma_chip *chip) free_irq(chip->irq, dw); err_pdata: pm_runtime_put_sync_suspend(chip->dev); - return err; + return ret; } int do_dma_remove(struct dw_dma_chip *chip) diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c index 7d9d4c951724b1..47c58ad468cbca 100644 --- a/drivers/dma/dw/platform.c +++ b/drivers/dma/dw/platform.c @@ -29,7 +29,7 @@ static int dw_probe(struct platform_device *pdev) struct dw_dma_chip_pdata *data; struct dw_dma_chip *chip; struct device *dev = &pdev->dev; - int err; + int ret; match = device_get_match_data(dev); if (!match) @@ -51,9 +51,9 @@ static int dw_probe(struct platform_device *pdev) if (IS_ERR(chip->regs)) return PTR_ERR(chip->regs); - err = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); - if (err) - return err; + ret = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (ret) + return ret; if (!data->pdata) data->pdata = dev_get_platdata(dev); @@ -69,14 +69,14 @@ static int dw_probe(struct platform_device *pdev) chip->clk = devm_clk_get_optional(chip->dev, "hclk"); if (IS_ERR(chip->clk)) return PTR_ERR(chip->clk); - err = clk_prepare_enable(chip->clk); - if (err) - return err; + ret = clk_prepare_enable(chip->clk); + if (ret) + return ret; pm_runtime_enable(&pdev->dev); - err = data->probe(chip); - if (err) + ret = data->probe(chip); + if (ret) goto err_dw_dma_probe; platform_set_drvdata(pdev, data); @@ -90,7 +90,7 @@ static int dw_probe(struct platform_device *pdev) err_dw_dma_probe: pm_runtime_disable(&pdev->dev); clk_disable_unprepare(chip->clk); - return err; + return ret; } static void dw_remove(struct platform_device *pdev) From 5e5c793c7fc47219998465361d94510fdf55d83f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 16 Jul 2024 14:57:06 -0700 Subject: [PATCH 069/991] dmaengine: ti: omap-dma: Initialize sglen after allocation With the new __counted_by annocation, the "sglen" struct member must be set before accessing the "sg" array. This initialization was done in other places where a new struct omap_desc is allocated, but these cases were missed. Set "sglen" after allocation. Fixes: b85178611c11 ("dmaengine: ti: omap-dma: Annotate struct omap_desc with __counted_by") Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20240716215702.work.802-kees@kernel.org Signed-off-by: Vinod Koul --- drivers/dma/ti/omap-dma.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/dma/ti/omap-dma.c b/drivers/dma/ti/omap-dma.c index 7e6c04afbe892d..6ab9bfbdc4809e 100644 --- a/drivers/dma/ti/omap-dma.c +++ b/drivers/dma/ti/omap-dma.c @@ -1186,10 +1186,10 @@ static struct dma_async_tx_descriptor *omap_dma_prep_dma_cyclic( d->dev_addr = dev_addr; d->fi = burst; d->es = es; + d->sglen = 1; d->sg[0].addr = buf_addr; d->sg[0].en = period_len / es_bytes[es]; d->sg[0].fn = buf_len / period_len; - d->sglen = 1; d->ccr = c->ccr; if (dir == DMA_DEV_TO_MEM) @@ -1258,10 +1258,10 @@ static struct dma_async_tx_descriptor *omap_dma_prep_dma_memcpy( d->dev_addr = src; d->fi = 0; d->es = data_type; + d->sglen = 1; d->sg[0].en = len / BIT(data_type); d->sg[0].fn = 1; d->sg[0].addr = dest; - d->sglen = 1; d->ccr = c->ccr; d->ccr |= CCR_DST_AMODE_POSTINC | CCR_SRC_AMODE_POSTINC; @@ -1309,6 +1309,7 @@ static struct dma_async_tx_descriptor *omap_dma_prep_dma_interleaved( if (data_type > CSDP_DATA_TYPE_32) data_type = CSDP_DATA_TYPE_32; + d->sglen = 1; sg = &d->sg[0]; d->dir = DMA_MEM_TO_MEM; d->dev_addr = xt->src_start; @@ -1316,7 +1317,6 @@ static struct dma_async_tx_descriptor *omap_dma_prep_dma_interleaved( sg->en = xt->sgl[0].size / BIT(data_type); sg->fn = xt->numf; sg->addr = xt->dst_start; - d->sglen = 1; d->ccr = c->ccr; src_icg = dmaengine_get_src_icg(xt, &xt->sgl[0]); From b53b831919a0dc4e6631ebe0497ab2a4d8bef014 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 16 Jul 2024 14:38:33 -0700 Subject: [PATCH 070/991] dmaengine: stm32-dma3: Set lli_size after allocation With the new __counted_by annotation, the "lli_size" variable needs to valid for accesses to the "lli" array. This requirement is not met in stm32_dma3_chan_desc_alloc(), since "lli_size" starts at "0", so "lli" index "0" will not be considered valid during the initialization for loop. Fix this by setting lli_size immediately after allocation (similar to how this is handled in stm32_mdma_alloc_desc() for the node/count relationship). Fixes: f561ec8b2b33 ("dmaengine: Add STM32 DMA3 support") Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20240716213830.work.951-kees@kernel.org Signed-off-by: Vinod Koul --- drivers/dma/stm32/stm32-dma3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/stm32/stm32-dma3.c b/drivers/dma/stm32/stm32-dma3.c index 4087e0263a485b..0be6e944df6fd5 100644 --- a/drivers/dma/stm32/stm32-dma3.c +++ b/drivers/dma/stm32/stm32-dma3.c @@ -403,6 +403,7 @@ static struct stm32_dma3_swdesc *stm32_dma3_chan_desc_alloc(struct stm32_dma3_ch swdesc = kzalloc(struct_size(swdesc, lli, count), GFP_NOWAIT); if (!swdesc) return NULL; + swdesc->lli_size = count; for (i = 0; i < count; i++) { swdesc->lli[i].hwdesc = dma_pool_zalloc(chan->lli_pool, GFP_NOWAIT, @@ -410,7 +411,6 @@ static struct stm32_dma3_swdesc *stm32_dma3_chan_desc_alloc(struct stm32_dma3_ch if (!swdesc->lli[i].hwdesc) goto err_pool_free; } - swdesc->lli_size = count; swdesc->ccr = 0; /* Set LL base address */ From 5062d9c0cbbc202e495e9b20f147f64ef5cc2897 Mon Sep 17 00:00:00 2001 From: "Sicelo A. Mhlongo" Date: Mon, 22 Jul 2024 13:31:11 +0200 Subject: [PATCH 071/991] ARM: dts: omap3-n900: correct the accelerometer orientation Negate the values reported for the accelerometer z-axis in order to match Documentation/devicetree/bindings/iio/mount-matrix.txt. Fixes: 14a213dcb004 ("ARM: dts: n900: use iio driver for accelerometer") Signed-off-by: Sicelo A. Mhlongo Reviewed-By: Andreas Kemnade Link: https://lore.kernel.org/r/20240722113137.3240847-1-absicsz@gmail.com Signed-off-by: Kevin Hilman --- arch/arm/boot/dts/ti/omap/omap3-n900.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/ti/omap/omap3-n900.dts b/arch/arm/boot/dts/ti/omap/omap3-n900.dts index 07c5b963af78ab..4bde3342bb9597 100644 --- a/arch/arm/boot/dts/ti/omap/omap3-n900.dts +++ b/arch/arm/boot/dts/ti/omap/omap3-n900.dts @@ -781,7 +781,7 @@ mount-matrix = "-1", "0", "0", "0", "1", "0", - "0", "0", "1"; + "0", "0", "-1"; }; cam1: camera@3e { From 7fef1eb0b013eaa42019a95a08f71368e5a22dba Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Tue, 6 Jun 2023 16:46:28 +0100 Subject: [PATCH 072/991] docs: KVM: Fix register ID of SPSR_FIQ Fixes the register ID of SPSR_FIQ. SPSR_FIQ is a 64-bit register and the 64-bit register size mask is 0x0030000000000000ULL. Fixes: fd3bc912d3d1 ("KVM: Documentation: Document arm64 core registers in detail") Signed-off-by: Takahiro Itazuri Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20230606154628.95498-1-itazur@amazon.com Signed-off-by: Oliver Upton --- Documentation/virt/kvm/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index fe722c5dada9f8..87865c8897ca0e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -2592,7 +2592,7 @@ Specifically: 0x6030 0000 0010 004a SPSR_ABT 64 spsr[KVM_SPSR_ABT] 0x6030 0000 0010 004c SPSR_UND 64 spsr[KVM_SPSR_UND] 0x6030 0000 0010 004e SPSR_IRQ 64 spsr[KVM_SPSR_IRQ] - 0x6060 0000 0010 0050 SPSR_FIQ 64 spsr[KVM_SPSR_FIQ] + 0x6030 0000 0010 0050 SPSR_FIQ 64 spsr[KVM_SPSR_FIQ] 0x6040 0000 0010 0054 V0 128 fp_regs.vregs[0] [1]_ 0x6040 0000 0010 0058 V1 128 fp_regs.vregs[1] [1]_ ... From df24373435f5899a2a98b7d377479c8d4376613b Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 2 Aug 2024 22:47:34 +0300 Subject: [PATCH 073/991] drm/msm/dpu: don't play tricks with debug macros DPU debugging macros need to be converted to a proper drm_debug_* macros, however this is a going an intrusive patch, not suitable for a fix. Wire DPU_DEBUG and DPU_DEBUG_DRIVER to always use DRM_DEBUG_DRIVER to make sure that DPU debugging messages always end up in the drm debug messages and are controlled via the usual drm.debug mask. I don't think that it is a good idea for a generic DPU_DEBUG macro to be tied to DRM_UT_KMS. It is used to report a debug message from driver, so by default it should go to the DRM_UT_DRIVER channel. While refactoring debug macros later on we might end up with particular messages going to ATOMIC or KMS, but DRIVER should be the default. Fixes: 25fdd5933e4c ("drm/msm: Add SDM845 DPU support") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/606932/ Link: https://lore.kernel.org/r/20240802-dpu-fix-wb-v2-2-7eac9eb8e895@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h index e2adc937ea63b2..935ff6fd172c41 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h @@ -31,24 +31,14 @@ * @fmt: Pointer to format string */ #define DPU_DEBUG(fmt, ...) \ - do { \ - if (drm_debug_enabled(DRM_UT_KMS)) \ - DRM_DEBUG(fmt, ##__VA_ARGS__); \ - else \ - pr_debug(fmt, ##__VA_ARGS__); \ - } while (0) + DRM_DEBUG_DRIVER(fmt, ##__VA_ARGS__) /** * DPU_DEBUG_DRIVER - macro for hardware driver logging * @fmt: Pointer to format string */ #define DPU_DEBUG_DRIVER(fmt, ...) \ - do { \ - if (drm_debug_enabled(DRM_UT_DRIVER)) \ - DRM_ERROR(fmt, ##__VA_ARGS__); \ - else \ - pr_debug(fmt, ##__VA_ARGS__); \ - } while (0) + DRM_DEBUG_DRIVER(fmt, ##__VA_ARGS__) #define DPU_ERROR(fmt, ...) pr_err("[dpu error]" fmt, ##__VA_ARGS__) #define DPU_ERROR_RATELIMITED(fmt, ...) pr_err_ratelimited("[dpu error]" fmt, ##__VA_ARGS__) From d19d5b8d8f6dab942ce5ddbcf34bf7275e778250 Mon Sep 17 00:00:00 2001 From: Abhinav Kumar Date: Mon, 5 Aug 2024 13:20:08 -0700 Subject: [PATCH 074/991] drm/msm/dp: fix the max supported bpp logic Fix the dp_panel_get_supported_bpp() API to return the minimum supported bpp correctly for relevant cases and use this API to correct the behavior of DP driver which hard-codes the max supported bpp to 30. This is incorrect because the number of lanes and max data rate supported by the lanes need to be taken into account. Replace the hardcoded limit with the appropriate math which accounts for the accurate number of lanes and max data rate. changes in v2: - Fix the dp_panel_get_supported_bpp() and use it - Drop the max_t usage as dp_panel_get_supported_bpp() already returns the min_bpp correctly now changes in v3: - replace min_t with just min as all params are u32 Fixes: c943b4948b58 ("drm/msm/dp: add displayPort driver support") Reported-by: Dmitry Baryshkov Closes: https://gitlab.freedesktop.org/drm/msm/-/issues/43 Tested-by: Dmitry Baryshkov # SM8350-HDK Reviewed-by: Stephen Boyd Patchwork: https://patchwork.freedesktop.org/patch/607073/ Link: https://lore.kernel.org/r/20240805202009.1120981-1-quic_abhinavk@quicinc.com Signed-off-by: Stephen Boyd Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_panel.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_panel.c b/drivers/gpu/drm/msm/dp/dp_panel.c index a916b5f3b3170a..6ff6c9ef351ff2 100644 --- a/drivers/gpu/drm/msm/dp/dp_panel.c +++ b/drivers/gpu/drm/msm/dp/dp_panel.c @@ -90,22 +90,22 @@ static int dp_panel_read_dpcd(struct dp_panel *dp_panel) static u32 dp_panel_get_supported_bpp(struct dp_panel *dp_panel, u32 mode_edid_bpp, u32 mode_pclk_khz) { - struct dp_link_info *link_info; + const struct dp_link_info *link_info; const u32 max_supported_bpp = 30, min_supported_bpp = 18; - u32 bpp = 0, data_rate_khz = 0; + u32 bpp, data_rate_khz; - bpp = min_t(u32, mode_edid_bpp, max_supported_bpp); + bpp = min(mode_edid_bpp, max_supported_bpp); link_info = &dp_panel->link_info; data_rate_khz = link_info->num_lanes * link_info->rate * 8; - while (bpp > min_supported_bpp) { + do { if (mode_pclk_khz * bpp <= data_rate_khz) - break; + return bpp; bpp -= 6; - } + } while (bpp > min_supported_bpp); - return bpp; + return min_supported_bpp; } int dp_panel_read_sink_caps(struct dp_panel *dp_panel, @@ -423,8 +423,9 @@ int dp_panel_init_panel_info(struct dp_panel *dp_panel) drm_mode->clock); drm_dbg_dp(panel->drm_dev, "bpp = %d\n", dp_panel->dp_mode.bpp); - dp_panel->dp_mode.bpp = max_t(u32, 18, - min_t(u32, dp_panel->dp_mode.bpp, 30)); + dp_panel->dp_mode.bpp = dp_panel_get_mode_bpp(dp_panel, dp_panel->dp_mode.bpp, + dp_panel->dp_mode.drm_mode.clock); + drm_dbg_dp(panel->drm_dev, "updated bpp = %d\n", dp_panel->dp_mode.bpp); From 959ab6350add903e352890af53e86663739fcb9a Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Sun, 4 Aug 2024 21:30:15 -0400 Subject: [PATCH 075/991] cgroup/cpuset: fix panic caused by partcmd_update We find a bug as below: BUG: unable to handle page fault for address: 00000003 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 3 PID: 358 Comm: bash Tainted: G W I 6.6.0-10893-g60d6 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/4 RIP: 0010:partition_sched_domains_locked+0x483/0x600 Code: 01 48 85 d2 74 0d 48 83 05 29 3f f8 03 01 f3 48 0f bc c2 89 c0 48 9 RSP: 0018:ffffc90000fdbc58 EFLAGS: 00000202 RAX: 0000000100000003 RBX: ffff888100b3dfa0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000002fe80 RBP: ffff888100b3dfb0 R08: 0000000000000001 R09: 0000000000000000 R10: ffffc90000fdbcb0 R11: 0000000000000004 R12: 0000000000000002 R13: ffff888100a92b48 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f44a5425740(0000) GS:ffff888237d80000(0000) knlGS:0000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000100030973 CR3: 000000010722c000 CR4: 00000000000006e0 Call Trace: ? show_regs+0x8c/0xa0 ? __die_body+0x23/0xa0 ? __die+0x3a/0x50 ? page_fault_oops+0x1d2/0x5c0 ? partition_sched_domains_locked+0x483/0x600 ? search_module_extables+0x2a/0xb0 ? search_exception_tables+0x67/0x90 ? kernelmode_fixup_or_oops+0x144/0x1b0 ? __bad_area_nosemaphore+0x211/0x360 ? up_read+0x3b/0x50 ? bad_area_nosemaphore+0x1a/0x30 ? exc_page_fault+0x890/0xd90 ? __lock_acquire.constprop.0+0x24f/0x8d0 ? __lock_acquire.constprop.0+0x24f/0x8d0 ? asm_exc_page_fault+0x26/0x30 ? partition_sched_domains_locked+0x483/0x600 ? partition_sched_domains_locked+0xf0/0x600 rebuild_sched_domains_locked+0x806/0xdc0 update_partition_sd_lb+0x118/0x130 cpuset_write_resmask+0xffc/0x1420 cgroup_file_write+0xb2/0x290 kernfs_fop_write_iter+0x194/0x290 new_sync_write+0xeb/0x160 vfs_write+0x16f/0x1d0 ksys_write+0x81/0x180 __x64_sys_write+0x21/0x30 x64_sys_call+0x2f25/0x4630 do_syscall_64+0x44/0xb0 entry_SYSCALL_64_after_hwframe+0x78/0xe2 RIP: 0033:0x7f44a553c887 It can be reproduced with cammands: cd /sys/fs/cgroup/ mkdir test cd test/ echo +cpuset > ../cgroup.subtree_control echo root > cpuset.cpus.partition cat /sys/fs/cgroup/cpuset.cpus.effective 0-3 echo 0-3 > cpuset.cpus // taking away all cpus from root This issue is caused by the incorrect rebuilding of scheduling domains. In this scenario, test/cpuset.cpus.partition should be an invalid root and should not trigger the rebuilding of scheduling domains. When calling update_parent_effective_cpumask with partcmd_update, if newmask is not null, it should recheck newmask whether there are cpus is available for parect/cs that has tasks. Fixes: 0c7f293efc87 ("cgroup/cpuset: Add cpuset.cpus.exclusive.effective for v2") Cc: stable@vger.kernel.org # v6.7+ Signed-off-by: Chen Ridong Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 40ec4abaf4408f..a9b6d56eeffabe 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1991,6 +1991,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, part_error = PERR_CPUSEMPTY; goto write_error; } + /* Check newmask again, whether cpus are available for parent/cs */ + nocpu |= tasks_nocpu_error(parent, cs, newmask); /* * partcmd_update with newmask: From 311a1bdc44a8e06024df4fd3392be0dfc8298655 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 4 Aug 2024 21:30:16 -0400 Subject: [PATCH 076/991] cgroup/cpuset: Clear effective_xcpus on cpus_allowed clearing only if cpus.exclusive not set Commit e2ffe502ba45 ("cgroup/cpuset: Add cpuset.cpus.exclusive for v2") adds a user writable cpuset.cpus.exclusive file for setting exclusive CPUs to be used for the creation of partitions. Since then effective_xcpus depends on both the cpuset.cpus and cpuset.cpus.exclusive setting. If cpuset.cpus.exclusive is set, effective_xcpus will depend only on cpuset.cpus.exclusive. When it is not set, effective_xcpus will be set according to the cpuset.cpus value when the cpuset becomes a valid partition root. When cpuset.cpus is being cleared by the user, effective_xcpus should only be cleared when cpuset.cpus.exclusive is not set. However, that is not currently the case. # cd /sys/fs/cgroup/ # mkdir test # echo +cpuset > cgroup.subtree_control # cd test # echo 3 > cpuset.cpus.exclusive # cat cpuset.cpus.exclusive.effective 3 # echo > cpuset.cpus # cat cpuset.cpus.exclusive.effective // was cleared Fix it by clearing effective_xcpus only if cpuset.cpus.exclusive is not set. Fixes: e2ffe502ba45 ("cgroup/cpuset: Add cpuset.cpus.exclusive for v2") Cc: stable@vger.kernel.org # v6.7+ Reported-by: Chen Ridong Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a9b6d56eeffabe..97d02612b3a66b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2525,7 +2525,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ if (!*buf) { cpumask_clear(trialcs->cpus_allowed); - cpumask_clear(trialcs->effective_xcpus); + if (cpumask_empty(trialcs->exclusive_cpus)) + cpumask_clear(trialcs->effective_xcpus); } else { retval = cpulist_parse(buf, trialcs->cpus_allowed); if (retval < 0) From ff0ce721ec213499ec5a532041fb3a1db2dc5ecb Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 4 Aug 2024 21:30:17 -0400 Subject: [PATCH 077/991] cgroup/cpuset: Eliminate unncessary sched domains rebuilds in hotplug It was found that some hotplug operations may cause multiple rebuild_sched_domains_locked() calls. Some of those intermediate calls may use cpuset states not in the final correct form leading to incorrect sched domain setting. Fix this problem by using the existing force_rebuild flag to inhibit immediate rebuild_sched_domains_locked() calls if set and only doing one final call at the end. Also renaming the force_rebuild flag to force_sd_rebuild to make its meaning for clear. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 97d02612b3a66b..4bd9e50bcc8eef 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -232,6 +232,13 @@ static cpumask_var_t isolated_cpus; /* List of remote partition root children */ static struct list_head remote_children; +/* + * A flag to force sched domain rebuild at the end of an operation while + * inhibiting it in the intermediate stages when set. Currently it is only + * set in hotplug code. + */ +static bool force_sd_rebuild; + /* * Partition root states: * @@ -1475,7 +1482,7 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } - if (rebuild_domains) + if (rebuild_domains && !force_sd_rebuild) rebuild_sched_domains_locked(); } @@ -1833,7 +1840,7 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, remote_partition_disable(child, tmp); disable_cnt++; } - if (disable_cnt) + if (disable_cnt && !force_sd_rebuild) rebuild_sched_domains_locked(); } @@ -2442,7 +2449,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, } rcu_read_unlock(); - if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) + if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD) && + !force_sd_rebuild) rebuild_sched_domains_locked(); } @@ -3104,7 +3112,8 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs->flags; spin_unlock_irq(&callback_lock); - if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed && + !force_sd_rebuild) rebuild_sched_domains_locked(); if (spread_flag_changed) @@ -4501,11 +4510,9 @@ hotplug_update_tasks(struct cpuset *cs, update_tasks_nodemask(cs); } -static bool force_rebuild; - void cpuset_force_rebuild(void) { - force_rebuild = true; + force_sd_rebuild = true; } /** @@ -4653,15 +4660,9 @@ static void cpuset_handle_hotplug(void) !cpumask_empty(subpartitions_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); - /* - * In the rare case that hotplug removes all the cpus in - * subpartitions_cpus, we assumed that cpus are updated. - */ - if (!cpus_updated && !cpumask_empty(subpartitions_cpus)) - cpus_updated = true; - /* For v1, synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { + cpuset_force_rebuild(); spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); @@ -4717,8 +4718,8 @@ static void cpuset_handle_hotplug(void) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; + if (force_sd_rebuild) { + force_sd_rebuild = false; rebuild_sched_domains_cpuslocked(); } From aedf02e46eb549dac8db4821a6b9f0c6bf6e3990 Mon Sep 17 00:00:00 2001 From: Abhinav Kumar Date: Wed, 31 Jul 2024 12:17:22 -0700 Subject: [PATCH 078/991] drm/msm/dpu: move dpu_encoder's connector assignment to atomic_enable() For cases where the crtc's connectors_changed was set without enable/active getting toggled , there is an atomic_enable() call followed by an atomic_disable() but without an atomic_mode_set(). This results in a NULL ptr access for the dpu_encoder_get_drm_fmt() call in the atomic_enable() as the dpu_encoder's connector was cleared in the atomic_disable() but not re-assigned as there was no atomic_mode_set() call. Fix the NULL ptr access by moving the assignment for atomic_enable() and also use drm_atomic_get_new_connector_for_encoder() to get the connector from the atomic_state. Fixes: 25fdd5933e4c ("drm/msm: Add SDM845 DPU support") Reported-by: Dmitry Baryshkov Closes: https://gitlab.freedesktop.org/drm/msm/-/issues/59 Suggested-by: Dmitry Baryshkov Reviewed-by: Dmitry Baryshkov Tested-by: Dmitry Baryshkov # SM8350-HDK Patchwork: https://patchwork.freedesktop.org/patch/606729/ Link: https://lore.kernel.org/r/20240731191723.3050932-1-quic_abhinavk@quicinc.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c index 34c56e855af778..3b171bf227d16f 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c @@ -1171,8 +1171,6 @@ static void dpu_encoder_virt_atomic_mode_set(struct drm_encoder *drm_enc, cstate->num_mixers = num_lm; - dpu_enc->connector = conn_state->connector; - for (i = 0; i < dpu_enc->num_phys_encs; i++) { struct dpu_encoder_phys *phys = dpu_enc->phys_encs[i]; @@ -1270,6 +1268,8 @@ static void dpu_encoder_virt_atomic_enable(struct drm_encoder *drm_enc, dpu_enc->commit_done_timedout = false; + dpu_enc->connector = drm_atomic_get_new_connector_for_encoder(state, drm_enc); + cur_mode = &dpu_enc->base.crtc->state->adjusted_mode; dpu_enc->wide_bus_en = dpu_encoder_is_widebus_enabled(drm_enc); From 319aca883bfa1b85ee08411541b51b9a934ac858 Mon Sep 17 00:00:00 2001 From: Abhinav Kumar Date: Thu, 25 Jul 2024 15:04:50 -0700 Subject: [PATCH 079/991] drm/msm/dp: reset the link phy params before link training Before re-starting link training reset the link phy params namely the pre-emphasis and voltage swing levels otherwise the next link training begins at the previously cached levels which can result in link training failures. Fixes: 8ede2ecc3e5e ("drm/msm/dp: Add DP compliance tests on Snapdragon Chipsets") Reviewed-by: Dmitry Baryshkov Tested-by: Dmitry Baryshkov # SM8350-HDK Reviewed-by: Stephen Boyd Patchwork: https://patchwork.freedesktop.org/patch/605946/ Link: https://lore.kernel.org/r/20240725220450.131245-1-quic_abhinavk@quicinc.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_ctrl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/msm/dp/dp_ctrl.c b/drivers/gpu/drm/msm/dp/dp_ctrl.c index 7bc8a9f0657a93..f342fc5ae41ecc 100644 --- a/drivers/gpu/drm/msm/dp/dp_ctrl.c +++ b/drivers/gpu/drm/msm/dp/dp_ctrl.c @@ -1286,6 +1286,8 @@ static int dp_ctrl_link_train(struct dp_ctrl_private *ctrl, link_info.rate = ctrl->link->link_params.rate; link_info.capabilities = DP_LINK_CAP_ENHANCED_FRAMING; + dp_link_reset_phy_params_vx_px(ctrl->link); + dp_aux_link_configure(ctrl->aux, &link_info); if (drm_dp_max_downspread(dpcd)) From bfa1a6283be390947d3649c482e5167186a37016 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 25 Jun 2024 00:13:41 +0300 Subject: [PATCH 080/991] drm/msm/dpu: cleanup FB if dpu_format_populate_layout fails If the dpu_format_populate_layout() fails, then FB is prepared, but not cleaned up. This ends up leaking the pin_count on the GEM object and causes a splat during DRM file closure: msm_obj->pin_count WARNING: CPU: 2 PID: 569 at drivers/gpu/drm/msm/msm_gem.c:121 update_lru_locked+0xc4/0xcc [...] Call trace: update_lru_locked+0xc4/0xcc put_pages+0xac/0x100 msm_gem_free_object+0x138/0x180 drm_gem_object_free+0x1c/0x30 drm_gem_object_handle_put_unlocked+0x108/0x10c drm_gem_object_release_handle+0x58/0x70 idr_for_each+0x68/0xec drm_gem_release+0x28/0x40 drm_file_free+0x174/0x234 drm_release+0xb0/0x160 __fput+0xc0/0x2c8 __fput_sync+0x50/0x5c __arm64_sys_close+0x38/0x7c invoke_syscall+0x48/0x118 el0_svc_common.constprop.0+0x40/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x4c/0x120 el0t_64_sync_handler+0x100/0x12c el0t_64_sync+0x190/0x194 irq event stamp: 129818 hardirqs last enabled at (129817): [] console_unlock+0x118/0x124 hardirqs last disabled at (129818): [] el1_dbg+0x24/0x8c softirqs last enabled at (129808): [] handle_softirqs+0x4c8/0x4e8 softirqs last disabled at (129785): [] __do_softirq+0x14/0x20 Fixes: 25fdd5933e4c ("drm/msm: Add SDM845 DPU support") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/600714/ Link: https://lore.kernel.org/r/20240625-dpu-mode-config-width-v5-1-501d984d634f@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c index 40c4dd2c3139fe..a62ac0c0c06a42 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c @@ -681,6 +681,9 @@ static int dpu_plane_prepare_fb(struct drm_plane *plane, new_state->fb, &layout); if (ret) { DPU_ERROR_PLANE(pdpu, "failed to get format layout, %d\n", ret); + if (pstate->aspace) + msm_framebuffer_cleanup(new_state->fb, pstate->aspace, + pstate->needs_dirtyfb); return ret; } From 2db13c4a631505029ada9404e09a2b06a268c1c4 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 27 Jun 2024 00:45:55 +0300 Subject: [PATCH 081/991] drm/msm/dpu: limit QCM2290 to RGB formats only The QCM2290 doesn't have CSC blocks, so it can not support YUV formats even on ViG blocks. Fix the formats declared by _VIG_SBLK_NOSCALE(). Fixes: 5334087ee743 ("drm/msm: add support for QCM2290 MDSS") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/601048/ Link: https://lore.kernel.org/r/20240627-dpu-virtual-wide-v5-1-5efb90cbb8be@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index fc178ec73907c6..648c8d0a4c362d 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -308,8 +308,8 @@ static const u32 wb2_formats_rgb_yuv[] = { { \ .maxdwnscale = SSPP_UNITY_SCALE, \ .maxupscale = SSPP_UNITY_SCALE, \ - .format_list = plane_formats_yuv, \ - .num_formats = ARRAY_SIZE(plane_formats_yuv), \ + .format_list = plane_formats, \ + .num_formats = ARRAY_SIZE(plane_formats), \ .virt_format_list = plane_formats, \ .virt_num_formats = ARRAY_SIZE(plane_formats), \ } From cb18195914e353ece0e789e365a5a16872169805 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 27 Jun 2024 00:45:56 +0300 Subject: [PATCH 082/991] drm/msm/dpu: relax YUV requirements YUV formats require only CSC to be enabled. Even decimated formats should not require scaler. Relax the requirement and don't check for the scaler block while checking if YUV format can be enabled. Fixes: 25fdd5933e4c ("drm/msm: Add SDM845 DPU support") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/601049/ Link: https://lore.kernel.org/r/20240627-dpu-virtual-wide-v5-2-5efb90cbb8be@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c index a62ac0c0c06a42..dc1fd95e767bb5 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c @@ -747,10 +747,9 @@ static int dpu_plane_atomic_check_pipe(struct dpu_plane *pdpu, min_src_size = MSM_FORMAT_IS_YUV(fmt) ? 2 : 1; if (MSM_FORMAT_IS_YUV(fmt) && - (!pipe->sspp->cap->sblk->scaler_blk.len || - !pipe->sspp->cap->sblk->csc_blk.len)) { + !pipe->sspp->cap->sblk->csc_blk.len) { DPU_DEBUG_PLANE(pdpu, - "plane doesn't have scaler/csc for yuv\n"); + "plane doesn't have csc for yuv\n"); return -EINVAL; } From d3a785e4f983f523380e023d8a05fb6d04402957 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 27 Jun 2024 00:45:57 +0300 Subject: [PATCH 083/991] drm/msm/dpu: take plane rotation into account for wide planes Take into account the plane rotation and flipping when calculating src positions for the wide plane parts. This is not an issue yet, because rotation is only supported for the UBWC planes and wide UBWC planes are rejected anyway because in parallel multirect case only the half of the usual width is supported for tiled formats. However it's better to fix this now rather than stumbling upon it later. Fixes: 80e8ae3b38ab ("drm/msm/dpu: add support for wide planes") Reviewed-by: Abhinav Kumar Signed-off-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/601059/ Link: https://lore.kernel.org/r/20240627-dpu-virtual-wide-v5-3-5efb90cbb8be@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c index dc1fd95e767bb5..29298e06616359 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c @@ -866,6 +866,10 @@ static int dpu_plane_atomic_check(struct drm_plane *plane, max_linewidth = pdpu->catalog->caps->max_linewidth; + drm_rect_rotate(&pipe_cfg->src_rect, + new_plane_state->fb->width, new_plane_state->fb->height, + new_plane_state->rotation); + if ((drm_rect_width(&pipe_cfg->src_rect) > max_linewidth) || _dpu_plane_calc_clk(&crtc_state->adjusted_mode, pipe_cfg) > max_mdp_clk_rate) { /* @@ -915,6 +919,14 @@ static int dpu_plane_atomic_check(struct drm_plane *plane, r_pipe_cfg->dst_rect.x1 = pipe_cfg->dst_rect.x2; } + drm_rect_rotate_inv(&pipe_cfg->src_rect, + new_plane_state->fb->width, new_plane_state->fb->height, + new_plane_state->rotation); + if (r_pipe->sspp) + drm_rect_rotate_inv(&r_pipe_cfg->src_rect, + new_plane_state->fb->width, new_plane_state->fb->height, + new_plane_state->rotation); + ret = dpu_plane_atomic_check_pipe(pdpu, pipe, pipe_cfg, fmt, &crtc_state->adjusted_mode); if (ret) return ret; From f91f7ac900e7342e0fd66093dfbf7cb8cb585a99 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 17 Jul 2024 15:00:23 +0200 Subject: [PATCH 084/991] refcount: Report UAF for refcount_sub_and_test(0) when counter==0 When a reference counter is at zero and refcount_sub_and_test() is invoked to subtract zero, the function accepts this request without any warning and returns true. This behavior does not seem ideal because the counter being already at zero indicates a use-after-free. Furthermore, returning true by refcount_sub_and_test() in this case potentially results in a double-free done by its caller. Modify the underlying function __refcount_sub_and_test() to warn about this case as a use-after-free and have it return false to avoid the potential double-free. Signed-off-by: Petr Pavlu Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240717130023.5675-1-petr.pavlu@suse.com Signed-off-by: Kees Cook --- drivers/misc/lkdtm/refcount.c | 16 ++++++++++++++++ include/linux/refcount.h | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/misc/lkdtm/refcount.c b/drivers/misc/lkdtm/refcount.c index 5cd488f54cfa54..8f744bee6fbde8 100644 --- a/drivers/misc/lkdtm/refcount.c +++ b/drivers/misc/lkdtm/refcount.c @@ -182,6 +182,21 @@ static void lkdtm_REFCOUNT_SUB_AND_TEST_NEGATIVE(void) check_negative(&neg, 3); } +/* + * A refcount_sub_and_test() by zero when the counter is at zero should act like + * refcount_sub_and_test() above when going negative. + */ +static void lkdtm_REFCOUNT_SUB_AND_TEST_ZERO(void) +{ + refcount_t neg = REFCOUNT_INIT(0); + + pr_info("attempting bad refcount_sub_and_test() at zero\n"); + if (refcount_sub_and_test(0, &neg)) + pr_warn("Weird: refcount_sub_and_test() reported zero\n"); + + check_negative(&neg, 0); +} + static void check_from_zero(refcount_t *ref) { switch (refcount_read(ref)) { @@ -400,6 +415,7 @@ static struct crashtype crashtypes[] = { CRASHTYPE(REFCOUNT_DEC_NEGATIVE), CRASHTYPE(REFCOUNT_DEC_AND_TEST_NEGATIVE), CRASHTYPE(REFCOUNT_SUB_AND_TEST_NEGATIVE), + CRASHTYPE(REFCOUNT_SUB_AND_TEST_ZERO), CRASHTYPE(REFCOUNT_INC_ZERO), CRASHTYPE(REFCOUNT_ADD_ZERO), CRASHTYPE(REFCOUNT_INC_SATURATED), diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 59b3b752394d3c..35f039ecb27256 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -266,12 +266,12 @@ bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) if (oldp) *oldp = old; - if (old == i) { + if (old > 0 && old == i) { smp_acquire__after_ctrl_dep(); return true; } - if (unlikely(old < 0 || old - i < 0)) + if (unlikely(old <= 0 || old - i < 0)) refcount_warn_saturate(r, REFCOUNT_SUB_UAF); return false; From f32e90c0688a3d1f8079ac18ed39b752d22e92bd Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 23 Jul 2024 18:53:31 +0200 Subject: [PATCH 085/991] gcc-plugins: randstruct: Remove GCC 4.7 or newer requirement Since the kernel currently requires GCC 5.1 as a minimum, remove the unnecessary GCC version >= 4.7 check. Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20240723165332.1947-1-thorsten.blum@toblux.com Signed-off-by: Kees Cook --- scripts/gcc-plugins/randomize_layout_plugin.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/gcc-plugins/randomize_layout_plugin.c b/scripts/gcc-plugins/randomize_layout_plugin.c index 746ff2d272f256..5694df3da2e95b 100644 --- a/scripts/gcc-plugins/randomize_layout_plugin.c +++ b/scripts/gcc-plugins/randomize_layout_plugin.c @@ -19,10 +19,6 @@ #include "gcc-common.h" #include "randomize_layout_seed.h" -#if BUILDING_GCC_MAJOR < 4 || (BUILDING_GCC_MAJOR == 4 && BUILDING_GCC_MINOR < 7) -#error "The RANDSTRUCT plugin requires GCC 4.7 or newer." -#endif - #define ORIG_TYPE_NAME(node) \ (TYPE_NAME(TYPE_MAIN_VARIANT(node)) != NULL_TREE ? ((const unsigned char *)IDENTIFIER_POINTER(TYPE_NAME(TYPE_MAIN_VARIANT(node)))) : (const unsigned char *)"anonymous") From 9a2fa1472083580b6c66bdaf291f591e1170123a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 Aug 2024 18:02:00 -0400 Subject: [PATCH 086/991] fix bitmap corruption on close_range() with CLOSE_RANGE_UNSHARE copy_fd_bitmaps(new, old, count) is expected to copy the first count/BITS_PER_LONG bits from old->full_fds_bits[] and fill the rest with zeroes. What it does is copying enough words (BITS_TO_LONGS(count/BITS_PER_LONG)), then memsets the rest. That works fine, *if* all bits past the cutoff point are clear. Otherwise we are risking garbage from the last word we'd copied. For most of the callers that is true - expand_fdtable() has count equal to old->max_fds, so there's no open descriptors past count, let alone fully occupied words in ->open_fds[], which is what bits in ->full_fds_bits[] correspond to. The other caller (dup_fd()) passes sane_fdtable_size(old_fdt, max_fds), which is the smallest multiple of BITS_PER_LONG that covers all opened descriptors below max_fds. In the common case (copying on fork()) max_fds is ~0U, so all opened descriptors will be below it and we are fine, by the same reasons why the call in expand_fdtable() is safe. Unfortunately, there is a case where max_fds is less than that and where we might, indeed, end up with junk in ->full_fds_bits[] - close_range(from, to, CLOSE_RANGE_UNSHARE) with * descriptor table being currently shared * 'to' being above the current capacity of descriptor table * 'from' being just under some chunk of opened descriptors. In that case we end up with observably wrong behaviour - e.g. spawn a child with CLONE_FILES, get all descriptors in range 0..127 open, then close_range(64, ~0U, CLOSE_RANGE_UNSHARE) and watch dup(0) ending up with descriptor #128, despite #64 being observably not open. The minimally invasive fix would be to deal with that in dup_fd(). If this proves to add measurable overhead, we can go that way, but let's try to fix copy_fd_bitmaps() first. * new helper: bitmap_copy_and_expand(to, from, bits_to_copy, size). * make copy_fd_bitmaps() take the bitmap size in words, rather than bits; it's 'count' argument is always a multiple of BITS_PER_LONG, so we are not losing any information, and that way we can use the same helper for all three bitmaps - compiler will see that count is a multiple of BITS_PER_LONG for the large ones, so it'll generate plain memcpy()+memset(). Reproducer added to tools/testing/selftests/core/close_range_test.c Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/file.c | 30 +++++++--------- include/linux/bitmap.h | 12 +++++++ .../testing/selftests/core/close_range_test.c | 35 +++++++++++++++++++ 3 files changed, 60 insertions(+), 17 deletions(-) diff --git a/fs/file.c b/fs/file.c index a11e59b5d6026a..655338effe9c72 100644 --- a/fs/file.c +++ b/fs/file.c @@ -46,27 +46,23 @@ static void free_fdtable_rcu(struct rcu_head *rcu) #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) +#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ -static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, - unsigned int count) +static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, + unsigned int copy_words) { - unsigned int cpy, set; - - cpy = count / BITS_PER_BYTE; - set = (nfdt->max_fds - count) / BITS_PER_BYTE; - memcpy(nfdt->open_fds, ofdt->open_fds, cpy); - memset((char *)nfdt->open_fds + cpy, 0, set); - memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); - memset((char *)nfdt->close_on_exec + cpy, 0, set); - - cpy = BITBIT_SIZE(count); - set = BITBIT_SIZE(nfdt->max_fds) - cpy; - memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); - memset((char *)nfdt->full_fds_bits + cpy, 0, set); + unsigned int nwords = fdt_words(nfdt); + + bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, + copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); + bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, + copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); + bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, + copy_words, nwords); } /* @@ -84,7 +80,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); - copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); + copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } /* @@ -379,7 +375,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int open_files = sane_fdtable_size(old_fdt, max_fds); } - copy_fd_bitmaps(new_fdt, old_fdt, open_files); + copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); old_fds = old_fdt->fd; new_fds = new_fdt->fd; diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 8c4768c44a01b3..d3b66d77df7a3c 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -270,6 +270,18 @@ static inline void bitmap_copy_clear_tail(unsigned long *dst, dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits); } +static inline void bitmap_copy_and_extend(unsigned long *to, + const unsigned long *from, + unsigned int count, unsigned int size) +{ + unsigned int copy = BITS_TO_LONGS(count); + + memcpy(to, from, copy * sizeof(long)); + if (count % BITS_PER_LONG) + to[copy - 1] &= BITMAP_LAST_WORD_MASK(count); + memset(to + copy, 0, bitmap_size(size) - copy * sizeof(long)); +} + /* * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64 * machines the order of hi and lo parts of numbers match the bitmap structure. diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index 991c473e385938..12b4eb9d04347f 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -589,4 +589,39 @@ TEST(close_range_cloexec_unshare_syzbot) EXPECT_EQ(close(fd3), 0); } +TEST(close_range_bitmap_corruption) +{ + pid_t pid; + int status; + struct __clone_args args = { + .flags = CLONE_FILES, + .exit_signal = SIGCHLD, + }; + + /* get the first 128 descriptors open */ + for (int i = 2; i < 128; i++) + EXPECT_GE(dup2(0, i), 0); + + /* get descriptor table shared */ + pid = sys_clone3(&args, sizeof(args)); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* unshare and truncate descriptor table down to 64 */ + if (sys_close_range(64, ~0U, CLOSE_RANGE_UNSHARE)) + exit(EXIT_FAILURE); + + ASSERT_EQ(fcntl(64, F_GETFD), -1); + /* ... and verify that the range 64..127 is not + stuck "fully used" according to secondary bitmap */ + EXPECT_EQ(dup(0), 64) + exit(EXIT_FAILURE); + exit(EXIT_SUCCESS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + TEST_HARNESS_MAIN From 44732f1dad20457d64c525549cd63dcef2563c23 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Fri, 19 Jul 2024 17:30:16 +0300 Subject: [PATCH 087/991] workqueue: doc: Fix function name, remove markers - s/alloc_ordered_queue()/alloc_ordered_workqueue()/ - remove markers to convert it into a link. Signed-off-by: Nikita Shubin Signed-off-by: Tejun Heo --- Documentation/core-api/workqueue.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index bcc370c876be95..16f861c9791e4a 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -260,7 +260,7 @@ Some users depend on strict execution ordering where only one work item is in flight at any given time and the work items are processed in queueing order. While the combination of ``@max_active`` of 1 and ``WQ_UNBOUND`` used to achieve this behavior, this is no longer the -case. Use ``alloc_ordered_queue()`` instead. +case. Use alloc_ordered_workqueue() instead. Example Execution Scenarios From 38f7e14519d39cf524ddc02d4caee9b337dad703 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 30 Jul 2024 12:44:31 +0100 Subject: [PATCH 088/991] workqueue: Fix UBSAN 'subtraction overflow' error in shift_and_mask() UBSAN reports the following 'subtraction overflow' error when booting in a virtual machine on Android: | Internal error: UBSAN: integer subtraction overflow: 00000000f2005515 [#1] PREEMPT SMP | Modules linked in: | CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.10.0-00006-g3cbe9e5abd46-dirty #4 | Hardware name: linux,dummy-virt (DT) | pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) | pc : cancel_delayed_work+0x34/0x44 | lr : cancel_delayed_work+0x2c/0x44 | sp : ffff80008002ba60 | x29: ffff80008002ba60 x28: 0000000000000000 x27: 0000000000000000 | x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000 | x23: 0000000000000000 x22: 0000000000000000 x21: ffff1f65014cd3c0 | x20: ffffc0e84c9d0da0 x19: ffffc0e84cab3558 x18: ffff800080009058 | x17: 00000000247ee1f8 x16: 00000000247ee1f8 x15: 00000000bdcb279d | x14: 0000000000000001 x13: 0000000000000075 x12: 00000a0000000000 | x11: ffff1f6501499018 x10: 00984901651fffff x9 : ffff5e7cc35af000 | x8 : 0000000000000001 x7 : 3d4d455453595342 x6 : 000000004e514553 | x5 : ffff1f6501499265 x4 : ffff1f650ff60b10 x3 : 0000000000000620 | x2 : ffff80008002ba78 x1 : 0000000000000000 x0 : 0000000000000000 | Call trace: | cancel_delayed_work+0x34/0x44 | deferred_probe_extend_timeout+0x20/0x70 | driver_register+0xa8/0x110 | __platform_driver_register+0x28/0x3c | syscon_init+0x24/0x38 | do_one_initcall+0xe4/0x338 | do_initcall_level+0xac/0x178 | do_initcalls+0x5c/0xa0 | do_basic_setup+0x20/0x30 | kernel_init_freeable+0x8c/0xf8 | kernel_init+0x28/0x1b4 | ret_from_fork+0x10/0x20 | Code: f9000fbf 97fffa2f 39400268 37100048 (d42aa2a0) | ---[ end trace 0000000000000000 ]--- | Kernel panic - not syncing: UBSAN: integer subtraction overflow: Fatal exception This is due to shift_and_mask() using a signed immediate to construct the mask and being called with a shift of 31 (WORK_OFFQ_POOL_SHIFT) so that it ends up decrementing from INT_MIN. Use an unsigned constant '1U' to generate the mask in shift_and_mask(). Cc: Tejun Heo Cc: Lai Jiangshan Fixes: 1211f3b21c2a ("workqueue: Preserve OFFQ bits in cancel[_sync] paths") Signed-off-by: Will Deacon Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1745ca788ede3e..b35f8ce80bc7c9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -897,7 +897,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits) { - return (v >> shift) & ((1 << bits) - 1); + return (v >> shift) & ((1U << bits) - 1); } static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data) From 98cc1730c89467fc26e2dc2ceb2a014f332daa97 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 25 Jul 2024 09:04:37 +0800 Subject: [PATCH 089/991] workqueue: Remove incorrect "WARN_ON_ONCE(!list_empty(&worker->entry));" from dying worker The commit 68f83057b913 ("workqueue: Reap workers via kthread_stop() and remove detach_completion") changes the procedure of destroying workers; the dying workers are kept in the cull_list in wake_dying_workers() with the pool lock held and removed from the cull_list by the newly added reap_dying_workers() without the pool lock. This can cause a warning if the dying worker is wokenup earlier than reaped as reported by Marc: 2024/07/23 18:01:21 [M83LP63]: [ 157.267727] ------------[ cut here ]------------ 2024/07/23 18:01:21 [M83LP63]: [ 157.267735] WARNING: CPU: 21 PID: 725 at kernel/workqueue.c:3340 worker_thread+0x54e/0x558 2024/07/23 18:01:21 [M83LP63]: [ 157.267746] Modules linked in: binfmt_misc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables sunrpc dm_service_time s390_trng vfio_ccw mdev vfio_iommu_type1 vfio sch_fq_codel 2024/07/23 18:01:21 [M83LP63]: loop dm_multipath configfs nfnetlink lcs ctcm fsm zfcp scsi_transport_fc ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common scm_block eadm_sch scsi_dh_rdac scsi_dh_emc scsi_dh_alua pkey zcrypt rng_core autofs4 2024/07/23 18:01:21 [M83LP63]: [ 157.267792] CPU: 21 PID: 725 Comm: kworker/dying Not tainted 6.10.0-rc2-00239-g68f83057b913 #95 2024/07/23 18:01:21 [M83LP63]: [ 157.267796] Hardware name: IBM 3906 M04 704 (LPAR) 2024/07/23 18:01:21 [M83LP63]: [ 157.267802] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3 2024/07/23 18:01:21 [M83LP63]: [ 157.267797] Krnl PSW : 0704d00180000000 000003d600fcd9fa (worker_thread+0x552/0x558) 2024/07/23 18:01:21 [M83LP63]: [ 157.267806] Krnl GPRS: 6479696e6700776f 000002c901b62780 000003d602493ec8 000002c914954600 2024/07/23 18:01:21 [M83LP63]: [ 157.267809] 0000000000000000 0000000000000008 000002c901a85400 000002c90719e840 2024/07/23 18:01:21 [M83LP63]: [ 157.267811] 000002c90719e880 000002c901a85420 000002c91127adf0 000002c901a85400 2024/07/23 18:01:21 [M83LP63]: [ 157.267813] 000002c914954600 0000000000000000 000003d600fcd772 000003560452bd98 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] Krnl Code: 000003d600fcd9ec: c0e500674262 brasl %r14,000003d601cb5eb0 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] 000003d600fcd9f2: a7f4ffc8 brc 15,000003d600fcd982 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] #000003d600fcd9f6: af000000 mc 0,0 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] >000003d600fcd9fa: a7f4fec2 brc 15,000003d600fcd77e 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] 000003d600fcd9fe: 0707 bcr 0,%r7 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] 000003d600fcda00: c00400682e10 brcl 0,000003d601cd3620 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] 000003d600fcda06: eb7ff0500024 stmg %r7,%r15,80(%r15) 2024/07/23 18:01:21 [M83LP63]: [ 157.267822] 000003d600fcda0c: b90400ef lgr %r14,%r15 2024/07/23 18:01:21 [M83LP63]: [ 157.267853] Call Trace: 2024/07/23 18:01:21 [M83LP63]: [ 157.267855] [<000003d600fcd9fa>] worker_thread+0x552/0x558 2024/07/23 18:01:21 [M83LP63]: [ 157.267859] ([<000003d600fcd772>] worker_thread+0x2ca/0x558) 2024/07/23 18:01:21 [M83LP63]: [ 157.267862] [<000003d600fd6c80>] kthread+0x120/0x128 2024/07/23 18:01:21 [M83LP63]: [ 157.267865] [<000003d600f5305c>] __ret_from_fork+0x3c/0x58 2024/07/23 18:01:21 [M83LP63]: [ 157.267868] [<000003d601cc746a>] ret_from_fork+0xa/0x30 2024/07/23 18:01:21 [M83LP63]: [ 157.267873] Last Breaking-Event-Address: 2024/07/23 18:01:21 [M83LP63]: [ 157.267874] [<000003d600fcd778>] worker_thread+0x2d0/0x558 Since the procedure of destroying workers is changed, the WARN_ON_ONCE() becomes incorrect and should be removed. Cc: Marc Hartmayer Link: https://lore.kernel.org/lkml/87le1sjd2e.fsf@linux.ibm.com/ Reported-by: Marc Hartmayer Fixes: 68f83057b913 ("workqueue: Reap workers via kthread_stop() and remove detach_completion") Cc: stable@vger.kernel.org # v6.11+ Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b35f8ce80bc7c9..d56bd2277e58e8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3351,7 +3351,6 @@ static int worker_thread(void *__worker) set_pf_worker(false); ida_free(&pool->worker_ida, worker->id); - WARN_ON_ONCE(!list_empty(&worker->entry)); return 0; } From 8bc35475ef1a23b0e224f3242eb11c76cab0ea88 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Aug 2024 09:37:25 -1000 Subject: [PATCH 090/991] workqueue: Fix spruious data race in __flush_work() When flushing a work item for cancellation, __flush_work() knows that it exclusively owns the work item through its PENDING bit. 134874e2eee9 ("workqueue: Allow cancel_work_sync() and disable_work() from atomic contexts on BH work items") added a read of @work->data to determine whether to use busy wait for BH work items that are being canceled. While the read is safe when @from_cancel, @work->data was read before testing @from_cancel to simplify code structure: data = *work_data_bits(work); if (from_cancel && !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) { While the read data was never used if !@from_cancel, this could trigger KCSAN data race detection spuriously: ================================================================== BUG: KCSAN: data-race in __flush_work / __flush_work write to 0xffff8881223aa3e8 of 8 bytes by task 3998 on cpu 0: instrument_write include/linux/instrumented.h:41 [inline] ___set_bit include/asm-generic/bitops/instrumented-non-atomic.h:28 [inline] insert_wq_barrier kernel/workqueue.c:3790 [inline] start_flush_work kernel/workqueue.c:4142 [inline] __flush_work+0x30b/0x570 kernel/workqueue.c:4178 flush_work kernel/workqueue.c:4229 [inline] ... read to 0xffff8881223aa3e8 of 8 bytes by task 50 on cpu 1: __flush_work+0x42a/0x570 kernel/workqueue.c:4188 flush_work kernel/workqueue.c:4229 [inline] flush_delayed_work+0x66/0x70 kernel/workqueue.c:4251 ... value changed: 0x0000000000400000 -> 0xffff88810006c00d Reorganize the code so that @from_cancel is tested before @work->data is accessed. The only problem is triggering KCSAN detection spuriously. This shouldn't need READ_ONCE() or other access qualifiers. No functional changes. Signed-off-by: Tejun Heo Reported-by: syzbot+b3e4f2f51ed645fd5df2@syzkaller.appspotmail.com Fixes: 134874e2eee9 ("workqueue: Allow cancel_work_sync() and disable_work() from atomic contexts on BH work items") Link: http://lkml.kernel.org/r/000000000000ae429e061eea2157@google.com Cc: Jens Axboe --- kernel/workqueue.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d56bd2277e58e8..ef174d8c1f6391 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4166,7 +4166,6 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, static bool __flush_work(struct work_struct *work, bool from_cancel) { struct wq_barrier barr; - unsigned long data; if (WARN_ON(!wq_online)) return false; @@ -4184,29 +4183,35 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) * was queued on a BH workqueue, we also know that it was running in the * BH context and thus can be busy-waited. */ - data = *work_data_bits(work); - if (from_cancel && - !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) { - /* - * On RT, prevent a live lock when %current preempted soft - * interrupt processing or prevents ksoftirqd from running by - * keeping flipping BH. If the BH work item runs on a different - * CPU then this has no effect other than doing the BH - * disable/enable dance for nothing. This is copied from - * kernel/softirq.c::tasklet_unlock_spin_wait(). - */ - while (!try_wait_for_completion(&barr.done)) { - if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - local_bh_disable(); - local_bh_enable(); - } else { - cpu_relax(); + if (from_cancel) { + unsigned long data = *work_data_bits(work); + + if (!WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && + (data & WORK_OFFQ_BH)) { + /* + * On RT, prevent a live lock when %current preempted + * soft interrupt processing or prevents ksoftirqd from + * running by keeping flipping BH. If the BH work item + * runs on a different CPU then this has no effect other + * than doing the BH disable/enable dance for nothing. + * This is copied from + * kernel/softirq.c::tasklet_unlock_spin_wait(). + */ + while (!try_wait_for_completion(&barr.done)) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + local_bh_disable(); + local_bh_enable(); + } else { + cpu_relax(); + } } + goto out_destroy; } - } else { - wait_for_completion(&barr.done); } + wait_for_completion(&barr.done); + +out_destroy: destroy_work_on_stack(&barr.work); return true; } From c4c8f369b6a6d21ce27286de1501137771e01dc3 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 5 Aug 2024 09:30:29 +0200 Subject: [PATCH 091/991] workqueue: Correct declaration of cpu_pwq in struct workqueue_struct cpu_pwq is used in various percpu functions that expect variable in __percpu address space. Correct the declaration of cpu_pwq to struct pool_workqueue __rcu * __percpu *cpu_pwq to declare the variable as __percpu pointer. The patch also fixes following sparse errors: workqueue.c:380:37: warning: duplicate [noderef] workqueue.c:380:37: error: multiple address spaces given: __rcu & __percpu workqueue.c:2271:15: error: incompatible types in comparison expression (different address spaces): workqueue.c:2271:15: struct pool_workqueue [noderef] __rcu * workqueue.c:2271:15: struct pool_workqueue [noderef] __percpu * and uncovers a couple of exisiting "incorrect type in assignment" warnings (from __rcu address space), which this patch does not address. Found by GCC's named address space checks. There were no changes in the resulting object files. Signed-off-by: Uros Bizjak Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ef174d8c1f6391..e7b005ff375033 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -377,7 +377,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ + struct pool_workqueue __rcu * __percpu *cpu_pwq; /* I: per-cpu pwqs */ struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */ }; From 8fcd8d1e63c05c48b3ac16d0c3e2cd6a7a5c8ec4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 26 Jul 2024 04:23:14 +0900 Subject: [PATCH 092/991] kbuild: clean up code duplication in cmd_fdtoverlay When resolving a merge conflict, Linus noticed the fdtoverlay command duplication introduced by commit 49636c5680b9 ("kbuild: verify dtoverlay files against schema"). He suggested a clean-up. I eliminated the duplication and refactored the code a little further. No functional changes are intended, except for the short logs. The log will look as follows: $ make ARCH=arm64 defconfig dtbs_check [ snip ] DTC [C] arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxca.dtb DTC [C] arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dtb DTC [C] arch/arm64/boot/dts/freescale/imx93-var-som-symphony.dtb DTC [C] arch/arm64/boot/dts/freescale/imx95-19x19-evk.dtb DTC arch/arm64/boot/dts/freescale/imx8mm-venice-gw72xx-0x-imx219.dtbo OVL [C] arch/arm64/boot/dts/freescale/imx8mm-venice-gw72xx-0x-imx219.dtb The tag [C] indicates that the schema check is executed. Link: https://lore.kernel.org/lkml/CAHk-=wiF3yeWehcvqY-4X7WNb8n4yw_5t0H1CpEpKi7JMjaMfw@mail.gmail.com/#t Requested-by: Linus Torvalds Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/Makefile.lib | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index fe3668dc4954b2..207325eaf1d1cb 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -400,26 +400,23 @@ $(obj)/%.dtb.S: $(obj)/%.dtb FORCE $(obj)/%.dtbo.S: $(obj)/%.dtbo FORCE $(call if_changed,wrap_S_dtb) -quiet_cmd_dtc = DTC $@ +quiet_dtb_check_tag = $(if $(dtb-check-enabled),[C], ) +cmd_dtb_check = $(if $(dtb-check-enabled),; $(DT_CHECKER) $(DT_CHECKER_FLAGS) -u $(srctree)/$(DT_BINDING_DIR) -p $(DT_TMP_SCHEMA) $@ || true) + +quiet_cmd_dtc = DTC $(quiet_dtb_check_tag) $@ cmd_dtc = $(HOSTCC) -E $(dtc_cpp_flags) -x assembler-with-cpp -o $(dtc-tmp) $< ; \ $(DTC) -o $@ -b 0 \ $(addprefix -i,$(dir $<) $(DTC_INCLUDE)) $(DTC_FLAGS) \ -d $(depfile).dtc.tmp $(dtc-tmp) ; \ - cat $(depfile).pre.tmp $(depfile).dtc.tmp > $(depfile) - -DT_CHECK_CMD = $(DT_CHECKER) $(DT_CHECKER_FLAGS) -u $(srctree)/$(DT_BINDING_DIR) -p $(DT_TMP_SCHEMA) + cat $(depfile).pre.tmp $(depfile).dtc.tmp > $(depfile) \ + $(cmd_dtb_check) # NOTE: # Do not replace $(filter %.dtb %.dtbo, $^) with $(real-prereqs). When a single # DTB is turned into a multi-blob DTB, $^ will contain header file dependencies # recorded in the .*.cmd file. -ifneq ($(CHECK_DTBS),) -quiet_cmd_fdtoverlay = DTOVLCH $@ - cmd_fdtoverlay = $(objtree)/scripts/dtc/fdtoverlay -o $@ -i $(filter %.dtb %.dtbo, $^) ; $(DT_CHECK_CMD) $@ || true -else -quiet_cmd_fdtoverlay = DTOVL $@ - cmd_fdtoverlay = $(objtree)/scripts/dtc/fdtoverlay -o $@ -i $(filter %.dtb %.dtbo, $^) -endif +quiet_cmd_fdtoverlay = OVL $(quiet_dtb_check_tag) $@ + cmd_fdtoverlay = $(objtree)/scripts/dtc/fdtoverlay -o $@ -i $(filter %.dtb %.dtbo, $^) $(cmd_dtb_check) $(multi-dtb-y): FORCE $(call if_changed,fdtoverlay) @@ -430,16 +427,11 @@ DT_CHECKER ?= dt-validate DT_CHECKER_FLAGS ?= $(if $(DT_SCHEMA_FILES),-l $(DT_SCHEMA_FILES),-m) DT_BINDING_DIR := Documentation/devicetree/bindings DT_TMP_SCHEMA := $(objtree)/$(DT_BINDING_DIR)/processed-schema.json - -quiet_cmd_dtb = DTC_CHK $@ - cmd_dtb = $(cmd_dtc) ; $(DT_CHECK_CMD) $@ || true -else -quiet_cmd_dtb = $(quiet_cmd_dtc) - cmd_dtb = $(cmd_dtc) +dtb-check-enabled = $(if $(filter %.dtb, $@),y) endif $(obj)/%.dtb: $(obj)/%.dts $(DTC) $(DT_TMP_SCHEMA) FORCE - $(call if_changed_dep,dtb) + $(call if_changed_dep,dtc) $(obj)/%.dtbo: $(src)/%.dtso $(DTC) FORCE $(call if_changed_dep,dtc) From 6fc9aacad49e3fbecd270c266850d50c453d52ef Mon Sep 17 00:00:00 2001 From: Alexandre Courbot Date: Sun, 4 Aug 2024 14:50:57 +0900 Subject: [PATCH 093/991] Makefile: add $(srctree) to dependency of compile_commands.json target When trying to build compile_commands.json for an external module against the kernel built in a separate output directory, the following error is displayed: make[1]: *** No rule to make target 'scripts/clang-tools/gen_compile_commands.py', needed by 'compile_commands.json'. Stop. This is because gen_compile_commands.py was previously looked up using a relative path to $(srctree), but commit b1992c3772e6 ("kbuild: use $(src) instead of $(srctree)/$(src) for source directory") stopped defining VPATH for external module builds. Prefixing gen_compile_commands.py with $(srctree) fixes the problem. Fixes: b1992c3772e6 ("kbuild: use $(src) instead of $(srctree)/$(src) for source directory") Signed-off-by: Alexandre Courbot Reviewed-by: Nicolas Schier Signed-off-by: Masahiro Yamada --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 44c02a6f60a145..44c33de28e933f 100644 --- a/Makefile +++ b/Makefile @@ -1980,7 +1980,7 @@ nsdeps: modules quiet_cmd_gen_compile_commands = GEN $@ cmd_gen_compile_commands = $(PYTHON3) $< -a $(AR) -o $@ $(filter-out $<, $(real-prereqs)) -$(extmod_prefix)compile_commands.json: scripts/clang-tools/gen_compile_commands.py \ +$(extmod_prefix)compile_commands.json: $(srctree)/scripts/clang-tools/gen_compile_commands.py \ $(if $(KBUILD_EXTMOD),, vmlinux.a $(KBUILD_VMLINUX_LIBS)) \ $(if $(CONFIG_MODULES), $(MODORDER)) FORCE $(call if_changed,gen_compile_commands) From e2006140ad2e01a02ed0aff49cc2ae3ceeb11f8d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 13 Jun 2024 15:05:03 +0300 Subject: [PATCH 094/991] thunderbolt: Mark XDomain as unplugged when router is removed I noticed that when we do discrete host router NVM upgrade and it gets hot-removed from the PCIe side as a result of NVM firmware authentication, if there is another host connected with enabled paths we hang in tearing them down. This is due to fact that the Thunderbolt networking driver also tries to cleanup the paths and ends up blocking in tb_disconnect_xdomain_paths() waiting for the domain lock. However, at this point we already cleaned the paths in tb_stop() so there is really no need for tb_disconnect_xdomain_paths() to do that anymore. Furthermore it already checks if the XDomain is unplugged and bails out early so take advantage of that and mark the XDomain as unplugged when we remove the parent router. Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/switch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index 326433df5880e2..6a2116cbb06f92 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -3392,6 +3392,7 @@ void tb_switch_remove(struct tb_switch *sw) tb_switch_remove(port->remote->sw); port->remote = NULL; } else if (port->xdomain) { + port->xdomain->is_unplugged = true; tb_xdomain_remove(port->xdomain); port->xdomain = NULL; } From 33330bcf031818e60a816db0cfd3add9eecc3b28 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Mon, 5 Aug 2024 11:22:34 +0200 Subject: [PATCH 095/991] scripts: kconfig: merge_config: config files: add a trailing newline When merging files without trailing newlines at the end of the file, two config fragments end up at the same row if file1.config doens't have a trailing newline at the end of the file. file1.config "CONFIG_1=y" file2.config "CONFIG_2=y" ./scripts/kconfig/merge_config.sh -m .config file1.config file2.config This will generate a .config looking like this. cat .config ... CONFIG_1=yCONFIG_2=y" Making sure so we add a newline at the end of every config file that is passed into the script. Signed-off-by: Anders Roxell Signed-off-by: Masahiro Yamada --- scripts/kconfig/merge_config.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/kconfig/merge_config.sh b/scripts/kconfig/merge_config.sh index 902eb429b9dbd9..0b7952471c18f6 100755 --- a/scripts/kconfig/merge_config.sh +++ b/scripts/kconfig/merge_config.sh @@ -167,6 +167,8 @@ for ORIG_MERGE_FILE in $MERGE_LIST ; do sed -i "/$CFG[ =]/d" $MERGE_FILE fi done + # In case the previous file lacks a new line at the end + echo >> $TMP_FILE cat $MERGE_FILE >> $TMP_FILE done From 5a44bb061d04b0306f2aa8add761d86d152b9377 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 1 Aug 2024 14:31:09 +0200 Subject: [PATCH 096/991] KVM: s390: fix validity interception issue when gisa is switched off We might run into a SIE validity if gisa has been disabled either via using kernel parameter "kvm.use_gisa=0" or by setting the related sysfs attribute to N (echo N >/sys/module/kvm/parameters/use_gisa). The validity is caused by an invalid value in the SIE control block's gisa designation. That happens because we pass the uninitialized gisa origin to virt_to_phys() before writing it to the gisa designation. To fix this we return 0 in kvm_s390_get_gisa_desc() if the origin is 0. kvm_s390_get_gisa_desc() is used to determine which gisa designation to set in the SIE control block. A value of 0 in the gisa designation disables gisa usage. The issue surfaces in the host kernel with the following kernel message as soon a new kvm guest start is attemted. kvm: unhandled validity intercept 0x1011 WARNING: CPU: 0 PID: 781237 at arch/s390/kvm/intercept.c:101 kvm_handle_sie_intercept+0x42e/0x4d0 [kvm] Modules linked in: vhost_net tap tun xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT xt_tcpudp nft_compat x_tables nf_nat_tftp nf_conntrack_tftp vfio_pci_core irqbypass vhost_vsock vmw_vsock_virtio_transport_common vsock vhost vhost_iotlb kvm nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables sunrpc mlx5_ib ib_uverbs ib_core mlx5_core uvdevice s390_trng eadm_sch vfio_ccw zcrypt_cex4 mdev vfio_iommu_type1 vfio sch_fq_codel drm i2c_core loop drm_panel_orientation_quirks configfs nfnetlink lcs ctcm fsm dm_service_time ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common dm_mirror dm_region_hash dm_log zfcp scsi_transport_fc scsi_dh_rdac scsi_dh_emc scsi_dh_alua pkey zcrypt dm_multipath rng_core autofs4 [last unloaded: vfio_pci] CPU: 0 PID: 781237 Comm: CPU 0/KVM Not tainted 6.10.0-08682-gcad9f11498ea #6 Hardware name: IBM 3931 A01 701 (LPAR) Krnl PSW : 0704c00180000000 000003d93deb0122 (kvm_handle_sie_intercept+0x432/0x4d0 [kvm]) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3 Krnl GPRS: 000003d900000027 000003d900000023 0000000000000028 000002cd00000000 000002d063a00900 00000359c6daf708 00000000000bebb5 0000000000001eff 000002cfd82e9000 000002cfd80bc000 0000000000001011 000003d93deda412 000003ff8962df98 000003d93de77ce0 000003d93deb011e 00000359c6daf960 Krnl Code: 000003d93deb0112: c020fffe7259 larl %r2,000003d93de7e5c4 000003d93deb0118: c0e53fa8beac brasl %r14,000003d9bd3c7e70 #000003d93deb011e: af000000 mc 0,0 >000003d93deb0122: a728ffea lhi %r2,-22 000003d93deb0126: a7f4fe24 brc 15,000003d93deafd6e 000003d93deb012a: 9101f0b0 tm 176(%r15),1 000003d93deb012e: a774fe48 brc 7,000003d93deafdbe 000003d93deb0132: 40a0f0ae sth %r10,174(%r15) Call Trace: [<000003d93deb0122>] kvm_handle_sie_intercept+0x432/0x4d0 [kvm] ([<000003d93deb011e>] kvm_handle_sie_intercept+0x42e/0x4d0 [kvm]) [<000003d93deacc10>] vcpu_post_run+0x1d0/0x3b0 [kvm] [<000003d93deaceda>] __vcpu_run+0xea/0x2d0 [kvm] [<000003d93dead9da>] kvm_arch_vcpu_ioctl_run+0x16a/0x430 [kvm] [<000003d93de93ee0>] kvm_vcpu_ioctl+0x190/0x7c0 [kvm] [<000003d9bd728b4e>] vfs_ioctl+0x2e/0x70 [<000003d9bd72a092>] __s390x_sys_ioctl+0xc2/0xd0 [<000003d9be0e9222>] __do_syscall+0x1f2/0x2e0 [<000003d9be0f9a90>] system_call+0x70/0x98 Last Breaking-Event-Address: [<000003d9bd3c7f58>] __warn_printk+0xe8/0xf0 Cc: stable@vger.kernel.org Reported-by: Christian Borntraeger Fixes: fe0ef0030463 ("KVM: s390: sort out physical vs virtual pointers usage") Signed-off-by: Michael Mueller Tested-by: Christian Borntraeger Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20240801123109.2782155-1-mimu@linux.ibm.com Message-ID: <20240801123109.2782155-1-mimu@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index bf8534218af3dc..e680c6bf0c9d94 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -267,7 +267,12 @@ static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots) static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) { - u32 gd = virt_to_phys(kvm->arch.gisa_int.origin); + u32 gd; + + if (!kvm->arch.gisa_int.origin) + return 0; + + gd = virt_to_phys(kvm->arch.gisa_int.origin); if (gd && sclp.has_gisaf) gd |= GISA_FORMAT1; From ee9a68394b4bea8b9044ec4bfdbaacf45297ecef Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 1 Aug 2024 21:14:04 +0200 Subject: [PATCH 097/991] riscv: Re-introduce global icache flush in patch_text_XXX() commit edf2d546bfd6 ("riscv: patch: Flush the icache right after patching to avoid illegal insns") mistakenly removed the global icache flush in patch_text_nosync() and patch_text_set_nosync() functions, so reintroduce them. Fixes: edf2d546bfd6 ("riscv: patch: Flush the icache right after patching to avoid illegal insns") Reported-by: Samuel Holland Closes: https://lore.kernel.org/linux-riscv/a28ddc26-d77a-470a-a33f-88144f717e86@sifive.com/ Signed-off-by: Alexandre Ghiti Reviewed-by: Samuel Holland Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240801191404.55181-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/patch.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c index 69e5796fc51fde..34ef522f07a8c2 100644 --- a/arch/riscv/kernel/patch.c +++ b/arch/riscv/kernel/patch.c @@ -205,6 +205,8 @@ int patch_text_set_nosync(void *addr, u8 c, size_t len) int ret; ret = patch_insn_set(addr, c, len); + if (!ret) + flush_icache_range((uintptr_t)addr, (uintptr_t)addr + len); return ret; } @@ -239,6 +241,8 @@ int patch_text_nosync(void *addr, const void *insns, size_t len) int ret; ret = patch_insn_write(addr, insns, len); + if (!ret) + flush_icache_range((uintptr_t)addr, (uintptr_t)addr + len); return ret; } From fbc05142ccdd0061f6d0e489608935943d2984a1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:01:40 -0700 Subject: [PATCH 098/991] perf tools: Add tools/include/uapi/README Write down the reason why we keep a copy of headers to the README file instead of adding it to every commit messages. Suggested-by: Jani Nikula Original-by: Arnaldo Carvalho de Melo Original-by: Ingo Molnar Signed-off-by: Namhyung Kim --- tools/include/uapi/README | 73 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 tools/include/uapi/README diff --git a/tools/include/uapi/README b/tools/include/uapi/README new file mode 100644 index 00000000000000..7147b1b2cb2850 --- /dev/null +++ b/tools/include/uapi/README @@ -0,0 +1,73 @@ +Why we want a copy of kernel headers in tools? +============================================== + +There used to be no copies, with tools/ code using kernel headers +directly. From time to time tools/perf/ broke due to legitimate kernel +hacking. At some point Linus complained about such direct usage. Then we +adopted the current model. + +The way these headers are used in perf are not restricted to just +including them to compile something. + +There are sometimes used in scripts that convert defines into string +tables, etc, so some change may break one of these scripts, or new MSRs +may use some different #define pattern, etc. + +E.g.: + + $ ls -1 tools/perf/trace/beauty/*.sh | head -5 + tools/perf/trace/beauty/arch_errno_names.sh + tools/perf/trace/beauty/drm_ioctl.sh + tools/perf/trace/beauty/fadvise.sh + tools/perf/trace/beauty/fsconfig.sh + tools/perf/trace/beauty/fsmount.sh + $ + $ tools/perf/trace/beauty/fadvise.sh + static const char *fadvise_advices[] = { + [0] = "NORMAL", + [1] = "RANDOM", + [2] = "SEQUENTIAL", + [3] = "WILLNEED", + [4] = "DONTNEED", + [5] = "NOREUSE", + }; + $ + +The tools/perf/check-headers.sh script, part of the tools/ build +process, points out changes in the original files. + +So its important not to touch the copies in tools/ when doing changes in +the original kernel headers, that will be done later, when +check-headers.sh inform about the change to the perf tools hackers. + +Another explanation from Ingo Molnar: +It's better than all the alternatives we tried so far: + + - Symbolic links and direct #includes: this was the original approach but + was pushed back on from the kernel side, when tooling modified the + headers and broke them accidentally for kernel builds. + + - Duplicate self-defined ABI headers like glibc: double the maintenance + burden, double the chance for mistakes, plus there's no tech-driven + notification mechanism to look at new kernel side changes. + +What we are doing now is a third option: + + - A software-enforced copy-on-write mechanism of kernel headers to + tooling, driven by non-fatal warnings on the tooling side build when + kernel headers get modified: + + Warning: Kernel ABI header differences: + diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h + diff -u tools/include/uapi/linux/fs.h include/uapi/linux/fs.h + diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h + ... + + The tooling policy is to always pick up the kernel side headers as-is, + and integate them into the tooling build. The warnings above serve as a + notification to tooling maintainers that there's changes on the kernel + side. + +We've been using this for many years now, and it might seem hacky, but +works surprisingly well. + From aef21f6b6a4aae648c890e74c2322d10ab267249 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 09:59:26 -0700 Subject: [PATCH 099/991] tools/include: Sync uapi/drm/i915_drm.h with the kernel sources To pick up changes from: 0f1bb41bf396 drm/i915: Support replaying GPU hangs with captured context image This should be used to beautify DRM syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: intel-gfx@lists.freedesktop.org Signed-off-by: Namhyung Kim --- tools/include/uapi/drm/i915_drm.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index d4d86e566e0777..535cb68fdb5c4b 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -2163,6 +2163,15 @@ struct drm_i915_gem_context_param { * supports this per context flag. */ #define I915_CONTEXT_PARAM_LOW_LATENCY 0xe + +/* + * I915_CONTEXT_PARAM_CONTEXT_IMAGE: + * + * Allows userspace to provide own context images. + * + * Note that this is a debug API not available on production kernel builds. + */ +#define I915_CONTEXT_PARAM_CONTEXT_IMAGE 0xf /* Must be kept compact -- no holes and well documented */ /** @value: Context parameter value to be set or queried */ @@ -2564,6 +2573,24 @@ struct i915_context_param_engines { struct i915_engine_class_instance engines[N__]; \ } __attribute__((packed)) name__ +struct i915_gem_context_param_context_image { + /** @engine: Engine class & instance to be configured. */ + struct i915_engine_class_instance engine; + + /** @flags: One of the supported flags or zero. */ + __u32 flags; +#define I915_CONTEXT_IMAGE_FLAG_ENGINE_INDEX (1u << 0) + + /** @size: Size of the image blob pointed to by @image. */ + __u32 size; + + /** @mbz: Must be zero. */ + __u32 mbz; + + /** @image: Userspace memory containing the context image. */ + __u64 image; +} __attribute__((packed)); + /** * struct drm_i915_gem_context_create_ext_setparam - Context parameter * to set or query during context creation. From a625df3995c31a5d8cf46f2337b207e93bef9bdd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 100/991] tools/include: Sync uapi/linux/kvm.h with the kernel sources And other arch-specific UAPI headers to pick up changes from: 4b23e0c199b2 KVM: Ensure new code that references immediate_exit gets extra scrutiny 85542adb65ec KVM: x86: Add KVM_RUN_X86_GUEST_MODE kvm_run flag 6fef518594bc KVM: x86: Add a capability to configure bus frequency for APIC timer 34ff65901735 x86/sev: Use kernel provided SVSM Calling Areas 5dcc1e76144f Merge tag 'kvm-x86-misc-6.11' of https://github.com/kvm-x86/linux into HEAD 9a0d2f4995dd KVM: PPC: Book3S HV: Add one-reg interface for HASHPKEYR register e9eb790b2557 KVM: PPC: Book3S HV: Add one-reg interface for HASHKEYR register 1a1e6865f516 KVM: PPC: Book3S HV: Add one-reg interface for DEXCR register This should be used to beautify KVM syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h diff -u tools/arch/x86/include/uapi/asm/svm.h arch/x86/include/uapi/asm/svm.h diff -u tools/arch/powerpc/include/uapi/asm/kvm.h arch/powerpc/include/uapi/asm/kvm.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Paolo Bonzini Cc: kvm@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/arch/powerpc/include/uapi/asm/kvm.h | 3 ++ tools/arch/x86/include/uapi/asm/kvm.h | 49 +++++++++++++++++++++++ tools/arch/x86/include/uapi/asm/svm.h | 1 + tools/include/uapi/linux/kvm.h | 17 +++++++- 4 files changed, 69 insertions(+), 1 deletion(-) diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 1691297a766a9c..eaeda001784ebb 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -645,6 +645,9 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_SIER3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3) #define KVM_REG_PPC_DAWR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4) #define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5) +#define KVM_REG_PPC_DEXCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc6) +#define KVM_REG_PPC_HASHKEYR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc7) +#define KVM_REG_PPC_HASHPKEYR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc8) /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 9fae1b73b529ca..bf57a824f72281 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -106,6 +106,7 @@ struct kvm_ioapic_state { #define KVM_RUN_X86_SMM (1 << 0) #define KVM_RUN_X86_BUS_LOCK (1 << 1) +#define KVM_RUN_X86_GUEST_MODE (1 << 2) /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { @@ -697,6 +698,11 @@ enum sev_cmd_id { /* Second time is the charm; improved versions of the above ioctls. */ KVM_SEV_INIT2, + /* SNP-specific commands */ + KVM_SEV_SNP_LAUNCH_START = 100, + KVM_SEV_SNP_LAUNCH_UPDATE, + KVM_SEV_SNP_LAUNCH_FINISH, + KVM_SEV_NR_MAX, }; @@ -824,6 +830,48 @@ struct kvm_sev_receive_update_data { __u32 pad2; }; +struct kvm_sev_snp_launch_start { + __u64 policy; + __u8 gosvw[16]; + __u16 flags; + __u8 pad0[6]; + __u64 pad1[4]; +}; + +/* Kept in sync with firmware values for simplicity. */ +#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 +#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 +#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 +#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5 +#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6 + +struct kvm_sev_snp_launch_update { + __u64 gfn_start; + __u64 uaddr; + __u64 len; + __u8 type; + __u8 pad0; + __u16 flags; + __u32 pad1; + __u64 pad2[4]; +}; + +#define KVM_SEV_SNP_ID_BLOCK_SIZE 96 +#define KVM_SEV_SNP_ID_AUTH_SIZE 4096 +#define KVM_SEV_SNP_FINISH_DATA_SIZE 32 + +struct kvm_sev_snp_launch_finish { + __u64 id_block_uaddr; + __u64 id_auth_uaddr; + __u8 id_block_en; + __u8 auth_key_en; + __u8 vcek_disabled; + __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE]; + __u8 pad0[3]; + __u16 flags; + __u64 pad1[4]; +}; + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) @@ -874,5 +922,6 @@ struct kvm_hyperv_eventfd { #define KVM_X86_SW_PROTECTED_VM 1 #define KVM_X86_SEV_VM 2 #define KVM_X86_SEV_ES_VM 3 +#define KVM_X86_SNP_VM 4 #endif /* _ASM_X86_KVM_H */ diff --git a/tools/arch/x86/include/uapi/asm/svm.h b/tools/arch/x86/include/uapi/asm/svm.h index 80e1df482337df..1814b413fd5783 100644 --- a/tools/arch/x86/include/uapi/asm/svm.h +++ b/tools/arch/x86/include/uapi/asm/svm.h @@ -115,6 +115,7 @@ #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 +#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index e5af8c692dc063..637efc05514534 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -192,11 +192,24 @@ struct kvm_xen_exit { /* Flags that describe what fields in emulation_failure hold valid data. */ #define KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES (1ULL << 0) +/* + * struct kvm_run can be modified by userspace at any time, so KVM must be + * careful to avoid TOCTOU bugs. In order to protect KVM, HINT_UNSAFE_IN_KVM() + * renames fields in struct kvm_run from to __unsafe when + * compiled into the kernel, ensuring that any use within KVM is obvious and + * gets extra scrutiny. + */ +#ifdef __KERNEL__ +#define HINT_UNSAFE_IN_KVM(_symbol) _symbol##__unsafe +#else +#define HINT_UNSAFE_IN_KVM(_symbol) _symbol +#endif + /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ __u8 request_interrupt_window; - __u8 immediate_exit; + __u8 HINT_UNSAFE_IN_KVM(immediate_exit); __u8 padding1[6]; /* out */ @@ -918,6 +931,8 @@ struct kvm_enable_cap { #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 #define KVM_CAP_PRE_FAULT_MEMORY 236 +#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 +#define KVM_CAP_X86_GUEST_MODE 238 struct kvm_irq_routing_irqchip { __u32 irqchip; From 8ec9497d3ef34fab216e277eca5035811f06b421 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 101/991] tools/include: Sync uapi/linux/perf.h with the kernel sources To pick up changes from: 608f6976c309 perf/x86/intel: Support new data source for Lunar Lake This should be used to beautify perf syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Ian Rogers Cc: Adrian Hunter Cc: "Liang, Kan" Cc: linux-perf-users@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/linux/perf_event.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 3a64499b0f5d63..4842c36fdf8019 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1349,12 +1349,14 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0x7 available */ +#define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ +/* 0x7 available */ #define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ -#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ #define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ #define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ From b9735006762677c2cd794bfcb1463a9a6ed558dd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 102/991] tools/include: Sync uapi/sound/asound.h with the kernel sources To pick up changes from: f05c1ffc2745 ALSA: pcm: reinvent the stream synchronization ID API This should be used to beautify sound syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/perf/trace/beauty/include/uapi/sound/asound.h include/uapi/sound/asound.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: linux-sound@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/perf/trace/beauty/include/uapi/sound/asound.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/perf/trace/beauty/include/uapi/sound/asound.h b/tools/perf/trace/beauty/include/uapi/sound/asound.h index 628d46a0da92eb..8bf7e8a0eb6f03 100644 --- a/tools/perf/trace/beauty/include/uapi/sound/asound.h +++ b/tools/perf/trace/beauty/include/uapi/sound/asound.h @@ -142,7 +142,7 @@ struct snd_hwdep_dsp_image { * * *****************************************************************************/ -#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 17) +#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 18) typedef unsigned long snd_pcm_uframes_t; typedef signed long snd_pcm_sframes_t; @@ -334,7 +334,7 @@ union snd_pcm_sync_id { unsigned char id[16]; unsigned short id16[8]; unsigned int id32[4]; -}; +} __attribute__((deprecated)); struct snd_pcm_info { unsigned int device; /* RO/WR (control): device number */ @@ -348,7 +348,7 @@ struct snd_pcm_info { int dev_subclass; /* SNDRV_PCM_SUBCLASS_* */ unsigned int subdevices_count; unsigned int subdevices_avail; - union snd_pcm_sync_id sync; /* hardware synchronization ID */ + unsigned char pad1[16]; /* was: hardware synchronization ID */ unsigned char reserved[64]; /* reserved for future... */ }; @@ -420,7 +420,8 @@ struct snd_pcm_hw_params { unsigned int rate_num; /* R: rate numerator */ unsigned int rate_den; /* R: rate denominator */ snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */ - unsigned char reserved[64]; /* reserved for future */ + unsigned char sync[16]; /* R: synchronization ID (perfect sync - one clock source) */ + unsigned char reserved[48]; /* reserved for future */ }; enum { From 379d9af3f3da2da1bbfa67baf1820c72a080d1f1 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Tue, 6 Aug 2024 14:51:13 +0800 Subject: [PATCH 103/991] selinux: fix potential counting error in avc_add_xperms_decision() The count increases only when a node is successfully added to the linked list. Cc: stable@vger.kernel.org Fixes: fa1aa143ac4a ("selinux: extended permissions for ioctls") Signed-off-by: Zhen Lei Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/avc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 32eb67fb3e42c0..7087cd2b802d8d 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -330,12 +330,12 @@ static int avc_add_xperms_decision(struct avc_node *node, { struct avc_xperms_decision_node *dest_xpd; - node->ae.xp_node->xp.len++; dest_xpd = avc_xperms_decision_alloc(src->used); if (!dest_xpd) return -ENOMEM; avc_copy_xperms_decision(&dest_xpd->xpd, src); list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head); + node->ae.xp_node->xp.len++; return 0; } From fe992163575b187405899c5abaad8ef6fb828ff6 Mon Sep 17 00:00:00 2001 From: Sarthak Singh Date: Wed, 24 Jul 2024 22:57:06 +0530 Subject: [PATCH 104/991] rust: Support latest version of `rust-analyzer` Sets the `sysroot` field in rust-project.json which is now needed in newer versions of rust-analyzer instead of the `sysroot_src` field. Till [1] `rust-analyzer` used to guess the `sysroot` based on the `sysroot_src` at [2]. Now `sysroot` is a required parameter for a `rust-project.json` file. It is required because `rust-analyzer` need it to find the proc-macro server [3]. In the current version of `rust-analyzer` the `sysroot_src` is only used to include the inbuilt library crates (std, core, alloc, etc) [4]. Since we already specify the core library to be included in the `rust-project.json` we don't need to define the `sysroot_src`. Code editors like VS Code try to use the latest version of rust-analyzer (which is updated every week) instead of the version of rust-analyzer that comes with the rustup toolchain (which is updated every six weeks along with the rust version). Without this change `rust-analyzer` is breaking for anyone using VS Code. As they are getting the latest version of `rust-analyzer` with the changes made in [1]. `rust-analyzer` will also start breaking for other developers as they update their rust version (assuming that also updates the rust-analyzer version on their system). This patch should work with every setup as there is no more guess work being done by `rust-analyzer`. [ Lukas, who leads the rust-analyzer team, says: `sysroot_src` is required now if you want to have the sysroot source libraries be loaded. I think we used to infer it as `{sysroot}/lib/rustlib/src/rust/library` before when only the `sysroot` field was given but that was since changed to make it possible in having a sysroot without the standard library sources (that is only have the binaries available). So if you want the library sources to be loaded by rust-analyzer you will have to set that field as well now. - Miguel ] Link: https://github.com/rust-lang/rust-analyzer/pull/17287 [1] Link: https://github.com/rust-lang/rust-analyzer/blob/f372a8a1176ff8dd5f45ab2ddd45f3530db0374f/crates/project-model/src/workspace.rs#L367-L374 [2] Link: https://github.com/rust-lang/rust-analyzer/blob/eeb192b79aeac47b40add66347022af17a74fbaf/crates/project-model/src/sysroot.rs#L180-L192 [3] Link: https://github.com/search?q=repo%3AVeykril%2Frust-analyzer%20src_root()&type=code [4] Tested-by: Dirk Behme Signed-off-by: Sarthak Singh Link: https://rust-for-linux.zulipchat.com/#narrow/stream/291565-Help/topic/How.20to.20rust-analyzer.20correctly.20working Link: https://lore.kernel.org/r/20240724172713.899399-1-sarthak.singh99@gmail.com [ Formatted comment, fixed typo and removed spurious empty line. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/Makefile | 2 +- scripts/generate_rust_analyzer.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/rust/Makefile b/rust/Makefile index 1f10f92737f2c0..6c0644b6090c7d 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -350,7 +350,7 @@ rust-analyzer: $(Q)$(srctree)/scripts/generate_rust_analyzer.py \ --cfgs='core=$(core-cfgs)' --cfgs='alloc=$(alloc-cfgs)' \ $(realpath $(srctree)) $(realpath $(objtree)) \ - $(RUST_LIB_SRC) $(KBUILD_EXTMOD) > \ + $(rustc_sysroot) $(RUST_LIB_SRC) $(KBUILD_EXTMOD) > \ $(if $(KBUILD_EXTMOD),$(extmod_prefix),$(objtree))/rust-project.json redirect-intrinsics = \ diff --git a/scripts/generate_rust_analyzer.py b/scripts/generate_rust_analyzer.py index f270c7b0cf345d..d2bc63cde8c6a3 100755 --- a/scripts/generate_rust_analyzer.py +++ b/scripts/generate_rust_analyzer.py @@ -145,6 +145,7 @@ def main(): parser.add_argument('--cfgs', action='append', default=[]) parser.add_argument("srctree", type=pathlib.Path) parser.add_argument("objtree", type=pathlib.Path) + parser.add_argument("sysroot", type=pathlib.Path) parser.add_argument("sysroot_src", type=pathlib.Path) parser.add_argument("exttree", type=pathlib.Path, nargs="?") args = parser.parse_args() @@ -154,9 +155,12 @@ def main(): level=logging.INFO if args.verbose else logging.WARNING ) + # Making sure that the `sysroot` and `sysroot_src` belong to the same toolchain. + assert args.sysroot in args.sysroot_src.parents + rust_project = { "crates": generate_crates(args.srctree, args.objtree, args.sysroot_src, args.exttree, args.cfgs), - "sysroot_src": str(args.sysroot_src), + "sysroot": str(args.sysroot), } json.dump(rust_project, sys.stdout, sort_keys=True, indent=4) From cff59d8631e1409ffdd22d9d717e15810181b32c Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 1 Aug 2024 13:25:48 +0200 Subject: [PATCH 105/991] s390/uv: Panic for set and remove shared access UVC errors The return value uv_set_shared() and uv_remove_shared() (which are wrappers around the share() function) is not always checked. The system integrity of a protected guest depends on the Share and Unshare UVCs being successful. This means that any caller that fails to check the return value will compromise the security of the protected guest. No code path that would lead to such violation of the security guarantees is currently exercised, since all the areas that are shared never get unshared during the lifetime of the system. This might change and become an issue in the future. The Share and Unshare UVCs can only fail in case of hypervisor misbehaviour (either a bug or malicious behaviour). In such cases there is no reasonable way forward, and the system needs to panic. This patch replaces the return at the end of the share() function with a panic, to guarantee system integrity. Fixes: 5abb9351dfd9 ("s390/uv: introduce guest side ultravisor code") Signed-off-by: Claudio Imbrenda Reviewed-by: Christian Borntraeger Reviewed-by: Steffen Eiden Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20240801112548.85303-1-imbrenda@linux.ibm.com Message-ID: <20240801112548.85303-1-imbrenda@linux.ibm.com> [frankja@linux.ibm.com: Fixed up patch subject] Signed-off-by: Janosch Frank --- arch/s390/include/asm/uv.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 0b5f8f3e84f126..153d93468b77c0 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -441,7 +441,10 @@ static inline int share(unsigned long addr, u16 cmd) if (!uv_call(0, (u64)&uvcb)) return 0; - return -EINVAL; + pr_err("%s UVC failed (rc: 0x%x, rrc: 0x%x), possible hypervisor bug.\n", + uvcb.header.cmd == UVC_CMD_SET_SHARED_ACCESS ? "Share" : "Unshare", + uvcb.header.rc, uvcb.header.rrc); + panic("System security cannot be guaranteed unless the system panics now.\n"); } /* From e9408fa234fb2c0f087d718c7172212bb0dd7e6f Mon Sep 17 00:00:00 2001 From: Andrei Simion Date: Wed, 31 Jul 2024 17:41:00 +0300 Subject: [PATCH 106/991] MAINTAINERS: Update DTS path for ARM/Microchip (AT91) SoC Update the path to the supported DTS files for ARM/Microchip (AT91) SoC to ensure that the output of the get_maintainer.pl script includes the email addresses of the maintainers for all files located in arch/arm/boot/dts/microchip. Suggested-by: Conor Dooley Signed-off-by: Andrei Simion Reviewed-by: Cristian Birsan Link: https://lore.kernel.org/r/20240731144100.182221-1-andrei.simion@microchip.com Signed-off-by: Claudiu Beznea --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 42decde3832066..515323593f991e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2535,8 +2535,7 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported W: http://www.linux4sam.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git -F: arch/arm/boot/dts/microchip/at91* -F: arch/arm/boot/dts/microchip/sama* +F: arch/arm/boot/dts/microchip/ F: arch/arm/include/debug/at91.S F: arch/arm/mach-at91/ F: drivers/memory/atmel* From f1cb9d5aefba07fc52b06b7bd5fdcd9ef91157b4 Mon Sep 17 00:00:00 2001 From: Bitterblue Smith Date: Tue, 6 Aug 2024 19:20:11 +0300 Subject: [PATCH 107/991] wifi: rtlwifi: rtl8192du: Initialise value32 in _rtl92du_init_queue_reserved_page GCC complains: In file included from include/linux/ieee80211.h:21, from include/net/mac80211.h:20, from drivers/net/wireless/realtek/rtlwifi/rtl8192du/../wifi.h:14, from drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c:4: In function 'u32p_replace_bits', inlined from '_rtl92du_init_queue_reserved_page.isra' at drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c:225:2: >> include/linux/bitfield.h:189:18: warning: 'value32' is used uninitialized [-Wuninitialized] Part of the variable is indeed left uninitialised. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408062100.DWhN0CYH-lkp@intel.com/ Fixes: e769c67105d3 ("wifi: rtlwifi: Add rtl8192du/hw.{c,h}") Signed-off-by: Bitterblue Smith Acked-by: Ping-Ke Shih Signed-off-by: Kalle Valo Link: https://patch.msgid.link/2a808244-93d0-492c-b304-ae1974df5df9@gmail.com --- drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c index 700c6e2bcad194..ff458fb8514dae 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192du/hw.c @@ -181,11 +181,11 @@ static void _rtl92du_init_queue_reserved_page(struct ieee80211_hw *hw, struct rtl_hal *rtlhal = rtl_hal(rtlpriv); u32 txqpagenum, txqpageunit; u32 txqremainingpage; + u32 value32 = 0; u32 numhq = 0; u32 numlq = 0; u32 numnq = 0; u32 numpubq; - u32 value32; if (rtlhal->macphymode != SINGLEMAC_SINGLEPHY) { numpubq = NORMAL_PAGE_NUM_PUBQ_92D_DUAL_MAC; From 568901e709d7fa564dfdc75816ea59fec65d20a0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 108/991] tools/include: Sync uapi/asm-generic/unistd.h with the kernel sources And arch syscall tables to pick up changes from: b1e31c134a8a powerpc: restore some missing spu syscalls d3882564a77c syscalls: fix compat_sys_io_pgetevents_time64 usage 54233a425403 uretprobe: change syscall number, again 63ded110979b uprobe: Change uretprobe syscall scope and number 9142be9e6443 x86/syscall: Mark exit[_group] syscall handlers __noreturn 9aae1baa1c5d x86, arm: Add missing license tag to syscall tables files 5c28424e9a34 syscalls: Fix to add sys_uretprobe to syscall.tbl 190fec72df4a uprobe: Wire up uretprobe system call This should be used to beautify syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Arnd Bergmann Cc: linux-arch@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/asm-generic/unistd.h | 2 +- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 6 +++++- tools/perf/arch/s390/entry/syscalls/syscall.tbl | 2 +- tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 8 +++++--- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index a00d53d027235a..5bf6148cac2b93 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -737,7 +737,7 @@ __SC_COMP(__NR_pselect6_time64, sys_pselect6, compat_sys_pselect6_time64) #define __NR_ppoll_time64 414 __SC_COMP(__NR_ppoll_time64, sys_ppoll, compat_sys_ppoll_time64) #define __NR_io_pgetevents_time64 416 -__SYSCALL(__NR_io_pgetevents_time64, sys_io_pgetevents) +__SC_COMP(__NR_io_pgetevents_time64, sys_io_pgetevents, compat_sys_io_pgetevents_time64) #define __NR_recvmmsg_time64 417 __SC_COMP(__NR_recvmmsg_time64, sys_recvmmsg, compat_sys_recvmmsg_time64) #define __NR_mq_timedsend_time64 418 diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 3656f1ca7a21c6..ebae8415dfbbab 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -230,8 +230,10 @@ 178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 179 32 pread64 sys_ppc_pread64 compat_sys_ppc_pread64 179 64 pread64 sys_pread64 +179 spu pread64 sys_pread64 180 32 pwrite64 sys_ppc_pwrite64 compat_sys_ppc_pwrite64 180 64 pwrite64 sys_pwrite64 +180 spu pwrite64 sys_pwrite64 181 common chown sys_chown 182 common getcwd sys_getcwd 183 common capget sys_capget @@ -246,6 +248,7 @@ 190 common ugetrlimit sys_getrlimit compat_sys_getrlimit 191 32 readahead sys_ppc_readahead compat_sys_ppc_readahead 191 64 readahead sys_readahead +191 spu readahead sys_readahead 192 32 mmap2 sys_mmap2 compat_sys_mmap2 193 32 truncate64 sys_ppc_truncate64 compat_sys_ppc_truncate64 194 32 ftruncate64 sys_ppc_ftruncate64 compat_sys_ppc_ftruncate64 @@ -293,6 +296,7 @@ 232 nospu set_tid_address sys_set_tid_address 233 32 fadvise64 sys_ppc32_fadvise64 compat_sys_ppc32_fadvise64 233 64 fadvise64 sys_fadvise64 +233 spu fadvise64 sys_fadvise64 234 nospu exit_group sys_exit_group 235 nospu lookup_dcookie sys_ni_syscall 236 common epoll_create sys_epoll_create @@ -502,7 +506,7 @@ 412 32 utimensat_time64 sys_utimensat sys_utimensat 413 32 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 414 32 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 sys_io_pgetevents sys_io_pgetevents +416 32 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64 417 32 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 418 32 mq_timedsend_time64 sys_mq_timedsend sys_mq_timedsend 419 32 mq_timedreceive_time64 sys_mq_timedreceive sys_mq_timedreceive diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index bd0fee24ad10a3..01071182763e96 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -418,7 +418,7 @@ 412 32 utimensat_time64 - sys_utimensat 413 32 pselect6_time64 - compat_sys_pselect6_time64 414 32 ppoll_time64 - compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 - sys_io_pgetevents +416 32 io_pgetevents_time64 - compat_sys_io_pgetevents_time64 417 32 recvmmsg_time64 - compat_sys_recvmmsg_time64 418 32 mq_timedsend_time64 - sys_mq_timedsend 419 32 mq_timedreceive_time64 - sys_mq_timedreceive diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index a396f6e6ab5bf9..7093ee21c0d1c0 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -1,8 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note # # 64-bit system call numbers and entry vectors # # The format is: -# +# [ [noreturn]] # # The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls # @@ -68,7 +69,7 @@ 57 common fork sys_fork 58 common vfork sys_vfork 59 64 execve sys_execve -60 common exit sys_exit +60 common exit sys_exit - noreturn 61 common wait4 sys_wait4 62 common kill sys_kill 63 common uname sys_newuname @@ -239,7 +240,7 @@ 228 common clock_gettime sys_clock_gettime 229 common clock_getres sys_clock_getres 230 common clock_nanosleep sys_clock_nanosleep -231 common exit_group sys_exit_group +231 common exit_group sys_exit_group - noreturn 232 common epoll_wait sys_epoll_wait 233 common epoll_ctl sys_epoll_ctl 234 common tgkill sys_tgkill @@ -343,6 +344,7 @@ 332 common statx sys_statx 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq +335 common uretprobe sys_uretprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal From ed86525f1f4b738bae75c73e89f25430bd0af1b0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 109/991] tools/include: Sync network socket headers with the kernel sources To pick up changes from: d25a92ccae6b net/smc: Introduce IPPROTO_SMC 060f4ba6e403 io_uring/net: move charging socket out of zc io_uring bb6aaf736680 net: Split a __sys_listen helper for io_uring dc2e77979412 net: Split a __sys_bind helper for io_uring This should be used to beautify socket syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/in.h include/uapi/linux/in.h diff -u tools/perf/trace/beauty/include/linux/socket.h include/linux/socket.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: netdev@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/linux/in.h | 2 ++ tools/perf/trace/beauty/include/linux/socket.h | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/in.h b/tools/include/uapi/linux/in.h index e682ab628dfa66..d358add1611cd1 100644 --- a/tools/include/uapi/linux/in.h +++ b/tools/include/uapi/linux/in.h @@ -81,6 +81,8 @@ enum { #define IPPROTO_ETHERNET IPPROTO_ETHERNET IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW + IPPROTO_SMC = 256, /* Shared Memory Communications */ +#define IPPROTO_SMC IPPROTO_SMC IPPROTO_MPTCP = 262, /* Multipath TCP connection */ #define IPPROTO_MPTCP IPPROTO_MPTCP IPPROTO_MAX diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index 89d16b90370bd4..df9cdb8bbfb884 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -76,7 +76,7 @@ struct msghdr { __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ struct ubuf_info *msg_ubuf; - int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb, + int (*sg_from_iter)(struct sk_buff *skb, struct iov_iter *from, size_t length); }; @@ -442,11 +442,14 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, extern int __sys_socket(int family, int type, int protocol); extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); +extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, + int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); +extern int __sys_listen_socket(struct socket *sock, int backlog); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, From 845295f4004c7e1591bab4bad01b51f37d32272f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 110/991] tools/include: Sync filesystem headers with the kernel sources To pick up changes from: 0f9ca80fa4f9 fs: Add initial atomic write support info to statx f9af549d1fd3 fs: export mount options via statmount() 0a3deb11858a fs: Allow listmount() in foreign mount namespace 09b31295f833 fs: export the mount ns id via statmount d04bccd8c19d listmount: allow listing in reverse order bfc69fd05ef9 fs/procfs: add build ID fetching to PROCMAP_QUERY API ed5d583a88a9 fs/procfs: implement efficient VMA querying API for /proc//maps This should be used to beautify FS syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/stat.h include/uapi/linux/stat.h diff -u tools/perf/trace/beauty/include/uapi/linux/fs.h include/uapi/linux/fs.h diff -u tools/perf/trace/beauty/include/uapi/linux/mount.h include/uapi/linux/mount.h diff -u tools/perf/trace/beauty/include/uapi/linux/stat.h include/uapi/linux/stat.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/linux/stat.h | 12 +- .../perf/trace/beauty/include/uapi/linux/fs.h | 163 +++++++++++++++++- .../trace/beauty/include/uapi/linux/mount.h | 10 +- .../trace/beauty/include/uapi/linux/stat.h | 12 +- 4 files changed, 189 insertions(+), 8 deletions(-) diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h index 67626d53531664..887a2528644168 100644 --- a/tools/include/uapi/linux/stat.h +++ b/tools/include/uapi/linux/stat.h @@ -126,9 +126,15 @@ struct statx { __u64 stx_mnt_id; __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ - __u64 stx_subvol; /* Subvolume identifier */ /* 0xa0 */ - __u64 __spare3[11]; /* Spare space for future expansion */ + __u64 stx_subvol; /* Subvolume identifier */ + __u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */ + /* 0xb0 */ + __u32 stx_atomic_write_segments_max; /* Max atomic write segment count */ + __u32 __spare1[1]; + /* 0xb8 */ + __u64 __spare3[9]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -157,6 +163,7 @@ struct statx { #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ +#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ @@ -192,6 +199,7 @@ struct statx { #define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ #define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ +#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */ #endif /* _UAPI_LINUX_STAT_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h index 45e4e64fd6643c..7539717707337a 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fs.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h @@ -329,12 +329,17 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO negation of O_APPEND */ #define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) +/* Atomic Write */ +#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) + +#define PROCFS_IOCTL_MAGIC 'f' /* Pagemap ioctl */ -#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) +#define PAGEMAP_SCAN _IOWR(PROCFS_IOCTL_MAGIC, 16, struct pm_scan_arg) /* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ #define PAGE_IS_WPALLOWED (1 << 0) @@ -393,4 +398,158 @@ struct pm_scan_arg { __u64 return_mask; }; +/* /proc//maps ioctl */ +#define PROCMAP_QUERY _IOWR(PROCFS_IOCTL_MAGIC, 17, struct procmap_query) + +enum procmap_query_flags { + /* + * VMA permission flags. + * + * Can be used as part of procmap_query.query_flags field to look up + * only VMAs satisfying specified subset of permissions. E.g., specifying + * PROCMAP_QUERY_VMA_READABLE only will return both readable and read/write VMAs, + * while having PROCMAP_QUERY_VMA_READABLE | PROCMAP_QUERY_VMA_WRITABLE will only + * return read/write VMAs, though both executable/non-executable and + * private/shared will be ignored. + * + * PROCMAP_QUERY_VMA_* flags are also returned in procmap_query.vma_flags + * field to specify actual VMA permissions. + */ + PROCMAP_QUERY_VMA_READABLE = 0x01, + PROCMAP_QUERY_VMA_WRITABLE = 0x02, + PROCMAP_QUERY_VMA_EXECUTABLE = 0x04, + PROCMAP_QUERY_VMA_SHARED = 0x08, + /* + * Query modifier flags. + * + * By default VMA that covers provided address is returned, or -ENOENT + * is returned. With PROCMAP_QUERY_COVERING_OR_NEXT_VMA flag set, closest + * VMA with vma_start > addr will be returned if no covering VMA is + * found. + * + * PROCMAP_QUERY_FILE_BACKED_VMA instructs query to consider only VMAs that + * have file backing. Can be combined with PROCMAP_QUERY_COVERING_OR_NEXT_VMA + * to iterate all VMAs with file backing. + */ + PROCMAP_QUERY_COVERING_OR_NEXT_VMA = 0x10, + PROCMAP_QUERY_FILE_BACKED_VMA = 0x20, +}; + +/* + * Input/output argument structured passed into ioctl() call. It can be used + * to query a set of VMAs (Virtual Memory Areas) of a process. + * + * Each field can be one of three kinds, marked in a short comment to the + * right of the field: + * - "in", input argument, user has to provide this value, kernel doesn't modify it; + * - "out", output argument, kernel sets this field with VMA data; + * - "in/out", input and output argument; user provides initial value (used + * to specify maximum allowable buffer size), and kernel sets it to actual + * amount of data written (or zero, if there is no data). + * + * If matching VMA is found (according to criterias specified by + * query_addr/query_flags, all the out fields are filled out, and ioctl() + * returns 0. If there is no matching VMA, -ENOENT will be returned. + * In case of any other error, negative error code other than -ENOENT is + * returned. + * + * Most of the data is similar to the one returned as text in /proc//maps + * file, but procmap_query provides more querying flexibility. There are no + * consistency guarantees between subsequent ioctl() calls, but data returned + * for matched VMA is self-consistent. + */ +struct procmap_query { + /* Query struct size, for backwards/forward compatibility */ + __u64 size; + /* + * Query flags, a combination of enum procmap_query_flags values. + * Defines query filtering and behavior, see enum procmap_query_flags. + * + * Input argument, provided by user. Kernel doesn't modify it. + */ + __u64 query_flags; /* in */ + /* + * Query address. By default, VMA that covers this address will + * be looked up. PROCMAP_QUERY_* flags above modify this default + * behavior further. + * + * Input argument, provided by user. Kernel doesn't modify it. + */ + __u64 query_addr; /* in */ + /* VMA starting (inclusive) and ending (exclusive) address, if VMA is found. */ + __u64 vma_start; /* out */ + __u64 vma_end; /* out */ + /* VMA permissions flags. A combination of PROCMAP_QUERY_VMA_* flags. */ + __u64 vma_flags; /* out */ + /* VMA backing page size granularity. */ + __u64 vma_page_size; /* out */ + /* + * VMA file offset. If VMA has file backing, this specifies offset + * within the file that VMA's start address corresponds to. + * Is set to zero if VMA has no backing file. + */ + __u64 vma_offset; /* out */ + /* Backing file's inode number, or zero, if VMA has no backing file. */ + __u64 inode; /* out */ + /* Backing file's device major/minor number, or zero, if VMA has no backing file. */ + __u32 dev_major; /* out */ + __u32 dev_minor; /* out */ + /* + * If set to non-zero value, signals the request to return VMA name + * (i.e., VMA's backing file's absolute path, with " (deleted)" suffix + * appended, if file was unlinked from FS) for matched VMA. VMA name + * can also be some special name (e.g., "[heap]", "[stack]") or could + * be even user-supplied with prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME). + * + * Kernel will set this field to zero, if VMA has no associated name. + * Otherwise kernel will return actual amount of bytes filled in + * user-supplied buffer (see vma_name_addr field below), including the + * terminating zero. + * + * If VMA name is longer that user-supplied maximum buffer size, + * -E2BIG error is returned. + * + * If this field is set to non-zero value, vma_name_addr should point + * to valid user space memory buffer of at least vma_name_size bytes. + * If set to zero, vma_name_addr should be set to zero as well + */ + __u32 vma_name_size; /* in/out */ + /* + * If set to non-zero value, signals the request to extract and return + * VMA's backing file's build ID, if the backing file is an ELF file + * and it contains embedded build ID. + * + * Kernel will set this field to zero, if VMA has no backing file, + * backing file is not an ELF file, or ELF file has no build ID + * embedded. + * + * Build ID is a binary value (not a string). Kernel will set + * build_id_size field to exact number of bytes used for build ID. + * If build ID is requested and present, but needs more bytes than + * user-supplied maximum buffer size (see build_id_addr field below), + * -E2BIG error will be returned. + * + * If this field is set to non-zero value, build_id_addr should point + * to valid user space memory buffer of at least build_id_size bytes. + * If set to zero, build_id_addr should be set to zero as well + */ + __u32 build_id_size; /* in/out */ + /* + * User-supplied address of a buffer of at least vma_name_size bytes + * for kernel to fill with matched VMA's name (see vma_name_size field + * description above for details). + * + * Should be set to zero if VMA name should not be returned. + */ + __u64 vma_name_addr; /* in */ + /* + * User-supplied address of a buffer of at least build_id_size bytes + * for kernel to fill with matched VMA's ELF build ID, if available + * (see build_id_size field description above for details). + * + * Should be set to zero if build ID should not be returned. + */ + __u64 build_id_addr; /* in */ +}; + #endif /* _UAPI_LINUX_FS_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/mount.h b/tools/perf/trace/beauty/include/uapi/linux/mount.h index ad5478dbad0073..225bc366ffcbf0 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/mount.h +++ b/tools/perf/trace/beauty/include/uapi/linux/mount.h @@ -154,7 +154,7 @@ struct mount_attr { */ struct statmount { __u32 size; /* Total size, including strings */ - __u32 __spare1; + __u32 mnt_opts; /* [str] Mount options of the mount */ __u64 mask; /* What results were written */ __u32 sb_dev_major; /* Device ID */ __u32 sb_dev_minor; @@ -172,7 +172,8 @@ struct statmount { __u64 propagate_from; /* Propagation from in current namespace */ __u32 mnt_root; /* [str] Root of mount relative to root of fs */ __u32 mnt_point; /* [str] Mountpoint relative to current root */ - __u64 __spare2[50]; + __u64 mnt_ns_id; /* ID of the mount namespace */ + __u64 __spare2[49]; char str[]; /* Variable size part containing strings */ }; @@ -188,10 +189,12 @@ struct mnt_id_req { __u32 spare; __u64 mnt_id; __u64 param; + __u64 mnt_ns_id; }; /* List of all mnt_id_req versions. */ #define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */ +#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */ /* * @mask bits for statmount(2) @@ -202,10 +205,13 @@ struct mnt_id_req { #define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ #define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ #define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ +#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ +#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ /* * Special @mnt_id values that can be passed to listmount */ #define LSMT_ROOT 0xffffffffffffffff /* root mount */ +#define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */ #endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/stat.h b/tools/perf/trace/beauty/include/uapi/linux/stat.h index 67626d53531664..887a2528644168 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/stat.h +++ b/tools/perf/trace/beauty/include/uapi/linux/stat.h @@ -126,9 +126,15 @@ struct statx { __u64 stx_mnt_id; __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ - __u64 stx_subvol; /* Subvolume identifier */ /* 0xa0 */ - __u64 __spare3[11]; /* Spare space for future expansion */ + __u64 stx_subvol; /* Subvolume identifier */ + __u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */ + /* 0xb0 */ + __u32 stx_atomic_write_segments_max; /* Max atomic write segment count */ + __u32 __spare1[1]; + /* 0xb8 */ + __u64 __spare3[9]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -157,6 +163,7 @@ struct statx { #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ +#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ @@ -192,6 +199,7 @@ struct statx { #define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ #define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ +#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */ #endif /* _UAPI_LINUX_STAT_H */ From f6d9883f8e680460be4714d4d35c7acac1dffeaf Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 111/991] tools/include: Sync x86 headers with the kernel sources To pick up changes from: 149fd4712bcd perf/x86/intel: Support Perfmon MSRs aliasing 21b362cc762a x86/resctrl: Enable shared RMID mode on Sub-NUMA Cluster (SNC) systems 4f460bff7b6a cpufreq: acpi: move MSR_K7_HWCR_CPB_DIS_BIT into msr-index.h 7ea81936b853 x86/cpufeatures: Add HWP highest perf change feature flag 78ce84b9e0a5 x86/cpufeatures: Flip the /proc/cpuinfo appearance logic 1beb348d5c7f x86/sev: Provide SVSM discovery support This should be used to beautify x86 syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim --- tools/arch/x86/include/asm/cpufeatures.h | 803 ++++++++++++----------- tools/arch/x86/include/asm/msr-index.h | 11 + 2 files changed, 414 insertions(+), 400 deletions(-) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 3c7434329661c6..dd4682857c1208 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -18,170 +18,170 @@ /* * Note: If the comment begins with a quoted string, that string is used - * in /proc/cpuinfo instead of the macro name. If the string is "", - * this feature bit is not displayed in /proc/cpuinfo at all. + * in /proc/cpuinfo instead of the macro name. Otherwise, this feature + * bit is not displayed in /proc/cpuinfo at all. * * When adding new features here that depend on other features, * please update the table in kernel/cpu/cpuid-deps.c as well. */ /* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ -#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* "fpu" Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* "vme" Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* "de" Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* "pse" Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* "tsc" Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* "msr" Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* "pae" Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* "mce" Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* "cx8" CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* "apic" Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* "sep" SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* "mtrr" Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* "pge" Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* "mca" Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* "cmov" CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT ( 0*32+16) /* "pat" Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* "pse36" 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* "pn" Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* "clflush" CLFLUSH instruction */ #define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_ACPI ( 0*32+22) /* "acpi" ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* "mmx" Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* "fxsr" FXSAVE/FXRSTOR, CR4.OSFXSR */ #define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ #define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ #define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_HT ( 0*32+28) /* "ht" Hyper-Threading */ #define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ +#define X86_FEATURE_IA64 ( 0*32+30) /* "ia64" IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* "pbe" Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */ -#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* "syscall" SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* "mp" MP Capable */ +#define X86_FEATURE_NX ( 1*32+20) /* "nx" Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* "mmxext" AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* "fxsr_opt" FXSAVE/FXRSTOR optimizations */ #define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */ -#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */ -#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* "rdtscp" RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* "lm" Long Mode (x86-64, 64-bit support) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* "3dnowext" AMD 3DNow extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* "3dnow" 3DNow */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* "recovery" CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* "longrun" Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* "lrti" LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ -#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */ -#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */ -#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */ -#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ -#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ -#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */ -#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ -#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ -#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ -#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ -#define X86_FEATURE_RAPL ( 3*32+29) /* AMD/Hygon RAPL interface */ -#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ -#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* "cxmmx" Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* "k6_mtrr" AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* "cyrix_arr" Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */ +#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */ +#define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* "arch_perfmon" Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* "pebs" Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* "bts" Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* syscall in IA32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* sysenter in IA32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* "rep_good" REP microcode works well */ +#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* "amd_lbr_v2" AMD Last Branch Record Extension Version 2 */ +#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* Clear CPU buffers using VERW */ +#define X86_FEATURE_ACC_POWER ( 3*32+19) /* "acc_power" AMD Accumulated Power Mechanism */ +#define X86_FEATURE_NOPL ( 3*32+20) /* "nopl" The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* "xtopology" CPU topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* "tsc_reliable" TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* "nonstop_tsc" TSC does not stop in C states */ +#define X86_FEATURE_CPUID ( 3*32+25) /* "cpuid" CPU has CPUID instruction itself */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* "extd_apicid" Extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* "amd_dcm" AMD multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* "aperfmperf" P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ +#define X86_FEATURE_RAPL ( 3*32+29) /* "rapl" AMD/Hygon RAPL interface */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* "nonstop_tsc_s3" TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* "tsc_known_freq" TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* "pclmulqdq" PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* "dtes64" 64-bit Debug Store */ #define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */ #define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ -#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */ -#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ -#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ -#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */ -#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */ -#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* "vmx" Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* "smx" Safer Mode eXtensions */ +#define X86_FEATURE_EST ( 4*32+ 7) /* "est" Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* "tm2" Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* "ssse3" Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* "cid" Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* "sdbg" Silicon Debug */ +#define X86_FEATURE_FMA ( 4*32+12) /* "fma" Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* "cx16" CMPXCHG16B instruction */ +#define X86_FEATURE_XTPR ( 4*32+14) /* "xtpr" Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* "pdcm" Perf/Debug Capabilities MSR */ +#define X86_FEATURE_PCID ( 4*32+17) /* "pcid" Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* "dca" Direct Cache Access */ #define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */ -#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */ -#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ -#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */ -#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */ -#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* "x2apic" X2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* "movbe" MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* "popcnt" POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* "tsc_deadline_timer" TSC deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* "aes" AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* "xsave" XSAVE/XRSTOR/XSETBV/XGETBV instructions */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* XSAVE instruction enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* "avx" Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* "f16c" 16-bit FP conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* "rdrand" RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* "hypervisor" Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ #define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ #define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ #define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* "ace2" Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* "ace2_en" ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* "phe" PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* "phe_en" PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* "pmm" PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* "pmm_en" PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ -#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */ -#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */ -#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */ -#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */ -#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ -#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* "lahf_lm" LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* "cmp_legacy" If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* "svm" Secure Virtual Machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* "extapic" Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* "cr8_legacy" CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* "abm" Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* "sse4a" SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* "misalignsse" Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* "3dnowprefetch" 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* "osvw" OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* "ibs" Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* "xop" Extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* "skinit" SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* "wdt" Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* "lwp" Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* "fma4" 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* "tce" Translation Cache Extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* "nodeid_msr" NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* "tbm" Trailing Bit Manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* "topoext" Topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* "perfctr_core" Core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* "perfctr_nb" NB performance counter extensions */ +#define X86_FEATURE_BPEXT ( 6*32+26) /* "bpext" Data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* "ptsc" Performance time-stamp counter */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* "perfctr_llc" Last Level Cache performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* "mwaitx" MWAIT extension (MONITORX/MWAITX instructions) */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -189,93 +189,93 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */ -#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ -#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ -#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ -#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ -#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* Platform supports being a TDX host */ -#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ -#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ -#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ -#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ -#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ -#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ -#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ -#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ -#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ -#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ -#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ -#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ -#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */ -#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ -#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ -#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */ -#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ -#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ -#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ +#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* "ring3mwait" Ring 3 MONITOR/MWAIT instructions */ +#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* "cpuid_fault" Intel CPUID faulting */ +#define X86_FEATURE_CPB ( 7*32+ 2) /* "cpb" AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* "epb" IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* "cat_l3" Cache Allocation Technology L3 */ +#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* "cat_l2" Cache Allocation Technology L2 */ +#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* "cdp_l3" Code and Data Prioritization L3 */ +#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* "tdx_host_platform" Platform supports being a TDX host */ +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* "hw_pstate" AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* "proc_feedback" AMD ProcFeedbackInterface */ +#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* Use compacted XSTATE (XSAVES or XSAVEC) */ +#define X86_FEATURE_PTI ( 7*32+11) /* "pti" Kernel Page Table Isolation enabled */ +#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* Set/clear IBRS on kernel entry/exit */ +#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* Fill RSB on VM-Exit */ +#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* "intel_ppin" Intel Processor Inventory Number */ +#define X86_FEATURE_CDP_L2 ( 7*32+15) /* "cdp_l2" Code and Data Prioritization L2 */ +#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* MSR SPEC_CTRL is implemented */ +#define X86_FEATURE_SSBD ( 7*32+17) /* "ssbd" Speculative Store Bypass Disable */ +#define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */ +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled */ +#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */ +#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */ +#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */ +#define X86_FEATURE_IBRS ( 7*32+25) /* "ibrs" Indirect Branch Restricted Speculation */ +#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier */ +#define X86_FEATURE_STIBP ( 7*32+27) /* "stibp" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ZEN ( 7*32+28) /* Generic flag for all Zen and newer */ +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* L1TF workaround PTE inversion */ +#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* "ibrs_enhanced" Enhanced IBRS */ +#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* MSR IA32_FEAT_CTL configured */ /* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* "tpr_shadow" Intel TPR Shadow */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* "flexpriority" Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 2) /* "ept" Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 3) /* "vpid" Intel Virtual Processor ID */ -#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ -#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ -#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ -#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ -#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ -#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ -#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ -#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */ +#define X86_FEATURE_VMMCALL ( 8*32+15) /* "vmmcall" Prefer VMMCALL to VMCALL */ +#define X86_FEATURE_XENPV ( 8*32+16) /* Xen paravirtual guest */ +#define X86_FEATURE_EPT_AD ( 8*32+17) /* "ept_ad" Intel Extended Page Table access-dirty bit */ +#define X86_FEATURE_VMCALL ( 8*32+18) /* Hypervisor supports the VMCALL instruction */ +#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* PV unlock function */ +#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* PV vcpu_is_preempted function */ +#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* "tdx_guest" Intel Trust Domain Extensions Guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ -#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ -#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ -#define X86_FEATURE_SGX ( 9*32+ 2) /* Software Guard Extensions */ -#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */ -#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ -#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */ -#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ -#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ -#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */ -#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ -#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ -#define X86_FEATURE_INTEL_PT ( 9*32+25) /* Intel Processor Trace */ -#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ -#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ -#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ -#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* "fsgsbase" RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* "tsc_adjust" TSC adjustment MSR 0x3B */ +#define X86_FEATURE_SGX ( 9*32+ 2) /* "sgx" Software Guard Extensions */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* "bmi1" 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* "hle" Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* "avx2" AVX2 instructions */ +#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* FPU data pointer updated only on x87 exceptions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* "smep" Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* "bmi2" 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* "erms" Enhanced REP MOVSB/STOSB instructions */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* "invpcid" Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* "rtm" Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* "cqm" Cache QoS Monitoring */ +#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* Zero out FPU CS and FPU DS */ +#define X86_FEATURE_MPX ( 9*32+14) /* "mpx" Memory Protection Extension */ +#define X86_FEATURE_RDT_A ( 9*32+15) /* "rdt_a" Resource Director Technology Allocation */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* "avx512f" AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ ( 9*32+17) /* "avx512dq" AVX-512 DQ (Double/Quad granular) Instructions */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* "rdseed" RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* "adx" ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* "smap" Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* "avx512ifma" AVX-512 Integer Fused Multiply-Add instructions */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* "clflushopt" CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* "clwb" CLWB instruction */ +#define X86_FEATURE_INTEL_PT ( 9*32+25) /* "intel_pt" Intel Processor Trace */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* "avx512pf" AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* "avx512er" AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* "avx512cd" AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* "sha_ni" SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW ( 9*32+30) /* "avx512bw" AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL ( 9*32+31) /* "avx512vl" AVX-512 VL (128/256 Vector Length) Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ -#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* "xsaveopt" XSAVEOPT instruction */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* "xsavec" XSAVEC instruction */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* "xgetbv1" XGETBV with ECX = 1 instruction */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* "xsaves" XSAVES/XRSTORS instructions */ +#define X86_FEATURE_XFD (10*32+ 4) /* eXtended Feature Disabling */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -283,181 +283,183 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */ -#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */ -#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ -#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ -#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ -#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ -#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ -#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ -#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ -#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* "" Issue an IBPB on kernel entry */ -#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ -#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ -#define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ -#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ -#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ -#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ -#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */ -#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */ -#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ -#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ -#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ -#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */ -#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ -#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ -#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */ -#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ -#define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */ -#define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */ -#define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */ -#define X86_FEATURE_ZEN1 (11*32+31) /* "" CPU based on Zen1 microarchitecture */ +#define X86_FEATURE_CQM_LLC (11*32+ 0) /* "cqm_llc" LLC QoS if 1 */ +#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* "cqm_occup_llc" LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* "cqm_mbm_total" LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* "cqm_mbm_local" LLC Local MBM monitoring */ +#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* LFENCE in user entry SWAPGS path */ +#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* LFENCE in kernel entry SWAPGS path */ +#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* "split_lock_detect" #AC for split lock */ +#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* SGX Enclave Dynamic Memory Management (EDMM) */ +#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* Issue an IBPB on kernel entry */ +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* RET prediction control */ +#define X86_FEATURE_RETPOLINE (11*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_RETHUNK (11*32+14) /* Use REturn THUNK */ +#define X86_FEATURE_UNRET (11*32+15) /* AMD BTB untrain return */ +#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* Use IBPB during runtime firmware calls */ +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* Fill RSB on VM exit when EIBRS is enabled */ +#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* SGX EDECCSSA user leaf function */ +#define X86_FEATURE_CALL_DEPTH (11*32+19) /* Call depth tracking for RSB stuffing */ +#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* MSR IA32_TSX_CTRL (Intel) implemented */ +#define X86_FEATURE_SMBA (11*32+21) /* Slow Memory Bandwidth Allocation */ +#define X86_FEATURE_BMEC (11*32+22) /* Bandwidth Monitoring Event Configuration */ +#define X86_FEATURE_USER_SHSTK (11*32+23) /* "user_shstk" Shadow stack support for user mode applications */ +#define X86_FEATURE_SRSO (11*32+24) /* AMD BTB untrain RETs */ +#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* AMD BTB untrain RETs through aliasing */ +#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* Issue an IBPB only on VMEXIT */ +#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ +#define X86_FEATURE_ZEN2 (11*32+28) /* CPU based on Zen2 microarchitecture */ +#define X86_FEATURE_ZEN3 (11*32+29) /* CPU based on Zen3 microarchitecture */ +#define X86_FEATURE_ZEN4 (11*32+30) /* CPU based on Zen4 microarchitecture */ +#define X86_FEATURE_ZEN1 (11*32+31) /* CPU based on Zen1 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ -#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ -#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ -#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ -#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* "" Intel Architectural PerfMon Extension */ -#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ -#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ -#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ -#define X86_FEATURE_FRED (12*32+17) /* Flexible Return and Event Delivery */ -#define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ -#define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ -#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ -#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ -#define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ +#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* "avx_vnni" AVX VNNI instructions */ +#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* CMPccXADD instructions */ +#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* Intel Architectural PerfMon Extension */ +#define X86_FEATURE_FZRM (12*32+10) /* Fast zero-length REP MOVSB */ +#define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */ +#define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */ +#define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */ +#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */ +#define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */ +#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ +#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ +#define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ -#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ -#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ -#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ -#define X86_FEATURE_RDPRU (13*32+ 4) /* Read processor register at user level */ -#define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */ -#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ -#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ -#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* "" Single Thread Indirect Branch Predictors always-on preferred */ -#define X86_FEATURE_AMD_PPIN (13*32+23) /* Protected Processor Inventory Number */ -#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ -#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ -#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ -#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ -#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */ -#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ -#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ +#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */ +#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */ +#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */ +#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ +#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ +#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ +#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ +#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* Speculative Store Bypass is fixed in hardware. */ +#define X86_FEATURE_CPPC (13*32+27) /* "cppc" Collaborative Processor Performance Control */ +#define X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ +#define X86_FEATURE_BTC_NO (13*32+29) /* Not vulnerable to Branch Type Confusion */ +#define X86_FEATURE_BRS (13*32+31) /* "brs" Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ -#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ -#define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* "dtherm" Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* "ida" Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* "arat" Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* "pln" Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* "pts" Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* "hwp" Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* "hwp_notify" HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* "hwp_act_window" HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* "hwp_epp" HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* "hwp_pkg_req" HWP Package Level Request */ +#define X86_FEATURE_HWP_HIGHEST_PERF_CHANGE (14*32+15) /* HWP Highest perf change */ +#define X86_FEATURE_HFI (14*32+19) /* "hfi" Hardware Feedback Interface */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ -#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_NPT (15*32+ 0) /* "npt" Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* "lbrv" LBR Virtualization support */ #define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ #define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ #define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ #define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ -#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ -#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ -#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ -#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ -#define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */ -#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* "flushbyasid" Flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* "decodeassists" Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* "pausefilter" Filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* "pfthreshold" Pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* "avic" Virtual Interrupt Controller */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* "v_vmsave_vmload" Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* "vgif" Virtual GIF */ +#define X86_FEATURE_X2AVIC (15*32+18) /* "x2avic" Virtual x2apic */ +#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */ +#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */ +#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ -#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ -#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ -#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ -#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */ -#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ -#define X86_FEATURE_SHSTK (16*32+ 7) /* "" Shadow stack */ -#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ -#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ -#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ -#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ -#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ -#define X86_FEATURE_TME (16*32+13) /* Intel Total Memory Encryption */ -#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ -#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ -#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ -#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */ -#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ -#define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ -#define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ -#define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */ -#define X86_FEATURE_SGX_LC (16*32+30) /* Software Guard Extensions Launch Control */ +#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* "avx512vbmi" AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (16*32+ 2) /* "umip" User Mode Instruction Protection */ +#define X86_FEATURE_PKU (16*32+ 3) /* "pku" Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (16*32+ 4) /* "ospke" OS Protection Keys Enable */ +#define X86_FEATURE_WAITPKG (16*32+ 5) /* "waitpkg" UMONITOR/UMWAIT/TPAUSE Instructions */ +#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* "avx512_vbmi2" Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_SHSTK (16*32+ 7) /* Shadow stack */ +#define X86_FEATURE_GFNI (16*32+ 8) /* "gfni" Galois Field New Instructions */ +#define X86_FEATURE_VAES (16*32+ 9) /* "vaes" Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* "vpclmulqdq" Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+11) /* "avx512_vnni" Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ +#define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */ +#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */ +#define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */ +#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */ +#define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */ +#define X86_FEATURE_MOVDIRI (16*32+27) /* "movdiri" MOVDIRI instruction */ +#define X86_FEATURE_MOVDIR64B (16*32+28) /* "movdir64b" MOVDIR64B instruction */ +#define X86_FEATURE_ENQCMD (16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */ +#define X86_FEATURE_SGX_LC (16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ -#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ -#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ -#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ -#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ -#define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */ -#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ -#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ -#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ -#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* "" RTM transaction always aborts */ -#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ -#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ -#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */ -#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ -#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ -#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ -#define X86_FEATURE_IBT (18*32+20) /* Indirect Branch Tracking */ -#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */ -#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */ -#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */ -#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */ -#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ -#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ -#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ -#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ -#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* "avx512_4vnniw" AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* "avx512_4fmaps" AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_FSRM (18*32+ 4) /* "fsrm" Fast Short Rep Mov */ +#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* "avx512_vp2intersect" AVX-512 Intersect for D/Q */ +#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* SRBDS mitigation MSR available */ +#define X86_FEATURE_MD_CLEAR (18*32+10) /* "md_clear" VERW clears CPU buffers */ +#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* RTM transaction always aborts */ +#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* TSX_FORCE_ABORT */ +#define X86_FEATURE_SERIALIZE (18*32+14) /* "serialize" SERIALIZE instruction */ +#define X86_FEATURE_HYBRID_CPU (18*32+15) /* This part has CPUs of more than one type */ +#define X86_FEATURE_TSXLDTRK (18*32+16) /* "tsxldtrk" TSX Suspend Load Address Tracking */ +#define X86_FEATURE_PCONFIG (18*32+18) /* "pconfig" Intel PCONFIG */ +#define X86_FEATURE_ARCH_LBR (18*32+19) /* "arch_lbr" Intel ARCH LBR */ +#define X86_FEATURE_IBT (18*32+20) /* "ibt" Indirect Branch Tracking */ +#define X86_FEATURE_AMX_BF16 (18*32+22) /* "amx_bf16" AMX bf16 Support */ +#define X86_FEATURE_AVX512_FP16 (18*32+23) /* "avx512_fp16" AVX512 FP16 */ +#define X86_FEATURE_AMX_TILE (18*32+24) /* "amx_tile" AMX tile Support */ +#define X86_FEATURE_AMX_INT8 (18*32+25) /* "amx_int8" AMX int8 Support */ +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */ +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* "flush_l1d" Flush L1D cache */ +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* "arch_capabilities" IA32_ARCH_CAPABILITIES MSR (Intel) */ +#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* IA32_CORE_CAPABILITIES MSR */ +#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* Speculative Store Bypass Disable */ /* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ -#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ -#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ -#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ -#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ -#define X86_FEATURE_SEV_SNP (19*32+ 4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */ -#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ -#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ -#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */ +#define X86_FEATURE_SME (19*32+ 0) /* "sme" AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* "sev" AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" AMD Secure Encrypted Virtualization - Secure Nested Paging */ +#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */ +#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */ +#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ -#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ -#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ -#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */ -#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ -#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ -#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */ +#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */ +#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ +#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */ +#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */ +#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ +#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ -#define X86_FEATURE_SBPB (20*32+27) /* "" Selective Branch Prediction Barrier */ -#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */ -#define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */ +#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ +#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ +#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -465,59 +467,60 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ -#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ -#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ -#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ -#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ +#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* "amd_lbr_pmc_freeze" AMD LBR and PMC Freeze */ +#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* Clear branch history at syscall entry using SW loop */ +#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */ +#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */ +#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */ +#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */ /* * BUG word(s) */ #define X86_BUG(x) (NCAPINTS*32 + (x)) -#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ -#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ -#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +#define X86_BUG_F00F X86_BUG(0) /* "f00f" Intel F00F */ +#define X86_BUG_FDIV X86_BUG(1) /* "fdiv" FPU FDIV */ +#define X86_BUG_COMA X86_BUG(2) /* "coma" Cyrix 6x86 coma */ #define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ #define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ -#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ -#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ -#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ -#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#define X86_BUG_11AP X86_BUG(5) /* "11ap" Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* "fxsave_leak" FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* "clflush_monitor" AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* "sysret_ss_attrs" SYSRET doesn't fix up SS attrs */ #ifdef CONFIG_X86_32 /* * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional * to avoid confusion. */ -#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ +#define X86_BUG_ESPFIX X86_BUG(9) /* IRET to 16-bit SS corrupts ESP/RSP high bits */ #endif -#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ -#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ -#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ -#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ -#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ -#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ -#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ -#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ -#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ -#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ -#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ -#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ -#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ -#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ -#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ -#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ -#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ -#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ -#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ -#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ -#define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */ -#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* CPU may incur #MC if non-TD software does partial write to TDX private memory */ +#define X86_BUG_NULL_SEG X86_BUG(10) /* "null_seg" Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* "swapgs_fence" SWAPGS without input dep on GS */ +#define X86_BUG_MONITOR X86_BUG(12) /* "monitor" IPI required to wake up remote CPU */ +#define X86_BUG_AMD_E400 X86_BUG(13) /* "amd_e400" CPU is among the affected by Erratum 400 */ +#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* "cpu_meltdown" CPU is affected by meltdown attack and needs kernel page table isolation */ +#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* "spectre_v1" CPU is affected by Spectre variant 1 attack with conditional branches */ +#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* "spectre_v2" CPU is affected by Spectre variant 2 attack with indirect branches */ +#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* "spec_store_bypass" CPU is affected by speculative store bypass attack */ +#define X86_BUG_L1TF X86_BUG(18) /* "l1tf" CPU is affected by L1 Terminal Fault */ +#define X86_BUG_MDS X86_BUG(19) /* "mds" CPU is affected by Microarchitectural data sampling */ +#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* "msbds_only" CPU is only affected by the MSDBS variant of BUG_MDS */ +#define X86_BUG_SWAPGS X86_BUG(21) /* "swapgs" CPU is affected by speculation through SWAPGS */ +#define X86_BUG_TAA X86_BUG(22) /* "taa" CPU is affected by TSX Async Abort(TAA) */ +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* "itlb_multihit" CPU may incur MCE during certain page attribute changes */ +#define X86_BUG_SRBDS X86_BUG(24) /* "srbds" CPU may leak RNG bits if not mitigated */ +#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* "mmio_stale_data" CPU is affected by Processor MMIO Stale Data vulnerabilities */ +#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* "mmio_unknown" CPU is too old and its MMIO Stale Data status is unknown */ +#define X86_BUG_RETBLEED X86_BUG(27) /* "retbleed" CPU is affected by RETBleed */ +#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* "eibrs_pbrsb" EIBRS is vulnerable to Post Barrier RSB Predictions */ +#define X86_BUG_SMT_RSB X86_BUG(29) /* "smt_rsb" CPU is vulnerable to Cross-Thread Return Address Predictions */ +#define X86_BUG_GDS X86_BUG(30) /* "gds" CPU is affected by Gather Data Sampling */ +#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* "tdx_pw_mce" CPU may incur #MC if non-TD software does partial write to TDX private memory */ /* BUG word 2 */ -#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ -#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ -#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ -#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ +#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* "srso" AMD SRSO bug */ +#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* "div0" AMD DIV0 speculation bug */ +#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */ +#define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index e022e6eb766c64..82c6a4d350e09e 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -566,6 +566,12 @@ #define MSR_RELOAD_PMC0 0x000014c1 #define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* V6 PMON MSR range */ +#define MSR_IA32_PMC_V6_GP0_CTR 0x1900 +#define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_STEP 4 + /* KeyID partitioning between MKTME and TDX */ #define MSR_IA32_MKTME_KEYID_PARTITIONING 0x00000087 @@ -660,6 +666,8 @@ #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 +#define MSR_SVSM_CAA 0xc001f000 + /* AMD Collaborative Processor Performance Control MSRs */ #define MSR_AMD_CPPC_CAP1 0xc00102b0 #define MSR_AMD_CPPC_ENABLE 0xc00102b1 @@ -781,6 +789,8 @@ #define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 +#define MSR_K7_HWCR_CPB_DIS_BIT 25 +#define MSR_K7_HWCR_CPB_DIS BIT_ULL(MSR_K7_HWCR_CPB_DIS_BIT) /* K6 MSRs */ #define MSR_K6_WHCR 0xc0000082 @@ -1164,6 +1174,7 @@ #define MSR_IA32_QM_CTR 0xc8e #define MSR_IA32_PQR_ASSOC 0xc8f #define MSR_IA32_L3_CBM_BASE 0xc90 +#define MSR_RMID_SNC_CONFIG 0xca0 #define MSR_IA32_L2_CBM_BASE 0xd10 #define MSR_IA32_MBA_THRTL_BASE 0xd50 From d5b854893d27b4030943a10cf28a07189aab0c36 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 112/991] tools/include: Sync arm64 headers with the kernel sources To pick up changes from: 9ef54a384526 arm64: cputype: Add Cortex-A725 definitions 58d245e03c32 arm64: cputype: Add Cortex-X1C definitions fd2ff5f0b320 arm64: cputype: Add Cortex-X925 definitions add332c40328 arm64: cputype: Add Cortex-A720 definitions be5a6f238700 arm64: cputype: Add Cortex-X3 definitions This should be used to beautify x86 syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/arm64/include/asm/cputype.h arch/arm64/include/asm/cputype.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Namhyung Kim --- tools/arch/arm64/include/asm/cputype.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 7b32b99023a21d..5fd7caea441936 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -86,9 +86,14 @@ #define ARM_CPU_PART_CORTEX_X2 0xD48 #define ARM_CPU_PART_NEOVERSE_N2 0xD49 #define ARM_CPU_PART_CORTEX_A78C 0xD4B +#define ARM_CPU_PART_CORTEX_X1C 0xD4C +#define ARM_CPU_PART_CORTEX_X3 0xD4E #define ARM_CPU_PART_NEOVERSE_V2 0xD4F +#define ARM_CPU_PART_CORTEX_A720 0xD81 #define ARM_CPU_PART_CORTEX_X4 0xD82 #define ARM_CPU_PART_NEOVERSE_V3 0xD84 +#define ARM_CPU_PART_CORTEX_X925 0xD85 +#define ARM_CPU_PART_CORTEX_A725 0xD87 #define APM_CPU_PART_XGENE 0x000 #define APM_CPU_VAR_POTENZA 0x00 @@ -162,9 +167,14 @@ #define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) #define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) +#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C) +#define MIDR_CORTEX_X3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X3) #define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2) +#define MIDR_CORTEX_A720 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720) #define MIDR_CORTEX_X4 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X4) #define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3) +#define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925) +#define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) From 10f2ad032defe906240d0c3b62dcbceace96b230 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 7 Aug 2024 12:51:44 +0100 Subject: [PATCH 113/991] KVM: arm64: Enforce dependency on an ARMv8.4-aware toolchain With the NV support of TLBI-range operations, KVM makes use of instructions that are only supported by binutils versions >= 2.30. This breaks the build for very old toolchains. Make KVM support conditional on having ARMv8.4 support in the assembler, side-stepping the issue. Fixes: 5d476ca57d7d ("KVM: arm64: nv: Add handling of range-based TLBI operations") Reported-by: Viresh Kumar Suggested-by: Arnd Bergmann Signed-off-by: Marc Zyngier Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240807115144.3237260-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 58f09370d17e01..8304eb342be9d9 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -19,6 +19,7 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" + depends on AS_HAS_ARMV8_4 select KVM_COMMON select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_MMU_NOTIFIER From 01ab08cafeced7ae1d6c01a08218742c8182f8da Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Wed, 7 Aug 2024 13:20:24 +0800 Subject: [PATCH 114/991] KVM: arm64: vgic-debug: Exit the iterator properly w/o LPI In case the guest doesn't have any LPI, we previously relied on the iterator setting 'intid = nr_spis + VGIC_NR_PRIVATE_IRQS' && 'lpi_idx = 1' to exit the iterator. But it was broken with commit 85d3ccc8b75b ("KVM: arm64: vgic-debug: Use an xarray mark for debug iterator") -- the intid remains at 'nr_spis + VGIC_NR_PRIVATE_IRQS - 1', and we end up endlessly printing the last SPI's state. Consider that it's meaningless to search the LPI xarray and populate lpi_idx when there is no LPI, let's just skip the process for that case. The result is that * If there's no LPI, we focus on the intid and exit the iterator when it runs out of the valid SPI range. * Otherwise we keep the current logic and let the xarray drive the iterator. Fixes: 85d3ccc8b75b ("KVM: arm64: vgic-debug: Use an xarray mark for debug iterator") Signed-off-by: Zenghui Yu Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240807052024.2084-1-yuzenghui@huawei.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index bcbc8c986b1d63..bc74d06398ef16 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -45,7 +45,8 @@ static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) * Let the xarray drive the iterator after the last SPI, as the iterator * has exhausted the sequentially-allocated INTID space. */ - if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1)) { + if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1) && + iter->nr_lpis) { if (iter->lpi_idx < iter->nr_lpis) xa_find_after(&dist->lpi_xa, &iter->intid, VGIC_LPI_MAX_INTID, @@ -112,7 +113,7 @@ static bool end_of_vgic(struct vgic_state_iter *iter) return iter->dist_id > 0 && iter->vcpu_id == iter->nr_cpus && iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && - iter->lpi_idx > iter->nr_lpis; + (!iter->nr_lpis || iter->lpi_idx > iter->nr_lpis); } static void *vgic_debug_start(struct seq_file *s, loff_t *pos) From 7e814a20f6da2bd2044b1a4682dd92a6f0df5a92 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 22 Jul 2024 17:33:11 +0100 Subject: [PATCH 115/991] KVM: arm64: Tidying up PAuth code in KVM Tidy up some of the PAuth trapping code to clear up some comments and avoid clang/checkpatch warnings. Also, don't bother setting PAuth HCR_EL2 bits in pKVM, since it's handled by the hypervisor. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20240722163311.1493879-1-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_ptrauth.h | 2 +- arch/arm64/kvm/arm.c | 14 ++++---------- arch/arm64/kvm/hyp/include/hyp/switch.h | 1 - arch/arm64/kvm/hyp/nvhe/switch.c | 5 ++--- 4 files changed, 7 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h index d81bac256abc35..6199c9f7ec6eda 100644 --- a/arch/arm64/include/asm/kvm_ptrauth.h +++ b/arch/arm64/include/asm/kvm_ptrauth.h @@ -104,7 +104,7 @@ alternative_else_nop_endif #define __ptrauth_save_key(ctxt, key) \ do { \ - u64 __val; \ + u64 __val; \ __val = read_sysreg_s(SYS_ ## key ## KEYLO_EL1); \ ctxt_sys_reg(ctxt, key ## KEYLO_EL1) = __val; \ __val = read_sysreg_s(SYS_ ## key ## KEYHI_EL1); \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 23e1fa56c02dd0..9bef7638342ef7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -522,10 +522,10 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu) { - if (vcpu_has_ptrauth(vcpu)) { + if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) { /* - * Either we're running running an L2 guest, and the API/APK - * bits come from L1's HCR_EL2, or API/APK are both set. + * Either we're running an L2 guest, and the API/APK bits come + * from L1's HCR_EL2, or API/APK are both set. */ if (unlikely(vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) { u64 val; @@ -542,16 +542,10 @@ static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu) * Save the host keys if there is any chance for the guest * to use pauth, as the entry code will reload the guest * keys in that case. - * Protected mode is the exception to that rule, as the - * entry into the EL2 code eagerly switch back and forth - * between host and hyp keys (and kvm_hyp_ctxt is out of - * reach anyway). */ - if (is_protected_kvm_enabled()) - return; - if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) { struct kvm_cpu_context *ctxt; + ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt); ptrauth_save_keys(ctxt); } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index f59ccfe11ab9ad..37ff87d782b62b 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 6af179c6356d66..8f5c56d5b1cdf5 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -173,9 +173,8 @@ static void __pmu_switch_to_host(struct kvm_vcpu *vcpu) static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) { /* - * Make sure we handle the exit for workarounds and ptrauth - * before the pKVM handling, as the latter could decide to - * UNDEF. + * Make sure we handle the exit for workarounds before the pKVM + * handling, as the latter could decide to UNDEF. */ return (kvm_hyp_handle_sysreg(vcpu, exit_code) || kvm_handle_pvm_sysreg(vcpu, exit_code)); From ad518452fd263766946346324810f14bd8bb8b34 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 31 Jul 2024 17:21:13 +0100 Subject: [PATCH 116/991] KVM: selftests: arm64: Correct feature test for S1PIE in get-reg-list The ID register for S1PIE is ID_AA64MMFR3_EL1.S1PIE which is bits 11:8 but get-reg-list uses a shift of 4, checking SCTLRX instead. Use a shift of 8 instead. Fixes: 5f0419a0083b ("KVM: selftests: get-reg-list: add Permission Indirection registers") Signed-off-by: Mark Brown Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20240731-kvm-arm64-fix-s1pie-test-v1-1-a9253f3b7db4@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 709d7d72176035..4abebde781873b 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -32,13 +32,13 @@ static struct feature_id_reg feat_id_regs[] = { { ARM64_SYS_REG(3, 0, 10, 2, 2), /* PIRE0_EL1 */ ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ - 4, + 8, 1 }, { ARM64_SYS_REG(3, 0, 10, 2, 3), /* PIR_EL1 */ ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ - 4, + 8, 1 } }; From 6dd1e4c045afa6a4ba5d46f044c83bd357c593c2 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 7 Aug 2024 17:00:56 +0800 Subject: [PATCH 117/991] selinux: add the processing of the failure of avc_add_xperms_decision() When avc_add_xperms_decision() fails, the information recorded by the new avc node is incomplete. In this case, the new avc node should be released instead of replacing the old avc node. Cc: stable@vger.kernel.org Fixes: fa1aa143ac4a ("selinux: extended permissions for ioctls") Suggested-by: Stephen Smalley Signed-off-by: Zhen Lei Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/avc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 7087cd2b802d8d..b49c44869dc462 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -907,7 +907,11 @@ static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, node->ae.avd.auditdeny &= ~perms; break; case AVC_CALLBACK_ADD_XPERMS: - avc_add_xperms_decision(node, xpd); + rc = avc_add_xperms_decision(node, xpd); + if (rc) { + avc_node_kill(node); + goto out_unlock; + } break; } avc_node_replace(node, orig); From e037a26ead187901f83cad9c503ccece5ff6817a Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Sat, 6 Jul 2024 11:38:07 -0400 Subject: [PATCH 118/991] igc: Fix packet still tx after gate close by reducing i226 MAC retry buffer Testing uncovered that even when the taprio gate is closed, some packets still transmit. According to i225/6 hardware errata [1], traffic might overflow the planned QBV window. This happens because MAC maintains an internal buffer, primarily for supporting half duplex retries. Therefore, even when the gate closes, residual MAC data in the buffer may still transmit. To mitigate this for i226, reduce the MAC's internal buffer from 192 bytes to the recommended 88 bytes by modifying the RETX_CTL register value. This follows guidelines from: [1] Ethernet Controller I225/I22 Spec Update Rev 2.1 Errata Item 9: TSN: Packet Transmission Might Cross Qbv Window [2] I225/6 SW User Manual Rev 1.2.4: Section 8.11.5 Retry Buffer Control Note that the RETX_CTL register can't be used in TSN mode because half duplex feature cannot coexist with TSN. Test Steps: 1. Send taprio cmd to board A: tc qdisc replace dev enp1s0 parent root handle 100 taprio \ num_tc 4 \ map 3 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3 \ queues 1@0 1@1 1@2 1@3 \ base-time 0 \ sched-entry S 0x07 500000 \ sched-entry S 0x0f 500000 \ flags 0x2 \ txtime-delay 0 Note that for TC3, gate should open for 500us and close for another 500us. 3. Take tcpdump log on Board B. 4. Send udp packets via UDP tai app from Board A to Board B. 5. Analyze tcpdump log via wireshark log on Board B. Ensure that the total time from the first to the last packet received during one cycle for TC3 does not exceed 500us. Fixes: 43546211738e ("igc: Add new device ID's") Signed-off-by: Faizal Rahim Acked-by: Vinicius Costa Gomes Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_defines.h | 6 ++++ drivers/net/ethernet/intel/igc/igc_tsn.c | 34 ++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 5f92b3c7c3d4a1..511384f3ec5cb5 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -404,6 +404,12 @@ #define IGC_DTXMXPKTSZ_TSN 0x19 /* 1600 bytes of max TX DMA packet size */ #define IGC_DTXMXPKTSZ_DEFAULT 0x98 /* 9728-byte Jumbo frames */ +/* Retry Buffer Control */ +#define IGC_RETX_CTL 0x041C +#define IGC_RETX_CTL_WATERMARK_MASK 0xF +#define IGC_RETX_CTL_QBVFULLTH_SHIFT 8 /* QBV Retry Buffer Full Threshold */ +#define IGC_RETX_CTL_QBVFULLEN 0x1000 /* Enable QBV Retry Buffer Full Threshold */ + /* Transmit Scheduling Latency */ /* Latency between transmission scheduling (LaunchTime) and the time * the packet is transmitted to the network in nanosecond. diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 22cefb1eeedfa7..46d4c3275bbb56 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -78,6 +78,15 @@ void igc_tsn_adjust_txtime_offset(struct igc_adapter *adapter) wr32(IGC_GTXOFFSET, txoffset); } +static void igc_tsn_restore_retx_default(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + u32 retxctl; + + retxctl = rd32(IGC_RETX_CTL) & IGC_RETX_CTL_WATERMARK_MASK; + wr32(IGC_RETX_CTL, retxctl); +} + /* Returns the TSN specific registers to their default values after * the adapter is reset. */ @@ -91,6 +100,9 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) wr32(IGC_TXPBS, I225_TXPBSIZE_DEFAULT); wr32(IGC_DTXMXPKTSZ, IGC_DTXMXPKTSZ_DEFAULT); + if (igc_is_device_id_i226(hw)) + igc_tsn_restore_retx_default(adapter); + tqavctrl = rd32(IGC_TQAVCTRL); tqavctrl &= ~(IGC_TQAVCTRL_TRANSMIT_MODE_TSN | IGC_TQAVCTRL_ENHANCED_QAV | IGC_TQAVCTRL_FUTSCDDIS); @@ -111,6 +123,25 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) return 0; } +/* To partially fix i226 HW errata, reduce MAC internal buffering from 192 Bytes + * to 88 Bytes by setting RETX_CTL register using the recommendation from: + * a) Ethernet Controller I225/I226 Specification Update Rev 2.1 + * Item 9: TSN: Packet Transmission Might Cross the Qbv Window + * b) I225/6 SW User Manual Rev 1.2.4: Section 8.11.5 Retry Buffer Control + */ +static void igc_tsn_set_retx_qbvfullthreshold(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + u32 retxctl, watermark; + + retxctl = rd32(IGC_RETX_CTL); + watermark = retxctl & IGC_RETX_CTL_WATERMARK_MASK; + /* Set QBVFULLTH value using watermark and set QBVFULLEN */ + retxctl |= (watermark << IGC_RETX_CTL_QBVFULLTH_SHIFT) | + IGC_RETX_CTL_QBVFULLEN; + wr32(IGC_RETX_CTL, retxctl); +} + static int igc_tsn_enable_offload(struct igc_adapter *adapter) { struct igc_hw *hw = &adapter->hw; @@ -123,6 +154,9 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) wr32(IGC_DTXMXPKTSZ, IGC_DTXMXPKTSZ_TSN); wr32(IGC_TXPBS, IGC_TXPBSIZE_TSN); + if (igc_is_device_id_i226(hw)) + igc_tsn_set_retx_qbvfullthreshold(adapter); + for (i = 0; i < adapter->num_tx_queues; i++) { struct igc_ring *ring = adapter->tx_ring[i]; u32 txqctl = 0; From f8d6acaee9d35cbff3c3cfad94641666c596f8da Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Sun, 7 Jul 2024 08:53:16 -0400 Subject: [PATCH 119/991] igc: Fix qbv_config_change_errors logics When user issues these cmds: 1. Either a) or b) a) mqprio with hardware offload disabled b) taprio with txtime-assist feature enabled 2. etf 3. tc qdisc delete 4. taprio with base time in the past At step 4, qbv_config_change_errors wrongly increased by 1. Excerpt from IEEE 802.1Q-2018 8.6.9.3.1: "If AdminBaseTime specifies a time in the past, and the current schedule is running, then: Increment ConfigChangeError counter" qbv_config_change_errors should only increase if base time is in the past and no taprio is active. In user perspective, taprio was not active when first triggered at step 4. However, i225/6 reuses qbv for etf, so qbv is enabled with a dummy schedule at step 2 where it enters igc_tsn_enable_offload() and qbv_count got incremented to 1. At step 4, it enters igc_tsn_enable_offload() again, qbv_count is incremented to 2. Because taprio is running, tc_setup_type is TC_SETUP_QDISC_ETF and qbv_count > 1, qbv_config_change_errors value got incremented. This issue happens due to reliance on qbv_count field where a non-zero value indicates that taprio is running. But qbv_count increases regardless if taprio is triggered by user or by other tsn feature. It does not align with qbv_config_change_errors expectation where it is only concerned with taprio triggered by user. Fixing this by relocating the qbv_config_change_errors logic to igc_save_qbv_schedule(), eliminating reliance on qbv_count and its inaccuracies from i225/6's multiple uses of qbv feature for other TSN features. The new function created: igc_tsn_is_taprio_activated_by_user() uses taprio_offload_enable field to indicate that the current running taprio was triggered by user, instead of triggered by non-qbv feature like etf. Fixes: ae4fe4698300 ("igc: Add qbv_config_change_errors counter") Signed-off-by: Faizal Rahim Reviewed-by: Simon Horman Acked-by: Vinicius Costa Gomes Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 8 ++++++-- drivers/net/ethernet/intel/igc/igc_tsn.c | 16 ++++++++-------- drivers/net/ethernet/intel/igc/igc_tsn.h | 1 + 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 8daf938afc36c2..dfd6c00b4205d4 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6315,12 +6315,16 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, if (!validate_schedule(adapter, qopt)) return -EINVAL; + igc_ptp_read(adapter, &now); + + if (igc_tsn_is_taprio_activated_by_user(adapter) && + is_base_time_past(qopt->base_time, &now)) + adapter->qbv_config_change_errors++; + adapter->cycle_time = qopt->cycle_time; adapter->base_time = qopt->base_time; adapter->taprio_offload_enable = true; - igc_ptp_read(adapter, &now); - for (n = 0; n < qopt->num_entries; n++) { struct tc_taprio_sched_entry *e = &qopt->entries[n]; diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 46d4c3275bbb56..8ed7b965484da7 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -87,6 +87,14 @@ static void igc_tsn_restore_retx_default(struct igc_adapter *adapter) wr32(IGC_RETX_CTL, retxctl); } +bool igc_tsn_is_taprio_activated_by_user(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + + return (rd32(IGC_BASET_H) || rd32(IGC_BASET_L)) && + adapter->taprio_offload_enable; +} + /* Returns the TSN specific registers to their default values after * the adapter is reset. */ @@ -296,14 +304,6 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) s64 n = div64_s64(ktime_sub_ns(systim, base_time), cycle); base_time = ktime_add_ns(base_time, (n + 1) * cycle); - - /* Increase the counter if scheduling into the past while - * Gate Control List (GCL) is running. - */ - if ((rd32(IGC_BASET_H) || rd32(IGC_BASET_L)) && - (adapter->tc_setup_type == TC_SETUP_QDISC_TAPRIO) && - (adapter->qbv_count > 1)) - adapter->qbv_config_change_errors++; } else { if (igc_is_device_id_i226(hw)) { ktime_t adjust_time, expires_time; diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.h b/drivers/net/ethernet/intel/igc/igc_tsn.h index b53e6af560b738..98ec845a86bf00 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.h +++ b/drivers/net/ethernet/intel/igc/igc_tsn.h @@ -7,5 +7,6 @@ int igc_tsn_offload_apply(struct igc_adapter *adapter); int igc_tsn_reset(struct igc_adapter *adapter); void igc_tsn_adjust_txtime_offset(struct igc_adapter *adapter); +bool igc_tsn_is_taprio_activated_by_user(struct igc_adapter *adapter); #endif /* _IGC_BASE_H */ From 0afeaeb5dae86aceded0d5f0c3a54d27858c0c6f Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Sun, 7 Jul 2024 08:53:17 -0400 Subject: [PATCH 120/991] igc: Fix reset adapter logics when tx mode change Following the "igc: Fix TX Hang issue when QBV Gate is close" changes, remaining issues with the reset adapter logic in igc_tsn_offload_apply() have been observed: 1. The reset adapter logics for i225 and i226 differ, although they should be the same according to the guidelines in I225/6 HW Design Section 7.5.2.1 on software initialization during tx mode changes. 2. The i225 resets adapter every time, even though tx mode doesn't change. This occurs solely based on the condition igc_is_device_id_i225() when calling schedule_work(). 3. i226 doesn't reset adapter for tsn->legacy tx mode changes. It only resets adapter for legacy->tsn tx mode transitions. 4. qbv_count introduced in the patch is actually not needed; in this context, a non-zero value of qbv_count is used to indicate if tx mode was unconditionally set to tsn in igc_tsn_enable_offload(). This could be replaced by checking the existing register IGC_TQAVCTRL_TRANSMIT_MODE_TSN bit. This patch resolves all issues and enters schedule_work() to reset the adapter only when changing tx mode. It also removes reliance on qbv_count. qbv_count field will be removed in a future patch. Test ran: 1. Verify reset adapter behaviour in i225/6: a) Enrol a new GCL Reset adapter observed (tx mode change legacy->tsn) b) Enrol a new GCL without deleting qdisc No reset adapter observed (tx mode remain tsn->tsn) c) Delete qdisc Reset adapter observed (tx mode change tsn->legacy) 2. Tested scenario from "igc: Fix TX Hang issue when QBV Gate is closed" to confirm it remains resolved. Fixes: 175c241288c0 ("igc: Fix TX Hang issue when QBV Gate is closed") Signed-off-by: Faizal Rahim Reviewed-by: Simon Horman Acked-by: Vinicius Costa Gomes Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_tsn.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 8ed7b965484da7..ada75143051714 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -49,6 +49,13 @@ static unsigned int igc_tsn_new_flags(struct igc_adapter *adapter) return new_flags; } +static bool igc_tsn_is_tx_mode_in_tsn(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + + return !!(rd32(IGC_TQAVCTRL) & IGC_TQAVCTRL_TRANSMIT_MODE_TSN); +} + void igc_tsn_adjust_txtime_offset(struct igc_adapter *adapter) { struct igc_hw *hw = &adapter->hw; @@ -365,15 +372,22 @@ int igc_tsn_reset(struct igc_adapter *adapter) return err; } -int igc_tsn_offload_apply(struct igc_adapter *adapter) +static bool igc_tsn_will_tx_mode_change(struct igc_adapter *adapter) { - struct igc_hw *hw = &adapter->hw; + bool any_tsn_enabled = !!(igc_tsn_new_flags(adapter) & + IGC_FLAG_TSN_ANY_ENABLED); + + return (any_tsn_enabled && !igc_tsn_is_tx_mode_in_tsn(adapter)) || + (!any_tsn_enabled && igc_tsn_is_tx_mode_in_tsn(adapter)); +} - /* Per I225/6 HW Design Section 7.5.2.1, transmit mode - * cannot be changed dynamically. Require reset the adapter. +int igc_tsn_offload_apply(struct igc_adapter *adapter) +{ + /* Per I225/6 HW Design Section 7.5.2.1 guideline, if tx mode change + * from legacy->tsn or tsn->legacy, then reset adapter is needed. */ if (netif_running(adapter->netdev) && - (igc_is_device_id_i225(hw) || !adapter->qbv_count)) { + igc_tsn_will_tx_mode_change(adapter)) { schedule_work(&adapter->reset_task); return 0; } From 6c3fc0b1c3d073bd6fc3bf43dbd0e64240537464 Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Sun, 7 Jul 2024 08:53:18 -0400 Subject: [PATCH 121/991] igc: Fix qbv tx latency by setting gtxoffset A large tx latency issue was discovered during testing when only QBV was enabled. The issue occurs because gtxoffset was not set when QBV is active, it was only set when launch time is active. The patch "igc: Correct the launchtime offset" only sets gtxoffset when the launchtime_enable field is set by the user. Enabling launchtime_enable ultimately sets the register IGC_TXQCTL_QUEUE_MODE_LAUNCHT (referred to as LaunchT in the SW user manual). Section 7.5.2.6 of the IGC i225/6 SW User Manual Rev 1.2.4 states: "The latency between transmission scheduling (launch time) and the time the packet is transmitted to the network is listed in Table 7-61." However, the patch misinterprets the phrase "launch time" in that section by assuming it specifically refers to the LaunchT register, whereas it actually denotes the generic term for when a packet is released from the internal buffer to the MAC transmit logic. This launch time, as per that section, also implicitly refers to the QBV gate open time, where a packet waits in the buffer for the QBV gate to open. Therefore, latency applies whenever QBV is in use. TSN features such as QBU and QAV reuse QBV, making the latency universal to TSN features. Discussed with i226 HW owner (Shalev, Avi) and we were in agreement that the term "launch time" used in Section 7.5.2.6 is not clear and can be easily misinterpreted. Avi will update this section to: "When TQAVCTRL.TRANSMIT_MODE = TSN, the latency between transmission scheduling and the time the packet is transmitted to the network is listed in Table 7-61." Fix this issue by using igc_tsn_is_tx_mode_in_tsn() as a condition to write to gtxoffset, aligning with the newly updated SW User Manual. Tested: 1. Enrol taprio on talker board base-time 0 cycle-time 1000000 flags 0x2 index 0 cmd S gatemask 0x1 interval1 index 0 cmd S gatemask 0x1 interval2 Note: interval1 = interval for a 64 bytes packet to go through interval2 = cycle-time - interval1 2. Take tcpdump on listener board 3. Use udp tai app on talker to send packets to listener 4. Check the timestamp on listener via wireshark Test Result: 100 Mbps: 113 ~193 ns 1000 Mbps: 52 ~ 84 ns 2500 Mbps: 95 ~ 223 ns Note that the test result is similar to the patch "igc: Correct the launchtime offset". Fixes: 790835fcc0cb ("igc: Correct the launchtime offset") Signed-off-by: Faizal Rahim Reviewed-by: Simon Horman Acked-by: Vinicius Costa Gomes Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_tsn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index ada75143051714..d68fa7f3d5f078 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -61,7 +61,7 @@ void igc_tsn_adjust_txtime_offset(struct igc_adapter *adapter) struct igc_hw *hw = &adapter->hw; u16 txoffset; - if (!is_any_launchtime(adapter)) + if (!igc_tsn_is_tx_mode_in_tsn(adapter)) return; switch (adapter->link_speed) { From 49f6202ce991742f451fc724f03d0c17460d06cd Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 3 Aug 2024 13:41:40 +0300 Subject: [PATCH 122/991] ASoC: codecs: lpass-macro: fix version strings returned for 1.x codecs Add missing cases to lpass_macro_get_codec_version_string() to let it print the correct codec version for 1.x codec platforms. Fixes: 378918d59181 ("ASoC: codecs: lpass-macro: add helpers to get codec version") Signed-off-by: Dmitry Baryshkov Link: https://patch.msgid.link/20240803-codec-version-v1-1-bc29baa5e417@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/lpass-macro-common.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/soc/codecs/lpass-macro-common.h b/sound/soc/codecs/lpass-macro-common.h index 21cb30ab706d8a..fb4b96cb2b232d 100644 --- a/sound/soc/codecs/lpass-macro-common.h +++ b/sound/soc/codecs/lpass-macro-common.h @@ -49,6 +49,12 @@ static inline void lpass_macro_pds_exit_action(void *pds) static inline const char *lpass_macro_get_codec_version_string(int version) { switch (version) { + case LPASS_CODEC_VERSION_1_0: + return "v1.0"; + case LPASS_CODEC_VERSION_1_1: + return "v1.1"; + case LPASS_CODEC_VERSION_1_2: + return "v1.2"; case LPASS_CODEC_VERSION_2_0: return "v2.0"; case LPASS_CODEC_VERSION_2_1: From a9a7a2d80790d06cd32c535e2e7b10f72ce592e7 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 3 Aug 2024 13:41:41 +0300 Subject: [PATCH 123/991] ASoC: codecs: lpass-va-macro: warn on unknown version Warn the users if the driver doesn't know the codec version. This helps in debugging the issues with other codec not detecting the correct version. va_macro 3370000.codec: Unknown VA Codec version, ID: 00 / 0f / 00 Signed-off-by: Dmitry Baryshkov Link: https://patch.msgid.link/20240803-codec-version-v1-2-bc29baa5e417@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/lpass-va-macro.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/soc/codecs/lpass-va-macro.c b/sound/soc/codecs/lpass-va-macro.c index a62ccd09bacd72..8454193ed22a62 100644 --- a/sound/soc/codecs/lpass-va-macro.c +++ b/sound/soc/codecs/lpass-va-macro.c @@ -1485,6 +1485,10 @@ static void va_macro_set_lpass_codec_version(struct va_macro *va) if ((core_id_0 == 0x02) && (core_id_1 == 0x0F) && (core_id_2 == 0x80 || core_id_2 == 0x81)) version = LPASS_CODEC_VERSION_2_8; + if (version == LPASS_CODEC_VERSION_UNKNOWN) + dev_warn(va->dev, "Unknown Codec version, ID: %02x / %02x / %02x\n", + core_id_0, core_id_1, core_id_2); + lpass_macro_set_codec_version(version); dev_dbg(va->dev, "LPASS Codec Version %s\n", lpass_macro_get_codec_version_string(version)); From 06ce0af34177a110d6a5cf71f924965b9b230691 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 29 May 2024 00:11:23 +0100 Subject: [PATCH 124/991] soc: fsl: qbman: remove unused struct 'cgr_comp' 'cgr_comp' has been unused since commit 96f413f47677 ("soc/fsl/qbman: fix issue in qman_delete_cgr_safe()"). Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Sean Anderson Signed-off-by: Michael Ellerman Link: https://msgid.link/20240528231123.136664-1-linux@treblig.org --- drivers/soc/fsl/qbman/qman.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c index 7e9074519ad22d..4dc8aba33d9b70 100644 --- a/drivers/soc/fsl/qbman/qman.c +++ b/drivers/soc/fsl/qbman/qman.c @@ -2546,11 +2546,6 @@ int qman_delete_cgr(struct qman_cgr *cgr) } EXPORT_SYMBOL(qman_delete_cgr); -struct cgr_comp { - struct qman_cgr *cgr; - struct completion completion; -}; - static void qman_delete_cgr_smp_call(void *p) { qman_delete_cgr((struct qman_cgr *)p); From 78296429e20052b029211b0aca64aadc5052d581 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 30 Jul 2024 19:53:16 +0530 Subject: [PATCH 125/991] platform/x86/amd/pmf: Fix to Update HPD Data When ALS is Disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the Ambient Light Sensor (ALS) is disabled, the current code in the PMF driver does not query for Human Presence Detection (HPD) data in amd_pmf_get_sensor_info(). As a result, stale HPD data is used by PMF-TA to evaluate policy conditions, leading to unexpected behavior in the policy output actions. To resolve this issue, modify the PMF driver to query HPD data independently of ALS. Since user_present is a boolean, modify the current code to return true if the user is present and false if the user is away or if the sensor is not detected, and report this status to the PMF TA firmware accordingly. With this change, amd_pmf_get_sensor_info() now returns void instead of int. Fixes: cedecdba60f4 ("platform/x86/amd/pmf: Get ambient light information from AMD SFH driver") Co-developed-by: Patil Rajesh Reddy Signed-off-by: Patil Rajesh Reddy Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20240730142316.3846259-1-Shyam-sundar.S-k@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/spc.c | 32 ++++++++++-------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/spc.c b/drivers/platform/x86/amd/pmf/spc.c index a3dec14c30043e..3c153fb1425e9f 100644 --- a/drivers/platform/x86/amd/pmf/spc.c +++ b/drivers/platform/x86/amd/pmf/spc.c @@ -150,36 +150,26 @@ static int amd_pmf_get_slider_info(struct amd_pmf_dev *dev, struct ta_pmf_enact_ return 0; } -static int amd_pmf_get_sensor_info(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) +static void amd_pmf_get_sensor_info(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) { struct amd_sfh_info sfh_info; - int ret; + + /* Get the latest information from SFH */ + in->ev_info.user_present = false; /* Get ALS data */ - ret = amd_get_sfh_info(&sfh_info, MT_ALS); - if (!ret) + if (!amd_get_sfh_info(&sfh_info, MT_ALS)) in->ev_info.ambient_light = sfh_info.ambient_light; else - return ret; + dev_dbg(dev->dev, "ALS is not enabled/detected\n"); /* get HPD data */ - ret = amd_get_sfh_info(&sfh_info, MT_HPD); - if (ret) - return ret; - - switch (sfh_info.user_present) { - case SFH_NOT_DETECTED: - in->ev_info.user_present = 0xff; /* assume no sensors connected */ - break; - case SFH_USER_PRESENT: - in->ev_info.user_present = 1; - break; - case SFH_USER_AWAY: - in->ev_info.user_present = 0; - break; + if (!amd_get_sfh_info(&sfh_info, MT_HPD)) { + if (sfh_info.user_present == SFH_USER_PRESENT) + in->ev_info.user_present = true; + } else { + dev_dbg(dev->dev, "HPD is not enabled/detected\n"); } - - return 0; } void amd_pmf_populate_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in) From 613e3900c24bb1379d994f44d75d31c3223cc263 Mon Sep 17 00:00:00 2001 From: Gergo Koteles Date: Thu, 25 Jul 2024 11:21:07 +0200 Subject: [PATCH 126/991] platform/x86: ideapad-laptop: introduce a generic notification chain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are several cases where a notification chain can simplify Lenovo WMI drivers. Add a generic notification chain into ideapad-laptop. Signed-off-by: Gergo Koteles Link: https://lore.kernel.org/r/c5a43efae8a32bd034c3d19c0a686941347575a7.1721898747.git.soyer@irl.hu Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/ideapad-laptop.c | 37 +++++++++++++++++++++++++++ drivers/platform/x86/ideapad-laptop.h | 5 ++++ 2 files changed, 42 insertions(+) diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index 1ace711f7442e2..866b32bfe2c950 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -1592,6 +1592,39 @@ static void ideapad_sync_touchpad_state(struct ideapad_private *priv, bool send_ priv->r_touchpad_val = value; } +static int ideapad_laptop_nb_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + switch (action) { + } + + return 0; +} + +static struct notifier_block ideapad_laptop_notifier = { + .notifier_call = ideapad_laptop_nb_notify, +}; + +static BLOCKING_NOTIFIER_HEAD(ideapad_laptop_chain_head); + +int ideapad_laptop_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&ideapad_laptop_chain_head, nb); +} +EXPORT_SYMBOL_NS_GPL(ideapad_laptop_register_notifier, IDEAPAD_LAPTOP); + +int ideapad_laptop_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&ideapad_laptop_chain_head, nb); +} +EXPORT_SYMBOL_NS_GPL(ideapad_laptop_unregister_notifier, IDEAPAD_LAPTOP); + +void ideapad_laptop_call_notifier(unsigned long action, void *data) +{ + blocking_notifier_call_chain(&ideapad_laptop_chain_head, action, data); +} +EXPORT_SYMBOL_NS_GPL(ideapad_laptop_call_notifier, IDEAPAD_LAPTOP); + static void ideapad_acpi_notify(acpi_handle handle, u32 event, void *data) { struct ideapad_private *priv = data; @@ -1974,6 +2007,8 @@ static int ideapad_acpi_add(struct platform_device *pdev) if (err) goto shared_init_failed; + ideapad_laptop_register_notifier(&ideapad_laptop_notifier); + return 0; shared_init_failed: @@ -2006,6 +2041,8 @@ static void ideapad_acpi_remove(struct platform_device *pdev) struct ideapad_private *priv = dev_get_drvdata(&pdev->dev); int i; + ideapad_laptop_unregister_notifier(&ideapad_laptop_notifier); + ideapad_shared_exit(priv); acpi_remove_notify_handler(priv->adev->handle, diff --git a/drivers/platform/x86/ideapad-laptop.h b/drivers/platform/x86/ideapad-laptop.h index 4498a96de59769..3eb0dcd6bf7ba8 100644 --- a/drivers/platform/x86/ideapad-laptop.h +++ b/drivers/platform/x86/ideapad-laptop.h @@ -12,6 +12,11 @@ #include #include #include +#include + +int ideapad_laptop_register_notifier(struct notifier_block *nb); +int ideapad_laptop_unregister_notifier(struct notifier_block *nb); +void ideapad_laptop_call_notifier(unsigned long action, void *data); enum { VPCCMD_R_VPC1 = 0x10, From cde7886b35176d56e72bfc68dc104fa08e7b072c Mon Sep 17 00:00:00 2001 From: Gergo Koteles Date: Thu, 25 Jul 2024 11:21:08 +0200 Subject: [PATCH 127/991] platform/x86: ideapad-laptop: move ymc_trigger_ec from lenovo-ymc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some models need to trigger the EC after each YMC event for the yoga mode control to work properly. EC triggering consist of a VPC call from the lenovo-ymc module. Except for this, all VPC calls are in the ideapad-laptop module. Since ideapad-laptop has a notification chain, a new YMC_EVENT action can be added and triggered from the lenovo-ymc module. Then the ideapad-laptop can trigger the EC. If the triggering is in the ideapad-laptop module, then the ec_trigger module parameter should be there as well. Move the ymc_trigger_ec functionality and the ec_trigger module parameter to the ideapad-laptop module. Signed-off-by: Gergo Koteles Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/d980ab3ac32b5e554f456b0ff17279bfdbe2a203.1721898747.git.soyer@irl.hu Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/Kconfig | 1 + drivers/platform/x86/ideapad-laptop.c | 49 ++++++++++++++++++++++ drivers/platform/x86/ideapad-laptop.h | 4 ++ drivers/platform/x86/lenovo-ymc.c | 60 +-------------------------- 4 files changed, 56 insertions(+), 58 deletions(-) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 665fa952498659..ddfccc226751f4 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -477,6 +477,7 @@ config LENOVO_YMC tristate "Lenovo Yoga Tablet Mode Control" depends on ACPI_WMI depends on INPUT + depends on IDEAPAD_LAPTOP select INPUT_SPARSEKMAP help This driver maps the Tablet Mode Control switch to SW_TABLET_MODE input diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index 866b32bfe2c950..9fc1bb990e4749 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -146,6 +146,7 @@ struct ideapad_private { bool touchpad_ctrl_via_ec : 1; bool ctrl_ps2_aux_port : 1; bool usb_charging : 1; + bool ymc_ec_trigger : 1; } features; struct { bool initialized; @@ -194,6 +195,12 @@ MODULE_PARM_DESC(touchpad_ctrl_via_ec, "Enable registering a 'touchpad' sysfs-attribute which can be used to manually " "tell the EC to enable/disable the touchpad. This may not work on all models."); +static bool ymc_ec_trigger __read_mostly; +module_param(ymc_ec_trigger, bool, 0444); +MODULE_PARM_DESC(ymc_ec_trigger, + "Enable EC triggering work-around to force emitting tablet mode events. " + "If you need this please report this to: platform-driver-x86@vger.kernel.org"); + /* * shared data */ @@ -1592,10 +1599,50 @@ static void ideapad_sync_touchpad_state(struct ideapad_private *priv, bool send_ priv->r_touchpad_val = value; } +static const struct dmi_system_id ymc_ec_trigger_quirk_dmi_table[] = { + { + /* Lenovo Yoga 7 14ARB7 */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "82QF"), + }, + }, + { + /* Lenovo Yoga 7 14ACN6 */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "82N7"), + }, + }, + { } +}; + +static void ideapad_laptop_trigger_ec(void) +{ + struct ideapad_private *priv; + int ret; + + guard(mutex)(&ideapad_shared_mutex); + + priv = ideapad_shared; + if (!priv) + return; + + if (!priv->features.ymc_ec_trigger) + return; + + ret = write_ec_cmd(priv->adev->handle, VPCCMD_W_YMC, 1); + if (ret) + dev_warn(&priv->platform_device->dev, "Could not write YMC: %d\n", ret); +} + static int ideapad_laptop_nb_notify(struct notifier_block *nb, unsigned long action, void *data) { switch (action) { + case IDEAPAD_LAPTOP_YMC_EVENT: + ideapad_laptop_trigger_ec(); + break; } return 0; @@ -1761,6 +1808,8 @@ static void ideapad_check_features(struct ideapad_private *priv) priv->features.ctrl_ps2_aux_port = ctrl_ps2_aux_port || dmi_check_system(ctrl_ps2_aux_port_list); priv->features.touchpad_ctrl_via_ec = touchpad_ctrl_via_ec; + priv->features.ymc_ec_trigger = + ymc_ec_trigger || dmi_check_system(ymc_ec_trigger_quirk_dmi_table); if (!read_ec_data(handle, VPCCMD_R_FAN, &val)) priv->features.fan_mode = true; diff --git a/drivers/platform/x86/ideapad-laptop.h b/drivers/platform/x86/ideapad-laptop.h index 3eb0dcd6bf7ba8..948cc61800a950 100644 --- a/drivers/platform/x86/ideapad-laptop.h +++ b/drivers/platform/x86/ideapad-laptop.h @@ -14,6 +14,10 @@ #include #include +enum ideapad_laptop_notifier_actions { + IDEAPAD_LAPTOP_YMC_EVENT, +}; + int ideapad_laptop_register_notifier(struct notifier_block *nb); int ideapad_laptop_unregister_notifier(struct notifier_block *nb); void ideapad_laptop_call_notifier(unsigned long action, void *data); diff --git a/drivers/platform/x86/lenovo-ymc.c b/drivers/platform/x86/lenovo-ymc.c index e1fbc35504d498..e0bbd6a14a89cb 100644 --- a/drivers/platform/x86/lenovo-ymc.c +++ b/drivers/platform/x86/lenovo-ymc.c @@ -20,32 +20,10 @@ #define LENOVO_YMC_QUERY_INSTANCE 0 #define LENOVO_YMC_QUERY_METHOD 0x01 -static bool ec_trigger __read_mostly; -module_param(ec_trigger, bool, 0444); -MODULE_PARM_DESC(ec_trigger, "Enable EC triggering work-around to force emitting tablet mode events"); - static bool force; module_param(force, bool, 0444); MODULE_PARM_DESC(force, "Force loading on boards without a convertible DMI chassis-type"); -static const struct dmi_system_id ec_trigger_quirk_dmi_table[] = { - { - /* Lenovo Yoga 7 14ARB7 */ - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_MATCH(DMI_PRODUCT_NAME, "82QF"), - }, - }, - { - /* Lenovo Yoga 7 14ACN6 */ - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_MATCH(DMI_PRODUCT_NAME, "82N7"), - }, - }, - { } -}; - static const struct dmi_system_id allowed_chasis_types_dmi_table[] = { { .matches = { @@ -62,21 +40,8 @@ static const struct dmi_system_id allowed_chasis_types_dmi_table[] = { struct lenovo_ymc_private { struct input_dev *input_dev; - struct acpi_device *ec_acpi_dev; }; -static void lenovo_ymc_trigger_ec(struct wmi_device *wdev, struct lenovo_ymc_private *priv) -{ - int err; - - if (!priv->ec_acpi_dev) - return; - - err = write_ec_cmd(priv->ec_acpi_dev->handle, VPCCMD_W_YMC, 1); - if (err) - dev_warn(&wdev->dev, "Could not write YMC: %d\n", err); -} - static const struct key_entry lenovo_ymc_keymap[] = { /* Laptop */ { KE_SW, 0x01, { .sw = { SW_TABLET_MODE, 0 } } }, @@ -125,11 +90,9 @@ static void lenovo_ymc_notify(struct wmi_device *wdev, union acpi_object *data) free_obj: kfree(obj); - lenovo_ymc_trigger_ec(wdev, priv); + ideapad_laptop_call_notifier(IDEAPAD_LAPTOP_YMC_EVENT, &code); } -static void acpi_dev_put_helper(void *p) { acpi_dev_put(p); } - static int lenovo_ymc_probe(struct wmi_device *wdev, const void *ctx) { struct lenovo_ymc_private *priv; @@ -143,29 +106,10 @@ static int lenovo_ymc_probe(struct wmi_device *wdev, const void *ctx) return -ENODEV; } - ec_trigger |= dmi_check_system(ec_trigger_quirk_dmi_table); - priv = devm_kzalloc(&wdev->dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; - if (ec_trigger) { - pr_debug("Lenovo YMC enable EC triggering.\n"); - priv->ec_acpi_dev = acpi_dev_get_first_match_dev("VPC2004", NULL, -1); - - if (!priv->ec_acpi_dev) { - dev_err(&wdev->dev, "Could not find EC ACPI device.\n"); - return -ENODEV; - } - err = devm_add_action_or_reset(&wdev->dev, - acpi_dev_put_helper, priv->ec_acpi_dev); - if (err) { - dev_err(&wdev->dev, - "Could not clean up EC ACPI device: %d\n", err); - return err; - } - } - input_dev = devm_input_allocate_device(&wdev->dev); if (!input_dev) return -ENOMEM; @@ -192,7 +136,6 @@ static int lenovo_ymc_probe(struct wmi_device *wdev, const void *ctx) dev_set_drvdata(&wdev->dev, priv); /* Report the state for the first time on probe */ - lenovo_ymc_trigger_ec(wdev, priv); lenovo_ymc_notify(wdev, NULL); return 0; } @@ -217,3 +160,4 @@ module_wmi_driver(lenovo_ymc_driver); MODULE_AUTHOR("Gergo Koteles "); MODULE_DESCRIPTION("Lenovo Yoga Mode Control driver"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(IDEAPAD_LAPTOP); From 7cc06e729460a209b84d3db4db56c9f85f048cc2 Mon Sep 17 00:00:00 2001 From: Gergo Koteles Date: Thu, 25 Jul 2024 11:21:10 +0200 Subject: [PATCH 128/991] platform/x86: ideapad-laptop: add a mutex to synchronize VPC commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling VPC commands consists of several VPCW and VPCR ACPI calls. These calls and their results can get mixed up if they are called simultaneously from different threads, like acpi notify handler, sysfs, debugfs, notification chain. The commit e2ffcda16290 ("ACPI: OSL: Allow Notify () handlers to run on all CPUs") made the race issues much worse than before it but some races were possible even before that commit. Add a mutex to synchronize VPC commands. Fixes: e2ffcda16290 ("ACPI: OSL: Allow Notify () handlers to run on all CPUs") Fixes: e82882cdd241 ("platform/x86: Add driver for Yoga Tablet Mode switch") Signed-off-by: Gergo Koteles Link: https://lore.kernel.org/r/f26782fa1194ad11ed5d9ba121a804e59b58b026.1721898747.git.soyer@irl.hu Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/ideapad-laptop.c | 64 ++++++++++++++++++++------- 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index 9fc1bb990e4749..98ec30fce9fdd3 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -126,6 +126,7 @@ struct ideapad_rfk_priv { struct ideapad_private { struct acpi_device *adev; + struct mutex vpc_mutex; /* protects the VPC calls */ struct rfkill *rfk[IDEAPAD_RFKILL_DEV_NUM]; struct ideapad_rfk_priv rfk_priv[IDEAPAD_RFKILL_DEV_NUM]; struct platform_device *platform_device; @@ -301,6 +302,8 @@ static int debugfs_status_show(struct seq_file *s, void *data) struct ideapad_private *priv = s->private; unsigned long value; + guard(mutex)(&priv->vpc_mutex); + if (!read_ec_data(priv->adev->handle, VPCCMD_R_BL_MAX, &value)) seq_printf(s, "Backlight max: %lu\n", value); if (!read_ec_data(priv->adev->handle, VPCCMD_R_BL, &value)) @@ -419,7 +422,8 @@ static ssize_t camera_power_show(struct device *dev, unsigned long result; int err; - err = read_ec_data(priv->adev->handle, VPCCMD_R_CAMERA, &result); + scoped_guard(mutex, &priv->vpc_mutex) + err = read_ec_data(priv->adev->handle, VPCCMD_R_CAMERA, &result); if (err) return err; @@ -438,7 +442,8 @@ static ssize_t camera_power_store(struct device *dev, if (err) return err; - err = write_ec_cmd(priv->adev->handle, VPCCMD_W_CAMERA, state); + scoped_guard(mutex, &priv->vpc_mutex) + err = write_ec_cmd(priv->adev->handle, VPCCMD_W_CAMERA, state); if (err) return err; @@ -491,7 +496,8 @@ static ssize_t fan_mode_show(struct device *dev, unsigned long result; int err; - err = read_ec_data(priv->adev->handle, VPCCMD_R_FAN, &result); + scoped_guard(mutex, &priv->vpc_mutex) + err = read_ec_data(priv->adev->handle, VPCCMD_R_FAN, &result); if (err) return err; @@ -513,7 +519,8 @@ static ssize_t fan_mode_store(struct device *dev, if (state > 4 || state == 3) return -EINVAL; - err = write_ec_cmd(priv->adev->handle, VPCCMD_W_FAN, state); + scoped_guard(mutex, &priv->vpc_mutex) + err = write_ec_cmd(priv->adev->handle, VPCCMD_W_FAN, state); if (err) return err; @@ -598,7 +605,8 @@ static ssize_t touchpad_show(struct device *dev, unsigned long result; int err; - err = read_ec_data(priv->adev->handle, VPCCMD_R_TOUCHPAD, &result); + scoped_guard(mutex, &priv->vpc_mutex) + err = read_ec_data(priv->adev->handle, VPCCMD_R_TOUCHPAD, &result); if (err) return err; @@ -619,7 +627,8 @@ static ssize_t touchpad_store(struct device *dev, if (err) return err; - err = write_ec_cmd(priv->adev->handle, VPCCMD_W_TOUCHPAD, state); + scoped_guard(mutex, &priv->vpc_mutex) + err = write_ec_cmd(priv->adev->handle, VPCCMD_W_TOUCHPAD, state); if (err) return err; @@ -1012,6 +1021,8 @@ static int ideapad_rfk_set(void *data, bool blocked) struct ideapad_rfk_priv *priv = data; int opcode = ideapad_rfk_data[priv->dev].opcode; + guard(mutex)(&priv->priv->vpc_mutex); + return write_ec_cmd(priv->priv->adev->handle, opcode, !blocked); } @@ -1025,6 +1036,8 @@ static void ideapad_sync_rfk_state(struct ideapad_private *priv) int i; if (priv->features.hw_rfkill_switch) { + guard(mutex)(&priv->vpc_mutex); + if (read_ec_data(priv->adev->handle, VPCCMD_R_RF, &hw_blocked)) return; hw_blocked = !hw_blocked; @@ -1198,8 +1211,9 @@ static void ideapad_input_novokey(struct ideapad_private *priv) { unsigned long long_pressed; - if (read_ec_data(priv->adev->handle, VPCCMD_R_NOVO, &long_pressed)) - return; + scoped_guard(mutex, &priv->vpc_mutex) + if (read_ec_data(priv->adev->handle, VPCCMD_R_NOVO, &long_pressed)) + return; if (long_pressed) ideapad_input_report(priv, 17); @@ -1211,8 +1225,9 @@ static void ideapad_check_special_buttons(struct ideapad_private *priv) { unsigned long bit, value; - if (read_ec_data(priv->adev->handle, VPCCMD_R_SPECIAL_BUTTONS, &value)) - return; + scoped_guard(mutex, &priv->vpc_mutex) + if (read_ec_data(priv->adev->handle, VPCCMD_R_SPECIAL_BUTTONS, &value)) + return; for_each_set_bit (bit, &value, 16) { switch (bit) { @@ -1245,6 +1260,8 @@ static int ideapad_backlight_get_brightness(struct backlight_device *blightdev) unsigned long now; int err; + guard(mutex)(&priv->vpc_mutex); + err = read_ec_data(priv->adev->handle, VPCCMD_R_BL, &now); if (err) return err; @@ -1257,6 +1274,8 @@ static int ideapad_backlight_update_status(struct backlight_device *blightdev) struct ideapad_private *priv = bl_get_data(blightdev); int err; + guard(mutex)(&priv->vpc_mutex); + err = write_ec_cmd(priv->adev->handle, VPCCMD_W_BL, blightdev->props.brightness); if (err) @@ -1334,6 +1353,8 @@ static void ideapad_backlight_notify_power(struct ideapad_private *priv) if (!blightdev) return; + guard(mutex)(&priv->vpc_mutex); + if (read_ec_data(priv->adev->handle, VPCCMD_R_BL_POWER, &power)) return; @@ -1346,7 +1367,8 @@ static void ideapad_backlight_notify_brightness(struct ideapad_private *priv) /* if we control brightness via acpi video driver */ if (!priv->blightdev) - read_ec_data(priv->adev->handle, VPCCMD_R_BL, &now); + scoped_guard(mutex, &priv->vpc_mutex) + read_ec_data(priv->adev->handle, VPCCMD_R_BL, &now); else backlight_force_update(priv->blightdev, BACKLIGHT_UPDATE_HOTKEY); } @@ -1571,7 +1593,8 @@ static void ideapad_sync_touchpad_state(struct ideapad_private *priv, bool send_ int ret; /* Without reading from EC touchpad LED doesn't switch state */ - ret = read_ec_data(priv->adev->handle, VPCCMD_R_TOUCHPAD, &value); + scoped_guard(mutex, &priv->vpc_mutex) + ret = read_ec_data(priv->adev->handle, VPCCMD_R_TOUCHPAD, &value); if (ret) return; @@ -1631,7 +1654,8 @@ static void ideapad_laptop_trigger_ec(void) if (!priv->features.ymc_ec_trigger) return; - ret = write_ec_cmd(priv->adev->handle, VPCCMD_W_YMC, 1); + scoped_guard(mutex, &priv->vpc_mutex) + ret = write_ec_cmd(priv->adev->handle, VPCCMD_W_YMC, 1); if (ret) dev_warn(&priv->platform_device->dev, "Could not write YMC: %d\n", ret); } @@ -1677,11 +1701,13 @@ static void ideapad_acpi_notify(acpi_handle handle, u32 event, void *data) struct ideapad_private *priv = data; unsigned long vpc1, vpc2, bit; - if (read_ec_data(handle, VPCCMD_R_VPC1, &vpc1)) - return; + scoped_guard(mutex, &priv->vpc_mutex) { + if (read_ec_data(handle, VPCCMD_R_VPC1, &vpc1)) + return; - if (read_ec_data(handle, VPCCMD_R_VPC2, &vpc2)) - return; + if (read_ec_data(handle, VPCCMD_R_VPC2, &vpc2)) + return; + } vpc1 = (vpc2 << 8) | vpc1; @@ -1988,6 +2014,10 @@ static int ideapad_acpi_add(struct platform_device *pdev) priv->adev = adev; priv->platform_device = pdev; + err = devm_mutex_init(&pdev->dev, &priv->vpc_mutex); + if (err) + return err; + ideapad_check_features(priv); err = ideapad_sysfs_init(priv); From 1cb6ab446424649f03c82334634360c2e3043684 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Tue, 23 Jul 2024 17:15:44 +0800 Subject: [PATCH 129/991] MIPS: Loongson64: Set timer mode in cpu-probe Loongson64 C and G processors have EXTIMER feature which is conflicting with CP0 counter. Although the processor resets in EXTIMER disabled & INTIMER enabled mode, which is compatible with MIPS CP0 compare, firmware may attempt to enable EXTIMER and interfere CP0 compare. Set timer mode back to MIPS compatible mode to fix booting on systems with such firmware before we have an actual driver for EXTIMER. Cc: stable@vger.kernel.org Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/cpu-probe.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index bda7f193baab9f..af7412549e6ea4 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -1724,12 +1724,16 @@ static inline void cpu_probe_loongson(struct cpuinfo_mips *c, unsigned int cpu) c->ases |= (MIPS_ASE_LOONGSON_MMI | MIPS_ASE_LOONGSON_CAM | MIPS_ASE_LOONGSON_EXT | MIPS_ASE_LOONGSON_EXT2); c->ases &= ~MIPS_ASE_VZ; /* VZ of Loongson-3A2000/3000 is incomplete */ + change_c0_config6(LOONGSON_CONF6_EXTIMER | LOONGSON_CONF6_INTIMER, + LOONGSON_CONF6_INTIMER); break; case PRID_IMP_LOONGSON_64G: __cpu_name[cpu] = "ICT Loongson-3"; set_elf_platform(cpu, "loongson3a"); set_isa(c, MIPS_CPU_ISA_M64R2); decode_cpucfg(c); + change_c0_config6(LOONGSON_CONF6_EXTIMER | LOONGSON_CONF6_INTIMER, + LOONGSON_CONF6_INTIMER); break; default: panic("Unknown Loongson Processor ID!"); From 9eb18136af9fe4dd688724070f2bfba271bd1542 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Aug 2024 10:15:46 +0100 Subject: [PATCH 130/991] KVM: arm64: vgic: Hold config_lock while tearing down a CPU interface Tearing down a vcpu CPU interface involves freeing the private interrupt array. If we don't hold the lock, we may race against another thread trying to configure it. Yeah, fuzzers do wonderful things... Taking the lock early solves this particular problem. Fixes: 03b3d00a70b5 ("KVM: arm64: vgic: Allocate private interrupts on demand") Reported-by: Alexander Potapenko Tested-by: Alexander Potapenko Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240808091546.3262111-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 7f68cf58b978fb..41feb858ff9a5e 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -438,14 +438,13 @@ void kvm_vgic_destroy(struct kvm *kvm) unsigned long i; mutex_lock(&kvm->slots_lock); + mutex_lock(&kvm->arch.config_lock); vgic_debug_destroy(kvm); kvm_for_each_vcpu(i, vcpu, kvm) __kvm_vgic_vcpu_destroy(vcpu); - mutex_lock(&kvm->arch.config_lock); - kvm_vgic_dist_destroy(kvm); mutex_unlock(&kvm->arch.config_lock); From cd9aae921ab6b614e56ce690dedfe82e79db9354 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 8 Aug 2024 11:44:07 -0700 Subject: [PATCH 131/991] dt-bindings: display: panel: samsung,atna45dc02: Fix indentation The yaml had indentation errors: ./Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml:21:9: [warning] wrong indentation: expected 10 but found 8 (indentation) ./Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml:23:11: [warning] wrong indentation: expected 12 but found 10 (indentation) Fix them. Reported-by: Rob Herring Closes: https://lore.kernel.org/r/CAL_JsqLRTgQRPcfXy4G9hLoHMd-Uax4_C90BV_OZn4mK+-82kw@mail.gmail.com Fixes: 1c4a057d01f4 ("dt-bindings: display: panel: samsung,atna45dc02: Document ATNA45DC02") Reviewed-by: Rob Clark Signed-off-by: Douglas Anderson Link: https://patchwork.freedesktop.org/patch/msgid/20240808114407.1.I099e8e9e36407a0785d846b953031d40ea71e559@changeid --- .../bindings/display/panel/samsung,atna33xc20.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml b/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml index 87c601bcf20af9..032f783eefc450 100644 --- a/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml +++ b/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml @@ -18,12 +18,12 @@ properties: # Samsung 13.3" FHD (1920x1080 pixels) eDP AMOLED panel - const: samsung,atna33xc20 - items: - - enum: - # Samsung 14.5" WQXGA+ (2880x1800 pixels) eDP AMOLED panel - - samsung,atna45af01 - # Samsung 14.5" 3K (2944x1840 pixels) eDP AMOLED panel - - samsung,atna45dc02 - - const: samsung,atna33xc20 + - enum: + # Samsung 14.5" WQXGA+ (2880x1800 pixels) eDP AMOLED panel + - samsung,atna45af01 + # Samsung 14.5" 3K (2944x1840 pixels) eDP AMOLED panel + - samsung,atna45dc02 + - const: samsung,atna33xc20 enable-gpios: true port: true From 0c84bde4f37ba27d50e4c70ecacd33fe4a57030d Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 8 Aug 2024 10:35:19 +0200 Subject: [PATCH 132/991] media: Revert "media: dvb-usb: Fix unexpected infinite loop in dvb_usb_read_remote_control()" This reverts commit 2052138b7da52ad5ccaf74f736d00f39a1c9198c. This breaks the TeVii s480 dual DVB-S2 S660. The device has a bulk in endpoint but no corresponding out endpoint, so the device does not pass the "has both receive and send bulk endpoint" test. Seemingly this device does not use dvb_usb_generic_rw() so I have tried removing the generic_bulk_ctrl_endpoint entry, but this resulted in different problems. As we have no explanation yet, revert. $ dmesg | grep -i -e dvb -e dw21 -e usb\ 4 [ 0.999122] usb 1-1: new high-speed USB device number 2 using ehci-pci [ 1.023123] usb 4-1: new high-speed USB device number 2 using ehci-pci [ 1.130247] usb 1-1: New USB device found, idVendor=9022, idProduct=d482, +bcdDevice= 0.01 [ 1.130257] usb 1-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0 [ 1.152323] usb 4-1: New USB device found, idVendor=9022, idProduct=d481, +bcdDevice= 0.01 [ 1.152329] usb 4-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0 [ 6.701033] dvb-usb: found a 'TeVii S480.2 USB' in cold state, will try to +load a firmware [ 6.701178] dvb-usb: downloading firmware from file 'dvb-usb-s660.fw' [ 6.701179] dw2102: start downloading DW210X firmware [ 6.703715] dvb-usb: found a 'Microsoft Xbox One Digital TV Tuner' in cold +state, will try to load a firmware [ 6.703974] dvb-usb: downloading firmware from file 'dvb-usb-dib0700-1.20.fw' [ 6.756432] usb 1-1: USB disconnect, device number 2 [ 6.862119] dvb-usb: found a 'TeVii S480.2 USB' in warm state. [ 6.862194] dvb-usb: TeVii S480.2 USB error while loading driver (-22) [ 6.862209] dvb-usb: found a 'TeVii S480.1 USB' in cold state, will try to +load a firmware [ 6.862244] dvb-usb: downloading firmware from file 'dvb-usb-s660.fw' [ 6.862245] dw2102: start downloading DW210X firmware [ 6.914811] usb 4-1: USB disconnect, device number 2 [ 7.014131] dvb-usb: found a 'TeVii S480.1 USB' in warm state. [ 7.014487] dvb-usb: TeVii S480.1 USB error while loading driver (-22) [ 7.014538] usbcore: registered new interface driver dw2102 Closes: https://lore.kernel.org/stable/20240801165146.38991f60@mir/ Fixes: 2052138b7da5 ("media: dvb-usb: Fix unexpected infinite loop in dvb_usb_read_remote_control()") Reported-by: Stefan Lippers-Hollmann Cc: stable@vger.kernel.org Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/usb/dvb-usb/dvb-usb-init.c | 35 +++--------------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/drivers/media/usb/dvb-usb/dvb-usb-init.c b/drivers/media/usb/dvb-usb/dvb-usb-init.c index 22d83ac18eb735..fbf58012becdf2 100644 --- a/drivers/media/usb/dvb-usb/dvb-usb-init.c +++ b/drivers/media/usb/dvb-usb/dvb-usb-init.c @@ -23,40 +23,11 @@ static int dvb_usb_force_pid_filter_usage; module_param_named(force_pid_filter_usage, dvb_usb_force_pid_filter_usage, int, 0444); MODULE_PARM_DESC(force_pid_filter_usage, "force all dvb-usb-devices to use a PID filter, if any (default: 0)."); -static int dvb_usb_check_bulk_endpoint(struct dvb_usb_device *d, u8 endpoint) -{ - if (endpoint) { - int ret; - - ret = usb_pipe_type_check(d->udev, usb_sndbulkpipe(d->udev, endpoint)); - if (ret) - return ret; - ret = usb_pipe_type_check(d->udev, usb_rcvbulkpipe(d->udev, endpoint)); - if (ret) - return ret; - } - return 0; -} - -static void dvb_usb_clear_halt(struct dvb_usb_device *d, u8 endpoint) -{ - if (endpoint) { - usb_clear_halt(d->udev, usb_sndbulkpipe(d->udev, endpoint)); - usb_clear_halt(d->udev, usb_rcvbulkpipe(d->udev, endpoint)); - } -} - static int dvb_usb_adapter_init(struct dvb_usb_device *d, short *adapter_nrs) { struct dvb_usb_adapter *adap; int ret, n, o; - ret = dvb_usb_check_bulk_endpoint(d, d->props.generic_bulk_ctrl_endpoint); - if (ret) - return ret; - ret = dvb_usb_check_bulk_endpoint(d, d->props.generic_bulk_ctrl_endpoint_response); - if (ret) - return ret; for (n = 0; n < d->props.num_adapters; n++) { adap = &d->adapter[n]; adap->dev = d; @@ -132,8 +103,10 @@ static int dvb_usb_adapter_init(struct dvb_usb_device *d, short *adapter_nrs) * when reloading the driver w/o replugging the device * sometimes a timeout occurs, this helps */ - dvb_usb_clear_halt(d, d->props.generic_bulk_ctrl_endpoint); - dvb_usb_clear_halt(d, d->props.generic_bulk_ctrl_endpoint_response); + if (d->props.generic_bulk_ctrl_endpoint != 0) { + usb_clear_halt(d->udev, usb_sndbulkpipe(d->udev, d->props.generic_bulk_ctrl_endpoint)); + usb_clear_halt(d->udev, usb_rcvbulkpipe(d->udev, d->props.generic_bulk_ctrl_endpoint)); + } return 0; From 05a3d6e9307250a5911d75308e4363466794ab21 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 8 Aug 2024 11:57:38 -0400 Subject: [PATCH 133/991] selinux: revert our use of vma_is_initial_heap() Unfortunately it appears that vma_is_initial_heap() is currently broken for applications that do not currently have any heap allocated, e.g. brk == start_brk. The breakage is such that it will cause SELinux to check for the process/execheap permission on memory regions that cross brk/start_brk even when there is no heap. The proper fix would be to correct vma_is_initial_heap(), but as there are multiple callers I am hesitant to unilaterally modify the helper out of concern that I would end up breaking some other subsystem. The mm developers have been made aware of the situation and hopefully they will have a fix at some point in the future, but we need a fix soon so we are simply going to revert our use of vma_is_initial_heap() in favor of our old logic/code which works as expected, even in the face of a zero size heap. We can return to using vma_is_initial_heap() at some point in the future when it is fixed. Cc: stable@vger.kernel.org Reported-by: Marc Reisner Closes: https://lore.kernel.org/all/ZrPmoLKJEf1wiFmM@marcreisner.com Fixes: 68df1baf158f ("selinux: use vma_is_initial_stack() and vma_is_initial_heap()") Signed-off-by: Paul Moore --- security/selinux/hooks.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 55c78c318ccd78..bfa61e005aace2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3852,7 +3852,17 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, if (default_noexec && (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) { int rc = 0; - if (vma_is_initial_heap(vma)) { + /* + * We don't use the vma_is_initial_heap() helper as it has + * a history of problems and is currently broken on systems + * where there is no heap, e.g. brk == start_brk. Before + * replacing the conditional below with vma_is_initial_heap(), + * or something similar, please ensure that the logic is the + * same as what we have below or you have tested every possible + * corner case you can think to test. + */ + if (vma->vm_start >= vma->vm_mm->start_brk && + vma->vm_end <= vma->vm_mm->brk) { rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__EXECHEAP, NULL); } else if (!vma->vm_file && (vma_is_initial_stack(vma) || From a018c1b636e79b60149b41151ded7c2606d8606e Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 5 Aug 2024 08:56:18 +0900 Subject: [PATCH 134/991] ksmbd: override fsids for share path check Sangsoo reported that a DAC denial error occurred when accessing files through the ksmbd thread. This patch override fsids for share path check. Reported-by: Sangsoo Lee Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/mgmt/share_config.c | 15 ++++++++++++--- fs/smb/server/mgmt/share_config.h | 4 +++- fs/smb/server/mgmt/tree_connect.c | 9 +++++---- fs/smb/server/mgmt/tree_connect.h | 4 ++-- fs/smb/server/smb2pdu.c | 2 +- fs/smb/server/smb_common.c | 9 +++++++-- fs/smb/server/smb_common.h | 2 ++ 7 files changed, 32 insertions(+), 13 deletions(-) diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c index e0a6b758094fc5..d8d03070ae44b4 100644 --- a/fs/smb/server/mgmt/share_config.c +++ b/fs/smb/server/mgmt/share_config.c @@ -15,6 +15,7 @@ #include "share_config.h" #include "user_config.h" #include "user_session.h" +#include "../connection.h" #include "../transport_ipc.h" #include "../misc.h" @@ -120,12 +121,13 @@ static int parse_veto_list(struct ksmbd_share_config *share, return 0; } -static struct ksmbd_share_config *share_config_request(struct unicode_map *um, +static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work, const char *name) { struct ksmbd_share_config_response *resp; struct ksmbd_share_config *share = NULL; struct ksmbd_share_config *lookup; + struct unicode_map *um = work->conn->um; int ret; resp = ksmbd_ipc_share_config_request(name); @@ -181,7 +183,14 @@ static struct ksmbd_share_config *share_config_request(struct unicode_map *um, KSMBD_SHARE_CONFIG_VETO_LIST(resp), resp->veto_list_sz); if (!ret && share->path) { + if (__ksmbd_override_fsids(work, share)) { + kill_share(share); + share = NULL; + goto out; + } + ret = kern_path(share->path, 0, &share->vfs_path); + ksmbd_revert_fsids(work); if (ret) { ksmbd_debug(SMB, "failed to access '%s'\n", share->path); @@ -214,7 +223,7 @@ static struct ksmbd_share_config *share_config_request(struct unicode_map *um, return share; } -struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, +struct ksmbd_share_config *ksmbd_share_config_get(struct ksmbd_work *work, const char *name) { struct ksmbd_share_config *share; @@ -227,7 +236,7 @@ struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, if (share) return share; - return share_config_request(um, name); + return share_config_request(work, name); } bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, diff --git a/fs/smb/server/mgmt/share_config.h b/fs/smb/server/mgmt/share_config.h index 5f591751b92365..d4ac2dd4de2040 100644 --- a/fs/smb/server/mgmt/share_config.h +++ b/fs/smb/server/mgmt/share_config.h @@ -11,6 +11,8 @@ #include #include +struct ksmbd_work; + struct ksmbd_share_config { char *name; char *path; @@ -68,7 +70,7 @@ static inline void ksmbd_share_config_put(struct ksmbd_share_config *share) __ksmbd_share_config_put(share); } -struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, +struct ksmbd_share_config *ksmbd_share_config_get(struct ksmbd_work *work, const char *name); bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, const char *filename); diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c index d2c81a8a11dda1..94a52a75014a43 100644 --- a/fs/smb/server/mgmt/tree_connect.c +++ b/fs/smb/server/mgmt/tree_connect.c @@ -16,17 +16,18 @@ #include "user_session.h" struct ksmbd_tree_conn_status -ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, - const char *share_name) +ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name) { struct ksmbd_tree_conn_status status = {-ENOENT, NULL}; struct ksmbd_tree_connect_response *resp = NULL; struct ksmbd_share_config *sc; struct ksmbd_tree_connect *tree_conn = NULL; struct sockaddr *peer_addr; + struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; int ret; - sc = ksmbd_share_config_get(conn->um, share_name); + sc = ksmbd_share_config_get(work, share_name); if (!sc) return status; @@ -61,7 +62,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, struct ksmbd_share_config *new_sc; ksmbd_share_config_del(sc); - new_sc = ksmbd_share_config_get(conn->um, share_name); + new_sc = ksmbd_share_config_get(work, share_name); if (!new_sc) { pr_err("Failed to update stale share config\n"); status.ret = -ESTALE; diff --git a/fs/smb/server/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h index 6377a70b811c89..a42cdd05104114 100644 --- a/fs/smb/server/mgmt/tree_connect.h +++ b/fs/smb/server/mgmt/tree_connect.h @@ -13,6 +13,7 @@ struct ksmbd_share_config; struct ksmbd_user; struct ksmbd_conn; +struct ksmbd_work; enum { TREE_NEW = 0, @@ -50,8 +51,7 @@ static inline int test_tree_conn_flag(struct ksmbd_tree_connect *tree_conn, struct ksmbd_session; struct ksmbd_tree_conn_status -ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, - const char *share_name); +ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name); void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon); int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 37a39ab4ee6543..54154d36ea2f81 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1955,7 +1955,7 @@ int smb2_tree_connect(struct ksmbd_work *work) ksmbd_debug(SMB, "tree connect request for tree %s treename %s\n", name, treename); - status = ksmbd_tree_conn_connect(conn, sess, name); + status = ksmbd_tree_conn_connect(work, name); if (status.ret == KSMBD_TREE_CONN_STATUS_OK) rsp->hdr.Id.SyncId.TreeId = cpu_to_le32(status.tree_conn->id); else diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index 474dadf6b7b8bc..13818ecb6e1b2f 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -732,10 +732,10 @@ bool is_asterisk(char *p) return p && p[0] == '*'; } -int ksmbd_override_fsids(struct ksmbd_work *work) +int __ksmbd_override_fsids(struct ksmbd_work *work, + struct ksmbd_share_config *share) { struct ksmbd_session *sess = work->sess; - struct ksmbd_share_config *share = work->tcon->share_conf; struct cred *cred; struct group_info *gi; unsigned int uid; @@ -775,6 +775,11 @@ int ksmbd_override_fsids(struct ksmbd_work *work) return 0; } +int ksmbd_override_fsids(struct ksmbd_work *work) +{ + return __ksmbd_override_fsids(work, work->tcon->share_conf); +} + void ksmbd_revert_fsids(struct ksmbd_work *work) { const struct cred *cred; diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index f1092519c0c288..4a3148b0167f54 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -447,6 +447,8 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn, int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command); int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp); +int __ksmbd_override_fsids(struct ksmbd_work *work, + struct ksmbd_share_config *share); int ksmbd_override_fsids(struct ksmbd_work *work); void ksmbd_revert_fsids(struct ksmbd_work *work); From f6bd41280a44dcc2e0a25ed72617d25f586974a7 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 5 Aug 2024 08:57:03 +0900 Subject: [PATCH 135/991] ksmbd: override fsids for smb2_query_info() Sangsoo reported that a DAC denial error occurred when accessing files through the ksmbd thread. This patch override fsids for smb2_query_info(). Reported-by: Sangsoo Lee Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 54154d36ea2f81..2df1354288e68a 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -5596,6 +5596,11 @@ int smb2_query_info(struct ksmbd_work *work) ksmbd_debug(SMB, "GOT query info request\n"); + if (ksmbd_override_fsids(work)) { + rc = -ENOMEM; + goto err_out; + } + switch (req->InfoType) { case SMB2_O_INFO_FILE: ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n"); @@ -5614,6 +5619,7 @@ int smb2_query_info(struct ksmbd_work *work) req->InfoType); rc = -EOPNOTSUPP; } + ksmbd_revert_fsids(work); if (!rc) { rsp->StructureSize = cpu_to_le16(9); @@ -5623,6 +5629,7 @@ int smb2_query_info(struct ksmbd_work *work) le32_to_cpu(rsp->OutputBufferLength)); } +err_out: if (rc < 0) { if (rc == -EACCES) rsp->hdr.Status = STATUS_ACCESS_DENIED; From e5876b088ba03a62124266fa20d00e65533c7269 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 6 Aug 2024 19:28:05 +0200 Subject: [PATCH 136/991] usbnet: ipheth: race between ipheth_close and error handling ipheth_sndbulk_callback() can submit carrier_work as a part of its error handling. That means that the driver must make sure that the work is cancelled after it has made sure that no more URB can terminate with an error condition. Hence the order of actions in ipheth_close() needs to be inverted. Signed-off-by: Oliver Neukum Signed-off-by: Foster Snowhill Tested-by: Georgi Valkov Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 687d70cfc55635..6eeef10edadad1 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -475,8 +475,8 @@ static int ipheth_close(struct net_device *net) { struct ipheth_device *dev = netdev_priv(net); - cancel_delayed_work_sync(&dev->carrier_work); netif_stop_queue(net); + cancel_delayed_work_sync(&dev->carrier_work); return 0; } From 655b46d7a39ac6f049698b27c1568c0f7ff85d1e Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Tue, 6 Aug 2024 19:28:06 +0200 Subject: [PATCH 137/991] usbnet: ipheth: remove extraneous rx URB length check Rx URB length was already checked in ipheth_rcvbulk_callback_legacy() and ipheth_rcvbulk_callback_ncm(), depending on the current mode. The check in ipheth_rcvbulk_callback() was thus mostly a duplicate. The only place in ipheth_rcvbulk_callback() where we care about the URB length is for the initial control frame. These frames are always 4 bytes long. This has been checked as far back as iOS 4.2.1 on iPhone 3G. Remove the extraneous URB length check. For control frames, check for the specific 4-byte length instead. Signed-off-by: Foster Snowhill Tested-by: Georgi Valkov Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 6eeef10edadad1..017255615508f2 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -286,11 +286,6 @@ static void ipheth_rcvbulk_callback(struct urb *urb) return; } - if (urb->actual_length <= IPHETH_IP_ALIGN) { - dev->net->stats.rx_length_errors++; - return; - } - /* RX URBs starting with 0x00 0x01 do not encapsulate Ethernet frames, * but rather are control frames. Their purpose is not documented, and * they don't affect driver functionality, okay to drop them. @@ -298,7 +293,8 @@ static void ipheth_rcvbulk_callback(struct urb *urb) * URB received from the bulk IN endpoint. */ if (unlikely - (((char *)urb->transfer_buffer)[0] == 0 && + (urb->actual_length == 4 && + ((char *)urb->transfer_buffer)[0] == 0 && ((char *)urb->transfer_buffer)[1] == 1)) goto rx_submit; From 94d7eeb6c0ef0310992944f0d0296929816a2cb0 Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Tue, 6 Aug 2024 19:28:07 +0200 Subject: [PATCH 138/991] usbnet: ipheth: drop RX URBs with no payload On iPhone 15 Pro Max one can observe periodic URBs with no payload on the "bulk in" (RX) endpoint. These don't seem to do anything meaningful. Reproduced on iOS 17.5.1 and 17.6. This behaviour isn't observed on iPhone 11 on the same iOS version. The nature of these zero-length URBs is so far unknown. Drop RX URBs with no payload. Signed-off-by: Foster Snowhill Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 017255615508f2..f04c7bf796654f 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -286,6 +286,12 @@ static void ipheth_rcvbulk_callback(struct urb *urb) return; } + /* iPhone may periodically send URBs with no payload + * on the "bulk in" endpoint. It is safe to ignore them. + */ + if (urb->actual_length == 0) + goto rx_submit; + /* RX URBs starting with 0x00 0x01 do not encapsulate Ethernet frames, * but rather are control frames. Their purpose is not documented, and * they don't affect driver functionality, okay to drop them. From 74efed51e0a4d62f998f806c307778b47fc73395 Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Tue, 6 Aug 2024 19:28:08 +0200 Subject: [PATCH 139/991] usbnet: ipheth: do not stop RX on failing RX callback RX callbacks can fail for multiple reasons: * Payload too short * Payload formatted incorrecly (e.g. bad NCM framing) * Lack of memory None of these should cause the driver to seize up. Make such failures non-critical and continue processing further incoming URBs. Signed-off-by: Foster Snowhill Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index f04c7bf796654f..cdc72559790a68 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -308,7 +308,6 @@ static void ipheth_rcvbulk_callback(struct urb *urb) if (retval != 0) { dev_err(&dev->intf->dev, "%s: callback retval: %d\n", __func__, retval); - return; } rx_submit: From 67927a1b255d883881be9467508e0af9a5e0be9d Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Tue, 6 Aug 2024 19:28:09 +0200 Subject: [PATCH 140/991] usbnet: ipheth: fix carrier detection in modes 1 and 4 Apart from the standard "configurations", "interfaces" and "alternate interface settings" in USB, iOS devices also have a notion of "modes". In different modes, the device exposes a different set of available configurations. Depending on the iOS version, and depending on the current mode, the length and contents of the carrier state control message differs: * 1 byte (seen on iOS 4.2.1, 8.4): * 03: carrier off (mode 0) * 04: carrier on (mode 0) * 3 bytes (seen on iOS 10.3.4, 15.7.6): * 03 03 03: carrier off (mode 0) * 04 04 03: carrier on (mode 0) * 4 bytes (seen on iOS 16.5, 17.6): * 03 03 03 00: carrier off (mode 0) * 04 03 03 00: carrier off (mode 1) * 06 03 03 00: carrier off (mode 4) * 04 04 03 04: carrier on (mode 0 and 1) * 06 04 03 04: carrier on (mode 4) Before this change, the driver always used the first byte of the response to determine carrier state. From this larger sample, the first byte seems to indicate the number of available USB configurations in the current mode (with the exception of the default mode 0), and in some cases (namely mode 1 and 4) does not correlate with the carrier state. Previous logic erroneously counted `04 03 03 00` as "carrier on" and `06 04 03 04` as "carrier off" on iOS versions that support mode 1 and mode 4 respectively. Only modes 0, 1 and 4 expose the USB Ethernet interfaces necessary for the ipheth driver. Check the second byte of the control message where possible, and fall back to checking the first byte on older iOS versions. Signed-off-by: Foster Snowhill Tested-by: Georgi Valkov Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index cdc72559790a68..46afb95ffabe3b 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -355,13 +355,14 @@ static int ipheth_carrier_set(struct ipheth_device *dev) 0x02, /* index */ dev->ctrl_buf, IPHETH_CTRL_BUF_SIZE, IPHETH_CTRL_TIMEOUT); - if (retval < 0) { + if (retval <= 0) { dev_err(&dev->intf->dev, "%s: usb_control_msg: %d\n", __func__, retval); return retval; } - if (dev->ctrl_buf[0] == IPHETH_CARRIER_ON) { + if ((retval == 1 && dev->ctrl_buf[0] == IPHETH_CARRIER_ON) || + (retval >= 2 && dev->ctrl_buf[1] == IPHETH_CARRIER_ON)) { netif_carrier_on(dev->net); if (dev->tx_urb->status != -EINPROGRESS) netif_wake_queue(dev->net); From d0949cd44a62c4c41b30ea7ae94d8c887f586882 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 8 Aug 2024 23:57:30 -0400 Subject: [PATCH 141/991] tracing: Return from tracing_buffers_read() if the file has been closed When running the following: # cd /sys/kernel/tracing/ # echo 1 > events/sched/sched_waking/enable # echo 1 > events/sched/sched_switch/enable # echo 0 > tracing_on # dd if=per_cpu/cpu0/trace_pipe_raw of=/tmp/raw0.dat The dd task would get stuck in an infinite loop in the kernel. What would happen is the following: When ring_buffer_read_page() returns -1 (no data) then a check is made to see if the buffer is empty (as happens when the page is not full), it will call wait_on_pipe() to wait until the ring buffer has data. When it is it will try again to read data (unless O_NONBLOCK is set). The issue happens when there's a reader and the file descriptor is closed. The wait_on_pipe() will return when that is the case. But this loop will continue to try again and wait_on_pipe() will again return immediately and the loop will continue and never stop. Simply check if the file was closed before looping and exit out if it is. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240808235730.78bf63e5@rorschach.local.home Fixes: 2aa043a55b9a7 ("tracing/ring-buffer: Fix wait_on_pipe() race") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 10cd38bce2f1c4..ebe7ce2f5f4a50 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7956,7 +7956,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, trace_access_unlock(iter->cpu_file); if (ret < 0) { - if (trace_empty(iter)) { + if (trace_empty(iter) && !iter->closed) { if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; From 90574d2a675947858b47008df8d07f75ea50d0d0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 9 Aug 2024 15:34:30 +0300 Subject: [PATCH 142/991] rtla/osnoise: Prevent NULL dereference in error handling If the "tool->data" allocation fails then there is no need to call osnoise_free_top() and, in fact, doing so will lead to a NULL dereference. Cc: stable@vger.kernel.org Cc: John Kacur Cc: "Luis Claudio R. Goncalves" Cc: Clark Williams Fixes: 1eceb2fc2ca5 ("rtla/osnoise: Add osnoise top mode") Link: https://lore.kernel.org/f964ed1f-64d2-4fde-ad3e-708331f8f358@stanley.mountain Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/osnoise_top.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index f594a44df840e4..2f756628613dd8 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -651,8 +651,10 @@ struct osnoise_tool *osnoise_init_top(struct osnoise_top_params *params) return NULL; tool->data = osnoise_alloc_top(nr_cpus); - if (!tool->data) - goto out_err; + if (!tool->data) { + osnoise_destroy_tool(tool); + return NULL; + } tool->params = params; @@ -660,11 +662,6 @@ struct osnoise_tool *osnoise_init_top(struct osnoise_top_params *params) osnoise_top_handler, NULL); return tool; - -out_err: - osnoise_free_top(tool->data); - osnoise_destroy_tool(tool); - return NULL; } static int stop_tracing; From d5240fa65db071909e9d1d5adcc5fd1abc8e96fe Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 9 Aug 2024 11:11:55 +0800 Subject: [PATCH 143/991] nvdimm/pmem: Set dax flag for all 'PFN_MAP' cases The dax is only supported on pfn type pmem devices since commit f467fee48da4 ("block: move the dax flag to queue_limits"). Trying to mount DAX filesystem fails with this error: mount: : wrong fs type, bad option, bad superblock on /dev/pmem7, missing codepage or helper program, or other error. dmesg(1) may have more information after failed mount system call. dmesg: EXT4-fs (pmem7): DAX unsupported by block device. Fix the problem by adding dax flag setting for the missed case. Fixes: f467fee48da4 ("block: move the dax flag to queue_limits") Signed-off-by: Zhihao Cheng Reviewed-by: Christoph Hellwig Reviewed-by: Dave Jiang Reviewed-by: Ira Weiny Tested-by: Ira Weiny Tested-by: Alison Schofield Link: https://patch.msgid.link/20240809031155.2837271-1-chengzhihao1@huawei.com Signed-off-by: Ira Weiny --- drivers/nvdimm/pmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 1ae8b2351654e7..210fb77f51ba02 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -498,7 +498,7 @@ static int pmem_attach_disk(struct device *dev, } if (fua) lim.features |= BLK_FEAT_FUA; - if (is_nd_pfn(dev)) + if (is_nd_pfn(dev) || pmem_should_map_pages(dev)) lim.features |= BLK_FEAT_DAX; if (!devm_request_mem_region(dev, res->start, resource_size(res), From 869b5016e94eced02f2cf99bf53c69b49adcee32 Mon Sep 17 00:00:00 2001 From: Zehui Xu Date: Wed, 31 Jul 2024 16:43:46 +0300 Subject: [PATCH 144/991] kbuild: rust: skip -fmin-function-alignment in bindgen flags GCC 14 recently added -fmin-function-alignment option and the root Makefile uses it to replace -falign-functions when available. However, this flag can cause issues when passed to the Rust Makefile and affect the bindgen process. Bindgen relies on libclang to parse C code, and currently does not support the -fmin-function-alignment flag, leading to compilation failures when GCC 14 is used. This patch addresses the issue by adding -fmin-function-alignment to the bindgen_skip_c_flags in rust/Makefile. This prevents the flag from causing compilation issues. [ Matthew and Gary confirm function alignment should not change the ABI in a way that bindgen would care about, thus we did not need the extra logic for bindgen from v2. - Miguel ] Link: https://lore.kernel.org/linux-kbuild/20240222133500.16991-1-petr.pavlu@suse.com/ Signed-off-by: Zehui Xu Reviewed-by: Alice Ryhl Reviewed-by: Neal Gompa Reviewed-by: Gary Guo Link: https://lore.kernel.org/r/20240731134346.10630-1-zehuixu@whu.edu.cn [ Reworded title. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/Makefile b/rust/Makefile index 6c0644b6090c7d..c41bae7ca8a322 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -227,7 +227,7 @@ bindgen_skip_c_flags := -mno-fp-ret-in-387 -mpreferred-stack-boundary=% \ -fno-reorder-blocks -fno-allow-store-data-races -fasan-shadow-offset=% \ -fzero-call-used-regs=% -fno-stack-clash-protection \ -fno-inline-functions-called-once -fsanitize=bounds-strict \ - -fstrict-flex-arrays=% \ + -fstrict-flex-arrays=% -fmin-function-alignment=% \ --param=% --param asan-% # Derived from `scripts/Makefile.clang`. From 02dfd63afe65f7bacad543ba2b10f77083ae7929 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 6 Aug 2024 17:06:19 +0200 Subject: [PATCH 145/991] rust: add intrinsics to fix `-Os` builds Alice reported [1] that an arm64 build failed with: ld.lld: error: undefined symbol: __extendsfdf2 >>> referenced by core.a6f5fc5794e7b7b3-cgu.0 >>> rust/core.o:(::midpoint) in archive vmlinux.a >>> referenced by core.a6f5fc5794e7b7b3-cgu.0 >>> rust/core.o:(::midpoint) in archive vmlinux.a ld.lld: error: undefined symbol: __truncdfsf2 >>> referenced by core.a6f5fc5794e7b7b3-cgu.0 >>> rust/core.o:(::midpoint) in archive vmlinux.a Rust 1.80.0 or later together with `CONFIG_CC_OPTIMIZE_FOR_SIZE=y` is what triggers it. In addition, x86_64 builds also fail the same way. Similarly, compiling with Rust 1.82.0 (currently in nightly) makes another one appear, possibly due to the LLVM 19 upgrade there: ld.lld: error: undefined symbol: __eqdf2 >>> referenced by core.20495ea57a9f069d-cgu.0 >>> rust/core.o:(::next_up) in archive vmlinux.a >>> referenced by core.20495ea57a9f069d-cgu.0 >>> rust/core.o:(::next_down) in archive vmlinux.a Gary adds [1]: > Usually the fix on rustc side is to mark those functions as `#[inline]` > > All of {midpoint,next_up,next_down} are indeed unstable functions not > marked as inline... Fix all those by adding those intrinsics to our usual workaround. [ Trevor quickly submitted a fix to upstream Rust [2] that has already been merged, to be released in Rust 1.82.0 (2024-10-17). - Miguel ] Cc: Gary Guo Reported-by: Alice Ryhl Closes: https://rust-for-linux.zulipchat.com/#narrow/stream/x/topic/x/near/455637364 [1] Reviewed-by: Trevor Gross Tested-by: Alice Ryhl Tested-by: Boqun Feng Reviewed-by: Gary Guo Link: https://github.com/rust-lang/rust/pull/128749 [2] Link: https://lore.kernel.org/r/20240806150619.192882-1-ojeda@kernel.org [ Shortened Zulip link. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/Makefile | 4 ++-- rust/compiler_builtins.rs | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/rust/Makefile b/rust/Makefile index c41bae7ca8a322..8de3ebba95512c 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -354,8 +354,8 @@ rust-analyzer: $(if $(KBUILD_EXTMOD),$(extmod_prefix),$(objtree))/rust-project.json redirect-intrinsics = \ - __addsf3 __eqsf2 __gesf2 __lesf2 __ltsf2 __mulsf3 __nesf2 __unordsf2 \ - __adddf3 __ledf2 __ltdf2 __muldf3 __unorddf2 \ + __addsf3 __eqsf2 __extendsfdf2 __gesf2 __lesf2 __ltsf2 __mulsf3 __nesf2 __truncdfsf2 __unordsf2 \ + __adddf3 __eqdf2 __ledf2 __ltdf2 __muldf3 __unorddf2 \ __muloti4 __multi3 \ __udivmodti4 __udivti3 __umodti3 diff --git a/rust/compiler_builtins.rs b/rust/compiler_builtins.rs index bba2922c6ef77f..f14b8d7caf8996 100644 --- a/rust/compiler_builtins.rs +++ b/rust/compiler_builtins.rs @@ -40,16 +40,19 @@ macro_rules! define_panicking_intrinsics( define_panicking_intrinsics!("`f32` should not be used", { __addsf3, __eqsf2, + __extendsfdf2, __gesf2, __lesf2, __ltsf2, __mulsf3, __nesf2, + __truncdfsf2, __unordsf2, }); define_panicking_intrinsics!("`f64` should not be used", { __adddf3, + __eqdf2, __ledf2, __ltdf2, __muldf3, From d734422b7dd7d033fc02e421924e70dabf665b4c Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Wed, 7 Aug 2024 01:35:59 +0200 Subject: [PATCH 146/991] kbuild: rust-analyzer: mark `rust_is_available.sh` invocation as recursive When calling the `rust_is_available.sh` script, we need to make the jobserver available to it, as commit ecab4115c44c ("kbuild: mark `rustc` (and others) invocations as recursive") explains and did for the others. Otherwise, we get a warning from `rustc` when calling `make rust-analyzer` with parallel jobs, e.g. `-j8`. Using several jobs for that target does not really matter, but developers may call `make` with jobs enabled in all cases. Thus fix it. Fixes: 6dc9d9ca9a72 ("kbuild: rust-analyzer: better error handling") Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20240806233559.246705-1-ojeda@kernel.org [ Reworded to add a couple more details mentioned in the list. - Miguel ] Signed-off-by: Miguel Ojeda --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8ad55d6e7b60f2..8ed76eb9d438a0 100644 --- a/Makefile +++ b/Makefile @@ -1963,7 +1963,7 @@ tags TAGS cscope gtags: FORCE # Protocol). PHONY += rust-analyzer rust-analyzer: - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh + +$(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh $(Q)$(MAKE) $(build)=rust $@ # Script to generate missing namespace dependencies From 0eba65f0310d3c7d5516c7fd4c172d0bfa8b285b Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 6 Aug 2024 16:45:58 +0200 Subject: [PATCH 147/991] rust: x86: remove `-3dnow{,a}` from target features LLVM 19 is dropping support for 3DNow! in commit f0eb5587ceeb ("Remove support for 3DNow!, both intrinsics and builtins. (#96246)"): Remove support for 3DNow!, both intrinsics and builtins. (#96246) This set of instructions was only supported by AMD chips starting in the K6-2 (introduced 1998), and before the "Bulldozer" family (2011). They were never much used, as they were effectively superseded by the more-widely-implemented SSE (first implemented on the AMD side in Athlon XP in 2001). This is being done as a predecessor towards general removal of MMX register usage. Since there is almost no usage of the 3DNow! intrinsics, and no modern hardware even implements them, simple removal seems like the best option. Thus we should avoid passing these to the backend, since otherwise we get a diagnostic about it: '-3dnow' is not a recognized feature for this target (ignoring feature) '-3dnowa' is not a recognized feature for this target (ignoring feature) We could try to disable them only up to LLVM 19 (not the C side one, but the one used by `rustc`, which may be built with a range of LLVMs). However, to avoid more complexity, we can likely just remove them altogether. According to Nikita [2]: > I don't think it's needed because LLVM should not generate 3dnow > instructions unless specifically asked to, using intrinsics that > Rust does not provide in the first place. Thus do so, like Rust did for one of their builtin targets [3]. For those curious: Clang will warn only about trying to enable them (`-m3dnow{,a}`), but not about disabling them (`-mno-3dnow{,a}`), so there is no change needed there. Cc: Nikita Popov Cc: Nathan Chancellor Cc: x86@kernel.org Link: https://github.com/llvm/llvm-project/commit/f0eb5587ceeb641445b64cb264c822b4751de04a [1] Link: https://github.com/rust-lang/rust/pull/127864#issuecomment-2235898760 [2] Link: https://github.com/rust-lang/rust/pull/127864 [3] Closes: https://github.com/Rust-for-Linux/linux/issues/1094 Tested-by: Benno Lossin Tested-by: Alice Ryhl Link: https://lore.kernel.org/r/20240806144558.114461-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- scripts/generate_rust_target.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs index 87f34925eb7b71..404edf7587e088 100644 --- a/scripts/generate_rust_target.rs +++ b/scripts/generate_rust_target.rs @@ -162,7 +162,7 @@ fn main() { "data-layout", "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", ); - let mut features = "-3dnow,-3dnowa,-mmx,+soft-float".to_string(); + let mut features = "-mmx,+soft-float".to_string(); if cfg.has("MITIGATION_RETPOLINE") { features += ",+retpoline-external-thunk"; } @@ -179,7 +179,7 @@ fn main() { "data-layout", "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128", ); - let mut features = "-3dnow,-3dnowa,-mmx,+soft-float".to_string(); + let mut features = "-mmx,+soft-float".to_string(); if cfg.has("MITIGATION_RETPOLINE") { features += ",+retpoline-external-thunk"; } From 8c251c5ab1b7cd204231e4ee936bfe078a33f234 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 9 Aug 2024 08:27:49 +0000 Subject: [PATCH 148/991] cxl/pci: Get AER capability address from RCRB only for RCH dport cxl_setup_parent_dport() needs to get RCH dport AER capability address from RCRB to disable AER interrupt. The function does not check if dport is RCH dport, it will get a wrong pci_host_bridge structure by dport_dev in VH case because dport_dev points to a pci device(RP or switch DSP) rather than a pci host bridge device. Fixes: f05fd10d138d ("cxl/pci: Add RCH downstream port AER register discovery") Signed-off-by: Li Ming Reviewed-by: Dan Williams Reviewed-by: Ira Weiny Tested-by: Ira Weiny Tested-by: Alison Schofield Link: https://patch.msgid.link/20240809082750.3015641-2-ming4.li@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index a663e7566c480d..51132a575b2766 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -834,11 +834,13 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport) void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport) { struct device *dport_dev = dport->dport_dev; - struct pci_host_bridge *host_bridge; - host_bridge = to_pci_host_bridge(dport_dev); - if (host_bridge->native_aer) - dport->rcrb.aer_cap = cxl_rcrb_to_aer(dport_dev, dport->rcrb.base); + if (dport->rch) { + struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport_dev); + + if (host_bridge->native_aer) + dport->rcrb.aer_cap = cxl_rcrb_to_aer(dport_dev, dport->rcrb.base); + } dport->reg_map.host = host; cxl_dport_map_regs(dport); From 2c402bd2e85b44dc00ef85b5c0e217de684b5372 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 9 Aug 2024 08:27:50 +0000 Subject: [PATCH 149/991] cxl/test: Skip cxl_setup_parent_dport() for emulated dports The cxl_test unit test environment on qemu always hits below call trace with KASAN enabled: BUG: KASAN: slab-out-of-bounds in cxl_setup_parent_dport+0x480/0x530 [cxl_core] Read of size 1 at addr ff110000676014f8 by task (udev-worker)/676[ 24.424403] CPU: 2 PID: 676 Comm: (udev-worker) Tainted: G O N 6.10.0-qemucxl #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS edk2-20240214-2.el9 02/14/2024 Call Trace: dump_stack_lvl+0xea/0x150 print_report+0xce/0x610 ? kasan_complete_mode_report_info+0x40/0x200 kasan_report+0xcc/0x110 __asan_report_load1_noabort+0x18/0x20 cxl_setup_parent_dport+0x480/0x530 [cxl_core] cxl_mem_probe+0x49b/0xaa0 [cxl_mem] cxl_test module models a CXL topology for testing, it creates some emulated dports with platform devices in the CXL topology, so the dport_dev of an emulated dport points to a platform device rather than a pci device or a pci host bridge in the case. Currently, cxl_setup_parent_dport() is used to set up RAS and AER capability on the dport connected to the CXL memory device, but cxl_test does not support RAS or AER functionality yet, so the fix is implementing a __wrap_cxl_setup_parent_dport() to filter out all emulated dports, guarantees only real dports can be handled by cxl_setup_parent_dport(). Fixes: f05fd10d138d ("cxl/pci: Add RCH downstream port AER register discovery") Reported-by: Pengfei Xu Closes: https://lore.kernel.org/linux-cxl/ZrHTBp2O+HtUe6kt@xpf.sh.intel.com/T/#t Signed-off-by: Li Ming Reviewed-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Tested-by: Ira Weiny Tested-by: Alison Schofield Link: https://patch.msgid.link/20240809082750.3015641-3-ming4.li@intel.com Signed-off-by: Dave Jiang --- tools/testing/cxl/Kbuild | 1 + tools/testing/cxl/test/mock.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 030b388800f051..3d1ca9e38b1fa9 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -14,6 +14,7 @@ ldflags-y += --wrap=cxl_dvsec_rr_decode ldflags-y += --wrap=devm_cxl_add_rch_dport ldflags-y += --wrap=cxl_rcd_component_reg_phys ldflags-y += --wrap=cxl_endpoint_parse_cdat +ldflags-y += --wrap=cxl_setup_parent_dport DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 6f737941dc0e16..d619672faa497a 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -299,6 +299,18 @@ void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_endpoint_parse_cdat, CXL); +void __wrap_cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport) +{ + int index; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (!ops || !ops->is_mock_port(dport->dport_dev)) + cxl_setup_parent_dport(host, dport); + + put_cxl_mock_ops(index); +} +EXPORT_SYMBOL_NS_GPL(__wrap_cxl_setup_parent_dport, CXL); + MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(ACPI); MODULE_IMPORT_NS(CXL); From 4bbe6002931954bbe82b25f25990b987b0392e18 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 18 Jul 2024 16:38:07 -0300 Subject: [PATCH 150/991] perf daemon: Fix the build on 32-bit architectures Noticed with: 1 6.22 debian:experimental-x-mipsel : FAIL gcc version 13.2.0 (Debian 13.2.0-25) builtin-daemon.c: In function 'cmd_session_list': builtin-daemon.c:691:35: error: format '%lu' expects argument of type 'long unsigned int', but argument 4 has type 'time_t' {aka 'long long int'} [-Werror=format=] Use inttypes.h's PRIu64 to deal with that. Signed-off-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/ZplvH21aQ8pzmza_@x1 Signed-off-by: Namhyung Kim --- tools/perf/builtin-daemon.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index de76bbc50bfbcb..5c9335fff2d396 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include @@ -688,7 +689,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) /* lock */ csv_sep, daemon->base, "lock"); - fprintf(out, "%c%lu", + fprintf(out, "%c%" PRIu64, /* session up time */ csv_sep, (curr - daemon->start) / 60); @@ -700,7 +701,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) daemon->base, SESSION_OUTPUT); fprintf(out, " lock: %s/lock\n", daemon->base); - fprintf(out, " up: %lu minutes\n", + fprintf(out, " up: %" PRIu64 " minutes\n", (curr - daemon->start) / 60); } } @@ -727,7 +728,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) /* session ack */ csv_sep, session->base, SESSION_ACK); - fprintf(out, "%c%lu", + fprintf(out, "%c%" PRIu64, /* session up time */ csv_sep, (curr - session->start) / 60); @@ -745,7 +746,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) session->base, SESSION_CONTROL); fprintf(out, " ack: %s/%s\n", session->base, SESSION_ACK); - fprintf(out, " up: %lu minutes\n", + fprintf(out, " up: %" PRIu64 " minutes\n", (curr - session->start) / 60); } } From 3eb3cd5992f7a0c37edc8d05b4c38c98758d8671 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Aug 2024 12:51:23 -0700 Subject: [PATCH 151/991] binfmt_flat: Fix corruption when not offsetting data start Commit 04d82a6d0881 ("binfmt_flat: allow not offsetting data start") introduced a RISC-V specific variant of the FLAT format which does not allocate any space for the (obsolete) array of shared library pointers. However, it did not disable the code which initializes the array, resulting in the corruption of sizeof(long) bytes before the DATA segment, generally the end of the TEXT segment. Introduce MAX_SHARED_LIBS_UPDATE which depends on the state of CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET to guard the initialization of the shared library pointer region so that it will only be initialized if space is reserved for it. Fixes: 04d82a6d0881 ("binfmt_flat: allow not offsetting data start") Co-developed-by: Stefan O'Rear Signed-off-by: Stefan O'Rear Reviewed-by: Damien Le Moal Acked-by: Greg Ungerer Link: https://lore.kernel.org/r/20240807195119.it.782-kees@kernel.org Signed-off-by: Kees Cook --- fs/binfmt_flat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index c26545d71d39a3..cd6d5bbb4b9df5 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -72,8 +72,10 @@ #ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET #define DATA_START_OFFSET_WORDS (0) +#define MAX_SHARED_LIBS_UPDATE (0) #else #define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS) +#define MAX_SHARED_LIBS_UPDATE (MAX_SHARED_LIBS) #endif struct lib_info { @@ -880,7 +882,7 @@ static int load_flat_binary(struct linux_binprm *bprm) return res; /* Update data segment pointers for all libraries */ - for (i = 0; i < MAX_SHARED_LIBS; i++) { + for (i = 0; i < MAX_SHARED_LIBS_UPDATE; i++) { if (!libinfo.lib_list[i].loaded) continue; for (j = 0; j < MAX_SHARED_LIBS; j++) { From 3a3be7ff9224f424e485287b54be00d2c6bd9c40 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Aug 2024 13:24:55 +0000 Subject: [PATCH 152/991] gtp: pull network headers in gtp_dev_xmit() syzbot/KMSAN reported use of uninit-value in get_dev_xmit() [1] We must make sure the IPv4 or Ipv6 header is pulled in skb->head before accessing fields in them. Use pskb_inet_may_pull() to fix this issue. [1] BUG: KMSAN: uninit-value in ipv6_pdp_find drivers/net/gtp.c:220 [inline] BUG: KMSAN: uninit-value in gtp_build_skb_ip6 drivers/net/gtp.c:1229 [inline] BUG: KMSAN: uninit-value in gtp_dev_xmit+0x1424/0x2540 drivers/net/gtp.c:1281 ipv6_pdp_find drivers/net/gtp.c:220 [inline] gtp_build_skb_ip6 drivers/net/gtp.c:1229 [inline] gtp_dev_xmit+0x1424/0x2540 drivers/net/gtp.c:1281 __netdev_start_xmit include/linux/netdevice.h:4913 [inline] netdev_start_xmit include/linux/netdevice.h:4922 [inline] xmit_one net/core/dev.c:3580 [inline] dev_hard_start_xmit+0x247/0xa20 net/core/dev.c:3596 __dev_queue_xmit+0x358c/0x5610 net/core/dev.c:4423 dev_queue_xmit include/linux/netdevice.h:3105 [inline] packet_xmit+0x9c/0x6c0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3145 [inline] packet_sendmsg+0x90e3/0xa3a0 net/packet/af_packet.c:3177 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 __sys_sendto+0x685/0x830 net/socket.c:2204 __do_sys_sendto net/socket.c:2216 [inline] __se_sys_sendto net/socket.c:2212 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2212 x64_sys_call+0x3799/0x3c10 arch/x86/include/generated/asm/syscalls_64.h:45 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was created at: slab_post_alloc_hook mm/slub.c:3994 [inline] slab_alloc_node mm/slub.c:4037 [inline] kmem_cache_alloc_node_noprof+0x6bf/0xb80 mm/slub.c:4080 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:583 __alloc_skb+0x363/0x7b0 net/core/skbuff.c:674 alloc_skb include/linux/skbuff.h:1320 [inline] alloc_skb_with_frags+0xc8/0xbf0 net/core/skbuff.c:6526 sock_alloc_send_pskb+0xa81/0xbf0 net/core/sock.c:2815 packet_alloc_skb net/packet/af_packet.c:2994 [inline] packet_snd net/packet/af_packet.c:3088 [inline] packet_sendmsg+0x749c/0xa3a0 net/packet/af_packet.c:3177 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 __sys_sendto+0x685/0x830 net/socket.c:2204 __do_sys_sendto net/socket.c:2216 [inline] __se_sys_sendto net/socket.c:2212 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2212 x64_sys_call+0x3799/0x3c10 arch/x86/include/generated/asm/syscalls_64.h:45 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f CPU: 0 UID: 0 PID: 7115 Comm: syz.1.515 Not tainted 6.11.0-rc1-syzkaller-00043-g94ede2a3e913 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/27/2024 Fixes: 999cb275c807 ("gtp: add IPv6 support") Fixes: 459aa660eb1d ("gtp: add initial driver for datapath of GPRS Tunneling Protocol (GTP-U)") Signed-off-by: Eric Dumazet Cc: Harald Welte Reviewed-by: Pablo Neira Ayuso Link: https://patch.msgid.link/20240808132455.3413916-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/gtp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 427b91aca50d3a..0696faf60013e0 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1269,6 +1269,9 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) if (skb_cow_head(skb, dev->needed_headroom)) goto tx_err; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + skb_reset_inner_headers(skb); /* PDP context lookups in gtp_build_skb_*() need rcu read-side lock. */ From 2b2bc3bab158b7e036508742b16cd8a3c2f59a12 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 8 Aug 2024 11:56:21 +0200 Subject: [PATCH 153/991] net: Make USO depend on CSUM offload UDP segmentation offload inherently depends on checksum offload. It should not be possible to disable checksum offload while leaving USO enabled. Enforce this dependency in code. There is a single tx-udp-segmentation feature flag to indicate support for both IPv4/6, hence the devices wishing to support USO must offer checksum offload for both IP versions. Fixes: 10154dbded6d ("udp: Allow GSO transmit from devices with no checksum offload") Suggested-by: Willem de Bruijn Signed-off-by: Jakub Sitnicki Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20240808-udp-gso-egress-from-tunnel-v4-1-f5c5b4149ab9@cloudflare.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 751d9b70e6ad76..f66e6140788328 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9912,6 +9912,15 @@ static void netdev_sync_lower_features(struct net_device *upper, } } +static bool netdev_has_ip_or_hw_csum(netdev_features_t features) +{ + netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + bool ip_csum = (features & ip_csum_mask) == ip_csum_mask; + bool hw_csum = features & NETIF_F_HW_CSUM; + + return ip_csum || hw_csum; +} + static netdev_features_t netdev_fix_features(struct net_device *dev, netdev_features_t features) { @@ -9993,15 +10002,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_LRO; } - if (features & NETIF_F_HW_TLS_TX) { - bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == - (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); - bool hw_csum = features & NETIF_F_HW_CSUM; - - if (!ip_csum && !hw_csum) { - netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); - features &= ~NETIF_F_HW_TLS_TX; - } + if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; } if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { @@ -10009,6 +10012,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_HW_TLS_RX; } + if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) { + netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n"); + features &= ~NETIF_F_GSO_UDP_L4; + } + return features; } From 30b03f2a0592eee1267298298eac9dd655f55ab2 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 8 Aug 2024 11:56:22 +0200 Subject: [PATCH 154/991] udp: Fall back to software USO if IPv6 extension headers are present In commit 10154dbded6d ("udp: Allow GSO transmit from devices with no checksum offload") we have intentionally allowed UDP GSO packets marked CHECKSUM_NONE to pass to the GSO stack, so that they can be segmented and checksummed by a software fallback when the egress device lacks these features. What was not taken into consideration is that a CHECKSUM_NONE skb can be handed over to the GSO stack also when the egress device advertises the tx-udp-segmentation / NETIF_F_GSO_UDP_L4 feature. This will happen when there are IPv6 extension headers present, which we check for in __ip6_append_data(). Syzbot has discovered this scenario, producing a warning as below: ip6tnl0: caps=(0x00000006401d7869, 0x00000006401d7869) WARNING: CPU: 0 PID: 5112 at net/core/dev.c:3293 skb_warn_bad_offload+0x166/0x1a0 net/core/dev.c:3291 Modules linked in: CPU: 0 PID: 5112 Comm: syz-executor391 Not tainted 6.10.0-rc7-syzkaller-01603-g80ab5445da62 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/07/2024 RIP: 0010:skb_warn_bad_offload+0x166/0x1a0 net/core/dev.c:3291 [...] Call Trace: __skb_gso_segment+0x3be/0x4c0 net/core/gso.c:127 skb_gso_segment include/net/gso.h:83 [inline] validate_xmit_skb+0x585/0x1120 net/core/dev.c:3661 __dev_queue_xmit+0x17a4/0x3e90 net/core/dev.c:4415 neigh_output include/net/neighbour.h:542 [inline] ip6_finish_output2+0xffa/0x1680 net/ipv6/ip6_output.c:137 ip6_finish_output+0x41e/0x810 net/ipv6/ip6_output.c:222 ip6_send_skb+0x112/0x230 net/ipv6/ip6_output.c:1958 udp_v6_send_skb+0xbf5/0x1870 net/ipv6/udp.c:1292 udpv6_sendmsg+0x23b3/0x3270 net/ipv6/udp.c:1588 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xef/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2585 ___sys_sendmsg net/socket.c:2639 [inline] __sys_sendmmsg+0x3b2/0x740 net/socket.c:2725 __do_sys_sendmmsg net/socket.c:2754 [inline] __se_sys_sendmmsg net/socket.c:2751 [inline] __x64_sys_sendmmsg+0xa0/0xb0 net/socket.c:2751 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f [...] We are hitting the bad offload warning because when an egress device is capable of handling segmentation offload requested by skb_shinfo(skb)->gso_type, the chain of gso_segment callbacks won't produce any segment skbs and return NULL. See the skb_gso_ok() branch in {__udp,tcp,sctp}_gso_segment helpers. To fix it, force a fallback to software USO when processing a packet with IPv6 extension headers, since we don't know if these can checksummed by all devices which offer USO. Fixes: 10154dbded6d ("udp: Allow GSO transmit from devices with no checksum offload") Reported-by: syzbot+e15b7e15b8a751a91d9a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000e1609a061d5330ce@google.com/ Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Sitnicki Link: https://patch.msgid.link/20240808-udp-gso-egress-from-tunnel-v4-2-f5c5b4149ab9@cloudflare.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp_offload.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index bc8a9da750fed6..b254a5dadfcf3f 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -282,6 +282,12 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, skb_transport_header(gso_skb))) return ERR_PTR(-EINVAL); + /* We don't know if egress device can segment and checksum the packet + * when IPv6 extension headers are present. Fall back to software GSO. + */ + if (gso_skb->ip_summed != CHECKSUM_PARTIAL) + features &= ~(NETIF_F_GSO_UDP_L4 | NETIF_F_CSUM_MASK); + if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh), From 1d2c46c1bc5680335f20f64089c161fdfcd3e8ab Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 8 Aug 2024 11:56:23 +0200 Subject: [PATCH 155/991] selftests/net: Add coverage for UDP GSO with IPv6 extension headers After enabling UDP GSO for devices not offering checksum offload, we have hit a regression where a bad offload warning can be triggered when sending a datagram with IPv6 extension headers. Extend the UDP GSO IPv6 tests to cover this scenario. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Sitnicki Link: https://patch.msgid.link/20240808-udp-gso-egress-from-tunnel-v4-3-f5c5b4149ab9@cloudflare.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/udpgso.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 3e74cfa1a2bfec..3f2fca02fec53f 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -67,6 +67,7 @@ struct testcase { int gso_len; /* mss after applying gso */ int r_num_mss; /* recv(): number of calls of full mss */ int r_len_last; /* recv(): size of last non-mss dgram, if any */ + bool v6_ext_hdr; /* send() dgrams with IPv6 extension headers */ }; const struct in6_addr addr6 = { @@ -77,6 +78,8 @@ const struct in_addr addr4 = { __constant_htonl(0x0a000001), /* 10.0.0.1 */ }; +static const char ipv6_hopopts_pad1[8] = { 0 }; + struct testcase testcases_v4[] = { { /* no GSO: send a single byte */ @@ -255,6 +258,13 @@ struct testcase testcases_v6[] = { .gso_len = 1, .r_num_mss = 2, }, + { + /* send 2 1B segments with extension headers */ + .tlen = 2, + .gso_len = 1, + .r_num_mss = 2, + .v6_ext_hdr = true, + }, { /* send 2B + 2B + 1B segments */ .tlen = 5, @@ -396,11 +406,18 @@ static void run_one(struct testcase *test, int fdt, int fdr, int i, ret, val, mss; bool sent; - fprintf(stderr, "ipv%d tx:%d gso:%d %s\n", + fprintf(stderr, "ipv%d tx:%d gso:%d %s%s\n", addr->sa_family == AF_INET ? 4 : 6, test->tlen, test->gso_len, + test->v6_ext_hdr ? "ext-hdr " : "", test->tfail ? "(fail)" : ""); + if (test->v6_ext_hdr) { + if (setsockopt(fdt, IPPROTO_IPV6, IPV6_HOPOPTS, + ipv6_hopopts_pad1, sizeof(ipv6_hopopts_pad1))) + error(1, errno, "setsockopt ipv6 hopopts"); + } + val = test->gso_len; if (cfg_do_setsockopt) { if (setsockopt(fdt, SOL_UDP, UDP_SEGMENT, &val, sizeof(val))) @@ -412,6 +429,12 @@ static void run_one(struct testcase *test, int fdt, int fdr, error(1, 0, "send succeeded while expecting failure"); if (!sent && !test->tfail) error(1, 0, "send failed while expecting success"); + + if (test->v6_ext_hdr) { + if (setsockopt(fdt, IPPROTO_IPV6, IPV6_HOPOPTS, NULL, 0)) + error(1, errno, "setsockopt ipv6 hopopts clear"); + } + if (!sent) return; From c31fe2b5095d8c84562ce90db07600f7e9f318df Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 8 Aug 2024 17:41:02 +0300 Subject: [PATCH 156/991] net/mlx5: SD, Do not query MPIR register if no sd_group Unconditionally calling the MPIR query on BF separate mode yields the FW syndrome below [1]. Do not call it unless admin clearly specified the SD group, i.e. expressing the intention of using the multi-PF netdev feature. This fix covers cases not covered in commit fca3b4791850 ("net/mlx5: Do not query MPIR on embedded CPU function"). [1] mlx5_cmd_out_err:808:(pid 8267): ACCESS_REG(0x805) op_mod(0x1) failed, status bad system state(0x4), syndrome (0x685f19), err(-5) Fixes: 678eb448055a ("net/mlx5: SD, Implement basic query and instantiation") Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Link: https://patch.msgid.link/20240808144107.2095424-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/lib/sd.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index f6deb5a3f82024..eeb0b7ea05f126 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -126,7 +126,7 @@ static bool mlx5_sd_is_supported(struct mlx5_core_dev *dev, u8 host_buses) } static int mlx5_query_sd(struct mlx5_core_dev *dev, bool *sdm, - u8 *host_buses, u8 *sd_group) + u8 *host_buses) { u32 out[MLX5_ST_SZ_DW(mpir_reg)]; int err; @@ -135,10 +135,6 @@ static int mlx5_query_sd(struct mlx5_core_dev *dev, bool *sdm, if (err) return err; - err = mlx5_query_nic_vport_sd_group(dev, sd_group); - if (err) - return err; - *sdm = MLX5_GET(mpir_reg, out, sdm); *host_buses = MLX5_GET(mpir_reg, out, host_buses); @@ -166,19 +162,23 @@ static int sd_init(struct mlx5_core_dev *dev) if (mlx5_core_is_ecpf(dev)) return 0; + err = mlx5_query_nic_vport_sd_group(dev, &sd_group); + if (err) + return err; + + if (!sd_group) + return 0; + if (!MLX5_CAP_MCAM_REG(dev, mpir)) return 0; - err = mlx5_query_sd(dev, &sdm, &host_buses, &sd_group); + err = mlx5_query_sd(dev, &sdm, &host_buses); if (err) return err; if (!sdm) return 0; - if (!sd_group) - return 0; - group_id = mlx5_sd_group_id(dev, sd_group); if (!mlx5_sd_is_supported(dev, host_buses)) { From ab6013a59b4d0947fda409c29426dc904959e632 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 8 Aug 2024 17:41:03 +0300 Subject: [PATCH 157/991] net/mlx5e: SHAMPO, Increase timeout to improve latency During latency tests (netperf TCP_RR) a 30% degradation of HW GRO vs SW GRO was observed. This is due to SHAMPO triggering timeout filler CQEs instead of delivering the CQE for the packet. Having a short timeout for SHAMPO doesn't bring any benefits as it is the driver that does the merging, not the hardware. On the contrary, it can have a negative impact: additional filler CQEs are generated due to the timeout. As there is no way to disable this timeout, this change sets it to the maximum value. Instead of using the packet_merge.timeout parameter which is also used for LRO, set the value directly when filling in the rest of the SHAMPO parameters in mlx5e_build_rq_param(). Fixes: 99be56171fa9 ("net/mlx5e: SHAMPO, Re-enable HW-GRO") Signed-off-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20240808144107.2095424-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- .../net/ethernet/mellanox/mlx5/core/en/params.c | 16 +++++++++++++++- .../net/ethernet/mellanox/mlx5/core/en/params.h | 1 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 12 ------------ 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 5fd82c67b6ab7f..bb5da42edc23a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -130,7 +130,7 @@ struct page_pool; #define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW 0x2 #define MLX5E_DEFAULT_LRO_TIMEOUT 32 -#define MLX5E_LRO_TIMEOUT_ARR_SIZE 4 +#define MLX5E_DEFAULT_SHAMPO_TIMEOUT 1024 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC 0x10 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE 0x3 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index 6c9ccccca81e27..64b62ed17b07a7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -928,7 +928,7 @@ int mlx5e_build_rq_param(struct mlx5_core_dev *mdev, MLX5_SET(wq, wq, log_headers_entry_size, mlx5e_shampo_get_log_hd_entry_size(mdev, params)); MLX5_SET(rqc, rqc, reservation_timeout, - params->packet_merge.timeout); + mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_SHAMPO_TIMEOUT)); MLX5_SET(rqc, rqc, shampo_match_criteria_type, params->packet_merge.shampo.match_criteria_type); MLX5_SET(rqc, rqc, shampo_no_match_alignment_granularity, @@ -1087,6 +1087,20 @@ static u32 mlx5e_shampo_icosq_sz(struct mlx5_core_dev *mdev, return wqebbs; } +#define MLX5E_LRO_TIMEOUT_ARR_SIZE 4 + +u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout) +{ + int i; + + /* The supported periods are organized in ascending order */ + for (i = 0; i < MLX5E_LRO_TIMEOUT_ARR_SIZE - 1; i++) + if (MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]) >= wanted_timeout) + break; + + return MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]); +} + static u32 mlx5e_mpwrq_total_umr_wqebbs(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h index 749b2ec0436eea..3f8986f9d86291 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h @@ -108,6 +108,7 @@ u32 mlx5e_shampo_hd_per_wqe(struct mlx5_core_dev *mdev, u32 mlx5e_shampo_hd_per_wq(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_rq_param *rq_param); +u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout); u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 6f686fabed4462..f04decca39f28e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5167,18 +5167,6 @@ const struct net_device_ops mlx5e_netdev_ops = { #endif }; -static u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout) -{ - int i; - - /* The supported periods are organized in ascending order */ - for (i = 0; i < MLX5E_LRO_TIMEOUT_ARR_SIZE - 1; i++) - if (MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]) >= wanted_timeout) - break; - - return MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]); -} - void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu) { struct mlx5e_params *params = &priv->channels.params; From e6b5afd30b99b43682a7764e1a74a42fe4d5f4b3 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 8 Aug 2024 17:41:04 +0300 Subject: [PATCH 158/991] net/mlx5e: Take state lock during tx timeout reporter mlx5e_safe_reopen_channels() requires the state lock taken. The referenced changed in the Fixes tag removed the lock to fix another issue. This patch adds it back but at a later point (when calling mlx5e_safe_reopen_channels()) to avoid the deadlock referenced in the Fixes tag. Fixes: eab0da38912e ("net/mlx5e: Fix possible deadlock on mlx5e_tx_timeout_work") Signed-off-by: Dragos Tatulea Link: https://lore.kernel.org/all/ZplpKq8FKi3vwfxv@gmail.com/T/ Reviewed-by: Breno Leitao Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20240808144107.2095424-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 22918b2ef7f128..09433b91be176f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -146,7 +146,9 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx) return err; } + mutex_lock(&priv->state_lock); err = mlx5e_safe_reopen_channels(priv); + mutex_unlock(&priv->state_lock); if (!err) { to_ctx->status = 1; /* all channels recovered */ return err; From cbc796be1779c4dbc9a482c7233995e2a8b6bfb3 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 8 Aug 2024 17:41:05 +0300 Subject: [PATCH 159/991] net/mlx5e: Correctly report errors for ethtool rx flows Previously, an ethtool rx flow with no attrs would not be added to the NIC as it has no rules to configure the hw with, but it would be reported as successful to the caller (return code 0). This is confusing for the user as ethtool then reports "Added rule $num", but no rule was actually added. This change corrects that by instead reporting these wrong rules as -EINVAL. Fixes: b29c61dac3a2 ("net/mlx5e: Ethtool steering flow validation refactoring") Signed-off-by: Cosmin Ratiu Reviewed-by: Saeed Mahameed Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20240808144107.2095424-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c index 3eccdadc035781..773624bb2c5d54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c @@ -734,7 +734,7 @@ mlx5e_ethtool_flow_replace(struct mlx5e_priv *priv, if (num_tuples <= 0) { netdev_warn(priv->netdev, "%s: flow is not valid %d\n", __func__, num_tuples); - return num_tuples; + return num_tuples < 0 ? num_tuples : -EINVAL; } eth_ft = get_flow_table(priv, fs, num_tuples); From 0b4a4534d083e055831b3bc29c5eafc918ed4d86 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 8 Aug 2024 17:41:06 +0300 Subject: [PATCH 160/991] net/mlx5e: Fix queue stats access to non-existing channels splat The queue stats API queries the queues according to the real_num_[tr]x_queues, in case the device is down and channels were not yet created, don't try to query their statistics. To trigger the panic, run this command before the interface is brought up: ./cli.py --spec ../../../Documentation/netlink/specs/netdev.yaml --dump qstats-get --json '{"ifindex": 4}' BUG: kernel NULL pointer dereference, address: 0000000000000c00 PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP PTI CPU: 3 UID: 0 PID: 977 Comm: python3 Not tainted 6.10.0+ #40 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:mlx5e_get_queue_stats_rx+0x3c/0xb0 [mlx5_core] Code: fc 55 48 63 ee 53 48 89 d3 e8 40 3d 70 e1 85 c0 74 58 4c 89 ef e8 d4 07 04 00 84 c0 75 41 49 8b 84 24 f8 39 00 00 48 8b 04 e8 <48> 8b 90 00 0c 00 00 48 03 90 40 0a 00 00 48 89 53 08 48 8b 90 08 RSP: 0018:ffff888116be37d0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff888116be3868 RCX: 0000000000000004 RDX: ffff88810ada4000 RSI: 0000000000000000 RDI: ffff888109df09c0 RBP: 0000000000000000 R08: 0000000000000004 R09: 0000000000000004 R10: ffff88813461901c R11: ffffffffffffffff R12: ffff888109df0000 R13: ffff888109df09c0 R14: ffff888116be38d0 R15: 0000000000000000 FS: 00007f4375d5c740(0000) GS:ffff88852c980000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000c00 CR3: 0000000106ada006 CR4: 0000000000370eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? __die+0x1f/0x60 ? page_fault_oops+0x14e/0x3d0 ? exc_page_fault+0x73/0x130 ? asm_exc_page_fault+0x22/0x30 ? mlx5e_get_queue_stats_rx+0x3c/0xb0 [mlx5_core] netdev_nl_stats_by_netdev+0x2a6/0x4c0 ? __rmqueue_pcplist+0x351/0x6f0 netdev_nl_qstats_get_dumpit+0xc4/0x1b0 genl_dumpit+0x2d/0x80 netlink_dump+0x199/0x410 __netlink_dump_start+0x1aa/0x2c0 genl_family_rcv_msg_dumpit+0x94/0xf0 ? __pfx_genl_start+0x10/0x10 ? __pfx_genl_dumpit+0x10/0x10 ? __pfx_genl_done+0x10/0x10 genl_rcv_msg+0x116/0x2b0 ? __pfx_netdev_nl_qstats_get_dumpit+0x10/0x10 ? __pfx_genl_rcv_msg+0x10/0x10 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x21a/0x340 netlink_sendmsg+0x1f4/0x440 __sys_sendto+0x1b6/0x1c0 ? do_sock_setsockopt+0xc3/0x180 ? __sys_setsockopt+0x60/0xb0 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x50/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f43757132b0 Code: c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 1d 45 31 c9 45 31 c0 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 68 c3 0f 1f 80 00 00 00 00 41 54 48 83 ec 20 RSP: 002b:00007ffd258da048 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00007ffd258da0f8 RCX: 00007f43757132b0 RDX: 000000000000001c RSI: 00007f437464b850 RDI: 0000000000000003 RBP: 00007f4375085de0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: ffffffffc4653600 R14: 0000000000000001 R15: 00007f43751a6147 Modules linked in: netconsole xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core zram zsmalloc mlx5_core fuse [last unloaded: netconsole] CR2: 0000000000000c00 ---[ end trace 0000000000000000 ]--- RIP: 0010:mlx5e_get_queue_stats_rx+0x3c/0xb0 [mlx5_core] Code: fc 55 48 63 ee 53 48 89 d3 e8 40 3d 70 e1 85 c0 74 58 4c 89 ef e8 d4 07 04 00 84 c0 75 41 49 8b 84 24 f8 39 00 00 48 8b 04 e8 <48> 8b 90 00 0c 00 00 48 03 90 40 0a 00 00 48 89 53 08 48 8b 90 08 RSP: 0018:ffff888116be37d0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff888116be3868 RCX: 0000000000000004 RDX: ffff88810ada4000 RSI: 0000000000000000 RDI: ffff888109df09c0 RBP: 0000000000000000 R08: 0000000000000004 R09: 0000000000000004 R10: ffff88813461901c R11: ffffffffffffffff R12: ffff888109df0000 R13: ffff888109df09c0 R14: ffff888116be38d0 R15: 0000000000000000 FS: 00007f4375d5c740(0000) GS:ffff88852c980000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000c00 CR3: 0000000106ada006 CR4: 0000000000370eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 7b66ae536a78 ("net/mlx5e: Add per queue netdev-genl stats") Signed-off-by: Gal Pressman Signed-off-by: Tariq Toukan Reviewed-by: Joe Damato Link: https://patch.msgid.link/20240808144107.2095424-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index f04decca39f28e..5df904639b0ce6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5296,7 +5296,7 @@ static void mlx5e_get_queue_stats_rx(struct net_device *dev, int i, struct mlx5e_rq_stats *rq_stats; ASSERT_RTNL(); - if (mlx5e_is_uplink_rep(priv)) + if (mlx5e_is_uplink_rep(priv) || !priv->stats_nch) return; channel_stats = priv->channel_stats[i]; @@ -5316,6 +5316,9 @@ static void mlx5e_get_queue_stats_tx(struct net_device *dev, int i, struct mlx5e_sq_stats *sq_stats; ASSERT_RTNL(); + if (!priv->stats_nch) + return; + /* no special case needed for ptp htb etc since txq2sq_stats is kept up * to date for active sq_stats, otherwise get_base_stats takes care of * inactive sqs. From ccbfcac05866ebe6eb3bc6d07b51d4ed4fcde436 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sat, 10 Aug 2024 10:48:32 +0200 Subject: [PATCH 161/991] ALSA: timer: Relax start tick time check for slave timer elements The recent addition of a sanity check for a too low start tick time seems breaking some applications that uses aloop with a certain slave timer setup. They may have the initial resolution 0, hence it's treated as if it were a too low value. Relax and skip the check for the slave timer instance for addressing the regression. Fixes: 4a63bd179fa8 ("ALSA: timer: Set lower bound of start tick time") Cc: Link: https://github.com/raspberrypi/linux/issues/6294 Link: https://patch.msgid.link/20240810084833.10939-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index d104adc75a8b0e..71a07c1662f5c4 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -547,7 +547,7 @@ static int snd_timer_start1(struct snd_timer_instance *timeri, /* check the actual time for the start tick; * bail out as error if it's way too low (< 100us) */ - if (start) { + if (start && !(timer->hw.flags & SNDRV_TIMER_HW_SLAVE)) { if ((u64)snd_timer_hw_resolution(timer) * ticks < 100000) return -EINVAL; } From 2ad4e1ada8eebafa2d75a4b75eeeca882de6ada1 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Sat, 3 Aug 2024 21:52:55 +0200 Subject: [PATCH 162/991] wifi: brcmfmac: cfg80211: Handle SSID based pmksa deletion wpa_supplicant 2.11 sends since 1efdba5fdc2c ("Handle PMKSA flush in the driver for SAE/OWE offload cases") SSID based PMKSA del commands. brcmfmac is not prepared and tries to dereference the NULL bssid and pmkid pointers in cfg80211_pmksa. PMKID_V3 operations support SSID based updates so copy the SSID. Fixes: a96202acaea4 ("wifi: brcmfmac: cfg80211: Add support for PMKID_V3 operations") Cc: stable@vger.kernel.org # 6.4.x Signed-off-by: Janne Grunau Reviewed-by: Neal Gompa Acked-by: Arend van Spriel Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240803-brcmfmac_pmksa_del_ssid-v1-1-4e85f19135e1@jannau.net --- .../wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 1585a5653ee4b9..d4cc5fa92341d5 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -4320,9 +4320,16 @@ brcmf_pmksa_v3_op(struct brcmf_if *ifp, struct cfg80211_pmksa *pmksa, /* Single PMK operation */ pmk_op->count = cpu_to_le16(1); length += sizeof(struct brcmf_pmksa_v3); - memcpy(pmk_op->pmk[0].bssid, pmksa->bssid, ETH_ALEN); - memcpy(pmk_op->pmk[0].pmkid, pmksa->pmkid, WLAN_PMKID_LEN); - pmk_op->pmk[0].pmkid_len = WLAN_PMKID_LEN; + if (pmksa->bssid) + memcpy(pmk_op->pmk[0].bssid, pmksa->bssid, ETH_ALEN); + if (pmksa->pmkid) { + memcpy(pmk_op->pmk[0].pmkid, pmksa->pmkid, WLAN_PMKID_LEN); + pmk_op->pmk[0].pmkid_len = WLAN_PMKID_LEN; + } + if (pmksa->ssid && pmksa->ssid_len) { + memcpy(pmk_op->pmk[0].ssid.SSID, pmksa->ssid, pmksa->ssid_len); + pmk_op->pmk[0].ssid.SSID_len = pmksa->ssid_len; + } pmk_op->pmk[0].time_left = cpu_to_le32(alive ? BRCMF_PMKSA_NO_EXPIRY : 0); } From aad41832326723627ad8ac9ee8a543b6dca4454d Mon Sep 17 00:00:00 2001 From: Asmaa Mnebhi Date: Tue, 11 Jun 2024 13:15:09 -0400 Subject: [PATCH 163/991] gpio: mlxbf3: Support shutdown() function During Linux graceful reboot, the GPIO interrupts are not disabled. Since the drivers are not removed during graceful reboot, the logic to call mlxbf3_gpio_irq_disable() is not triggered. Interrupts that remain enabled can cause issues on subsequent boots. For example, the mlxbf-gige driver contains PHY logic to bring up the link. If the gpio-mlxbf3 driver loads first, the mlxbf-gige driver will use a GPIO interrupt to bring up the link. Otherwise, it will use polling. The next time Linux boots and loads the drivers in this order, we encounter the issue: - mlxbf-gige loads first and uses polling while the GPIO10 interrupt is still enabled from the previous boot. So if the interrupt triggers, there is nothing to clear it. - gpio-mlxbf3 loads. - i2c-mlxbf loads. The interrupt doesn't trigger for I2C because it is shared with the GPIO interrupt line which was not cleared. The solution is to add a shutdown function to the GPIO driver to clear and disable all interrupts. Also clear the interrupt after disabling it in mlxbf3_gpio_irq_disable(). Fixes: 38a700efc510 ("gpio: mlxbf3: Add gpio driver support") Signed-off-by: Asmaa Mnebhi Reviewed-by: David Thompson Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20240611171509.22151-1-asmaa@nvidia.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mlxbf3.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpio/gpio-mlxbf3.c b/drivers/gpio/gpio-mlxbf3.c index d5906d419b0ab9..10ea71273c8915 100644 --- a/drivers/gpio/gpio-mlxbf3.c +++ b/drivers/gpio/gpio-mlxbf3.c @@ -39,6 +39,8 @@ #define MLXBF_GPIO_CAUSE_OR_EVTEN0 0x14 #define MLXBF_GPIO_CAUSE_OR_CLRCAUSE 0x18 +#define MLXBF_GPIO_CLR_ALL_INTS GENMASK(31, 0) + struct mlxbf3_gpio_context { struct gpio_chip gc; @@ -82,6 +84,8 @@ static void mlxbf3_gpio_irq_disable(struct irq_data *irqd) val = readl(gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); val &= ~BIT(offset); writel(val, gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); + + writel(BIT(offset), gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_CLRCAUSE); raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); gpiochip_disable_irq(gc, offset); @@ -253,6 +257,15 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) return 0; } +static void mlxbf3_gpio_shutdown(struct platform_device *pdev) +{ + struct mlxbf3_gpio_context *gs = platform_get_drvdata(pdev); + + /* Disable and clear all interrupts */ + writel(0, gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); + writel(MLXBF_GPIO_CLR_ALL_INTS, gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_CLRCAUSE); +} + static const struct acpi_device_id mlxbf3_gpio_acpi_match[] = { { "MLNXBF33", 0 }, {} @@ -265,6 +278,7 @@ static struct platform_driver mlxbf3_gpio_driver = { .acpi_match_table = mlxbf3_gpio_acpi_match, }, .probe = mlxbf3_gpio_probe, + .shutdown = mlxbf3_gpio_shutdown, }; module_platform_driver(mlxbf3_gpio_driver); From 9a039eeb71a42c8b13408a1976e300f3898e1be0 Mon Sep 17 00:00:00 2001 From: Moon Yeounsu Date: Wed, 7 Aug 2024 19:07:21 +0900 Subject: [PATCH 164/991] net: ethernet: use ip_hdrlen() instead of bit shift `ip_hdr(skb)->ihl << 2` is the same as `ip_hdrlen(skb)` Therefore, we should use a well-defined function not a bit shift to find the header length. It also compresses two lines to a single line. Signed-off-by: Moon Yeounsu Reviewed-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/jme.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c index b06e245629739f..d8be0e4dcb072b 100644 --- a/drivers/net/ethernet/jme.c +++ b/drivers/net/ethernet/jme.c @@ -946,15 +946,13 @@ jme_udpsum(struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IP)) return csum; skb_set_network_header(skb, ETH_HLEN); - if ((ip_hdr(skb)->protocol != IPPROTO_UDP) || - (skb->len < (ETH_HLEN + - (ip_hdr(skb)->ihl << 2) + - sizeof(struct udphdr)))) { + + if (ip_hdr(skb)->protocol != IPPROTO_UDP || + skb->len < (ETH_HLEN + ip_hdrlen(skb) + sizeof(struct udphdr))) { skb_reset_network_header(skb); return csum; } - skb_set_transport_header(skb, - ETH_HLEN + (ip_hdr(skb)->ihl << 2)); + skb_set_transport_header(skb, ETH_HLEN + ip_hdrlen(skb)); csum = udp_hdr(skb)->check; skb_reset_transport_header(skb); skb_reset_network_header(skb); From ef9718b3d54e822de294351251f3a574f8a082ce Mon Sep 17 00:00:00 2001 From: Parsa Poorshikhian Date: Sat, 10 Aug 2024 18:39:06 +0330 Subject: [PATCH 165/991] ALSA: hda/realtek: Fix noise from speakers on Lenovo IdeaPad 3 15IAU7 Fix noise from speakers connected to AUX port when no sound is playing. The problem occurs because the `alc_shutup_pins` function includes a 0x10ec0257 vendor ID, which causes noise on Lenovo IdeaPad 3 15IAU7 with Realtek ALC257 codec when no sound is playing. Removing this vendor ID from the function fixes the bug. Fixes: 70794b9563fe ("ALSA: hda/realtek: Add more codec ID to no shutup pins list") Signed-off-by: Parsa Poorshikhian Link: https://patch.msgid.link/20240810150939.330693-1-parsa.poorsh@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 480e82df7a4ce7..6e19598e23b793 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -583,7 +583,6 @@ static void alc_shutup_pins(struct hda_codec *codec) switch (codec->core.vendor_id) { case 0x10ec0236: case 0x10ec0256: - case 0x10ec0257: case 0x19e58326: case 0x10ec0283: case 0x10ec0285: From 4e69cd835a2d5c3915838491f59a68ee697a87d0 Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Sun, 14 Jul 2024 12:20:17 -0500 Subject: [PATCH 166/991] arm64: dts: imx8mp-beacon-kit: Fix Stereo Audio on WM8962 The L/R clock needs to be controlled by the SAI3 instead of the CODEC to properly achieve stereo sound. Doing this allows removes the need for unnecessary clock manipulation to try to get the CODEC's clock in sync with the SAI3 clock, since the CODEC can cope with a wide variety of clock inputs. Fixes: 161af16c18f3 ("arm64: dts: imx8mp-beacon-kit: Fix audio_pll2 clock") Fixes: 69e2f37a6ddc ("arm64: dts: imx8mp-beacon-kit: Enable WM8962 Audio CODEC") Signed-off-by: Adam Ford Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts b/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts index 17e2c19d845513..cc9b81d4618868 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts @@ -211,13 +211,12 @@ simple-audio-card,cpu { sound-dai = <&sai3>; + frame-master; + bitclock-master; }; simple-audio-card,codec { sound-dai = <&wm8962>; - clocks = <&clk IMX8MP_CLK_IPP_DO_CLKO1>; - frame-master; - bitclock-master; }; }; }; @@ -507,10 +506,9 @@ &sai3 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_sai3>; - assigned-clocks = <&clk IMX8MP_CLK_SAI3>, - <&clk IMX8MP_AUDIO_PLL2> ; - assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL2_OUT>; - assigned-clock-rates = <12288000>, <361267200>; + assigned-clocks = <&clk IMX8MP_CLK_SAI3>; + assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL1_OUT>; + assigned-clock-rates = <12288000>; fsl,sai-mclk-direction-output; status = "okay"; }; From 4736ad9422cb86f15464d2bd579c1f5d7786bb61 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Mon, 15 Jul 2024 11:32:31 +0200 Subject: [PATCH 167/991] arm64: dts: freescale: tqma9352: Fix watchdog reset On the tqma9352 the board is reset through an external PMIC, so set the fsl,ext-reset-output property to enable triggering the output pin on a watchdog trigger. Signed-off-by: Sascha Hauer Reviewed-by: Fabio Estevam Reviewed-by: Alexander Stein Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi b/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi index edbd8cad35bca1..d3a0e1244aae67 100644 --- a/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi +++ b/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi @@ -156,6 +156,7 @@ &wdog3 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_wdog>; + fsl,ext-reset-output; status = "okay"; }; From 109f256285dd6a5f8c3bd0d80d39b2ccd4fe314e Mon Sep 17 00:00:00 2001 From: Shenwei Wang Date: Mon, 15 Jul 2024 08:17:22 -0500 Subject: [PATCH 168/991] arm64: dts: imx93: update default value for snps,clk-csr For the i.MX93 SoC, the default clock rate for the IP of STMMAC EQOS is 312.5 MHz. According to the following mapping table from the i.MX93 reference manual, this clock rate corresponds to a CSR value of 6. 0000: CSR clock = 60-100 MHz; MDC clock = CSR clock/42 0001: CSR clock = 100-150 MHz; MDC clock = CSR clock/62 0010: CSR clock = 20-35 MHz; MDC clock = CSR clock/16 0011: CSR clock = 35-60 MHz; MDC clock = CSR clock/26 0100: CSR clock = 150-250 MHz; MDC clock = CSR clock/102 0101: CSR clock = 250-300 MHz; MDC clock = CSR clock/124 0110: CSR clock = 300-500 MHz; MDC clock = CSR clock/204 0111: CSR clock = 500-800 MHz; MDC clock = CSR clock/324 Fixes: f2d03ba997cb ("arm64: dts: imx93: reorder device nodes") Signed-off-by: Shenwei Wang Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx93.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx93.dtsi b/arch/arm64/boot/dts/freescale/imx93.dtsi index 4a3f42355cb8fc..a0993022c102da 100644 --- a/arch/arm64/boot/dts/freescale/imx93.dtsi +++ b/arch/arm64/boot/dts/freescale/imx93.dtsi @@ -1105,7 +1105,7 @@ <&clk IMX93_CLK_SYS_PLL_PFD0_DIV2>; assigned-clock-rates = <100000000>, <250000000>; intf_mode = <&wakeupmix_gpr 0x28>; - snps,clk-csr = <0>; + snps,clk-csr = <6>; nvmem-cells = <ð_mac2>; nvmem-cell-names = "mac-address"; status = "disabled"; From 03c5c350e38d9346b69357d0e52c3c40495c14a0 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Fri, 2 Aug 2024 16:22:15 +0100 Subject: [PATCH 169/991] ALSA: hda/realtek: Add support for new HP G12 laptops Some of these laptop models have quirk IDs that are identical but have different amplifier parts fitted, this difference is described in the ACPI information. The solution introduced for this product family can derive the required component binding information from ACPI instead of hardcoding it, supports the new variants of the CS35L56 being used and has generalized naming that makes it applicable to other ALC+amp combinations. Signed-off-by: Simon Trimmer Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20240802152215.20831-4-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 99 +++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 480e82df7a4ce7..24eb71efac6caa 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -11,15 +11,18 @@ */ #include +#include #include #include #include #include #include #include +#include #include #include #include +#include #include #include #include @@ -6856,6 +6859,86 @@ static void comp_generic_fixup(struct hda_codec *cdc, int action, const char *bu } } +static void cs35lxx_autodet_fixup(struct hda_codec *cdc, + const struct hda_fixup *fix, + int action) +{ + struct device *dev = hda_codec_dev(cdc); + struct acpi_device *adev; + struct fwnode_handle *fwnode __free(fwnode_handle) = NULL; + const char *bus = NULL; + static const struct { + const char *hid; + const char *name; + } acpi_ids[] = {{ "CSC3554", "cs35l54-hda" }, + { "CSC3556", "cs35l56-hda" }, + { "CSC3557", "cs35l57-hda" }}; + char *match; + int i, count = 0, count_devindex = 0; + + switch (action) { + case HDA_FIXUP_ACT_PRE_PROBE: + for (i = 0; i < ARRAY_SIZE(acpi_ids); ++i) { + adev = acpi_dev_get_first_match_dev(acpi_ids[i].hid, NULL, -1); + if (adev) + break; + } + if (!adev) { + dev_err(dev, "Failed to find ACPI entry for a Cirrus Amp\n"); + return; + } + + count = i2c_acpi_client_count(adev); + if (count > 0) { + bus = "i2c"; + } else { + count = acpi_spi_count_resources(adev); + if (count > 0) + bus = "spi"; + } + + fwnode = fwnode_handle_get(acpi_fwnode_handle(adev)); + acpi_dev_put(adev); + + if (!bus) { + dev_err(dev, "Did not find any buses for %s\n", acpi_ids[i].hid); + return; + } + + if (!fwnode) { + dev_err(dev, "Could not get fwnode for %s\n", acpi_ids[i].hid); + return; + } + + /* + * When available the cirrus,dev-index property is an accurate + * count of the amps in a system and is used in preference to + * the count of bus devices that can contain additional address + * alias entries. + */ + count_devindex = fwnode_property_count_u32(fwnode, "cirrus,dev-index"); + if (count_devindex > 0) + count = count_devindex; + + match = devm_kasprintf(dev, GFP_KERNEL, "-%%s:00-%s.%%d", acpi_ids[i].name); + if (!match) + return; + dev_info(dev, "Found %d %s on %s (%s)\n", count, acpi_ids[i].hid, bus, match); + comp_generic_fixup(cdc, action, bus, acpi_ids[i].hid, match, count); + + break; + case HDA_FIXUP_ACT_FREE: + /* + * Pass the action on to comp_generic_fixup() so that + * hda_component_manager functions can be called in just once + * place. In this context the bus, hid, match_str or count + * values do not need to be calculated. + */ + comp_generic_fixup(cdc, action, NULL, NULL, NULL, 0); + break; + } +} + static void cs35l41_fixup_i2c_two(struct hda_codec *cdc, const struct hda_fixup *fix, int action) { comp_generic_fixup(cdc, action, "i2c", "CSC3551", "-%s:00-cs35l41-hda.%d", 2); @@ -7528,6 +7611,7 @@ enum { ALC256_FIXUP_CHROME_BOOK, ALC287_FIXUP_LENOVO_14ARP8_LEGION_IAH7, ALC287_FIXUP_LENOVO_SSID_17AA3820, + ALCXXX_FIXUP_CS35LXX, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -9857,6 +9941,10 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc287_fixup_lenovo_ssid_17aa3820, }, + [ALCXXX_FIXUP_CS35LXX] = { + .type = HDA_FIXUP_FUNC, + .v.func = cs35lxx_autodet_fixup, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -10271,6 +10359,17 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8cdf, "HP SnowWhite", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ce0, "HP SnowWhite", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8cf5, "HP ZBook Studio 16", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8d01, "HP ZBook Power 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d08, "HP EliteBook 1045 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d85, "HP EliteBook 1040 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d86, "HP Elite x360 1040 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d8c, "HP EliteBook 830 13 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d8d, "HP Elite x360 830 13 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d8e, "HP EliteBook 840 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d8f, "HP EliteBook 840 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d90, "HP EliteBook 860 16 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d91, "HP ZBook Firefly 14 G12", ALCXXX_FIXUP_CS35LXX), + SND_PCI_QUIRK(0x103c, 0x8d92, "HP ZBook Firefly 16 G12", ALCXXX_FIXUP_CS35LXX), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), From 004eb8ba776ccd3e296ea6f78f7ae7985b12824e Mon Sep 17 00:00:00 2001 From: Lianqin Hu Date: Sun, 11 Aug 2024 08:30:11 +0000 Subject: [PATCH 170/991] ALSA: usb-audio: Add delay quirk for VIVO USB-C-XE710 HEADSET Audio control requests that sets sampling frequency sometimes fail on this card. Adding delay between control messages eliminates that problem. Signed-off-by: Lianqin Hu Cc: Link: https://patch.msgid.link/TYUPR06MB6217FF67076AF3E49E12C877D2842@TYUPR06MB6217.apcprd06.prod.outlook.com Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index ea063a14cdd8fe..e7b68c67852e92 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2221,6 +2221,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x2b53, 0x0031, /* Fiero SC-01 (firmware v1.1.0) */ QUIRK_FLAG_GENERIC_IMPLICIT_FB), + DEVICE_FLG(0x2d95, 0x8021, /* VIVO USB-C-XE710 HEADSET */ + QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x30be, 0x0101, /* Schiit Hel */ QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x413c, 0xa506, /* Dell AE515 sound bar */ From b86aa4140f6a8f01f35bfb05af60e01a55b48803 Mon Sep 17 00:00:00 2001 From: Bouke Sybren Haarsma Date: Sun, 28 Jul 2024 14:47:30 +0200 Subject: [PATCH 171/991] drm: panel-orientation-quirks: Add quirk for Ayn Loki Zero Add quirk orientation for the Ayn Loki Zero. This also has been tested/used by the JELOS team. Signed-off-by: Bouke Sybren Haarsma Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20240728124731.168452-2-boukehaarsma23@gmail.com --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index c16c7678237eab..a1dfeaae644df2 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -208,6 +208,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_MATCH(DMI_BOARD_NAME, "KUN"), }, .driver_data = (void *)&lcd1600x2560_rightside_up, + }, { /* AYN Loki Zero */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ayn"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Loki Zero"), + }, + .driver_data = (void *)&lcd1080x1920_leftside_up, }, { /* Chuwi HiBook (CWI514) */ .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Hampoo"), From 2c71c8459c8ca66bd8f597effaac892ee8448a9f Mon Sep 17 00:00:00 2001 From: Bouke Sybren Haarsma Date: Sun, 28 Jul 2024 14:47:31 +0200 Subject: [PATCH 172/991] drm: panel-orientation-quirks: Add quirk for Ayn Loki Max Add quirk orientation for Ayn Loki Max model. This has been tested by JELOS team that uses their own patched kernel for a while now and confirmed by users in the ChimeraOS discord servers. Signed-off-by: Bouke Sybren Haarsma Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20240728124731.168452-3-boukehaarsma23@gmail.com --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index a1dfeaae644df2..0830cae9a4d0f5 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -208,6 +208,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_MATCH(DMI_BOARD_NAME, "KUN"), }, .driver_data = (void *)&lcd1600x2560_rightside_up, + }, { /* AYN Loki Max */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ayn"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Loki Max"), + }, + .driver_data = (void *)&lcd1080x1920_leftside_up, }, { /* AYN Loki Zero */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ayn"), From e332a5aba83500e8d422c90d2a84d8a5f888673e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 8 Aug 2024 02:47:28 +0900 Subject: [PATCH 173/991] treewide: remove unnecessary inclusion These files do not use any macros defined in . Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- drivers/accessibility/speakup/genmap.c | 1 - drivers/accessibility/speakup/makemapdata.c | 1 - drivers/staging/media/atomisp/include/linux/atomisp.h | 1 - samples/trace_events/trace_custom_sched.c | 1 - sound/soc/codecs/cs42l42.c | 1 - 5 files changed, 5 deletions(-) diff --git a/drivers/accessibility/speakup/genmap.c b/drivers/accessibility/speakup/genmap.c index 0125000e00d9ab..0882bab10fb87a 100644 --- a/drivers/accessibility/speakup/genmap.c +++ b/drivers/accessibility/speakup/genmap.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include "utils.h" diff --git a/drivers/accessibility/speakup/makemapdata.c b/drivers/accessibility/speakup/makemapdata.c index d7d41bb9b05fbb..55e4ef8a93dc99 100644 --- a/drivers/accessibility/speakup/makemapdata.c +++ b/drivers/accessibility/speakup/makemapdata.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include "utils.h" diff --git a/drivers/staging/media/atomisp/include/linux/atomisp.h b/drivers/staging/media/atomisp/include/linux/atomisp.h index 16c9da172c0317..fefbe3cd08f33d 100644 --- a/drivers/staging/media/atomisp/include/linux/atomisp.h +++ b/drivers/staging/media/atomisp/include/linux/atomisp.h @@ -20,7 +20,6 @@ #define _ATOM_ISP_H #include -#include /* struct media_device_info.hw_revision */ #define ATOMISP_HW_REVISION_MASK 0x0000ff00 diff --git a/samples/trace_events/trace_custom_sched.c b/samples/trace_events/trace_custom_sched.c index b99d9ab7db858f..dd409b704b35b6 100644 --- a/samples/trace_events/trace_custom_sched.c +++ b/samples/trace_events/trace_custom_sched.c @@ -8,7 +8,6 @@ #define pr_fmt(fmt) fmt #include -#include #include #include diff --git a/sound/soc/codecs/cs42l42.c b/sound/soc/codecs/cs42l42.c index 60d366e53526ff..6400ac875e6f6c 100644 --- a/sound/soc/codecs/cs42l42.c +++ b/sound/soc/codecs/cs42l42.c @@ -11,7 +11,6 @@ #include #include -#include #include #include #include From 1472464c6248575bf2d01c7f076b94704bb32c95 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 8 Aug 2024 03:03:00 +0900 Subject: [PATCH 174/991] kbuild: avoid scripts/kallsyms parsing /dev/null On macOS, as reported by Daniel Gomez, getline() sets ENOTTY to errno if it is requested to read from /dev/null. If this is worth fixing, I would rather pass an empty file to scripts/kallsyms instead of adding the ugly #ifdef __APPLE__. Fixes: c442db3f49f2 ("kbuild: remove PROVIDE() for kallsyms symbols") Reported-by: Daniel Gomez Closes: https://lore.kernel.org/all/20240807-macos-build-support-v1-12-4cd1ded85694@samsung.com/ Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier Reviewed-by: Daniel Gomez --- scripts/link-vmlinux.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index f7b2503cdba956..41c68ae3415d5a 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -219,7 +219,8 @@ kallsymso= strip_debug= if is_enabled CONFIG_KALLSYMS; then - kallsyms /dev/null .tmp_vmlinux0.kallsyms + truncate -s0 .tmp_vmlinux.kallsyms0.syms + kallsyms .tmp_vmlinux.kallsyms0.syms .tmp_vmlinux0.kallsyms fi if is_enabled CONFIG_KALLSYMS || is_enabled CONFIG_DEBUG_INFO_BTF; then From a9a18e8f770c9b0703dab93580d0b02e199a4c79 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 9 Aug 2024 15:28:19 +0300 Subject: [PATCH 175/991] atm: idt77252: prevent use after free in dequeue_rx() We can't dereference "skb" after calling vcc->push() because the skb is released. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/atm/idt77252.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index e7f713cd70d3fd..a876024d8a05f9 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -1118,8 +1118,8 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe) rpp->len += skb->len; if (stat & SAR_RSQE_EPDU) { + unsigned int len, truesize; unsigned char *l1l2; - unsigned int len; l1l2 = (unsigned char *) ((unsigned long) skb->data + skb->len - 6); @@ -1189,14 +1189,15 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe) ATM_SKB(skb)->vcc = vcc; __net_timestamp(skb); + truesize = skb->truesize; vcc->push(vcc, skb); atomic_inc(&vcc->stats->rx); - if (skb->truesize > SAR_FB_SIZE_3) + if (truesize > SAR_FB_SIZE_3) add_rx_skb(card, 3, SAR_FB_SIZE_3, 1); - else if (skb->truesize > SAR_FB_SIZE_2) + else if (truesize > SAR_FB_SIZE_2) add_rx_skb(card, 2, SAR_FB_SIZE_2, 1); - else if (skb->truesize > SAR_FB_SIZE_1) + else if (truesize > SAR_FB_SIZE_1) add_rx_skb(card, 1, SAR_FB_SIZE_1, 1); else add_rx_skb(card, 0, SAR_FB_SIZE_0, 1); From 9ff2f816e2aa65ca9a1cdf0954842f8173c0f48d Mon Sep 17 00:00:00 2001 From: Radhey Shyam Pandey Date: Fri, 9 Aug 2024 11:56:09 +0530 Subject: [PATCH 176/991] net: axienet: Fix register defines comment description In axiethernet header fix register defines comment description to be inline with IP documentation. It updates MAC configuration register, MDIO configuration register and frame filter control description. Fixes: 8a3b7a252dca ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Radhey Shyam Pandey Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h index fa5500decc9602..c7d9221fafdcb1 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h @@ -160,16 +160,16 @@ #define XAE_RCW1_OFFSET 0x00000404 /* Rx Configuration Word 1 */ #define XAE_TC_OFFSET 0x00000408 /* Tx Configuration */ #define XAE_FCC_OFFSET 0x0000040C /* Flow Control Configuration */ -#define XAE_EMMC_OFFSET 0x00000410 /* EMAC mode configuration */ -#define XAE_PHYC_OFFSET 0x00000414 /* RGMII/SGMII configuration */ +#define XAE_EMMC_OFFSET 0x00000410 /* MAC speed configuration */ +#define XAE_PHYC_OFFSET 0x00000414 /* RX Max Frame Configuration */ #define XAE_ID_OFFSET 0x000004F8 /* Identification register */ -#define XAE_MDIO_MC_OFFSET 0x00000500 /* MII Management Config */ -#define XAE_MDIO_MCR_OFFSET 0x00000504 /* MII Management Control */ -#define XAE_MDIO_MWD_OFFSET 0x00000508 /* MII Management Write Data */ -#define XAE_MDIO_MRD_OFFSET 0x0000050C /* MII Management Read Data */ +#define XAE_MDIO_MC_OFFSET 0x00000500 /* MDIO Setup */ +#define XAE_MDIO_MCR_OFFSET 0x00000504 /* MDIO Control */ +#define XAE_MDIO_MWD_OFFSET 0x00000508 /* MDIO Write Data */ +#define XAE_MDIO_MRD_OFFSET 0x0000050C /* MDIO Read Data */ #define XAE_UAW0_OFFSET 0x00000700 /* Unicast address word 0 */ #define XAE_UAW1_OFFSET 0x00000704 /* Unicast address word 1 */ -#define XAE_FMI_OFFSET 0x00000708 /* Filter Mask Index */ +#define XAE_FMI_OFFSET 0x00000708 /* Frame Filter Control */ #define XAE_AF0_OFFSET 0x00000710 /* Address Filter 0 */ #define XAE_AF1_OFFSET 0x00000714 /* Address Filter 1 */ @@ -308,7 +308,7 @@ */ #define XAE_UAW1_UNICASTADDR_MASK 0x0000FFFF -/* Bit masks for Axi Ethernet FMI register */ +/* Bit masks for Axi Ethernet FMC register */ #define XAE_FMI_PM_MASK 0x80000000 /* Promis. mode enable */ #define XAE_FMI_IND_MASK 0x00000003 /* Index Mask */ From 63796bc2e97cd5ebcef60bad4953259d4ad11cb4 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Fri, 9 Aug 2024 21:38:02 +0200 Subject: [PATCH 177/991] net: dsa: vsc73xx: fix port MAC configuration in full duplex mode According to the datasheet description ("Port Mode Procedure" in 5.6.2), the VSC73XX_MAC_CFG_WEXC_DIS bit is configured only for half duplex mode. The WEXC_DIS bit is responsible for MAC behavior after an excessive collision. Let's set it as described in the datasheet. Fixes: 05bd97fc559d ("net: dsa: Add Vitesse VSC73xx DSA router driver") Reviewed-by: Linus Walleij Reviewed-by: Florian Fainelli Signed-off-by: Pawel Dembicki Signed-off-by: David S. Miller --- drivers/net/dsa/vitesse-vsc73xx-core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index d9d3e30fd47ad4..f548ed4cb23f0c 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -957,6 +957,11 @@ static void vsc73xx_mac_link_up(struct phylink_config *config, if (duplex == DUPLEX_FULL) val |= VSC73XX_MAC_CFG_FDX; + else + /* In datasheet description ("Port Mode Procedure" in 5.6.2) + * this bit is configured only for half duplex. + */ + val |= VSC73XX_MAC_CFG_WEXC_DIS; /* This routine is described in the datasheet (below ARBDISC register * description) @@ -967,7 +972,6 @@ static void vsc73xx_mac_link_up(struct phylink_config *config, get_random_bytes(&seed, 1); val |= seed << VSC73XX_MAC_CFG_SEED_OFFSET; val |= VSC73XX_MAC_CFG_SEED_LOAD; - val |= VSC73XX_MAC_CFG_WEXC_DIS; /* Those bits are responsible for MTU only. Kernel takes care about MTU, * let's enable +8 bytes frame length unconditionally. From 5b9eebc2c7a5f0cc7950d918c1e8a4ad4bed5010 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Fri, 9 Aug 2024 21:38:03 +0200 Subject: [PATCH 178/991] net: dsa: vsc73xx: pass value in phy_write operation In the 'vsc73xx_phy_write' function, the register value is missing, and the phy write operation always sends zeros. This commit passes the value variable into the proper register. Fixes: 05bd97fc559d ("net: dsa: Add Vitesse VSC73xx DSA router driver") Reviewed-by: Linus Walleij Reviewed-by: Florian Fainelli Signed-off-by: Pawel Dembicki Signed-off-by: David S. Miller --- drivers/net/dsa/vitesse-vsc73xx-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index f548ed4cb23f0c..4b300c293dec47 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -574,7 +574,7 @@ static int vsc73xx_phy_write(struct dsa_switch *ds, int phy, int regnum, return 0; } - cmd = (phy << 21) | (regnum << 16); + cmd = (phy << 21) | (regnum << 16) | val; ret = vsc73xx_write(vsc, VSC73XX_BLOCK_MII, 0, 1, cmd); if (ret) return ret; From fa63c6434b6f6aaf9d8d599dc899bc0a074cc0ad Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Fri, 9 Aug 2024 21:38:04 +0200 Subject: [PATCH 179/991] net: dsa: vsc73xx: check busy flag in MDIO operations The VSC73xx has a busy flag used during MDIO operations. It is raised when MDIO read/write operations are in progress. Without it, PHYs are misconfigured and bus operations do not work as expected. Fixes: 05bd97fc559d ("net: dsa: Add Vitesse VSC73xx DSA router driver") Reviewed-by: Linus Walleij Reviewed-by: Florian Fainelli Signed-off-by: Pawel Dembicki Signed-off-by: David S. Miller --- drivers/net/dsa/vitesse-vsc73xx-core.c | 37 +++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index 4b300c293dec47..a789b2da9b7d4a 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -40,6 +40,10 @@ #define VSC73XX_BLOCK_ARBITER 0x5 /* Only subblock 0 */ #define VSC73XX_BLOCK_SYSTEM 0x7 /* Only subblock 0 */ +/* MII Block subblock */ +#define VSC73XX_BLOCK_MII_INTERNAL 0x0 /* Internal MDIO subblock */ +#define VSC73XX_BLOCK_MII_EXTERNAL 0x1 /* External MDIO subblock */ + #define CPU_PORT 6 /* CPU port */ /* MAC Block registers */ @@ -225,6 +229,8 @@ #define VSC73XX_MII_CMD 0x1 #define VSC73XX_MII_DATA 0x2 +#define VSC73XX_MII_STAT_BUSY BIT(3) + /* Arbiter block 5 registers */ #define VSC73XX_ARBEMPTY 0x0c #define VSC73XX_ARBDISC 0x0e @@ -299,6 +305,7 @@ #define IS_739X(a) (IS_7395(a) || IS_7398(a)) #define VSC73XX_POLL_SLEEP_US 1000 +#define VSC73XX_MDIO_POLL_SLEEP_US 5 #define VSC73XX_POLL_TIMEOUT_US 10000 struct vsc73xx_counter { @@ -527,6 +534,22 @@ static int vsc73xx_detect(struct vsc73xx *vsc) return 0; } +static int vsc73xx_mdio_busy_check(struct vsc73xx *vsc) +{ + int ret, err; + u32 val; + + ret = read_poll_timeout(vsc73xx_read, err, + err < 0 || !(val & VSC73XX_MII_STAT_BUSY), + VSC73XX_MDIO_POLL_SLEEP_US, + VSC73XX_POLL_TIMEOUT_US, false, vsc, + VSC73XX_BLOCK_MII, VSC73XX_BLOCK_MII_INTERNAL, + VSC73XX_MII_STAT, &val); + if (ret) + return ret; + return err; +} + static int vsc73xx_phy_read(struct dsa_switch *ds, int phy, int regnum) { struct vsc73xx *vsc = ds->priv; @@ -534,12 +557,20 @@ static int vsc73xx_phy_read(struct dsa_switch *ds, int phy, int regnum) u32 val; int ret; + ret = vsc73xx_mdio_busy_check(vsc); + if (ret) + return ret; + /* Setting bit 26 means "read" */ cmd = BIT(26) | (phy << 21) | (regnum << 16); ret = vsc73xx_write(vsc, VSC73XX_BLOCK_MII, 0, 1, cmd); if (ret) return ret; - msleep(2); + + ret = vsc73xx_mdio_busy_check(vsc); + if (ret) + return ret; + ret = vsc73xx_read(vsc, VSC73XX_BLOCK_MII, 0, 2, &val); if (ret) return ret; @@ -563,6 +594,10 @@ static int vsc73xx_phy_write(struct dsa_switch *ds, int phy, int regnum, u32 cmd; int ret; + ret = vsc73xx_mdio_busy_check(vsc); + if (ret) + return ret; + /* It was found through tedious experiments that this router * chip really hates to have it's PHYs reset. They * never recover if that happens: autonegotiation stops From 9f9a72654622bae75adb1e1923d709e96ede3042 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Fri, 9 Aug 2024 21:38:05 +0200 Subject: [PATCH 180/991] net: dsa: vsc73xx: allow phy resetting Resetting the VSC73xx PHY was problematic because the MDIO bus, without a busy check, read and wrote incorrect register values. My investigation indicates that resetting the PHY only triggers changes in configuration. However, improper register values written earlier were only exposed after a soft reset. The reset itself wasn't the issue; rather, the problem stemmed from incorrect read and write operations. A 'soft_reset' can now proceed normally. There are no reasons to keep the VSC73xx from being reset. This commit removes the reset blockade in the 'vsc73xx_phy_write' function. Reviewed-by: Linus Walleij Reviewed-by: Florian Fainelli Signed-off-by: Pawel Dembicki Signed-off-by: David S. Miller --- drivers/net/dsa/vitesse-vsc73xx-core.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index a789b2da9b7d4a..e3f95d2cc2c168 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -598,17 +598,6 @@ static int vsc73xx_phy_write(struct dsa_switch *ds, int phy, int regnum, if (ret) return ret; - /* It was found through tedious experiments that this router - * chip really hates to have it's PHYs reset. They - * never recover if that happens: autonegotiation stops - * working after a reset. Just filter out this command. - * (Resetting the whole chip is OK.) - */ - if (regnum == 0 && (val & BIT(15))) { - dev_info(vsc->dev, "reset PHY - disallowed\n"); - return 0; - } - cmd = (phy << 21) | (regnum << 16) | val; ret = vsc73xx_write(vsc, VSC73XX_BLOCK_MII, 0, 1, cmd); if (ret) From de7a670f8defe4ed2115552ad23dea0f432f7be4 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Fri, 9 Aug 2024 21:38:06 +0200 Subject: [PATCH 181/991] net: phy: vitesse: repair vsc73xx autonegotiation When the vsc73xx mdio bus work properly, the generic autonegotiation configuration works well. Reviewed-by: Linus Walleij Signed-off-by: Pawel Dembicki Signed-off-by: David S. Miller --- drivers/net/phy/vitesse.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/net/phy/vitesse.c b/drivers/net/phy/vitesse.c index 897b979ec03c81..3b5fcaf0dd36db 100644 --- a/drivers/net/phy/vitesse.c +++ b/drivers/net/phy/vitesse.c @@ -237,16 +237,6 @@ static int vsc739x_config_init(struct phy_device *phydev) return 0; } -static int vsc73xx_config_aneg(struct phy_device *phydev) -{ - /* The VSC73xx switches does not like to be instructed to - * do autonegotiation in any way, it prefers that you just go - * with the power-on/reset defaults. Writing some registers will - * just make autonegotiation permanently fail. - */ - return 0; -} - /* This adds a skew for both TX and RX clocks, so the skew should only be * applied to "rgmii-id" interfaces. It may not work as expected * on "rgmii-txid", "rgmii-rxid" or "rgmii" interfaces. @@ -444,7 +434,6 @@ static struct phy_driver vsc82xx_driver[] = { .phy_id_mask = 0x000ffff0, /* PHY_GBIT_FEATURES */ .config_init = vsc738x_config_init, - .config_aneg = vsc73xx_config_aneg, .read_page = vsc73xx_read_page, .write_page = vsc73xx_write_page, }, { @@ -453,7 +442,6 @@ static struct phy_driver vsc82xx_driver[] = { .phy_id_mask = 0x000ffff0, /* PHY_GBIT_FEATURES */ .config_init = vsc738x_config_init, - .config_aneg = vsc73xx_config_aneg, .read_page = vsc73xx_read_page, .write_page = vsc73xx_write_page, }, { @@ -462,7 +450,6 @@ static struct phy_driver vsc82xx_driver[] = { .phy_id_mask = 0x000ffff0, /* PHY_GBIT_FEATURES */ .config_init = vsc739x_config_init, - .config_aneg = vsc73xx_config_aneg, .read_page = vsc73xx_read_page, .write_page = vsc73xx_write_page, }, { @@ -471,7 +458,6 @@ static struct phy_driver vsc82xx_driver[] = { .phy_id_mask = 0x000ffff0, /* PHY_GBIT_FEATURES */ .config_init = vsc739x_config_init, - .config_aneg = vsc73xx_config_aneg, .read_page = vsc73xx_read_page, .write_page = vsc73xx_write_page, }, { From 8512fbb64b0e599412da661412d10d4ba1cb003c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Vok=C3=A1=C4=8D?= Date: Tue, 23 Jul 2024 16:25:19 +0200 Subject: [PATCH 182/991] ARM: dts: imx6dl-yapp43: Increase LED current to match the yapp4 HW design MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On the imx6dl-yapp4 revision based boards, the RGB LED is not driven directly by the LP5562 driver but through FET transistors. Hence the LED current is not determined by the driver but by the LED series resistors. On the imx6dl-yapp43 revision based boards, we removed the FET transistors to drive the LED directly from the LP5562 but forgot to tune the output current to match the previous HW design. Set the LED current on imx6dl-yapp43 based boards to the same values measured on the imx6dl-yapp4 boards and limit the maximum current to 20mA. Fixes: 7da4734751e0 ("ARM: dts: imx6dl-yapp43: Add support for new HW revision of the IOTA board") Cc: Signed-off-by: Michal Vokáč Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi b/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi index 52a0f6ee426f97..bcf4d9c870ec97 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi @@ -274,24 +274,24 @@ led@0 { chan-name = "R"; - led-cur = /bits/ 8 <0x20>; - max-cur = /bits/ 8 <0x60>; + led-cur = /bits/ 8 <0x6e>; + max-cur = /bits/ 8 <0xc8>; reg = <0>; color = ; }; led@1 { chan-name = "G"; - led-cur = /bits/ 8 <0x20>; - max-cur = /bits/ 8 <0x60>; + led-cur = /bits/ 8 <0xbe>; + max-cur = /bits/ 8 <0xc8>; reg = <1>; color = ; }; led@2 { chan-name = "B"; - led-cur = /bits/ 8 <0x20>; - max-cur = /bits/ 8 <0x60>; + led-cur = /bits/ 8 <0xbe>; + max-cur = /bits/ 8 <0xc8>; reg = <2>; color = ; }; From e7a9af8c93aa9f408f9972809b642faeec5287e1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Aug 2024 11:32:47 +0200 Subject: [PATCH 183/991] powerpc/mm: Fix size of allocated PGDIR Commit 6b0e82791bd0 ("powerpc/e500: switch to 64 bits PGD on 85xx (32 bits)") increased the size of PGD entries but failed to increase the PGD directory. Use the size of pgd_t instead of the size of pointers to calculate the allocated size. Reported-by: Guenter Roeck Fixes: 6b0e82791bd0 ("powerpc/e500: switch to 64 bits PGD on 85xx (32 bits)") Signed-off-by: Christophe Leroy Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://msgid.link/1cdaacb391cbd3e0240f0e0faf691202874e9422.1723109462.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/init-common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 9b4a675eb8f877..2978fcbe307eab 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -73,7 +73,7 @@ void setup_kup(void) #define CTOR(shift) static void ctor_##shift(void *addr) \ { \ - memset(addr, 0, sizeof(void *) << (shift)); \ + memset(addr, 0, sizeof(pgd_t) << (shift)); \ } CTOR(0); CTOR(1); CTOR(2); CTOR(3); CTOR(4); CTOR(5); CTOR(6); CTOR(7); @@ -117,7 +117,7 @@ EXPORT_SYMBOL_GPL(pgtable_cache); /* used by kvm_hv module */ void pgtable_cache_add(unsigned int shift) { char *name; - unsigned long table_size = sizeof(void *) << shift; + unsigned long table_size = sizeof(pgd_t) << shift; unsigned long align = table_size; /* When batching pgtable pointers for RCU freeing, we store From e7e846dc6c73fbc94ae8b4ec20d05627646416f2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Aug 2024 09:05:08 +0200 Subject: [PATCH 184/991] powerpc/mm: Fix boot warning with hugepages and CONFIG_DEBUG_VIRTUAL Booting with CONFIG_DEBUG_VIRTUAL leads to following warning when passing hugepage reservation on command line: Kernel command line: hugepagesz=1g hugepages=1 hugepagesz=64m hugepages=1 hugepagesz=256m hugepages=1 noreboot HugeTLB: allocating 1 of page size 1.00 GiB failed. Only allocated 0 hugepages. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at arch/powerpc/include/asm/io.h:948 __alloc_bootmem_huge_page+0xd4/0x284 Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 6.10.0-rc6-00396-g6b0e82791bd0-dirty #936 Hardware name: MPC8544DS e500v2 0x80210030 MPC8544 DS NIP: c1020240 LR: c10201d0 CTR: 00000000 REGS: c13fdd30 TRAP: 0700 Not tainted (6.10.0-rc6-00396-g6b0e82791bd0-dirty) MSR: 00021000 CR: 44084288 XER: 20000000 GPR00: c10201d0 c13fde20 c130b560 e8000000 e8001000 00000000 00000000 c1420000 GPR08: 00000000 00028001 00000000 00000004 44084282 01066ac0 c0eb7c9c efffe149 GPR16: c0fc4228 0000005f ffffffff c0eb7d0c c0eb7cc0 c0eb7ce0 ffffffff 00000000 GPR24: c1441cec efffe153 e8001000 c14240c0 00000000 c1441d64 00000000 e8000000 NIP [c1020240] __alloc_bootmem_huge_page+0xd4/0x284 LR [c10201d0] __alloc_bootmem_huge_page+0x64/0x284 Call Trace: [c13fde20] [c10201d0] __alloc_bootmem_huge_page+0x64/0x284 (unreliable) [c13fde50] [c10207b8] hugetlb_hstate_alloc_pages+0x8c/0x3e8 [c13fdeb0] [c1021384] hugepages_setup+0x240/0x2cc [c13fdef0] [c1000574] unknown_bootoption+0xfc/0x280 [c13fdf30] [c0078904] parse_args+0x200/0x4c4 [c13fdfa0] [c1000d9c] start_kernel+0x238/0x7d0 [c13fdff0] [c0000434] set_ivor+0x12c/0x168 Code: 554aa33e 7c042840 3ce0c142 80a7427c 5109a016 50caa016 7c9a2378 7fdcf378 4180000c 7c052040 41810160 7c095040 <0fe00000> 38c00000 40800108 3c60c0eb ---[ end trace 0000000000000000 ]--- This is due to virt_addr_valid() using high_memory before it is set. high_memory is set in mem_init() using max_low_pfn, but max_low_pfn is available long before, it is set in mem_topology_setup(). So just like commit daa9ada2093e ("powerpc/mm: Fix boot crash with FLATMEM") moved the setting of max_mapnr immediately after the call to mem_topology_setup(), the same can be done for high_memory. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/62b69c4baad067093f39e7e60df0fe27a86b8d2a.1723100702.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/setup-common.c | 1 + arch/powerpc/mm/mem.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 4bd2f87616baa9..943430077375a4 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -959,6 +959,7 @@ void __init setup_arch(char **cmdline_p) mem_topology_setup(); /* Set max_mapnr before paging_init() */ set_max_mapnr(max_pfn); + high_memory = (void *)__va(max_low_pfn * PAGE_SIZE); /* * Release secondary cpus out of their spinloops at 0x60 now that diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index d325217ab20121..da21cb018984eb 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -290,8 +290,6 @@ void __init mem_init(void) swiotlb_init(ppc_swiotlb_enable, ppc_swiotlb_flags); #endif - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - kasan_late_init(); memblock_free_all(); From c25504a0ba36968f919aa30caff172ef23346299 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 9 Aug 2024 16:06:53 -0400 Subject: [PATCH 185/991] dt-bindings: net: fsl,qoriq-mc-dpmac: add missed property phys Add missed property phys, which indicate how connect to serdes phy. Fix below warning: arch/arm64/boot/dts/freescale/fsl-lx2160a-honeycomb.dtb: fsl-mc@80c000000: dpmacs:ethernet@7: Unevaluated properties are not allowed ('phys' was unexpected) Signed-off-by: Frank Li Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml b/Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml index a1b71b35319e70..42f9843d1868ac 100644 --- a/Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml +++ b/Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml @@ -38,6 +38,10 @@ properties: managed: true + phys: + description: A reference to the SerDes lane(s) + maxItems: 1 + required: - reg From 32316f676b4ee87c0404d333d248ccf777f739bc Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Fri, 9 Aug 2024 14:01:24 -0700 Subject: [PATCH 186/991] net: mana: Fix RX buf alloc_size alignment and atomic op panic The MANA driver's RX buffer alloc_size is passed into napi_build_skb() to create SKB. skb_shinfo(skb) is located at the end of skb, and its alignment is affected by the alloc_size passed into napi_build_skb(). The size needs to be aligned properly for better performance and atomic operations. Otherwise, on ARM64 CPU, for certain MTU settings like 4000, atomic operations may panic on the skb_shinfo(skb)->dataref due to alignment fault. To fix this bug, add proper alignment to the alloc_size calculation. Sample panic info: [ 253.298819] Unable to handle kernel paging request at virtual address ffff000129ba5cce [ 253.300900] Mem abort info: [ 253.301760] ESR = 0x0000000096000021 [ 253.302825] EC = 0x25: DABT (current EL), IL = 32 bits [ 253.304268] SET = 0, FnV = 0 [ 253.305172] EA = 0, S1PTW = 0 [ 253.306103] FSC = 0x21: alignment fault Call trace: __skb_clone+0xfc/0x198 skb_clone+0x78/0xe0 raw6_local_deliver+0xfc/0x228 ip6_protocol_deliver_rcu+0x80/0x500 ip6_input_finish+0x48/0x80 ip6_input+0x48/0xc0 ip6_sublist_rcv_finish+0x50/0x78 ip6_sublist_rcv+0x1cc/0x2b8 ipv6_list_rcv+0x100/0x150 __netif_receive_skb_list_core+0x180/0x220 netif_receive_skb_list_internal+0x198/0x2a8 __napi_poll+0x138/0x250 net_rx_action+0x148/0x330 handle_softirqs+0x12c/0x3a0 Cc: stable@vger.kernel.org Fixes: 80f6215b450e ("net: mana: Add support for jumbo frame") Signed-off-by: Haiyang Zhang Reviewed-by: Long Li Signed-off-by: David S. Miller --- drivers/net/ethernet/microsoft/mana/mana_en.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index d2f07e179e86ba..ae717d06e66f04 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -599,7 +599,11 @@ static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size, else *headroom = XDP_PACKET_HEADROOM; - *alloc_size = mtu + MANA_RXBUF_PAD + *headroom; + *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom); + + /* Using page pool in this case, so alloc_size is PAGE_SIZE */ + if (*alloc_size < PAGE_SIZE) + *alloc_size = PAGE_SIZE; *datasize = mtu + ETH_HLEN; } From db1b4bedb9b97c6d34b03d03815147c04fffe8b4 Mon Sep 17 00:00:00 2001 From: Zheng Zhang Date: Sat, 10 Aug 2024 13:26:51 +0800 Subject: [PATCH 187/991] net: ethernet: mtk_wed: fix use-after-free panic in mtk_wed_setup_tc_block_cb() When there are multiple ap interfaces on one band and with WED on, turning the interface down will cause a kernel panic on MT798X. Previously, cb_priv was freed in mtk_wed_setup_tc_block() without marking NULL,and mtk_wed_setup_tc_block_cb() didn't check the value, too. Assign NULL after free cb_priv in mtk_wed_setup_tc_block() and check NULL in mtk_wed_setup_tc_block_cb(). ---------- Unable to handle kernel paging request at virtual address 0072460bca32b4f5 Call trace: mtk_wed_setup_tc_block_cb+0x4/0x38 0xffffffc0794084bc tcf_block_playback_offloads+0x70/0x1e8 tcf_block_unbind+0x6c/0xc8 ... --------- Fixes: 799684448e3e ("net: ethernet: mtk_wed: introduce wed wo support") Signed-off-by: Zheng Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_wed.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index 61334a71058c75..e212a4ba92751f 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -2666,14 +2666,15 @@ mtk_wed_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_pri { struct mtk_wed_flow_block_priv *priv = cb_priv; struct flow_cls_offload *cls = type_data; - struct mtk_wed_hw *hw = priv->hw; + struct mtk_wed_hw *hw = NULL; - if (!tc_can_offload(priv->dev)) + if (!priv || !tc_can_offload(priv->dev)) return -EOPNOTSUPP; if (type != TC_SETUP_CLSFLOWER) return -EOPNOTSUPP; + hw = priv->hw; return mtk_flow_offload_cmd(hw->eth, cls, hw->index); } @@ -2729,6 +2730,7 @@ mtk_wed_setup_tc_block(struct mtk_wed_hw *hw, struct net_device *dev, flow_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); kfree(block_cb->cb_priv); + block_cb->cb_priv = NULL; } return 0; default: From 497d370a644d95a9f04271aa92cb96d32e84c770 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Fri, 9 Aug 2024 12:18:45 -0300 Subject: [PATCH 188/991] drm/v3d: Fix out-of-bounds read in `v3d_csd_job_run()` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When enabling UBSAN on Raspberry Pi 5, we get the following warning: [ 387.894977] UBSAN: array-index-out-of-bounds in drivers/gpu/drm/v3d/v3d_sched.c:320:3 [ 387.903868] index 7 is out of range for type '__u32 [7]' [ 387.909692] CPU: 0 PID: 1207 Comm: kworker/u16:2 Tainted: G WC 6.10.3-v8-16k-numa #151 [ 387.919166] Hardware name: Raspberry Pi 5 Model B Rev 1.0 (DT) [ 387.925961] Workqueue: v3d_csd drm_sched_run_job_work [gpu_sched] [ 387.932525] Call trace: [ 387.935296] dump_backtrace+0x170/0x1b8 [ 387.939403] show_stack+0x20/0x38 [ 387.942907] dump_stack_lvl+0x90/0xd0 [ 387.946785] dump_stack+0x18/0x28 [ 387.950301] __ubsan_handle_out_of_bounds+0x98/0xd0 [ 387.955383] v3d_csd_job_run+0x3a8/0x438 [v3d] [ 387.960707] drm_sched_run_job_work+0x520/0x6d0 [gpu_sched] [ 387.966862] process_one_work+0x62c/0xb48 [ 387.971296] worker_thread+0x468/0x5b0 [ 387.975317] kthread+0x1c4/0x1e0 [ 387.978818] ret_from_fork+0x10/0x20 [ 387.983014] ---[ end trace ]--- This happens because the UAPI provides only seven configuration registers and we are reading the eighth position of this u32 array. Therefore, fix the out-of-bounds read in `v3d_csd_job_run()` by accessing only seven positions on the '__u32 [7]' array. The eighth register exists indeed on V3D 7.1, but it isn't currently used. That being so, let's guarantee that it remains unused and add a note that it could be set in a future patch. Fixes: 0ad5bc1ce463 ("drm/v3d: fix up register addresses for V3D 7.x") Reported-by: Tvrtko Ursulin Signed-off-by: Maíra Canal Reviewed-by: Iago Toral Quiroga Link: https://patchwork.freedesktop.org/patch/msgid/20240809152001.668314-1-mcanal@igalia.com --- drivers/gpu/drm/v3d/v3d_sched.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 9bd7453b25ad23..b8682818bafa6c 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -315,7 +315,7 @@ v3d_csd_job_run(struct drm_sched_job *sched_job) struct v3d_dev *v3d = job->base.v3d; struct drm_device *dev = &v3d->drm; struct dma_fence *fence; - int i, csd_cfg0_reg, csd_cfg_reg_count; + int i, csd_cfg0_reg; v3d->csd_job = job; @@ -335,9 +335,17 @@ v3d_csd_job_run(struct drm_sched_job *sched_job) v3d_switch_perfmon(v3d, &job->base); csd_cfg0_reg = V3D_CSD_QUEUED_CFG0(v3d->ver); - csd_cfg_reg_count = v3d->ver < 71 ? 6 : 7; - for (i = 1; i <= csd_cfg_reg_count; i++) + for (i = 1; i <= 6; i++) V3D_CORE_WRITE(0, csd_cfg0_reg + 4 * i, job->args.cfg[i]); + + /* Although V3D 7.1 has an eighth configuration register, we are not + * using it. Therefore, make sure it remains unused. + * + * XXX: Set the CFG7 register + */ + if (v3d->ver >= 71) + V3D_CORE_WRITE(0, V3D_V7_CSD_QUEUED_CFG7, 0); + /* CFG0 write kicks off the job. */ V3D_CORE_WRITE(0, csd_cfg0_reg, job->args.cfg[0]); From 2a07bb64d80152701d507b1498237ed1b8d83866 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Mon, 12 Aug 2024 14:57:32 +0200 Subject: [PATCH 189/991] s390/dasd: Remove DMA alignment This reverts commit bc792884b76f ("s390/dasd: Establish DMA alignment"). Quoting the original commit: linux-next commit bf8d08532bc1 ("iomap: add support for dma aligned direct-io") changes the alignment requirement to come from the block device rather than the block size, and the default alignment requirement is 512-byte boundaries. Since DASD I/O has page alignments for IDAW/TIDAW requests, let's override this value to restore the expected behavior. I mentioned TIDAW, but that was wrong. TIDAWs have no distinct alignment requirement (per p. 15-70 of POPS SA22-7832-13): Unless otherwise specified, TIDAWs may designate a block of main storage on any boundary and length up to 4K bytes, provided the specified block does not cross a 4 K-byte boundary. IDAWs do, but the original commit neglected that while ECKD DASD are typically formatted in 4096-byte blocks, they don't HAVE to be. Formatting an ECKD volume with smaller blocks is permitted (dasdfmt -b xxx), and the problematic commit enforces alignment properties to such a device that will result in errors, such as: [test@host ~]# lsdasd -l a367 | grep blksz blksz: 512 [test@host ~]# mkfs.xfs -f /dev/disk/by-path/ccw-0.0.a367-part1 meta-data=/dev/dasdc1 isize=512 agcount=4, agsize=230075 blks = sectsz=512 attr=2, projid32bit=1 = crc=1 finobt=1, sparse=1, rmapbt=1 = reflink=1 bigtime=1 inobtcount=1 nrext64=1 data = bsize=4096 blocks=920299, imaxpct=25 = sunit=0 swidth=0 blks naming =version 2 bsize=4096 ascii-ci=0, ftype=1 log =internal log bsize=4096 blocks=16384, version=2 = sectsz=512 sunit=0 blks, lazy-count=1 realtime =none extsz=4096 blocks=0, rtextents=0 error reading existing superblock: Invalid argument mkfs.xfs: pwrite failed: Invalid argument libxfs_bwrite: write failed on (unknown) bno 0x70565c/0x100, err=22 mkfs.xfs: Releasing dirty buffer to free list! found dirty buffer (bulk) on free list! mkfs.xfs: pwrite failed: Invalid argument ...snipped... The original commit omitted the FBA discipline for just this reason, but the formatted block size of the other disciplines was overlooked. The solution to all of this is to revert to the original behavior, such that the block size can be respected. There were two commits [1] that moved this code in the interim, so a straight git-revert is not possible, but the change is straightforward. But what of the original problem? That was manifested with a direct-io QEMU guest, where QEMU itself was changed a month or two later with commit 25474d90aa ("block: use the request length for iov alignment") such that the blamed kernel commit is unnecessary. [1] commit 0127a47f58c6 ("dasd: move queue setup to common code") commit fde07a4d74e3 ("dasd: use the atomic queue limits API") Fixes: bc792884b76f ("s390/dasd: Establish DMA alignment") Reviewed-by: Stefan Haberland Signed-off-by: Eric Farman Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20240812125733.126431-2-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_genhd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 1aa426b1deddc7..6da47a65af610f 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -41,7 +41,6 @@ int dasd_gendisk_alloc(struct dasd_block *block) */ .max_segment_size = PAGE_SIZE, .seg_boundary_mask = PAGE_SIZE - 1, - .dma_alignment = PAGE_SIZE - 1, .max_segments = USHRT_MAX, }; struct gendisk *gdp; From 7db4042336580dfd75cb5faa82c12cd51098c90b Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Mon, 12 Aug 2024 14:57:33 +0200 Subject: [PATCH 190/991] s390/dasd: fix error recovery leading to data corruption on ESE devices Extent Space Efficient (ESE) or thin provisioned volumes need to be formatted on demand during usual IO processing. The dasd_ese_needs_format function checks for error codes that signal the non existence of a proper track format. The check for incorrect length is to imprecise since other error cases leading to transport of insufficient data also have this flag set. This might lead to data corruption in certain error cases for example during a storage server warmstart. Fix by removing the check for incorrect length and replacing by explicitly checking for invalid track format in transport mode. Also remove the check for file protected since this is not a valid ESE handling case. Cc: stable@vger.kernel.org # 5.3+ Fixes: 5e2b17e712cf ("s390/dasd: Add dynamic formatting support for ESE volumes") Reviewed-by: Jan Hoeppner Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20240812125733.126431-3-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 36 ++++++++++++------- drivers/s390/block/dasd_3990_erp.c | 10 ++---- drivers/s390/block/dasd_eckd.c | 55 +++++++++++++----------------- drivers/s390/block/dasd_int.h | 2 +- 4 files changed, 50 insertions(+), 53 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 0a97cfedd7060a..42a4a996defbe1 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1601,9 +1601,15 @@ static int dasd_ese_needs_format(struct dasd_block *block, struct irb *irb) if (!sense) return 0; - return !!(sense[1] & SNS1_NO_REC_FOUND) || - !!(sense[1] & SNS1_FILE_PROTECTED) || - scsw_cstat(&irb->scsw) == SCHN_STAT_INCORR_LEN; + if (sense[1] & SNS1_NO_REC_FOUND) + return 1; + + if ((sense[1] & SNS1_INV_TRACK_FORMAT) && + scsw_is_tm(&irb->scsw) && + !(sense[2] & SNS2_ENV_DATA_PRESENT)) + return 1; + + return 0; } static int dasd_ese_oos_cond(u8 *sense) @@ -1624,7 +1630,7 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm, struct dasd_device *device; unsigned long now; int nrf_suppressed = 0; - int fp_suppressed = 0; + int it_suppressed = 0; struct request *req; u8 *sense = NULL; int expires; @@ -1679,8 +1685,9 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm, */ sense = dasd_get_sense(irb); if (sense) { - fp_suppressed = (sense[1] & SNS1_FILE_PROTECTED) && - test_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags); + it_suppressed = (sense[1] & SNS1_INV_TRACK_FORMAT) && + !(sense[2] & SNS2_ENV_DATA_PRESENT) && + test_bit(DASD_CQR_SUPPRESS_IT, &cqr->flags); nrf_suppressed = (sense[1] & SNS1_NO_REC_FOUND) && test_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags); @@ -1695,7 +1702,7 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm, return; } } - if (!(fp_suppressed || nrf_suppressed)) + if (!(it_suppressed || nrf_suppressed)) device->discipline->dump_sense_dbf(device, irb, "int"); if (device->features & DASD_FEATURE_ERPLOG) @@ -2459,14 +2466,17 @@ static int _dasd_sleep_on_queue(struct list_head *ccw_queue, int interruptible) rc = 0; list_for_each_entry_safe(cqr, n, ccw_queue, blocklist) { /* - * In some cases the 'File Protected' or 'Incorrect Length' - * error might be expected and error recovery would be - * unnecessary in these cases. Check if the according suppress - * bit is set. + * In some cases certain errors might be expected and + * error recovery would be unnecessary in these cases. + * Check if the according suppress bit is set. */ sense = dasd_get_sense(&cqr->irb); - if (sense && sense[1] & SNS1_FILE_PROTECTED && - test_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags)) + if (sense && (sense[1] & SNS1_INV_TRACK_FORMAT) && + !(sense[2] & SNS2_ENV_DATA_PRESENT) && + test_bit(DASD_CQR_SUPPRESS_IT, &cqr->flags)) + continue; + if (sense && (sense[1] & SNS1_NO_REC_FOUND) && + test_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags)) continue; if (scsw_cstat(&cqr->irb.scsw) == 0x40 && test_bit(DASD_CQR_SUPPRESS_IL, &cqr->flags)) diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c index bbbacfc386f28d..d0aa267462c50a 100644 --- a/drivers/s390/block/dasd_3990_erp.c +++ b/drivers/s390/block/dasd_3990_erp.c @@ -1386,14 +1386,8 @@ dasd_3990_erp_file_prot(struct dasd_ccw_req * erp) struct dasd_device *device = erp->startdev; - /* - * In some cases the 'File Protected' error might be expected and - * log messages shouldn't be written then. - * Check if the according suppress bit is set. - */ - if (!test_bit(DASD_CQR_SUPPRESS_FP, &erp->flags)) - dev_err(&device->cdev->dev, - "Accessing the DASD failed because of a hardware error\n"); + dev_err(&device->cdev->dev, + "Accessing the DASD failed because of a hardware error\n"); return dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 9388b5c383cab8..90b106408992d0 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -2275,6 +2275,7 @@ dasd_eckd_analysis_ccw(struct dasd_device *device) cqr->status = DASD_CQR_FILLED; /* Set flags to suppress output for expected errors */ set_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags); + set_bit(DASD_CQR_SUPPRESS_IT, &cqr->flags); return cqr; } @@ -2556,7 +2557,6 @@ dasd_eckd_build_check_tcw(struct dasd_device *base, struct format_data_t *fdata, cqr->buildclk = get_tod_clock(); cqr->status = DASD_CQR_FILLED; /* Set flags to suppress output for expected errors */ - set_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags); set_bit(DASD_CQR_SUPPRESS_IL, &cqr->flags); return cqr; @@ -4130,8 +4130,6 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single( /* Set flags to suppress output for expected errors */ if (dasd_eckd_is_ese(basedev)) { - set_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags); - set_bit(DASD_CQR_SUPPRESS_IL, &cqr->flags); set_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags); } @@ -4633,9 +4631,8 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track( /* Set flags to suppress output for expected errors */ if (dasd_eckd_is_ese(basedev)) { - set_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags); - set_bit(DASD_CQR_SUPPRESS_IL, &cqr->flags); set_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags); + set_bit(DASD_CQR_SUPPRESS_IT, &cqr->flags); } return cqr; @@ -5780,36 +5777,32 @@ static void dasd_eckd_dump_sense(struct dasd_device *device, { u8 *sense = dasd_get_sense(irb); - if (scsw_is_tm(&irb->scsw)) { - /* - * In some cases the 'File Protected' or 'Incorrect Length' - * error might be expected and log messages shouldn't be written - * then. Check if the according suppress bit is set. - */ - if (sense && (sense[1] & SNS1_FILE_PROTECTED) && - test_bit(DASD_CQR_SUPPRESS_FP, &req->flags)) - return; - if (scsw_cstat(&irb->scsw) == 0x40 && - test_bit(DASD_CQR_SUPPRESS_IL, &req->flags)) - return; + /* + * In some cases certain errors might be expected and + * log messages shouldn't be written then. + * Check if the according suppress bit is set. + */ + if (sense && (sense[1] & SNS1_INV_TRACK_FORMAT) && + !(sense[2] & SNS2_ENV_DATA_PRESENT) && + test_bit(DASD_CQR_SUPPRESS_IT, &req->flags)) + return; - dasd_eckd_dump_sense_tcw(device, req, irb); - } else { - /* - * In some cases the 'Command Reject' or 'No Record Found' - * error might be expected and log messages shouldn't be - * written then. Check if the according suppress bit is set. - */ - if (sense && sense[0] & SNS0_CMD_REJECT && - test_bit(DASD_CQR_SUPPRESS_CR, &req->flags)) - return; + if (sense && sense[0] & SNS0_CMD_REJECT && + test_bit(DASD_CQR_SUPPRESS_CR, &req->flags)) + return; - if (sense && sense[1] & SNS1_NO_REC_FOUND && - test_bit(DASD_CQR_SUPPRESS_NRF, &req->flags)) - return; + if (sense && sense[1] & SNS1_NO_REC_FOUND && + test_bit(DASD_CQR_SUPPRESS_NRF, &req->flags)) + return; + if (scsw_cstat(&irb->scsw) == 0x40 && + test_bit(DASD_CQR_SUPPRESS_IL, &req->flags)) + return; + + if (scsw_is_tm(&irb->scsw)) + dasd_eckd_dump_sense_tcw(device, req, irb); + else dasd_eckd_dump_sense_ccw(device, req, irb); - } } static int dasd_eckd_reload_device(struct dasd_device *device) diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index e5f40536b42540..81cfb5c89681bc 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -196,7 +196,7 @@ struct dasd_ccw_req { * The following flags are used to suppress output of certain errors. */ #define DASD_CQR_SUPPRESS_NRF 4 /* Suppress 'No Record Found' error */ -#define DASD_CQR_SUPPRESS_FP 5 /* Suppress 'File Protected' error*/ +#define DASD_CQR_SUPPRESS_IT 5 /* Suppress 'Invalid Track' error*/ #define DASD_CQR_SUPPRESS_IL 6 /* Suppress 'Incorrect Length' error */ #define DASD_CQR_SUPPRESS_CR 7 /* Suppress 'Command Reject' error */ From 84f2eecf95018386c145ada19bb45b03bdb80d9e Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Sun, 11 Aug 2024 14:07:11 -0400 Subject: [PATCH 191/991] io_uring/napi: check napi_enabled in io_napi_add() before proceeding doing so avoids the overhead of adding napi ids to all the rings that do not enable napi. if no id is added to napi_list because napi is disabled, __io_napi_busy_loop() will not be called. Signed-off-by: Olivier Langlois Fixes: b4ccc4dd1330 ("io_uring/napi: enable even with a timeout of 0") Link: https://lore.kernel.org/r/bd989ccef5fda14f5fd9888faf4fefcf66bd0369.1723400131.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- io_uring/napi.c | 2 +- io_uring/napi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index a3dc3762008fa5..73c4159e8405b3 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -302,7 +302,7 @@ void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); - if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled) + if (!(ctx->flags & IORING_SETUP_SQPOLL)) io_napi_blocking_busy_loop(ctx, iowq); } diff --git a/io_uring/napi.h b/io_uring/napi.h index 88f1c21d5548ff..27b88c3eb42894 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -55,7 +55,7 @@ static inline void io_napi_add(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct socket *sock; - if (!READ_ONCE(ctx->napi_busy_poll_dt)) + if (!READ_ONCE(ctx->napi_enabled)) return; sock = sock_from_file(req->file); From 48cc7ecd3a68e0fbfa281ef1ed6f6b6cb7638390 Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Sun, 11 Aug 2024 20:34:46 -0400 Subject: [PATCH 192/991] io_uring/napi: remove duplicate io_napi_entry timeout assignation io_napi_entry() has 2 calling sites. One of them is unlikely to find an entry and if it does, the timeout should arguable not be updated. The other io_napi_entry() calling site is overwriting the update made by io_napi_entry() so the io_napi_entry() timeout value update has no or little value and therefore is removed. Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/145b54ff179f87609e20dffaf5563c07cdbcad1a.1723423275.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- io_uring/napi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index 73c4159e8405b3..1de1d4d629254c 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -26,7 +26,6 @@ static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, hlist_for_each_entry_rcu(e, hash_list, node) { if (e->napi_id != napi_id) continue; - e->timeout = jiffies + NAPI_TIMEOUT; return e; } From 054308ad90ae43ba2d4b9c83c6582e8fe94f6fed Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 12 Aug 2024 11:27:07 +0530 Subject: [PATCH 193/991] MAINTAINERS: Add Manivannan Sadhasivam as Reviewer for PCI native host bridge and endpoint drivers I've been reviewing the native host bridge drivers for some time and would like to be listed as a Reviewer formally. Link: https://lore.kernel.org/r/20240812055707.6778-1-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 42decde3832066..3fb27f41515d51 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17616,6 +17616,7 @@ F: drivers/pci/controller/pci-xgene-msi.c PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS M: Lorenzo Pieralisi M: Krzysztof Wilczyński +R: Manivannan Sadhasivam R: Rob Herring L: linux-pci@vger.kernel.org S: Supported From 98055bc3595500bcf2126b93b1595354bdb86a66 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 May 2024 21:17:32 +0100 Subject: [PATCH 194/991] netfs: Fault in smaller chunks for non-large folio mappings As in commit 4e527d5841e2 ("iomap: fault in smaller chunks for non-large folio mappings"), we can see a performance loss for filesystems which have not yet been converted to large folios. Fixes: c38f4e96e605 ("netfs: Provide func to copy data to pagecache for buffered write") Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240527201735.1898381-1-willy@infradead.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/netfs/buffered_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 4726c315453c3a..ca53c5d1622ede 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -184,7 +184,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; ssize_t written = 0, ret, ret2; loff_t i_size, pos = iocb->ki_pos, from, to; - size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; + size_t max_chunk = mapping_max_folio_size(mapping); bool maybe_trouble = false; if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || From 3f65f3c099bcb27949e712f39ba836f21785924a Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 29 Jul 2024 15:48:12 -0700 Subject: [PATCH 195/991] filelock: fix name of file_lease slab cache When struct file_lease was split out from struct file_lock, the name of the file_lock slab cache was copied to the new slab cache for file_lease. This name conflict causes confusion in /proc/slabinfo and /sys/kernel/slab. In particular, it caused failures in drgn's test case for slab cache merging. Link: https://github.com/osandov/drgn/blob/9ad29fd86499eb32847473e928b6540872d3d59a/tests/linux_kernel/helpers/test_slab.py#L81 Fixes: c69ff4071935 ("filelock: split leases out of struct file_lock") Signed-off-by: Omar Sandoval Link: https://lore.kernel.org/r/2d1d053da1cafb3e7940c4f25952da4f0af34e38.1722293276.git.osandov@fb.com Reviewed-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/locks.c b/fs/locks.c index 9afb16e0683ffd..e45cad40f8b6bd 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2984,7 +2984,7 @@ static int __init filelock_init(void) filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); - filelease_cache = kmem_cache_create("file_lock_cache", + filelease_cache = kmem_cache_create("file_lease_cache", sizeof(struct file_lease), 0, SLAB_PANIC, NULL); for_each_possible_cpu(i) { From f71aa06398aabc2e3eaac25acdf3d62e0094ba70 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 29 Jul 2024 17:19:30 +0100 Subject: [PATCH 196/991] fs/netfs/fscache_cookie: add missing "n_accesses" check This fixes a NULL pointer dereference bug due to a data race which looks like this: BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 33 PID: 16573 Comm: kworker/u97:799 Not tainted 6.8.7-cm4all1-hp+ #43 Hardware name: HP ProLiant DL380 Gen9/ProLiant DL380 Gen9, BIOS P89 10/17/2018 Workqueue: events_unbound netfs_rreq_write_to_cache_work RIP: 0010:cachefiles_prepare_write+0x30/0xa0 Code: 57 41 56 45 89 ce 41 55 49 89 cd 41 54 49 89 d4 55 53 48 89 fb 48 83 ec 08 48 8b 47 08 48 83 7f 10 00 48 89 34 24 48 8b 68 20 <48> 8b 45 08 4c 8b 38 74 45 49 8b 7f 50 e8 4e a9 b0 ff 48 8b 73 10 RSP: 0018:ffffb4e78113bde0 EFLAGS: 00010286 RAX: ffff976126be6d10 RBX: ffff97615cdb8438 RCX: 0000000000020000 RDX: ffff97605e6c4c68 RSI: ffff97605e6c4c60 RDI: ffff97615cdb8438 RBP: 0000000000000000 R08: 0000000000278333 R09: 0000000000000001 R10: ffff97605e6c4600 R11: 0000000000000001 R12: ffff97605e6c4c68 R13: 0000000000020000 R14: 0000000000000001 R15: ffff976064fe2c00 FS: 0000000000000000(0000) GS:ffff9776dfd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 000000005942c002 CR4: 00000000001706f0 Call Trace: ? __die+0x1f/0x70 ? page_fault_oops+0x15d/0x440 ? search_module_extables+0xe/0x40 ? fixup_exception+0x22/0x2f0 ? exc_page_fault+0x5f/0x100 ? asm_exc_page_fault+0x22/0x30 ? cachefiles_prepare_write+0x30/0xa0 netfs_rreq_write_to_cache_work+0x135/0x2e0 process_one_work+0x137/0x2c0 worker_thread+0x2e9/0x400 ? __pfx_worker_thread+0x10/0x10 kthread+0xcc/0x100 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x30/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 Modules linked in: CR2: 0000000000000008 ---[ end trace 0000000000000000 ]--- This happened because fscache_cookie_state_machine() was slow and was still running while another process invoked fscache_unuse_cookie(); this led to a fscache_cookie_lru_do_one() call, setting the FSCACHE_COOKIE_DO_LRU_DISCARD flag, which was picked up by fscache_cookie_state_machine(), withdrawing the cookie via cachefiles_withdraw_cookie(), clearing cookie->cache_priv. At the same time, yet another process invoked cachefiles_prepare_write(), which found a NULL pointer in this code line: struct cachefiles_object *object = cachefiles_cres_object(cres); The next line crashes, obviously: struct cachefiles_cache *cache = object->volume->cache; During cachefiles_prepare_write(), the "n_accesses" counter is non-zero (via fscache_begin_operation()). The cookie must not be withdrawn until it drops to zero. The counter is checked by fscache_cookie_state_machine() before switching to FSCACHE_COOKIE_STATE_RELINQUISHING and FSCACHE_COOKIE_STATE_WITHDRAWING (in "case FSCACHE_COOKIE_STATE_FAILED"), but not for FSCACHE_COOKIE_STATE_LRU_DISCARDING ("case FSCACHE_COOKIE_STATE_ACTIVE"). This patch adds the missing check. With a non-zero access counter, the function returns and the next fscache_end_cookie_access() call will queue another fscache_cookie_state_machine() call to handle the still-pending FSCACHE_COOKIE_DO_LRU_DISCARD. Fixes: 12bb21a29c19 ("fscache: Implement cookie user counting and resource pinning") Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240729162002.3436763-2-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/fscache_cookie.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c index bce2492186d0b4..d4d4b3a8b10603 100644 --- a/fs/netfs/fscache_cookie.c +++ b/fs/netfs/fscache_cookie.c @@ -741,6 +741,10 @@ static void fscache_cookie_state_machine(struct fscache_cookie *cookie) spin_lock(&cookie->lock); } if (test_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) { + if (atomic_read(&cookie->n_accesses) != 0) + /* still being accessed: postpone it */ + break; + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_LRU_DISCARDING); wake = true; From 42b0f8da3acc87953161baeb24f756936eb4d4b2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 31 Jul 2024 07:47:27 +0200 Subject: [PATCH 197/991] nsfs: fix ioctl declaration The kernel is writing an object of type __u64, so the ioctl has to be defined to _IOR(NSIO, 0x5, __u64) instead of _IO(NSIO, 0x5). Reported-by: Dmitry V. Levin Link: https://lore.kernel.org/r/20240730164554.GA18486@altlinux.org Signed-off-by: Christian Brauner --- include/uapi/linux/nsfs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index b133211331f6a6..5fad3d0fcd7078 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -3,6 +3,7 @@ #define __LINUX_NSFS_H #include +#include #define NSIO 0xb7 @@ -16,7 +17,7 @@ /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) /* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IO(NSIO, 0x5) +#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ From 64a7ce76fb901bf9f9c36cf5d681328fc0fd4b5a Mon Sep 17 00:00:00 2001 From: yangerkun Date: Wed, 31 Jul 2024 12:38:35 +0800 Subject: [PATCH 198/991] libfs: fix infinite directory reads for offset dir After we switch tmpfs dir operations from simple_dir_operations to simple_offset_dir_operations, every rename happened will fill new dentry to dest dir's maple tree(&SHMEM_I(inode)->dir_offsets->mt) with a free key starting with octx->newx_offset, and then set newx_offset equals to free key + 1. This will lead to infinite readdir combine with rename happened at the same time, which fail generic/736 in xfstests(detail show as below). 1. create 5000 files(1 2 3...) under one dir 2. call readdir(man 3 readdir) once, and get one entry 3. rename(entry, "TEMPFILE"), then rename("TEMPFILE", entry) 4. loop 2~3, until readdir return nothing or we loop too many times(tmpfs break test with the second condition) We choose the same logic what commit 9b378f6ad48cf ("btrfs: fix infinite directory reads") to fix it, record the last_index when we open dir, and do not emit the entry which index >= last_index. The file->private_data now used in offset dir can use directly to do this, and we also update the last_index when we llseek the dir file. Fixes: a2e459555c5f ("shmem: stable directory offsets") Signed-off-by: yangerkun Link: https://lore.kernel.org/r/20240731043835.1828697-1-yangerkun@huawei.com Reviewed-by: Chuck Lever [brauner: only update last_index after seek when offset is zero like Jan suggested] Signed-off-by: Christian Brauner --- fs/libfs.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 8aa34870449fe1..02602d00939e96 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -450,6 +450,14 @@ void simple_offset_destroy(struct offset_ctx *octx) mtree_destroy(&octx->mt); } +static int offset_dir_open(struct inode *inode, struct file *file) +{ + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); + + file->private_data = (void *)ctx->next_offset; + return 0; +} + /** * offset_dir_llseek - Advance the read position of a directory descriptor * @file: an open directory whose position is to be updated @@ -463,6 +471,9 @@ void simple_offset_destroy(struct offset_ctx *octx) */ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) { + struct inode *inode = file->f_inode; + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); + switch (whence) { case SEEK_CUR: offset += file->f_pos; @@ -476,7 +487,8 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) } /* In this case, ->private_data is protected by f_pos_lock */ - file->private_data = NULL; + if (!offset) + file->private_data = (void *)ctx->next_offset; return vfs_setpos(file, offset, LONG_MAX); } @@ -507,7 +519,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) inode->i_ino, fs_umode_to_dtype(inode->i_mode)); } -static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index) { struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *dentry; @@ -515,17 +527,21 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) while (true) { dentry = offset_find_next(octx, ctx->pos); if (!dentry) - return ERR_PTR(-ENOENT); + return; + + if (dentry2offset(dentry) >= last_index) { + dput(dentry); + return; + } if (!offset_dir_emit(ctx, dentry)) { dput(dentry); - break; + return; } ctx->pos = dentry2offset(dentry) + 1; dput(dentry); } - return NULL; } /** @@ -552,22 +568,19 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) static int offset_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; + long last_index = (long)file->private_data; lockdep_assert_held(&d_inode(dir)->i_rwsem); if (!dir_emit_dots(file, ctx)) return 0; - /* In this case, ->private_data is protected by f_pos_lock */ - if (ctx->pos == DIR_OFFSET_MIN) - file->private_data = NULL; - else if (file->private_data == ERR_PTR(-ENOENT)) - return 0; - file->private_data = offset_iterate_dir(d_inode(dir), ctx); + offset_iterate_dir(d_inode(dir), ctx, last_index); return 0; } const struct file_operations simple_offset_dir_operations = { + .open = offset_dir_open, .llseek = offset_dir_llseek, .iterate_shared = offset_readdir, .read = generic_read_dir, From 889ced4c9388785952d78d20d338bda2df209bb5 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 31 Jul 2024 09:39:02 +0200 Subject: [PATCH 199/991] netfs: clean up after renaming FSCACHE_DEBUG config Commit 6b8e61472529 ("netfs: Rename CONFIG_FSCACHE_DEBUG to CONFIG_NETFS_DEBUG") renames the config, but introduces two issues: First, NETFS_DEBUG mistakenly depends on the non-existing config NETFS, whereas the actual intended config is called NETFS_SUPPORT. Second, the config renaming misses to adjust the documentation of the functionality of this config. Clean up those two points. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20240731073902.69262-1-lukas.bulwahn@redhat.com Signed-off-by: Christian Brauner --- Documentation/filesystems/caching/fscache.rst | 8 ++++---- fs/netfs/Kconfig | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/caching/fscache.rst b/Documentation/filesystems/caching/fscache.rst index a74d7b052dc130..de1f32526cc1c4 100644 --- a/Documentation/filesystems/caching/fscache.rst +++ b/Documentation/filesystems/caching/fscache.rst @@ -318,10 +318,10 @@ where the columns are: Debugging ========= -If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime -debugging enabled by adjusting the value in:: +If CONFIG_NETFS_DEBUG is enabled, the FS-Cache facility and NETFS support can +have runtime debugging enabled by adjusting the value in:: - /sys/module/fscache/parameters/debug + /sys/module/netfs/parameters/debug This is a bitmask of debugging streams to enable: @@ -343,6 +343,6 @@ This is a bitmask of debugging streams to enable: The appropriate set of values should be OR'd together and the result written to the control file. For example:: - echo $((1|8|512)) >/sys/module/fscache/parameters/debug + echo $((1|8|512)) >/sys/module/netfs/parameters/debug will turn on all function entry debugging. diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig index 1b78e8b65ebc14..7701c037c3283f 100644 --- a/fs/netfs/Kconfig +++ b/fs/netfs/Kconfig @@ -24,7 +24,7 @@ config NETFS_STATS config NETFS_DEBUG bool "Enable dynamic debugging netfslib and FS-Cache" - depends on NETFS + depends on NETFS_SUPPORT help This permits debugging to be dynamically enabled in the local caching management module. If this is set, the debugging output may be From 3b5bbe798b2451820e74243b738268f51901e7d0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 31 Jul 2024 12:01:12 +0200 Subject: [PATCH 200/991] pidfd: prevent creation of pidfds for kthreads It's currently possible to create pidfds for kthreads but it is unclear what that is supposed to mean. Until we have use-cases for it and we figured out what behavior we want block the creation of pidfds for kthreads. Link: https://lore.kernel.org/r/20240731-gleis-mehreinnahmen-6bbadd128383@brauner Fixes: 32fcb426ec00 ("pid: add pidfd_open()") Cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- kernel/fork.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index cc760491f20127..18bdc87209d057 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2053,11 +2053,24 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - bool thread = flags & PIDFD_THREAD; - - if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) + if (!pid) return -EINVAL; + scoped_guard(rcu) { + struct task_struct *tsk; + + if (flags & PIDFD_THREAD) + tsk = pid_task(pid, PIDTYPE_PID); + else + tsk = pid_task(pid, PIDTYPE_TGID); + if (!tsk) + return -EINVAL; + + /* Don't create pidfds for kernel threads for now. */ + if (tsk->flags & PF_KTHREAD) + return -EINVAL; + } + return __pidfd_prepare(pid, flags, ret); } @@ -2403,6 +2416,12 @@ __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; + /* Don't create pidfds for kernel threads for now. */ + if (args->kthread) { + retval = -EINVAL; + goto bad_fork_free_pid; + } + /* Note that no task has been attached to @pid yet. */ retval = __pidfd_prepare(pid, flags, &pidfile); if (retval < 0) From 86509e38a80da34d7800985fa2be183475242c8c Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 9 Aug 2024 15:50:35 +0200 Subject: [PATCH 201/991] file: fix typo in take_fd() comment The explanatory comment above take_fd() contains a typo, fix that to not confuse readers. Signed-off-by: Mathias Krause Link: https://lore.kernel.org/r/20240809135035.748109-1-minipli@grsecurity.net Signed-off-by: Christian Brauner --- include/linux/file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/file.h b/include/linux/file.h index 237931f20739f3..59b146a14dcad0 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -110,7 +110,7 @@ DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), * * f = dentry_open(&path, O_RDONLY, current_cred()); * if (IS_ERR(f)) - * return PTR_ERR(fd); + * return PTR_ERR(f); * * fd_install(fd, f); * return take_fd(fd); From 8e5ced7804cb9184c4a23f8054551240562a8eda Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 30 Jul 2024 17:01:40 +0100 Subject: [PATCH 202/991] netfs, ceph: Revert "netfs: Remove deprecated use of PG_private_2 as a second writeback flag" This reverts commit ae678317b95e760607c7b20b97c9cd4ca9ed6e1a. Revert the patch that removes the deprecated use of PG_private_2 in netfslib for the moment as Ceph is actually still using this to track data copied to the cache. Fixes: ae678317b95e ("netfs: Remove deprecated use of PG_private_2 as a second writeback flag") Reported-by: Max Kellermann Signed-off-by: David Howells cc: Ilya Dryomov cc: Xiubo Li cc: Jeff Layton cc: Matthew Wilcox cc: ceph-devel@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org https: //lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk Signed-off-by: Christian Brauner --- fs/ceph/addr.c | 19 ++++- fs/netfs/buffered_read.c | 8 +- fs/netfs/io.c | 144 +++++++++++++++++++++++++++++++++++ include/trace/events/netfs.h | 1 + 4 files changed, 170 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c16bc5250ef56..73b5a07bf94dee 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -498,6 +498,11 @@ const struct netfs_request_ops ceph_netfs_ops = { }; #ifdef CONFIG_CEPH_FSCACHE +static void ceph_set_page_fscache(struct page *page) +{ + folio_start_private_2(page_folio(page)); /* [DEPRECATED] */ +} + static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) { struct inode *inode = priv; @@ -515,6 +520,10 @@ static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, b ceph_fscache_write_terminated, inode, true, caching); } #else +static inline void ceph_set_page_fscache(struct page *page) +{ +} + static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) { } @@ -706,6 +715,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) len = wlen; set_page_writeback(page); + if (caching) + ceph_set_page_fscache(page); ceph_fscache_write_to_cache(inode, page_off, len, caching); if (IS_ENCRYPTED(inode)) { @@ -789,6 +800,8 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) return AOP_WRITEPAGE_ACTIVATE; } + folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ + err = writepage_nounlock(page, wbc); if (err == -ERESTARTSYS) { /* direct memory reclaimer was killed by SIGKILL. return 0 @@ -1062,7 +1075,8 @@ static int ceph_writepages_start(struct address_space *mapping, unlock_page(page); break; } - if (PageWriteback(page)) { + if (PageWriteback(page) || + PagePrivate2(page) /* [DEPRECATED] */) { if (wbc->sync_mode == WB_SYNC_NONE) { doutc(cl, "%p under writeback\n", page); unlock_page(page); @@ -1070,6 +1084,7 @@ static int ceph_writepages_start(struct address_space *mapping, } doutc(cl, "waiting on writeback %p\n", page); wait_on_page_writeback(page); + folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ } if (!clear_page_dirty_for_io(page)) { @@ -1254,6 +1269,8 @@ static int ceph_writepages_start(struct address_space *mapping, } set_page_writeback(page); + if (caching) + ceph_set_page_fscache(page); len += thp_size(page); } ceph_fscache_write_to_cache(inode, offset, len, caching); diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a688d4c75d9926..424048f9ed1fdc 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -466,7 +466,7 @@ int netfs_write_begin(struct netfs_inode *ctx, if (!netfs_is_cache_enabled(ctx) && netfs_skip_folio_read(folio, pos, len, false)) { netfs_stat(&netfs_n_rh_write_zskip); - goto have_folio; + goto have_folio_no_wait; } rreq = netfs_alloc_request(mapping, file, @@ -507,6 +507,12 @@ int netfs_write_begin(struct netfs_inode *ctx, netfs_put_request(rreq, false, netfs_rreq_trace_put_return); have_folio: + if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) { + ret = folio_wait_private_2_killable(folio); + if (ret < 0) + goto error; + } +have_folio_no_wait: *_folio = folio; _leave(" = 0"); return 0; diff --git a/fs/netfs/io.c b/fs/netfs/io.c index c93851b9836889..c179a1c73fa703 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -98,6 +98,146 @@ static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async) netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete); } +/* + * [DEPRECATED] Deal with the completion of writing the data to the cache. We + * have to clear the PG_fscache bits on the folios involved and release the + * caller's ref. + * + * May be called in softirq mode and we inherit a ref from the caller. + */ +static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, + bool was_async) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + pgoff_t unlocked = 0; + bool have_unlocked = false; + + rcu_read_lock(); + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); + + xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { + if (xas_retry(&xas, folio)) + continue; + + /* We might have multiple writes from the same huge + * folio, but we mustn't unlock a folio more than once. + */ + if (have_unlocked && folio->index <= unlocked) + continue; + unlocked = folio_next_index(folio) - 1; + trace_netfs_folio(folio, netfs_folio_trace_end_copy); + folio_end_private_2(folio); + have_unlocked = true; + } + } + + rcu_read_unlock(); + netfs_rreq_completed(rreq, was_async); +} + +static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error, + bool was_async) /* [DEPRECATED] */ +{ + struct netfs_io_subrequest *subreq = priv; + struct netfs_io_request *rreq = subreq->rreq; + + if (IS_ERR_VALUE(transferred_or_error)) { + netfs_stat(&netfs_n_rh_write_failed); + trace_netfs_failure(rreq, subreq, transferred_or_error, + netfs_fail_copy_to_cache); + } else { + netfs_stat(&netfs_n_rh_write_done); + } + + trace_netfs_sreq(subreq, netfs_sreq_trace_write_term); + + /* If we decrement nr_copy_ops to 0, the ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_copy_ops)) + netfs_rreq_unmark_after_write(rreq, was_async); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); +} + +/* + * [DEPRECATED] Perform any outstanding writes to the cache. We inherit a ref + * from the caller. + */ +static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + struct netfs_io_subrequest *subreq, *next, *p; + struct iov_iter iter; + int ret; + + trace_netfs_rreq(rreq, netfs_rreq_trace_copy); + + /* We don't want terminating writes trying to wake us up whilst we're + * still going through the list. + */ + atomic_inc(&rreq->nr_copy_ops); + + list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) { + if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { + list_del_init(&subreq->rreq_link); + netfs_put_subrequest(subreq, false, + netfs_sreq_trace_put_no_copy); + } + } + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + /* Amalgamate adjacent writes */ + while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + next = list_next_entry(subreq, rreq_link); + if (next->start != subreq->start + subreq->len) + break; + subreq->len += next->len; + list_del_init(&next->rreq_link); + netfs_put_subrequest(next, false, + netfs_sreq_trace_put_merged); + } + + ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, + subreq->len, rreq->i_size, true); + if (ret < 0) { + trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); + trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); + continue; + } + + iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages, + subreq->start, subreq->len); + + atomic_inc(&rreq->nr_copy_ops); + netfs_stat(&netfs_n_rh_write); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache); + trace_netfs_sreq(subreq, netfs_sreq_trace_write); + cres->ops->write(cres, subreq->start, &iter, + netfs_rreq_copy_terminated, subreq); + } + + /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_copy_ops)) + netfs_rreq_unmark_after_write(rreq, false); +} + +static void netfs_rreq_write_to_cache_work(struct work_struct *work) /* [DEPRECATED] */ +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, work); + + netfs_rreq_do_write_to_cache(rreq); +} + +static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq) /* [DEPRECATED] */ +{ + rreq->work.func = netfs_rreq_write_to_cache_work; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); +} + /* * Handle a short read. */ @@ -275,6 +415,10 @@ static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async) clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); + if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags) && + test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) + return netfs_rreq_write_to_cache(rreq); + netfs_rreq_completed(rreq, was_async); } diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index da23484268dfcf..24ec3434d32ee9 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -145,6 +145,7 @@ EM(netfs_folio_trace_clear_g, "clear-g") \ EM(netfs_folio_trace_clear_s, "clear-s") \ EM(netfs_folio_trace_copy_to_cache, "mark-copy") \ + EM(netfs_folio_trace_end_copy, "end-copy") \ EM(netfs_folio_trace_filled_gaps, "filled-gaps") \ EM(netfs_folio_trace_kill, "kill") \ EM(netfs_folio_trace_kill_cc, "kill-cc") \ From 7b589a9b45ae32aa9d7bece597490e141198d7a6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Aug 2024 19:38:46 +0100 Subject: [PATCH 203/991] netfs: Fix handling of USE_PGPRIV2 and WRITE_TO_CACHE flags The NETFS_RREQ_USE_PGPRIV2 and NETFS_RREQ_WRITE_TO_CACHE flags aren't used correctly. The problem is that we try to set them up in the request initialisation, but we the cache may be in the process of setting up still, and so the state may not be correct. Further, we secondarily sample the cache state and make contradictory decisions later. The issue arises because we set up the cache resources, which allows the cache's ->prepare_read() to switch on NETFS_SREQ_COPY_TO_CACHE - which triggers cache writing even if we didn't set the flags when allocating. Fix this in the following way: (1) Drop NETFS_ICTX_USE_PGPRIV2 and instead set NETFS_RREQ_USE_PGPRIV2 in ->init_request() rather than trying to juggle that in netfs_alloc_request(). (2) Repurpose NETFS_RREQ_USE_PGPRIV2 to merely indicate that if caching is to be done, then PG_private_2 is to be used rather than only setting it if we decide to cache and then having netfs_rreq_unlock_folios() set the non-PG_private_2 writeback-to-cache if it wasn't set. (3) Split netfs_rreq_unlock_folios() into two functions, one of which contains the deprecated code for using PG_private_2 to avoid accidentally doing the writeback path - and always use it if USE_PGPRIV2 is set. (4) As NETFS_ICTX_USE_PGPRIV2 is removed, make netfs_write_begin() always wait for PG_private_2. This function is deprecated and only used by ceph anyway, and so label it so. (5) Drop the NETFS_RREQ_WRITE_TO_CACHE flag and use fscache_operation_valid() on the cache_resources instead. This has the advantage of picking up the result of netfs_begin_cache_read() and fscache_begin_write_operation() - which are called after the object is initialised and will wait for the cache to come to a usable state. Just reverting ae678317b95e[1] isn't a sufficient fix, so this need to be applied on top of that. Without this as well, things like: rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { and: WARNING: CPU: 13 PID: 3621 at fs/ceph/caps.c:3386 may happen, along with some UAFs due to PG_private_2 not getting used to wait on writeback completion. Fixes: 2ff1e97587f4 ("netfs: Replace PG_fscache by setting folio->private and marking dirty") Reported-by: Max Kellermann Signed-off-by: David Howells cc: Ilya Dryomov cc: Xiubo Li cc: Hristo Venev cc: Jeff Layton cc: Matthew Wilcox cc: ceph-devel@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk/ [1] Link: https://lore.kernel.org/r/1173209.1723152682@warthog.procyon.org.uk Signed-off-by: Christian Brauner --- fs/ceph/addr.c | 3 + fs/ceph/inode.c | 2 - fs/netfs/buffered_read.c | 125 ++++++++++++++++++++++++++++++----- fs/netfs/objects.c | 10 --- fs/netfs/write_issue.c | 4 +- fs/nfs/fscache.c | 2 + fs/nfs/fscache.h | 2 - include/linux/netfs.h | 3 - include/trace/events/netfs.h | 1 + 9 files changed, 116 insertions(+), 36 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 73b5a07bf94dee..cc0a2240de98ee 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -424,6 +424,9 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) struct ceph_netfs_request_data *priv; int ret = 0; + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); + if (rreq->origin != NETFS_READAHEAD) return 0; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8f8de8f33abbfb..71cd70514efa55 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -577,8 +577,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) /* Set parameters for the netfs library */ netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); - /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ - __set_bit(NETFS_ICTX_USE_PGPRIV2, &ci->netfs.flags); spin_lock_init(&ci->i_ceph_lock); diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 424048f9ed1fdc..27c750d3947626 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -9,6 +9,97 @@ #include #include "internal.h" +/* + * [DEPRECATED] Unlock the folios in a read operation for when the filesystem + * is using PG_private_2 and direct writing to the cache from here rather than + * marking the page for writeback. + * + * Note that we don't touch folio->private in this code. + */ +static void netfs_rreq_unlock_folios_pgpriv2(struct netfs_io_request *rreq, + size_t *account) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + pgoff_t start_page = rreq->start / PAGE_SIZE; + pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; + bool subreq_failed = false; + + XA_STATE(xas, &rreq->mapping->i_pages, start_page); + + /* Walk through the pagecache and the I/O request lists simultaneously. + * We may have a mixture of cached and uncached sections and we only + * really want to write out the uncached sections. This is slightly + * complicated by the possibility that we might have huge pages with a + * mixture inside. + */ + subreq = list_first_entry(&rreq->subrequests, + struct netfs_io_subrequest, rreq_link); + subreq_failed = (subreq->error < 0); + + trace_netfs_rreq(rreq, netfs_rreq_trace_unlock_pgpriv2); + + rcu_read_lock(); + xas_for_each(&xas, folio, last_page) { + loff_t pg_end; + bool pg_failed = false; + bool folio_started = false; + + if (xas_retry(&xas, folio)) + continue; + + pg_end = folio_pos(folio) + folio_size(folio) - 1; + + for (;;) { + loff_t sreq_end; + + if (!subreq) { + pg_failed = true; + break; + } + + if (!folio_started && + test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags) && + fscache_operation_valid(&rreq->cache_resources)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); + folio_start_private_2(folio); + folio_started = true; + } + + pg_failed |= subreq_failed; + sreq_end = subreq->start + subreq->len - 1; + if (pg_end < sreq_end) + break; + + *account += subreq->transferred; + if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + subreq = list_next_entry(subreq, rreq_link); + subreq_failed = (subreq->error < 0); + } else { + subreq = NULL; + subreq_failed = false; + } + + if (pg_end == sreq_end) + break; + } + + if (!pg_failed) { + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + + if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { + if (folio->index == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) + _debug("no unlock"); + else + folio_unlock(folio); + } + } + rcu_read_unlock(); +} + /* * Unlock the folios in a read operation. We need to set PG_writeback on any * folios we're going to write back before we unlock them. @@ -35,6 +126,12 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } } + /* Handle deprecated PG_private_2 case. */ + if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { + netfs_rreq_unlock_folios_pgpriv2(rreq, &account); + goto out; + } + /* Walk through the pagecache and the I/O request lists simultaneously. * We may have a mixture of cached and uncached sections and we only * really want to write out the uncached sections. This is slightly @@ -52,7 +149,6 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) loff_t pg_end; bool pg_failed = false; bool wback_to_cache = false; - bool folio_started = false; if (xas_retry(&xas, folio)) continue; @@ -66,17 +162,8 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) pg_failed = true; break; } - if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { - if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, - &subreq->flags)) { - trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); - folio_start_private_2(folio); - folio_started = true; - } - } else { - wback_to_cache |= - test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - } + + wback_to_cache |= test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); pg_failed |= subreq_failed; sreq_end = subreq->start + subreq->len - 1; if (pg_end < sreq_end) @@ -124,6 +211,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } rcu_read_unlock(); +out: task_io_account_read(account); if (rreq->netfs_ops->done) rreq->netfs_ops->done(rreq); @@ -395,7 +483,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, } /** - * netfs_write_begin - Helper to prepare for writing + * netfs_write_begin - Helper to prepare for writing [DEPRECATED] * @ctx: The netfs context * @file: The file to read from * @mapping: The mapping to read from @@ -426,6 +514,9 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, * inode before calling this. * * This is usable whether or not caching is enabled. + * + * Note that this should be considered deprecated and netfs_perform_write() + * used instead. */ int netfs_write_begin(struct netfs_inode *ctx, struct file *file, struct address_space *mapping, @@ -507,11 +598,9 @@ int netfs_write_begin(struct netfs_inode *ctx, netfs_put_request(rreq, false, netfs_rreq_trace_put_return); have_folio: - if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) { - ret = folio_wait_private_2_killable(folio); - if (ret < 0) - goto error; - } + ret = folio_wait_private_2_killable(folio); + if (ret < 0) + goto error; have_folio_no_wait: *_folio = folio; _leave(" = 0"); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index f4a64272747925..0294df70c3ff45 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -24,10 +24,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct netfs_io_request *rreq; mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool; struct kmem_cache *cache = mempool->pool_data; - bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE || - origin == NETFS_DIO_READ || - origin == NETFS_DIO_WRITE); - bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx); int ret; for (;;) { @@ -56,12 +52,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, refcount_set(&rreq->ref, 1); __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - if (cached) { - __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); - if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) - /* Filesystem uses deprecated PG_private_2 marking. */ - __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); - } if (file && file->f_flags & O_NONBLOCK) __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); if (rreq->netfs_ops->init_request) { diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 9258d30cffe3cf..3f7e37e50c7d02 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -94,6 +94,8 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, { struct netfs_io_request *wreq; struct netfs_inode *ictx; + bool is_buffered = (origin == NETFS_WRITEBACK || + origin == NETFS_WRITETHROUGH); wreq = netfs_alloc_request(mapping, file, start, 0, origin); if (IS_ERR(wreq)) @@ -102,7 +104,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, _enter("R=%x", wreq->debug_id); ictx = netfs_inode(wreq->inode); - if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) + if (is_buffered && netfs_is_cache_enabled(ictx)) fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx)); wreq->contiguity = wreq->start; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 7202ce84d0eb03..bf29a65c5027f4 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -265,6 +265,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi { rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); return 0; } diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index fbed0027996f88..e8adae1bc260a3 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -81,8 +81,6 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) { netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false); - /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ - __set_bit(NETFS_ICTX_USE_PGPRIV2, &nfsi->netfs.flags); } extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5d0288938cc2dc..983816608f15d7 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -73,8 +73,6 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ -#define NETFS_ICTX_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark - * write to cache on read */ }; /* @@ -269,7 +267,6 @@ struct netfs_io_request { #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ -#define NETFS_RREQ_WRITE_TO_CACHE 7 /* Need to write to the cache */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ #define NETFS_RREQ_BLOCKED 10 /* We blocked */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 24ec3434d32ee9..606b4a0f92dae2 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -51,6 +51,7 @@ EM(netfs_rreq_trace_resubmit, "RESUBMT") \ EM(netfs_rreq_trace_set_pause, "PAUSE ") \ EM(netfs_rreq_trace_unlock, "UNLOCK ") \ + EM(netfs_rreq_trace_unlock_pgpriv2, "UNLCK-2") \ EM(netfs_rreq_trace_unmark, "UNMARK ") \ EM(netfs_rreq_trace_wait_ip, "WAIT-IP") \ EM(netfs_rreq_trace_wait_pause, "WT-PAUS") \ From 6b9935da2a6b2a72774c15c844ae201a3fc362ac Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Sat, 10 Aug 2024 13:27:00 +0900 Subject: [PATCH 204/991] scsi: mpi3mr: Add missing spin_lock_init() for mrioc->trigger_lock Commit fc4444941140 ("scsi: mpi3mr: HDB allocation and posting for hardware and firmware buffers") added the spinlock trigger_lock to the struct mpi3mr_ioc. However, spin_lock_init() call was not added for it, then the lock does not work as expected. Also, the kernel reports the message below when lockdep is enabled. INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? To fix the issue and to avoid the INFO message, add the missing spin_lock_init() call. Fixes: fc4444941140 ("scsi: mpi3mr: HDB allocation and posting for hardware and firmware buffers") Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20240810042701.661841-2-shinichiro.kawasaki@wdc.com Acked-by: Sathya Prakash Veerichetty Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_os.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c index ca8f132e03aee2..616894571c6abe 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_os.c +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -5234,6 +5234,7 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id) spin_lock_init(&mrioc->watchdog_lock); spin_lock_init(&mrioc->chain_buf_lock); spin_lock_init(&mrioc->sas_node_lock); + spin_lock_init(&mrioc->trigger_lock); INIT_LIST_HEAD(&mrioc->fwevt_list); INIT_LIST_HEAD(&mrioc->tgtdev_list); From fdad456cbcca739bae1849549c7a999857c56f88 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 28 Jul 2024 19:46:11 +0800 Subject: [PATCH 205/991] bpf: Fix updating attached freplace prog in prog_array map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit f7866c358733 ("bpf: Fix null pointer dereference in resolve_prog_type() for BPF_PROG_TYPE_EXT") fixed a NULL pointer dereference panic, but didn't fix the issue that fails to update attached freplace prog to prog_array map. Since commit 1c123c567fb1 ("bpf: Resolve fext program type when checking map compatibility"), freplace prog and its target prog are able to tail call each other. And the commit 3aac1ead5eb6 ("bpf: Move prog->aux->linked_prog and trampoline into bpf_link on attach") sets prog->aux->dst_prog as NULL after attaching freplace prog to its target prog. After loading freplace the prog_array's owner type is BPF_PROG_TYPE_SCHED_CLS. Then, after attaching freplace its prog->aux->dst_prog is NULL. Then, while updating freplace in prog_array the bpf_prog_map_compatible() incorrectly returns false because resolve_prog_type() returns BPF_PROG_TYPE_EXT instead of BPF_PROG_TYPE_SCHED_CLS. After this patch the resolve_prog_type() returns BPF_PROG_TYPE_SCHED_CLS and update to prog_array can succeed. Fixes: f7866c358733 ("bpf: Fix null pointer dereference in resolve_prog_type() for BPF_PROG_TYPE_EXT") Cc: Toke Høiland-Jørgensen Cc: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20240728114612.48486-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6503c85b10a30a..7b776dae36e584 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -856,8 +856,8 @@ static inline u32 type_flag(u32 type) /* only use after check_attach_btf_id() */ static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog) { - return (prog->type == BPF_PROG_TYPE_EXT && prog->aux->dst_prog) ? - prog->aux->dst_prog->type : prog->type; + return (prog->type == BPF_PROG_TYPE_EXT && prog->aux->saved_dst_prog_type) ? + prog->aux->saved_dst_prog_type : prog->type; } static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) From 6c17ea1f3eaa330d445ac14a9428402ce4e3055e Mon Sep 17 00:00:00 2001 From: "Nysal Jan K.A" Date: Wed, 31 Jul 2024 08:31:12 +0530 Subject: [PATCH 206/991] cpu/SMT: Enable SMT only if a core is online If a core is offline then enabling SMT should not online CPUs of this core. By enabling SMT, what is intended is either changing the SMT value from "off" to "on" or setting the SMT level (threads per core) from a lower to higher value. On PowerPC the ppc64_cpu utility can be used, among other things, to perform the following functions: ppc64_cpu --cores-on # Get the number of online cores ppc64_cpu --cores-on=X # Put exactly X cores online ppc64_cpu --offline-cores=X[,Y,...] # Put specified cores offline ppc64_cpu --smt={on|off|value} # Enable, disable or change SMT level If the user has decided to offline certain cores, enabling SMT should not online CPUs in those cores. This patch fixes the issue and changes the behaviour as described, by introducing an arch specific function topology_is_core_online(). It is currently implemented only for PowerPC. Fixes: 73c58e7e1412 ("powerpc: Add HOTPLUG_SMT support") Reported-by: Tyrel Datwyler Closes: https://groups.google.com/g/powerpc-utils-devel/c/wrwVzAAnRlI/m/5KJSoqP4BAAJ Signed-off-by: Nysal Jan K.A Reviewed-by: Shrikanth Hegde Reviewed-by: Thomas Gleixner Signed-off-by: Michael Ellerman Link: https://msgid.link/20240731030126.956210-2-nysal@linux.ibm.com --- Documentation/ABI/testing/sysfs-devices-system-cpu | 3 ++- kernel/cpu.c | 12 +++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 325873385b71fb..de725ca3be825b 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -562,7 +562,8 @@ Description: Control Symmetric Multi Threading (SMT) ================ ========================================= If control status is "forceoff" or "notsupported" writes - are rejected. + are rejected. Note that enabling SMT on PowerPC skips + offline cores. What: /sys/devices/system/cpu/cpuX/power/energy_perf_bias Date: March 2019 diff --git a/kernel/cpu.c b/kernel/cpu.c index 1209ddaec026da..b1fd2a3db91a2e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2689,6 +2689,16 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } +/** + * Check if the core a CPU belongs to is online + */ +#if !defined(topology_is_core_online) +static inline bool topology_is_core_online(unsigned int cpu) +{ + return true; +} +#endif + int cpuhp_smt_enable(void) { int cpu, ret = 0; @@ -2699,7 +2709,7 @@ int cpuhp_smt_enable(void) /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) continue; - if (!cpu_smt_thread_allowed(cpu)) + if (!cpu_smt_thread_allowed(cpu) || !topology_is_core_online(cpu)) continue; ret = _cpu_up(cpu, 0, CPUHP_ONLINE); if (ret) From 227bbaabe64b6f9cd98aa051454c1d4a194a8c6a Mon Sep 17 00:00:00 2001 From: "Nysal Jan K.A" Date: Wed, 31 Jul 2024 08:31:13 +0530 Subject: [PATCH 207/991] powerpc/topology: Check if a core is online topology_is_core_online() checks if the core a CPU belongs to is online. The core is online if at least one of the sibling CPUs is online. The first CPU of an online core is also online in the common case, so this should be fairly quick. Fixes: 73c58e7e1412 ("powerpc: Add HOTPLUG_SMT support") Signed-off-by: Nysal Jan K.A Reviewed-by: Shrikanth Hegde Signed-off-by: Michael Ellerman Link: https://msgid.link/20240731030126.956210-3-nysal@linux.ibm.com --- arch/powerpc/include/asm/topology.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index f4e6f2dd04b731..16bacfe8c7a2ca 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -145,6 +145,7 @@ static inline int cpu_to_coregroup_id(int cpu) #ifdef CONFIG_HOTPLUG_SMT #include +#include #include static inline bool topology_is_primary_thread(unsigned int cpu) @@ -156,6 +157,18 @@ static inline bool topology_smt_thread_allowed(unsigned int cpu) { return cpu_thread_in_core(cpu) < cpu_smt_num_threads; } + +#define topology_is_core_online topology_is_core_online +static inline bool topology_is_core_online(unsigned int cpu) +{ + int i, first_cpu = cpu_first_thread_sibling(cpu); + + for (i = first_cpu; i < first_cpu + threads_per_core; ++i) { + if (cpu_online(i)) + return true; + } + return false; +} #endif #endif /* __KERNEL__ */ From 8c6b808c8c2a9de21503944bd6308979410fd812 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Sat, 10 Aug 2024 13:27:01 +0900 Subject: [PATCH 208/991] scsi: mpi3mr: Avoid MAX_PAGE_ORDER WARNING for buffer allocations Commit fc4444941140 ("scsi: mpi3mr: HDB allocation and posting for hardware and firmware buffers") added mpi3mr_alloc_diag_bufs() which calls dma_alloc_coherent() to allocate the trace buffer and the firmware buffer. mpi3mr_alloc_diag_bufs() decides the buffer sizes from the driver configuration. In my environment, the sizes are 8MB. With the sizes, dma_alloc_coherent() fails and report this WARNING: WARNING: CPU: 4 PID: 438 at mm/page_alloc.c:4676 __alloc_pages_noprof+0x52f/0x640 The WARNING indicates that the order of the allocation size is larger than MAX_PAGE_ORDER. After this failure, mpi3mr_alloc_diag_bufs() reduces the buffer sizes and retries dma_alloc_coherent(). In the end, the buffer allocations succeed with 4MB size in my environment, which corresponds to MAX_PAGE_ORDER=10. Though the allocations succeed, the WARNING message is misleading and should be avoided. To avoid the WARNING, check the orders of the buffer allocation sizes before calling dma_alloc_coherent(). If the orders are larger than MAX_PAGE_ORDER, fall back to the retry path. Fixes: fc4444941140 ("scsi: mpi3mr: HDB allocation and posting for hardware and firmware buffers") Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20240810042701.661841-3-shinichiro.kawasaki@wdc.com Acked-by: Sathya Prakash Veerichetty Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_app.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_app.c b/drivers/scsi/mpi3mr/mpi3mr_app.c index 8b0eded6ef36e5..01f035f9330e4b 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_app.c +++ b/drivers/scsi/mpi3mr/mpi3mr_app.c @@ -100,7 +100,8 @@ void mpi3mr_alloc_diag_bufs(struct mpi3mr_ioc *mrioc) dprint_init(mrioc, "trying to allocate trace diag buffer of size = %dKB\n", trace_size / 1024); - if (mpi3mr_alloc_trace_buffer(mrioc, trace_size)) { + if (get_order(trace_size) > MAX_PAGE_ORDER || + mpi3mr_alloc_trace_buffer(mrioc, trace_size)) { retry = true; trace_size -= trace_dec_size; dprint_init(mrioc, "trace diag buffer allocation failed\n" @@ -118,8 +119,12 @@ void mpi3mr_alloc_diag_bufs(struct mpi3mr_ioc *mrioc) diag_buffer->type = MPI3_DIAG_BUFFER_TYPE_FW; diag_buffer->status = MPI3MR_HDB_BUFSTATUS_NOT_ALLOCATED; if ((mrioc->facts.diag_fw_sz < fw_size) && (fw_size >= fw_min_size)) { - diag_buffer->addr = dma_alloc_coherent(&mrioc->pdev->dev, - fw_size, &diag_buffer->dma_addr, GFP_KERNEL); + if (get_order(fw_size) <= MAX_PAGE_ORDER) { + diag_buffer->addr + = dma_alloc_coherent(&mrioc->pdev->dev, fw_size, + &diag_buffer->dma_addr, + GFP_KERNEL); + } if (!retry) dprint_init(mrioc, "%s:trying to allocate firmware diag buffer of size = %dKB\n", From bed2eb964c70b780fb55925892a74f26cb590b25 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Aug 2024 14:48:47 -0700 Subject: [PATCH 209/991] bpf: Fix a kernel verifier crash in stacksafe() Daniel Hodges reported a kernel verifier crash when playing with sched-ext. Further investigation shows that the crash is due to invalid memory access in stacksafe(). More specifically, it is the following code: if (exact != NOT_EXACT && old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) return false; The 'i' iterates old->allocated_stack. If cur->allocated_stack < old->allocated_stack the out-of-bound access will happen. To fix the issue add 'i >= cur->allocated_stack' check such that if the condition is true, stacksafe() should fail. Otherwise, cur->stack[spi].slot_type[i % BPF_REG_SIZE] memory access is legal. Fixes: 2793a8b015f7 ("bpf: exact states comparison for iterator convergence checks") Cc: Eduard Zingerman Reported-by: Daniel Hodges Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240812214847.213612-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4cb5441ad75fc5..d8520095ca030e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16884,8 +16884,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, spi = i / BPF_REG_SIZE; if (exact != NOT_EXACT && - old->stack[spi].slot_type[i % BPF_REG_SIZE] != - cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + (i >= cur->allocated_stack || + old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE])) return false; if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) From 662c3e2db00f92e50c26e9dc4fe47c52223d9982 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Aug 2024 14:48:52 -0700 Subject: [PATCH 210/991] selftests/bpf: Add a test to verify previous stacksafe() fix A selftest is added such that without the previous patch, a crash can happen. With the previous patch, the test can run successfully. The new test is written in a way which mimics original crash case: main_prog static_prog_1 static_prog_2 where static_prog_1 has different paths to static_prog_2 and some path has stack allocated and some other path does not. A stacksafe() checking in static_prog_2() triggered the crash. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240812214852.214037-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/iters.c | 54 +++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 16bdc3e255913c..ef70b88bccb252 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1432,4 +1432,58 @@ int iter_arr_with_actual_elem_count(const void *ctx) return sum; } +__u32 upper, select_n, result; +__u64 global; + +static __noinline bool nest_2(char *str) +{ + /* some insns (including branch insns) to ensure stacksafe() is triggered + * in nest_2(). This way, stacksafe() can compare frame associated with nest_1(). + */ + if (str[0] == 't') + return true; + if (str[1] == 'e') + return true; + if (str[2] == 's') + return true; + if (str[3] == 't') + return true; + return false; +} + +static __noinline bool nest_1(int n) +{ + /* case 0: allocate stack, case 1: no allocate stack */ + switch (n) { + case 0: { + char comm[16]; + + if (bpf_get_current_comm(comm, 16)) + return false; + return nest_2(comm); + } + case 1: + return nest_2((char *)&global); + default: + return false; + } +} + +SEC("raw_tp") +__success +int iter_subprog_check_stacksafe(const void *ctx) +{ + long i; + + bpf_for(i, 0, upper) { + if (!nest_1(select_n)) { + result = 1; + return 0; + } + } + + result = 2; + return 0; +} + char _license[] SEC("license") = "GPL"; From 6e1918ff680527ce4be77426aa537012b5aa997c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 7 Aug 2024 21:00:21 -0700 Subject: [PATCH 211/991] net: macb: Use rcu_dereference() for idev->ifa_list in macb_suspend(). In macb_suspend(), idev->ifa_list is fetched with rcu_access_pointer() and later the pointer is dereferenced as ifa->ifa_local. So, idev->ifa_list must be fetched with rcu_dereference(). Fixes: 0cb8de39a776 ("net: macb: Add ARP support to WOL") Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20240808040021.6971-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 11665be3a22c7a..dcd3f54ed0cf00 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -5250,8 +5250,8 @@ static int __maybe_unused macb_suspend(struct device *dev) if (bp->wol & MACB_WOL_ENABLED) { /* Check for IP address in WOL ARP mode */ idev = __in_dev_get_rcu(bp->dev); - if (idev && idev->ifa_list) - ifa = rcu_access_pointer(idev->ifa_list); + if (idev) + ifa = rcu_dereference(idev->ifa_list); if ((bp->wolopts & WAKE_ARP) && !ifa) { netdev_err(netdev, "IP address not assigned as required by WoL walk ARP\n"); return -EOPNOTSUPP; From cd0c6872aab4d2c556a5e953e6926a1b4485e543 Mon Sep 17 00:00:00 2001 From: Markus Niebel Date: Wed, 24 Jul 2024 14:58:48 +0200 Subject: [PATCH 212/991] arm64: dts: freescale: imx93-tqma9352: fix CMA alloc-ranges DRAM starts at 0x80000000. Fixes: c982ecfa7992 ("arm64: dts: freescale: add initial device tree for MBa93xxLA SBC board") Signed-off-by: Markus Niebel Signed-off-by: Alexander Stein Reviewed-by: Peng Fan Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi b/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi index d3a0e1244aae67..72a9a5d4e27a3c 100644 --- a/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi +++ b/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi @@ -19,7 +19,7 @@ linux,cma { compatible = "shared-dma-pool"; reusable; - alloc-ranges = <0 0x60000000 0 0x40000000>; + alloc-ranges = <0 0x80000000 0 0x40000000>; size = <0 0x10000000>; linux,cma-default; }; From 5f0a894bfa3c26ce61deda4c52b12e8ec84d876a Mon Sep 17 00:00:00 2001 From: Markus Niebel Date: Wed, 24 Jul 2024 14:58:52 +0200 Subject: [PATCH 213/991] arm64: dts: freescale: imx93-tqma9352-mba93xxla: fix typo Fix typo in assignment of SD-Card cd-gpios. Fixes: c982ecfa7992 ("arm64: dts: freescale: add initial device tree for MBa93xxLA SBC board") Signed-off-by: Markus Niebel Signed-off-by: Alexander Stein Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts b/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts index da8f19a646a98f..e2ee9f5a042cb1 100644 --- a/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts +++ b/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts @@ -499,7 +499,7 @@ pinctrl-0 = <&pinctrl_usdhc2_hs>, <&pinctrl_usdhc2_gpio>; pinctrl-1 = <&pinctrl_usdhc2_uhs>, <&pinctrl_usdhc2_gpio>; pinctrl-2 = <&pinctrl_usdhc2_uhs>, <&pinctrl_usdhc2_gpio>; - cd-gpios = <&gpio3 00 GPIO_ACTIVE_LOW>; + cd-gpios = <&gpio3 0 GPIO_ACTIVE_LOW>; vmmc-supply = <®_usdhc2_vmmc>; bus-width = <4>; no-sdio; From 046667c4d3196938e992fba0dfcde570aa85cd0e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 21 Jul 2024 14:45:08 -0400 Subject: [PATCH 214/991] memcg_write_event_control(): fix a user-triggerable oops we are *not* guaranteed that anything past the terminating NUL is mapped (let alone initialized with anything sane). Fixes: 0dea116876ee ("cgroup: implement eventfd-based generic API for notifications") Cc: stable@vger.kernel.org Cc: Andrew Morton Acked-by: Michal Hocko Signed-off-by: Al Viro --- mm/memcontrol-v1.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 2aeea4d8bf8e53..417c96f2da28e8 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -1842,9 +1842,12 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, buf = endp + 1; cfd = simple_strtoul(buf, &endp, 10); - if ((*endp != ' ') && (*endp != '\0')) + if (*endp == '\0') + buf = endp; + else if (*endp == ' ') + buf = endp + 1; + else return -EINVAL; - buf = endp + 1; event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) From 915d9d914a25575055804cb8bfd13490111282ec Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 29 Jul 2024 14:41:11 +0800 Subject: [PATCH 215/991] arm64: dts: imx95: correct a55 power-domains The A55 power domains is for SCMI performance usage, so for device power on/off. Correct the power-domains entry to use scmi_perf not scmi_devpd. Fixes: 5e3cbb8e4256 ("arm64: dts: freescale: add i.MX95 basic dtsi") Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx95.dtsi | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi index 1bbf9a0468f695..3499d4eb249613 100644 --- a/arch/arm64/boot/dts/freescale/imx95.dtsi +++ b/arch/arm64/boot/dts/freescale/imx95.dtsi @@ -27,7 +27,7 @@ reg = <0x0>; enable-method = "psci"; #cooling-cells = <2>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; i-cache-size = <32768>; i-cache-line-size = <64>; @@ -44,7 +44,7 @@ reg = <0x100>; enable-method = "psci"; #cooling-cells = <2>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; i-cache-size = <32768>; i-cache-line-size = <64>; @@ -61,7 +61,7 @@ reg = <0x200>; enable-method = "psci"; #cooling-cells = <2>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; i-cache-size = <32768>; i-cache-line-size = <64>; @@ -78,7 +78,7 @@ reg = <0x300>; enable-method = "psci"; #cooling-cells = <2>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; i-cache-size = <32768>; i-cache-line-size = <64>; @@ -93,7 +93,7 @@ device_type = "cpu"; compatible = "arm,cortex-a55"; reg = <0x400>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; enable-method = "psci"; #cooling-cells = <2>; @@ -110,7 +110,7 @@ device_type = "cpu"; compatible = "arm,cortex-a55"; reg = <0x500>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; enable-method = "psci"; #cooling-cells = <2>; From d3c2b2a8923abc087c2e585f5828fb7fae8fedfe Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 5 Aug 2024 11:05:35 +0800 Subject: [PATCH 216/991] arm64: dts: imx95: correct L3Cache cache-sets The L3Cache size is 512KB. Size = Cache Line Size(64) * num sets(512) * Assoc(0x10). Correct the number of Cache sets. Fixes: 5e3cbb8e4256 ("arm64: dts: freescale: add i.MX95 basic dtsi") Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx95.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi index 3499d4eb249613..425272aa5a8160 100644 --- a/arch/arm64/boot/dts/freescale/imx95.dtsi +++ b/arch/arm64/boot/dts/freescale/imx95.dtsi @@ -187,7 +187,7 @@ compatible = "cache"; cache-size = <524288>; cache-line-size = <64>; - cache-sets = <1024>; + cache-sets = <512>; cache-level = <3>; cache-unified; }; From 3beddef84d90590270465a907de1cfe2539ac70d Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Tue, 13 Aug 2024 12:37:48 +0800 Subject: [PATCH 217/991] ALSA: hda/tas2781: fix wrong calibrated data order Wrong calibration data order cause sound too low in some device. Fix wrong calibrated data order, add calibration data converssion by get_unaligned_be32() after reading from UEFI. Fixes: 5be27f1e3ec9 ("ALSA: hda/tas2781: Add tas2781 HDA driver") Cc: Signed-off-by: Baojun Xu Link: https://patch.msgid.link/20240813043749.108-1-shenghao-ding@ti.com Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_i2c.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/sound/pci/hda/tas2781_hda_i2c.c b/sound/pci/hda/tas2781_hda_i2c.c index 49bd7097d8928a..7dbfc92d9d55c4 100644 --- a/sound/pci/hda/tas2781_hda_i2c.c +++ b/sound/pci/hda/tas2781_hda_i2c.c @@ -2,10 +2,12 @@ // // TAS2781 HDA I2C driver // -// Copyright 2023 Texas Instruments, Inc. +// Copyright 2023 - 2024 Texas Instruments, Inc. // // Author: Shenghao Ding +// Current maintainer: Baojun Xu +#include #include #include #include @@ -519,20 +521,22 @@ static void tas2781_apply_calib(struct tasdevice_priv *tas_priv) static const unsigned char rgno_array[CALIB_MAX] = { 0x74, 0x0c, 0x14, 0x70, 0x7c, }; - unsigned char *data; + int offset = 0; int i, j, rc; + __be32 data; for (i = 0; i < tas_priv->ndev; i++) { - data = tas_priv->cali_data.data + - i * TASDEVICE_SPEAKER_CALIBRATION_SIZE; for (j = 0; j < CALIB_MAX; j++) { + data = get_unaligned_be32( + &tas_priv->cali_data.data[offset]); rc = tasdevice_dev_bulk_write(tas_priv, i, TASDEVICE_REG(0, page_array[j], rgno_array[j]), - &(data[4 * j]), 4); + (unsigned char *)&data, 4); if (rc < 0) dev_err(tas_priv->dev, "chn %d calib %d bulk_wr err = %d\n", i, j, rc); + offset += 4; } } } From d2dfed310aae0739dc87b68c660357e6a4f29819 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Tue, 6 Aug 2024 11:46:03 +1200 Subject: [PATCH 218/991] platform/x86: asus-wmi: Add quirk for ROG Ally X MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new ROG Ally X functions the same as the previus model so we can use the same method to ensure the MCU USB devices wake and reconnect correctly. Given that two devices marks the start of a trend, this patch also adds a quirk table to make future additions easier if the MCU is the same. Signed-off-by: Luke D. Jones Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240805234603.38736-1-luke@ljones.dev Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-wmi.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index cc735931f97b93..37636e5a38e3b5 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -146,6 +146,20 @@ static const char * const ashs_ids[] = { "ATK4001", "ATK4002", NULL }; static int throttle_thermal_policy_write(struct asus_wmi *); +static const struct dmi_system_id asus_ally_mcu_quirk[] = { + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC71L"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC72L"), + }, + }, + { }, +}; + static bool ashs_present(void) { int i = 0; @@ -4685,7 +4699,7 @@ static int asus_wmi_add(struct platform_device *pdev) asus->dgpu_disable_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_DGPU); asus->kbd_rgb_state_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_STATE); asus->ally_mcu_usb_switch = acpi_has_method(NULL, ASUS_USB0_PWR_EC0_CSEE) - && dmi_match(DMI_BOARD_NAME, "RC71L"); + && dmi_check_system(asus_ally_mcu_quirk); if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_MINI_LED_MODE)) asus->mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE; From 9c8e022567bbec53bee8ae75c44b3d6cd2080d42 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sun, 11 Aug 2024 15:19:44 +0200 Subject: [PATCH 219/991] platform/surface: aggregator_registry: Add Support for Surface Pro 10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SAM client device nodes for the Surface Pro 10. It seems to use the same SAM client devices as the Surface Pro 9, so re-use its node group. Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20240811131948.261806-2-luzmaximilian@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/surface/surface_aggregator_registry.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c index 1c4d74db08c954..fa5b896e5f4e01 100644 --- a/drivers/platform/surface/surface_aggregator_registry.c +++ b/drivers/platform/surface/surface_aggregator_registry.c @@ -324,7 +324,7 @@ static const struct software_node *ssam_node_group_sp8[] = { NULL, }; -/* Devices for Surface Pro 9 */ +/* Devices for Surface Pro 9 and 10 */ static const struct software_node *ssam_node_group_sp9[] = { &ssam_node_root, &ssam_node_hub_kip, @@ -365,6 +365,9 @@ static const struct acpi_device_id ssam_platform_hub_match[] = { /* Surface Pro 9 */ { "MSHW0343", (unsigned long)ssam_node_group_sp9 }, + /* Surface Pro 10 */ + { "MSHW0510", (unsigned long)ssam_node_group_sp9 }, + /* Surface Book 2 */ { "MSHW0107", (unsigned long)ssam_node_group_gen5 }, From ed235163c3f02329d5e37ed4485bbc39ed2568d4 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sun, 11 Aug 2024 15:19:45 +0200 Subject: [PATCH 220/991] platform/surface: aggregator_registry: Add support for Surface Laptop Go 3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SAM client device nodes for the Surface Laptop Go 3. It seems to use the same SAM client devices as the Surface Laptop Go 1 and 2, so re-use their node group. Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20240811131948.261806-3-luzmaximilian@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/surface/surface_aggregator_registry.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c index fa5b896e5f4e01..4d36810c23082f 100644 --- a/drivers/platform/surface/surface_aggregator_registry.c +++ b/drivers/platform/surface/surface_aggregator_registry.c @@ -398,6 +398,9 @@ static const struct acpi_device_id ssam_platform_hub_match[] = { /* Surface Laptop Go 2 */ { "MSHW0290", (unsigned long)ssam_node_group_slg1 }, + /* Surface Laptop Go 3 */ + { "MSHW0440", (unsigned long)ssam_node_group_slg1 }, + /* Surface Laptop Studio */ { "MSHW0123", (unsigned long)ssam_node_group_sls }, From 28d04b4a2cc20981c95787f9c449e6fc51d904f9 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sun, 11 Aug 2024 15:19:46 +0200 Subject: [PATCH 221/991] platform/surface: aggregator_registry: Add support for Surface Laptop Studio 2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SAM client device nodes for the Surface Laptop Studio 2 (SLS2). The SLS2 is quite similar to the SLS1, but it does not provide the touchpad as a SAM-HID device. Therefore, add a new node group for the SLS2 and update the comments accordingly. In addition, it uses the new fan control interface. Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20240811131948.261806-4-luzmaximilian@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../surface/surface_aggregator_registry.c | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c index 4d36810c23082f..892ba9549f6a9d 100644 --- a/drivers/platform/surface/surface_aggregator_registry.c +++ b/drivers/platform/surface/surface_aggregator_registry.c @@ -273,8 +273,8 @@ static const struct software_node *ssam_node_group_sl5[] = { NULL, }; -/* Devices for Surface Laptop Studio. */ -static const struct software_node *ssam_node_group_sls[] = { +/* Devices for Surface Laptop Studio 1. */ +static const struct software_node *ssam_node_group_sls1[] = { &ssam_node_root, &ssam_node_bat_ac, &ssam_node_bat_main, @@ -289,6 +289,22 @@ static const struct software_node *ssam_node_group_sls[] = { NULL, }; +/* Devices for Surface Laptop Studio 2. */ +static const struct software_node *ssam_node_group_sls2[] = { + &ssam_node_root, + &ssam_node_bat_ac, + &ssam_node_bat_main, + &ssam_node_tmp_perf_profile_with_fan, + &ssam_node_tmp_sensors, + &ssam_node_fan_speed, + &ssam_node_pos_tablet_switch, + &ssam_node_hid_sam_keyboard, + &ssam_node_hid_sam_penstash, + &ssam_node_hid_sam_sensors, + &ssam_node_hid_sam_ucm_ucsi, + NULL, +}; + /* Devices for Surface Laptop Go. */ static const struct software_node *ssam_node_group_slg1[] = { &ssam_node_root, @@ -401,8 +417,11 @@ static const struct acpi_device_id ssam_platform_hub_match[] = { /* Surface Laptop Go 3 */ { "MSHW0440", (unsigned long)ssam_node_group_slg1 }, - /* Surface Laptop Studio */ - { "MSHW0123", (unsigned long)ssam_node_group_sls }, + /* Surface Laptop Studio 1 */ + { "MSHW0123", (unsigned long)ssam_node_group_sls1 }, + + /* Surface Laptop Studio 2 */ + { "MSHW0360", (unsigned long)ssam_node_group_sls2 }, { }, }; From 002adda09bc1c983c75c82a7e12285c7423aec31 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sun, 11 Aug 2024 15:19:47 +0200 Subject: [PATCH 222/991] platform/surface: aggregator_registry: Add fan and thermal sensor support for Surface Laptop 5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The EC on the Surface Laptop 5 exposes the fan interface. With the recently introduced driver for it, we can now also enable it here. In addition, also enable the thermal sensor interface. Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20240811131948.261806-5-luzmaximilian@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/surface/surface_aggregator_registry.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c index 892ba9549f6a9d..4d3f5b3111bae3 100644 --- a/drivers/platform/surface/surface_aggregator_registry.c +++ b/drivers/platform/surface/surface_aggregator_registry.c @@ -265,7 +265,9 @@ static const struct software_node *ssam_node_group_sl5[] = { &ssam_node_root, &ssam_node_bat_ac, &ssam_node_bat_main, - &ssam_node_tmp_perf_profile, + &ssam_node_tmp_perf_profile_with_fan, + &ssam_node_tmp_sensors, + &ssam_node_fan_speed, &ssam_node_hid_main_keyboard, &ssam_node_hid_main_touchpad, &ssam_node_hid_main_iid5, From 99ae7b9ba047ad029a0a23b2bd51608ce79c8e97 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sun, 11 Aug 2024 15:19:48 +0200 Subject: [PATCH 223/991] platform/surface: aggregator_registry: Add support for Surface Laptop 6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SAM client device nodes for the Surface Laptop Studio 6 (SL6). The SL6 is similar to the SL5, with the typical battery/AC, platform profile, and HID nodes. It also has support for the newly supported fan interface. Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20240811131948.261806-6-luzmaximilian@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../surface/surface_aggregator_registry.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c index 4d3f5b3111bae3..a23dff35f8ca23 100644 --- a/drivers/platform/surface/surface_aggregator_registry.c +++ b/drivers/platform/surface/surface_aggregator_registry.c @@ -275,6 +275,22 @@ static const struct software_node *ssam_node_group_sl5[] = { NULL, }; +/* Devices for Surface Laptop 6. */ +static const struct software_node *ssam_node_group_sl6[] = { + &ssam_node_root, + &ssam_node_bat_ac, + &ssam_node_bat_main, + &ssam_node_tmp_perf_profile_with_fan, + &ssam_node_tmp_sensors, + &ssam_node_fan_speed, + &ssam_node_hid_main_keyboard, + &ssam_node_hid_main_touchpad, + &ssam_node_hid_main_iid5, + &ssam_node_hid_sam_sensors, + &ssam_node_hid_sam_ucm_ucsi, + NULL, +}; + /* Devices for Surface Laptop Studio 1. */ static const struct software_node *ssam_node_group_sls1[] = { &ssam_node_root, @@ -410,6 +426,9 @@ static const struct acpi_device_id ssam_platform_hub_match[] = { /* Surface Laptop 5 */ { "MSHW0350", (unsigned long)ssam_node_group_sl5 }, + /* Surface Laptop 6 */ + { "MSHW0530", (unsigned long)ssam_node_group_sl6 }, + /* Surface Laptop Go 1 */ { "MSHW0118", (unsigned long)ssam_node_group_slg1 }, From ccbde4b128ef9c73d14d0d7817d68ef795f6d131 Mon Sep 17 00:00:00 2001 From: Eli Billauer Date: Thu, 1 Aug 2024 15:11:26 +0300 Subject: [PATCH 224/991] char: xillybus: Don't destroy workqueue from work item running on it Triggered by a kref decrement, destroy_workqueue() may be called from within a work item for destroying its own workqueue. This illegal situation is averted by adding a module-global workqueue for exclusive use of the offending work item. Other work items continue to be queued on per-device workqueues to ensure performance. Reported-by: syzbot+91dbdfecdd3287734d8e@syzkaller.appspotmail.com Cc: stable Closes: https://lore.kernel.org/lkml/0000000000000ab25a061e1dfe9f@google.com/ Signed-off-by: Eli Billauer Link: https://lore.kernel.org/r/20240801121126.60183-1-eli.billauer@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/xillybus/xillyusb.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/char/xillybus/xillyusb.c b/drivers/char/xillybus/xillyusb.c index 5a5afa14ca8cb8..33ca0f4af3901e 100644 --- a/drivers/char/xillybus/xillyusb.c +++ b/drivers/char/xillybus/xillyusb.c @@ -50,6 +50,7 @@ MODULE_LICENSE("GPL v2"); static const char xillyname[] = "xillyusb"; static unsigned int fifo_buf_order; +static struct workqueue_struct *wakeup_wq; #define USB_VENDOR_ID_XILINX 0x03fd #define USB_VENDOR_ID_ALTERA 0x09fb @@ -569,10 +570,6 @@ static void cleanup_dev(struct kref *kref) * errors if executed. The mechanism relies on that xdev->error is assigned * a non-zero value by report_io_error() prior to queueing wakeup_all(), * which prevents bulk_in_work() from calling process_bulk_in(). - * - * The fact that wakeup_all() and bulk_in_work() are queued on the same - * workqueue makes their concurrent execution very unlikely, however the - * kernel's API doesn't seem to ensure this strictly. */ static void wakeup_all(struct work_struct *work) @@ -627,7 +624,7 @@ static void report_io_error(struct xillyusb_dev *xdev, if (do_once) { kref_get(&xdev->kref); /* xdev is used by work item */ - queue_work(xdev->workq, &xdev->wakeup_workitem); + queue_work(wakeup_wq, &xdev->wakeup_workitem); } } @@ -2258,6 +2255,10 @@ static int __init xillyusb_init(void) { int rc = 0; + wakeup_wq = alloc_workqueue(xillyname, 0, 0); + if (!wakeup_wq) + return -ENOMEM; + if (LOG2_INITIAL_FIFO_BUF_SIZE > PAGE_SHIFT) fifo_buf_order = LOG2_INITIAL_FIFO_BUF_SIZE - PAGE_SHIFT; else @@ -2265,11 +2266,16 @@ static int __init xillyusb_init(void) rc = usb_register(&xillyusb_driver); + if (rc) + destroy_workqueue(wakeup_wq); + return rc; } static void __exit xillyusb_exit(void) { + destroy_workqueue(wakeup_wq); + usb_deregister(&xillyusb_driver); } From bc923d594db21bee0ead128eb4bb78f7e77467a4 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sun, 11 Aug 2024 14:46:44 +0200 Subject: [PATCH 225/991] platform/surface: aggregator: Fix warning when controller is destroyed in probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a small window in ssam_serial_hub_probe() where the controller is initialized but has not been started yet. Specifically, between ssam_controller_init() and ssam_controller_start(). Any failure in this window, for example caused by a failure of serdev_device_open(), currently results in an incorrect warning being emitted. In particular, any failure in this window results in the controller being destroyed via ssam_controller_destroy(). This function checks the state of the controller and, in an attempt to validate that the controller has been cleanly shut down before we try and deallocate any resources, emits a warning if that state is not SSAM_CONTROLLER_STOPPED. However, since we have only just initialized the controller and have not yet started it, its state is SSAM_CONTROLLER_INITIALIZED. Note that this is the only point at which the controller has this state, as it will change after we start the controller with ssam_controller_start() and never revert back. Further, at this point no communication has taken place and the sender and receiver threads have not been started yet (and we may not even have an open serdev device either). Therefore, it is perfectly safe to call ssam_controller_destroy() with a state of SSAM_CONTROLLER_INITIALIZED. This, however, means that the warning currently being emitted is incorrect. Fix it by extending the check. Fixes: c167b9c7e3d6 ("platform/surface: Add Surface Aggregator subsystem") Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20240811124645.246016-1-luzmaximilian@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/surface/aggregator/controller.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/surface/aggregator/controller.c b/drivers/platform/surface/aggregator/controller.c index 7fc602e01487d4..7e89f547999b2a 100644 --- a/drivers/platform/surface/aggregator/controller.c +++ b/drivers/platform/surface/aggregator/controller.c @@ -1354,7 +1354,8 @@ void ssam_controller_destroy(struct ssam_controller *ctrl) if (ctrl->state == SSAM_CONTROLLER_UNINITIALIZED) return; - WARN_ON(ctrl->state != SSAM_CONTROLLER_STOPPED); + WARN_ON(ctrl->state != SSAM_CONTROLLER_STOPPED && + ctrl->state != SSAM_CONTROLLER_INITIALIZED); /* * Note: New events could still have been received after the previous From dcdb52d948f3a17ccd3fce757d9bd981d7c32039 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Aug 2024 15:44:07 +0300 Subject: [PATCH 226/991] usb: xhci: Check for xhci->interrupters being allocated in xhci_mem_clearup() If xhci_mem_init() fails, it calls into xhci_mem_cleanup() to mop up the damage. If it fails early enough, before xhci->interrupters is allocated but after xhci->max_interrupters has been set, which happens in most (all?) cases, things get uglier, as xhci_mem_cleanup() unconditionally derefences xhci->interrupters. With prejudice. Gate the interrupt freeing loop with a check on xhci->interrupters being non-NULL. Found while debugging a DMA allocation issue that led the XHCI driver on this exact path. Fixes: c99b38c41234 ("xhci: add support to allocate several interrupters") Cc: Mathias Nyman Cc: Wesley Cheng Cc: Greg Kroah-Hartman Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org # 6.8+ Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240809124408.505786-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index d7654f475dafbf..937ce5fd58095b 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -1872,7 +1872,7 @@ void xhci_mem_cleanup(struct xhci_hcd *xhci) cancel_delayed_work_sync(&xhci->cmd_timer); - for (i = 0; i < xhci->max_interrupters; i++) { + for (i = 0; xhci->interrupters && i < xhci->max_interrupters; i++) { if (xhci->interrupters[i]) { xhci_remove_interrupter(xhci, xhci->interrupters[i]); xhci_free_interrupter(xhci, xhci->interrupters[i]); From 741b41b48faf41c0bcf3c26fbb3448b0fda4fc5d Mon Sep 17 00:00:00 2001 From: Niklas Neronin Date: Fri, 9 Aug 2024 15:44:08 +0300 Subject: [PATCH 227/991] usb: xhci: fix duplicate stall handling in handle_tx_event() Stall handling is managed in the 'process_*' functions, which are called right before the 'goto' stall handling code snippet. Thus, there should be a return after the 'process_*' functions. Otherwise, the stall code may run twice. Fixes: 1b349f214ac7 ("usb: xhci: add 'goto' for halted endpoint check in handle_tx_event()") Reported-by: Michal Pecio Signed-off-by: Niklas Neronin Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240809124408.505786-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index b7517c3c8059f4..4ea2c3e072a9e3 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -2910,6 +2910,7 @@ static int handle_tx_event(struct xhci_hcd *xhci, process_isoc_td(xhci, ep, ep_ring, td, ep_trb, event); else process_bulk_intr_td(xhci, ep, ep_ring, td, ep_trb, event); + return 0; check_endpoint_halted: if (xhci_halted_host_endpoint(ep_ctx, trb_comp_code)) From d209d1634e6562eafc369b28f8a1f67a2e9e5222 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Fri, 9 Aug 2024 18:03:43 +0300 Subject: [PATCH 228/991] usb: typec: ucsi: Fix the return value of ucsi_run_command() The command execution routines need to return the amount of data that was transferred when succesful. This fixes an issue where the alternate modes and the power delivery capabilities are not getting registered. Fixes: 5e9c1662a89b ("usb: typec: ucsi: rework command execution functions") Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240809150343.286942-1-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 432a2d6266d714..4039851551c1b7 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -137,7 +137,7 @@ static int ucsi_run_command(struct ucsi *ucsi, u64 command, u32 *cci, if (ret) return ret; - return err; + return err ?: UCSI_CCI_LENGTH(*cci); } static int ucsi_read_error(struct ucsi *ucsi, u8 connector_num) From 21ea1ce37fc267dc45fe27517bbde926211683df Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Fri, 9 Aug 2024 19:29:01 +0800 Subject: [PATCH 229/991] Revert "usb: typec: tcpm: clear pd_event queue in PORT_RESET" This reverts commit bf20c69cf3cf9c6445c4925dd9a8a6ca1b78bfdf. During tcpm_init() stage, if the VBUS is still present after tcpm_reset_port(), then we assume that VBUS will off and goto safe0v after a specific discharge time. Following a TCPM_VBUS_EVENT event if VBUS reach to off state. TCPM_VBUS_EVENT event may be set during PORT_RESET handling stage. If pd_events reset to 0 after TCPM_VBUS_EVENT set, we will lost this VBUS event. Then the port state machine may stuck at one state. Before: [ 2.570172] pending state change PORT_RESET -> PORT_RESET_WAIT_OFF @ 100 ms [rev1 NONE_AMS] [ 2.570179] state change PORT_RESET -> PORT_RESET_WAIT_OFF [delayed 100 ms] [ 2.570182] pending state change PORT_RESET_WAIT_OFF -> SNK_UNATTACHED @ 920 ms [rev1 NONE_AMS] [ 3.490213] state change PORT_RESET_WAIT_OFF -> SNK_UNATTACHED [delayed 920 ms] [ 3.490220] Start toggling [ 3.546050] CC1: 0 -> 0, CC2: 0 -> 2 [state TOGGLING, polarity 0, connected] [ 3.546057] state change TOGGLING -> SRC_ATTACH_WAIT [rev1 NONE_AMS] After revert this patch, we can see VBUS off event and the port will goto expected state. [ 2.441992] pending state change PORT_RESET -> PORT_RESET_WAIT_OFF @ 100 ms [rev1 NONE_AMS] [ 2.441999] state change PORT_RESET -> PORT_RESET_WAIT_OFF [delayed 100 ms] [ 2.442002] pending state change PORT_RESET_WAIT_OFF -> SNK_UNATTACHED @ 920 ms [rev1 NONE_AMS] [ 2.442122] VBUS off [ 2.442125] state change PORT_RESET_WAIT_OFF -> SNK_UNATTACHED [rev1 NONE_AMS] [ 2.442127] VBUS VSAFE0V [ 2.442351] CC1: 0 -> 0, CC2: 0 -> 0 [state SNK_UNATTACHED, polarity 0, disconnected] [ 2.442357] Start toggling [ 2.491850] CC1: 0 -> 0, CC2: 0 -> 2 [state TOGGLING, polarity 0, connected] [ 2.491858] state change TOGGLING -> SRC_ATTACH_WAIT [rev1 NONE_AMS] [ 2.491863] pending state change SRC_ATTACH_WAIT -> SNK_TRY @ 200 ms [rev1 NONE_AMS] [ 2.691905] state change SRC_ATTACH_WAIT -> SNK_TRY [delayed 200 ms] Fixes: bf20c69cf3cf ("usb: typec: tcpm: clear pd_event queue in PORT_RESET") Cc: stable@vger.kernel.org Signed-off-by: Xu Yang Acked-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240809112901.535072-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index cce39818e99aee..4b02d647425910 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -5655,7 +5655,6 @@ static void run_state_machine(struct tcpm_port *port) break; case PORT_RESET: tcpm_reset_port(port); - port->pd_events = 0; if (port->self_powered) tcpm_set_cc(port, TYPEC_CC_OPEN); else From 3ed486e383ccee9b0c8d727608f12a937c6603ca Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 12 Aug 2024 11:50:38 +0200 Subject: [PATCH 230/991] usb: misc: ljca: Add Lunar Lake ljca GPIO HID to ljca_gpio_hids[] Add LJCA GPIO support for the Lunar Lake platform. New HID taken from out of tree ivsc-driver git repo. Link: https://github.com/intel/ivsc-driver/commit/47e7c4a446c8ea8c741ff5a32fa7b19f9e6fd47e Cc: stable Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240812095038.555837-1-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/usb-ljca.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/misc/usb-ljca.c b/drivers/usb/misc/usb-ljca.c index 2d30fc1be30669..1a8d5e80b9aec2 100644 --- a/drivers/usb/misc/usb-ljca.c +++ b/drivers/usb/misc/usb-ljca.c @@ -169,6 +169,7 @@ static const struct acpi_device_id ljca_gpio_hids[] = { { "INTC1096" }, { "INTC100B" }, { "INTC10D1" }, + { "INTC10B5" }, {}, }; From f149be46e4c13e277e013c0fff13cb2aa7a4399c Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 7 Aug 2024 10:52:09 -0400 Subject: [PATCH 231/991] arm64: dts: imx8mm-phygate: fix typo pinctrcl-0 Fix typo pinctrcl-0 with pinctrl-0. Fix below warning: arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtb: gpio@30220000: 'pinctrl-0' is a dependency of 'pinctrl-names' from schema $id: http://devicetree.org/schemas/pinctrl/pinctrl-consumer.yaml# arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtb: uart4_rs485_en: $nodename:0: 'uart4_rs485_en' does not match '^(hog-[0-9]+|.+-hog(-[0-9]+)?)$ Fixes: 8d97083c0b5d ("arm64: dts: phygate-tauri-l: add overlays for RS232 and RS485") Reviewed-by: Teresa Remmet Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- .../boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso | 2 +- .../boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso index bf3e04651ba00d..353ace3601dc8d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso +++ b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso @@ -21,7 +21,7 @@ &gpio3 { pinctrl-names = "default"; - pinctrcl-0 = <&pinctrl_gpio3_hog>; + pinctrl-0 = <&pinctrl_gpio3_hog>; uart4_rs485_en { gpio-hog; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso index f4448cde0407ce..8a75d6783ad2b4 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso +++ b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso @@ -22,7 +22,7 @@ &gpio3 { pinctrl-names = "default"; - pinctrcl-0 = <&pinctrl_gpio3_hog>; + pinctrl-0 = <&pinctrl_gpio3_hog>; uart4_rs485_en { gpio-hog; From 92567a5f92bc947fb7aa4351979db1b7b71a554c Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Thu, 8 Aug 2024 22:06:19 +0800 Subject: [PATCH 232/991] iommu: Remove unused declaration iommu_sva_unbind_gpasid() Commit 0c9f17877891 ("iommu: Remove guest pasid related interfaces and definitions") removed the implementation but leave declaration. Signed-off-by: Yue Haibing Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240808140619.2498535-1-yuehaibing@huawei.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4d47f2c3331185..04cbdae0052eb2 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -795,8 +795,6 @@ extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); -extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, From dc98d76a15bc29a9a4e76f2f65f39f3e590fb15c Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Thu, 8 Aug 2024 22:03:25 +0800 Subject: [PATCH 233/991] tty: serial: fsl_lpuart: mark last busy before uart_add_one_port With "earlycon initcall_debug=1 loglevel=8" in bootargs, kernel sometimes boot hang. It is because normal console still is not ready, but runtime suspend is called, so early console putchar will hang in waiting TRDE set in UARTSTAT. The lpuart driver has auto suspend delay set to 3000ms, but during uart_add_one_port, a child device serial ctrl will added and probed with its pm runtime enabled(see serial_ctrl.c). The runtime suspend call path is: device_add |-> bus_probe_device |->device_initial_probe |->__device_attach |-> pm_runtime_get_sync(dev->parent); |-> pm_request_idle(dev); |-> pm_runtime_put(dev->parent); So in the end, before normal console ready, the lpuart get runtime suspended. And earlycon putchar will hang. To address the issue, mark last busy just after pm_runtime_enable, three seconds is long enough to switch from bootconsole to normal console. Fixes: 43543e6f539b ("tty: serial: fsl_lpuart: Add runtime pm support") Cc: stable Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20240808140325.580105-1-peng.fan@oss.nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/fsl_lpuart.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index 615291ea9b5e93..77efa7ee6eda29 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -2923,6 +2923,7 @@ static int lpuart_probe(struct platform_device *pdev) pm_runtime_set_autosuspend_delay(&pdev->dev, UART_AUTOSUSPEND_TIMEOUT); pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); + pm_runtime_mark_last_busy(&pdev->dev); ret = lpuart_global_reset(sport); if (ret) From 7258fdd7d7459616b3fe1a603e33900584b10c13 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 10 Aug 2024 01:07:20 +0900 Subject: [PATCH 234/991] tty: vt: conmakehash: remove non-portable code printing comment header Commit 6e20753da6bc ("tty: vt: conmakehash: cope with abs_srctree no longer in env") included , which invoked another (wrong) patch that tried to address a build error on macOS. According to the specification [1], the correct header to use PATH_MAX is . The minimal fix would be to replace with . However, the following commits seem questionable to me: - 3bd85c6c97b2 ("tty: vt: conmakehash: Don't mention the full path of the input in output") - 6e20753da6bc ("tty: vt: conmakehash: cope with abs_srctree no longer in env") These commits made too many efforts to cope with a comment header in drivers/tty/vt/consolemap_deftbl.c: /* * Do not edit this file; it was automatically generated by * * conmakehash drivers/tty/vt/cp437.uni > [this file] * */ With this commit, the header part of the generate C file will be simplified as follows: /* * Automatically generated file; Do not edit. */ BTW, another series of excessive efforts for a comment header can be seen in the following: - 5ef6dc08cfde ("lib/build_OID_registry: don't mention the full path of the script in output") - 2fe29fe94563 ("lib/build_OID_registry: avoid non-destructive substitution for Perl < 5.13.2 compat") [1]: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/limits.h.html Fixes: 6e20753da6bc ("tty: vt: conmakehash: cope with abs_srctree no longer in env") Cc: stable Reported-by: Daniel Gomez Closes: https://lore.kernel.org/all/20240807-macos-build-support-v1-11-4cd1ded85694@samsung.com/ Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20240809160853.1269466-1-masahiroy@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/conmakehash.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/tty/vt/conmakehash.c b/drivers/tty/vt/conmakehash.c index 82d9db68b2ce81..a931fcde7ad98d 100644 --- a/drivers/tty/vt/conmakehash.c +++ b/drivers/tty/vt/conmakehash.c @@ -11,8 +11,6 @@ * Copyright (C) 1995-1997 H. Peter Anvin */ -#include -#include #include #include #include @@ -79,7 +77,6 @@ int main(int argc, char *argv[]) { FILE *ctbl; const char *tblname; - char base_tblname[PATH_MAX]; char buffer[65536]; int fontlen; int i, nuni, nent; @@ -245,20 +242,15 @@ int main(int argc, char *argv[]) for ( i = 0 ; i < fontlen ; i++ ) nuni += unicount[i]; - strncpy(base_tblname, tblname, PATH_MAX); - base_tblname[PATH_MAX - 1] = 0; printf("\ /*\n\ - * Do not edit this file; it was automatically generated by\n\ - *\n\ - * conmakehash %s > [this file]\n\ - *\n\ + * Automatically generated file; Do not edit.\n\ */\n\ \n\ #include \n\ \n\ u8 dfont_unicount[%d] = \n\ -{\n\t", basename(base_tblname), fontlen); +{\n\t", fontlen); for ( i = 0 ; i < fontlen ; i++ ) { From c9f6613b16123989f2c3bd04b1d9b2365d6914e7 Mon Sep 17 00:00:00 2001 From: Mathieu Othacehe Date: Thu, 8 Aug 2024 08:06:37 +0200 Subject: [PATCH 235/991] tty: atmel_serial: use the correct RTS flag. In RS485 mode, the RTS pin is driven high by hardware when the transmitter is operating. This behaviour cannot be changed. This means that the driver should claim that it supports SER_RS485_RTS_ON_SEND and not SER_RS485_RTS_AFTER_SEND. Otherwise, when configuring the port with the SER_RS485_RTS_ON_SEND, one get the following warning: kern.warning kernel: atmel_usart_serial atmel_usart_serial.2.auto: ttyS1 (1): invalid RTS setting, using RTS_AFTER_SEND instead which is contradictory with what's really happening. Signed-off-by: Mathieu Othacehe Cc: stable Tested-by: Alexander Dahl Fixes: af47c491e3c7 ("serial: atmel: Fill in rs485_supported") Link: https://lore.kernel.org/r/20240808060637.19886-1-othacehe@gnu.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/atmel_serial.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index 0a90964d6d107f..09b246c9e389ec 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -2514,7 +2514,7 @@ static const struct uart_ops atmel_pops = { }; static const struct serial_rs485 atmel_rs485_supported = { - .flags = SER_RS485_ENABLED | SER_RS485_RTS_AFTER_SEND | SER_RS485_RX_DURING_TX, + .flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RX_DURING_TX, .delay_rts_before_send = 1, .delay_rts_after_send = 1, }; From abfceba0a7a246ac082bf569807738ff7416f59f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 5 Aug 2024 16:20:20 -0700 Subject: [PATCH 236/991] ARM: riscpc: ecard: Fix the build Fix a recently introduced build failure. Cc: Russell King Fixes: d69d80484598 ("driver core: have match() callback in struct bus_type take a const *") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240805232026.65087-2-bvanassche@acm.org Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-rpc/ecard.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c index c30df1097c524b..9f7454b8efa79e 100644 --- a/arch/arm/mach-rpc/ecard.c +++ b/arch/arm/mach-rpc/ecard.c @@ -1109,7 +1109,7 @@ void ecard_remove_driver(struct ecard_driver *drv) driver_unregister(&drv->drv); } -static int ecard_match(struct device *_dev, struct device_driver *_drv) +static int ecard_match(struct device *_dev, const struct device_driver *_drv) { struct expansion_card *ec = ECARD_DEV(_dev); struct ecard_driver *drv = ECARD_DRV(_drv); From cdd1fa91a6b8c7cd93b3abf9f3ef05b8ce741b61 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 5 Aug 2024 16:20:21 -0700 Subject: [PATCH 237/991] mips: sgi-ip22: Fix the build Fix a recently introduced build failure. Fixes: d69d80484598 ("driver core: have match() callback in struct bus_type take a const *") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240805232026.65087-3-bvanassche@acm.org Signed-off-by: Greg Kroah-Hartman --- arch/mips/sgi-ip22/ip22-gio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/sgi-ip22/ip22-gio.c b/arch/mips/sgi-ip22/ip22-gio.c index 2738325e98dd09..d20eec742bfaab 100644 --- a/arch/mips/sgi-ip22/ip22-gio.c +++ b/arch/mips/sgi-ip22/ip22-gio.c @@ -111,7 +111,7 @@ void gio_device_unregister(struct gio_device *giodev) } EXPORT_SYMBOL_GPL(gio_device_unregister); -static int gio_bus_match(struct device *dev, struct device_driver *drv) +static int gio_bus_match(struct device *dev, const struct device_driver *drv) { struct gio_device *gio_dev = to_gio_device(dev); struct gio_driver *gio_drv = to_gio_driver(drv); From 479ffee68d59c599f8aed8fa2dcc8e13e7bd13c3 Mon Sep 17 00:00:00 2001 From: Bert Karwatzki Date: Mon, 12 Aug 2024 12:45:41 +0200 Subject: [PATCH 238/991] wifi: mt76: mt7921: fix NULL pointer access in mt7921_ipv6_addr_change When disabling wifi mt7921_ipv6_addr_change() is called as a notifier. At this point mvif->phy is already NULL so we cannot use it here. Signed-off-by: Bert Karwatzki Signed-off-by: Felix Fietkau Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240812104542.80760-1-spasswolf@web.de --- drivers/net/wireless/mediatek/mt76/mt7921/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index 1bab93d049df31..23b228804289be 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -1183,7 +1183,7 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw *hw, struct inet6_dev *idev) { struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv; - struct mt792x_dev *dev = mvif->phy->dev; + struct mt792x_dev *dev = mt792x_hw_dev(hw); struct inet6_ifaddr *ifa; struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; struct sk_buff *skb; From 38c8d02501c09454e4fbf0f67de03de35e94d384 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 12 Aug 2024 13:06:40 +0200 Subject: [PATCH 239/991] wifi: iwlwifi: correctly lookup DMA address in SG table The code to lookup the scatter gather table entry assumed that it was possible to use sg_virt() in order to lookup the DMA address in a mapped scatter gather table. However, this assumption is incorrect as the DMA mapping code may merge multiple entries into one. In that case, the DMA address space may have e.g. two consecutive pages which is correctly represented by the scatter gather list entry, however the virtual addresses for these two pages may differ and the relationship cannot be resolved anymore. Avoid this problem entirely by working with the offset into the mapped area instead of using virtual addresses. With that we only use the DMA length and DMA address from the scatter gather list entries. The underlying DMA/IOMMU code is therefore free to merge two entries into one even if the virtual addresses space for the area is not continuous. Fixes: 90db50755228 ("wifi: iwlwifi: use already mapped data when TXing an AMSDU") Reported-by: Chris Bainbridge Closes: https://lore.kernel.org/r/ZrNRoEbdkxkKFMBi@debian.local Signed-off-by: Benjamin Berg Tested-by: Chris Bainbridge Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240812110640.460514-1-benjamin@sipsolutions.net --- .../wireless/intel/iwlwifi/pcie/internal.h | 3 +- .../net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 5 ++- drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 32 +++++++++++++------ 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index b59de4f80b4b89..27a7e0b5b3d51e 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -639,7 +639,8 @@ void iwl_trans_pcie_tx_reset(struct iwl_trans *trans); int iwl_pcie_txq_alloc(struct iwl_trans *trans, struct iwl_txq *txq, int slots_num, bool cmd_queue); -dma_addr_t iwl_pcie_get_sgt_tb_phys(struct sg_table *sgt, void *addr); +dma_addr_t iwl_pcie_get_sgt_tb_phys(struct sg_table *sgt, unsigned int offset, + unsigned int len); struct sg_table *iwl_pcie_prep_tso(struct iwl_trans *trans, struct sk_buff *skb, struct iwl_cmd_meta *cmd_meta, u8 **hdr, unsigned int hdr_room); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c index 2e780fb2da426c..b1846abb99b78f 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c @@ -168,6 +168,7 @@ static int iwl_txq_gen2_build_amsdu(struct iwl_trans *trans, struct ieee80211_hdr *hdr = (void *)skb->data; unsigned int snap_ip_tcp_hdrlen, ip_hdrlen, total_len, hdr_room; unsigned int mss = skb_shinfo(skb)->gso_size; + unsigned int data_offset = 0; dma_addr_t start_hdr_phys; u16 length, amsdu_pad; u8 *start_hdr; @@ -260,7 +261,8 @@ static int iwl_txq_gen2_build_amsdu(struct iwl_trans *trans, int ret; tb_len = min_t(unsigned int, tso.size, data_left); - tb_phys = iwl_pcie_get_sgt_tb_phys(sgt, tso.data); + tb_phys = iwl_pcie_get_sgt_tb_phys(sgt, data_offset, + tb_len); /* Not a real mapping error, use direct comparison */ if (unlikely(tb_phys == DMA_MAPPING_ERROR)) goto out_err; @@ -272,6 +274,7 @@ static int iwl_txq_gen2_build_amsdu(struct iwl_trans *trans, goto out_err; data_left -= tb_len; + data_offset += tb_len; tso_build_data(skb, &tso, tb_len); } } diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c index 22d482ae53d97a..9fe050f0ddc160 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c @@ -1814,23 +1814,31 @@ static void *iwl_pcie_get_page_hdr(struct iwl_trans *trans, /** * iwl_pcie_get_sgt_tb_phys - Find TB address in mapped SG list * @sgt: scatter gather table - * @addr: Virtual address + * @offset: Offset into the mapped memory (i.e. SKB payload data) + * @len: Length of the area * - * Find the entry that includes the address for the given address and return - * correct physical address for the TB entry. + * Find the DMA address that corresponds to the SKB payload data at the + * position given by @offset. * * Returns: Address for TB entry */ -dma_addr_t iwl_pcie_get_sgt_tb_phys(struct sg_table *sgt, void *addr) +dma_addr_t iwl_pcie_get_sgt_tb_phys(struct sg_table *sgt, unsigned int offset, + unsigned int len) { struct scatterlist *sg; + unsigned int sg_offset = 0; int i; + /* + * Search the mapped DMA areas in the SG for the area that contains the + * data at offset with the given length. + */ for_each_sgtable_dma_sg(sgt, sg, i) { - if (addr >= sg_virt(sg) && - (u8 *)addr < (u8 *)sg_virt(sg) + sg_dma_len(sg)) - return sg_dma_address(sg) + - ((unsigned long)addr - (unsigned long)sg_virt(sg)); + if (offset >= sg_offset && + offset + len <= sg_offset + sg_dma_len(sg)) + return sg_dma_address(sg) + offset - sg_offset; + + sg_offset += sg_dma_len(sg); } WARN_ON_ONCE(1); @@ -1875,7 +1883,9 @@ struct sg_table *iwl_pcie_prep_tso(struct iwl_trans *trans, struct sk_buff *skb, sg_init_table(sgt->sgl, skb_shinfo(skb)->nr_frags + 1); - sgt->orig_nents = skb_to_sgvec(skb, sgt->sgl, 0, skb->len); + /* Only map the data, not the header (it is copied to the TSO page) */ + sgt->orig_nents = skb_to_sgvec(skb, sgt->sgl, skb_headlen(skb), + skb->data_len); if (WARN_ON_ONCE(sgt->orig_nents <= 0)) return NULL; @@ -1900,6 +1910,7 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, struct ieee80211_hdr *hdr = (void *)skb->data; unsigned int snap_ip_tcp_hdrlen, ip_hdrlen, total_len, hdr_room; unsigned int mss = skb_shinfo(skb)->gso_size; + unsigned int data_offset = 0; u16 length, iv_len, amsdu_pad; dma_addr_t start_hdr_phys; u8 *start_hdr, *pos_hdr; @@ -2000,7 +2011,7 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, data_left); dma_addr_t tb_phys; - tb_phys = iwl_pcie_get_sgt_tb_phys(sgt, tso.data); + tb_phys = iwl_pcie_get_sgt_tb_phys(sgt, data_offset, size); /* Not a real mapping error, use direct comparison */ if (unlikely(tb_phys == DMA_MAPPING_ERROR)) return -EINVAL; @@ -2011,6 +2022,7 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, tb_phys, size); data_left -= size; + data_offset += size; tso_build_data(skb, &tso, size); } } From 92b6c2f0076c50aaa919d16b595f34f3e9967bea Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jun 2024 14:50:38 +0300 Subject: [PATCH 240/991] KVM: SVM: Fix uninitialized variable bug If snp_lookup_rmpentry() fails then "assigned" is printed in the error message but it was never initialized. Initialize it to false. Fixes: dee5a47cc7a4 ("KVM: SEV: Add KVM_SEV_SNP_LAUNCH_UPDATE command") Signed-off-by: Dan Carpenter Message-ID: <20240612115040.2423290-3-dan.carpenter@linaro.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 532df12b43c5b7..393f450adbc36a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2276,7 +2276,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) { struct sev_data_snp_launch_update fw_args = {0}; - bool assigned; + bool assigned = false; int level; ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); From cd2d00606553e631e9b5d11cca7da38fc95433e6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jun 2024 14:50:39 +0300 Subject: [PATCH 241/991] KVM: SVM: Fix an error code in sev_gmem_post_populate() The copy_from_user() function returns the number of bytes which it was not able to copy. Return -EFAULT instead. Fixes: dee5a47cc7a4 ("KVM: SEV: Add KVM_SEV_SNP_LAUNCH_UPDATE command") Signed-off-by: Dan Carpenter Message-ID: <20240612115040.2423290-4-dan.carpenter@linaro.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 393f450adbc36a..714c517dd4b72b 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2290,9 +2290,10 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf if (src) { void *vaddr = kmap_local_pfn(pfn + i); - ret = copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE); - if (ret) + if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) { + ret = -EFAULT; goto err; + } kunmap_local(vaddr); } From 58a63729c957621f1990c3494c702711188ca347 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 9 Aug 2024 08:58:58 -0700 Subject: [PATCH 242/991] net: mana: Fix doorbell out of order violation and avoid unnecessary doorbell rings After napi_complete_done() is called when NAPI is polling in the current process context, another NAPI may be scheduled and start running in softirq on another CPU and may ring the doorbell before the current CPU does. When combined with unnecessary rings when there is no need to arm the CQ, it triggers error paths in the hardware. This patch fixes this by calling napi_complete_done() after doorbell rings. It limits the number of unnecessary rings when there is no need to arm. MANA hardware specifies that there must be one doorbell ring every 8 CQ wraparounds. This driver guarantees one doorbell ring as soon as the number of consumed CQEs exceeds 4 CQ wraparounds. In practical workloads, the 4 CQ wraparounds proves to be big enough that it rarely exceeds this limit before all the napi weight is consumed. To implement this, add a per-CQ counter cq->work_done_since_doorbell, and make sure the CQ is armed as soon as passing 4 wraparounds of the CQ. Cc: stable@vger.kernel.org Fixes: e1b5683ff62e ("net: mana: Move NAPI from EQ to CQ") Reviewed-by: Haiyang Zhang Signed-off-by: Long Li Link: https://patch.msgid.link/1723219138-29887-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 24 ++++++++++++------- include/net/mana/mana.h | 1 + 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index ae717d06e66f04..39f56973746d7f 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1792,7 +1792,6 @@ static void mana_poll_rx_cq(struct mana_cq *cq) static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue) { struct mana_cq *cq = context; - u8 arm_bit; int w; WARN_ON_ONCE(cq->gdma_cq != gdma_queue); @@ -1803,16 +1802,23 @@ static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue) mana_poll_tx_cq(cq); w = cq->work_done; - - if (w < cq->budget && - napi_complete_done(&cq->napi, w)) { - arm_bit = SET_ARM_BIT; - } else { - arm_bit = 0; + cq->work_done_since_doorbell += w; + + if (w < cq->budget) { + mana_gd_ring_cq(gdma_queue, SET_ARM_BIT); + cq->work_done_since_doorbell = 0; + napi_complete_done(&cq->napi, w); + } else if (cq->work_done_since_doorbell > + cq->gdma_cq->queue_size / COMP_ENTRY_SIZE * 4) { + /* MANA hardware requires at least one doorbell ring every 8 + * wraparounds of CQ even if there is no need to arm the CQ. + * This driver rings the doorbell as soon as we have exceeded + * 4 wraparounds. + */ + mana_gd_ring_cq(gdma_queue, 0); + cq->work_done_since_doorbell = 0; } - mana_gd_ring_cq(gdma_queue, arm_bit); - return w; } diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 6439fd8b437b66..7caa334f48880e 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -275,6 +275,7 @@ struct mana_cq { /* NAPI data */ struct napi_struct napi; int work_done; + int work_done_since_doorbell; int budget; }; From 12d82c7b0a612372f594e4ff00983a1da3a1d929 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 13 Aug 2024 12:07:50 +0100 Subject: [PATCH 243/991] ALSA: hda: cs35l56: Remove redundant call to hda_cs_dsp_control_remove() The driver doesn't create any ALSA controls for firmware controls, so it shouldn't be calling hda_cs_dsp_control_remove(). commit 34e1b1bb7324 ("ALSA: hda: cs35l56: Stop creating ALSA controls for firmware coefficients") removed the call to hda_cs_dsp_add_controls() but didn't remove the call for destroying those controls. Signed-off-by: Richard Fitzgerald Fixes: 34e1b1bb7324 ("ALSA: hda: cs35l56: Stop creating ALSA controls for firmware coefficients") Link: https://patch.msgid.link/20240813110750.2814-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l56_hda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/cs35l56_hda.c b/sound/pci/hda/cs35l56_hda.c index 31cc92bac89a68..a9dfd62637cf4c 100644 --- a/sound/pci/hda/cs35l56_hda.c +++ b/sound/pci/hda/cs35l56_hda.c @@ -413,7 +413,7 @@ static void cs35l56_hda_remove_controls(struct cs35l56_hda *cs35l56) } static const struct cs_dsp_client_ops cs35l56_hda_client_ops = { - .control_remove = hda_cs_dsp_control_remove, + /* cs_dsp requires the client to provide this even if it is empty */ }; static int cs35l56_hda_request_firmware_file(struct cs35l56_hda *cs35l56, From c56ba3e44784527fd6efe5eb7a4fa6c9f6969a58 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Tue, 13 Aug 2024 16:29:43 +0530 Subject: [PATCH 244/991] ASoC: SOF: amd: move iram-dram fence register programming sequence The existing code modifies IRAM and DRAM size after sha dma start for vangogh platform. The problem with this sequence is that it might cause sha dma failure when firmware code binary size is greater than the default IRAM size. To fix this issue, Move the iram-dram fence register sequence prior to sha dma start. Fixes: 094d11768f74 ("ASoC: SOF: amd: Skip IRAM/DRAM size modification for Steam Deck OLED") Signed-off-by: Vijendar Mukunda Link: https://patch.msgid.link/20240813105944.3126903-1-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sound/soc/sof/amd/acp.c b/sound/soc/sof/amd/acp.c index 74fd5f2b148b85..9123427fab4e30 100644 --- a/sound/soc/sof/amd/acp.c +++ b/sound/soc/sof/amd/acp.c @@ -263,6 +263,17 @@ int configure_and_run_sha_dma(struct acp_dev_data *adata, void *image_addr, snd_sof_dsp_write(sdev, ACP_DSP_BAR, ACP_SHA_DMA_STRT_ADDR, start_addr); snd_sof_dsp_write(sdev, ACP_DSP_BAR, ACP_SHA_DMA_DESTINATION_ADDR, dest_addr); snd_sof_dsp_write(sdev, ACP_DSP_BAR, ACP_SHA_MSG_LENGTH, image_length); + + /* psp_send_cmd only required for vangogh platform (rev - 5) */ + if (desc->rev == 5 && !(adata->quirks && adata->quirks->skip_iram_dram_size_mod)) { + /* Modify IRAM and DRAM size */ + ret = psp_send_cmd(adata, MBOX_ACP_IRAM_DRAM_FENCE_COMMAND | IRAM_DRAM_FENCE_2); + if (ret) + return ret; + ret = psp_send_cmd(adata, MBOX_ACP_IRAM_DRAM_FENCE_COMMAND | MBOX_ISREADY_FLAG); + if (ret) + return ret; + } snd_sof_dsp_write(sdev, ACP_DSP_BAR, ACP_SHA_DMA_CMD, ACP_SHA_RUN); ret = snd_sof_dsp_read_poll_timeout(sdev, ACP_DSP_BAR, ACP_SHA_TRANSFER_BYTE_CNT, @@ -280,17 +291,6 @@ int configure_and_run_sha_dma(struct acp_dev_data *adata, void *image_addr, return ret; } - /* psp_send_cmd only required for vangogh platform (rev - 5) */ - if (desc->rev == 5 && !(adata->quirks && adata->quirks->skip_iram_dram_size_mod)) { - /* Modify IRAM and DRAM size */ - ret = psp_send_cmd(adata, MBOX_ACP_IRAM_DRAM_FENCE_COMMAND | IRAM_DRAM_FENCE_2); - if (ret) - return ret; - ret = psp_send_cmd(adata, MBOX_ACP_IRAM_DRAM_FENCE_COMMAND | MBOX_ISREADY_FLAG); - if (ret) - return ret; - } - ret = snd_sof_dsp_read_poll_timeout(sdev, ACP_DSP_BAR, ACP_SHA_DSP_FW_QUALIFIER, fw_qualifier, fw_qualifier & DSP_FW_RUN_ENABLE, ACP_REG_POLL_INTERVAL, ACP_DMA_COMPLETE_TIMEOUT_US); From 897e91e995b338002b00454fd0018af26a098148 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Tue, 13 Aug 2024 16:29:44 +0530 Subject: [PATCH 245/991] ASoC: SOF: amd: Fix for incorrect acp error register offsets Addition of 'dsp_intr_base' to ACP error register offsets points to wrong register offsets in irq handler. Correct the acp error register offsets. ACP error status register offset and acp error reason register offset got changed from ACP6.0 onwards. Add 'acp_error_stat' and 'acp_sw0_i2s_err_reason' as descriptor fields in sof_amd_acp_desc structure and update the values based on the ACP variant. >From Rembrandt platform onwards, errors related to SW1 Soundwire manager instance/I2S controller connected on P1 power tile is reported with ACP_SW1_I2S_ERROR_REASON register. Add conditional check for the same. Fixes: 96eb81851012 ("ASoC: SOF: amd: add interrupt handling for SoundWire manager devices") Signed-off-by: Vijendar Mukunda Link: https://patch.msgid.link/20240813105944.3126903-2-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp-dsp-offset.h | 6 ++++-- sound/soc/sof/amd/acp.c | 11 +++++++---- sound/soc/sof/amd/acp.h | 2 ++ sound/soc/sof/amd/pci-acp63.c | 2 ++ sound/soc/sof/amd/pci-rmb.c | 2 ++ sound/soc/sof/amd/pci-rn.c | 2 ++ 6 files changed, 19 insertions(+), 6 deletions(-) diff --git a/sound/soc/sof/amd/acp-dsp-offset.h b/sound/soc/sof/amd/acp-dsp-offset.h index 59afbe2e0f4205..072b703f9b3f32 100644 --- a/sound/soc/sof/amd/acp-dsp-offset.h +++ b/sound/soc/sof/amd/acp-dsp-offset.h @@ -76,13 +76,15 @@ #define DSP_SW_INTR_CNTL_OFFSET 0x0 #define DSP_SW_INTR_STAT_OFFSET 0x4 #define DSP_SW_INTR_TRIG_OFFSET 0x8 -#define ACP_ERROR_STATUS 0x18C4 +#define ACP3X_ERROR_STATUS 0x18C4 +#define ACP6X_ERROR_STATUS 0x1A4C #define ACP3X_AXI2DAGB_SEM_0 0x1880 #define ACP5X_AXI2DAGB_SEM_0 0x1884 #define ACP6X_AXI2DAGB_SEM_0 0x1874 /* ACP common registers to report errors related to I2S & SoundWire interfaces */ -#define ACP_SW0_I2S_ERROR_REASON 0x18B4 +#define ACP3X_SW_I2S_ERROR_REASON 0x18C8 +#define ACP6X_SW0_I2S_ERROR_REASON 0x18B4 #define ACP_SW1_I2S_ERROR_REASON 0x1A50 /* Registers from ACP_SHA block */ diff --git a/sound/soc/sof/amd/acp.c b/sound/soc/sof/amd/acp.c index 9123427fab4e30..d95f865669a699 100644 --- a/sound/soc/sof/amd/acp.c +++ b/sound/soc/sof/amd/acp.c @@ -92,6 +92,7 @@ static int config_dma_channel(struct acp_dev_data *adata, unsigned int ch, unsigned int idx, unsigned int dscr_count) { struct snd_sof_dev *sdev = adata->dev; + const struct sof_amd_acp_desc *desc = get_chip_info(sdev->pdata); unsigned int val, status; int ret; @@ -102,7 +103,7 @@ static int config_dma_channel(struct acp_dev_data *adata, unsigned int ch, val & (1 << ch), ACP_REG_POLL_INTERVAL, ACP_REG_POLL_TIMEOUT_US); if (ret < 0) { - status = snd_sof_dsp_read(sdev, ACP_DSP_BAR, ACP_ERROR_STATUS); + status = snd_sof_dsp_read(sdev, ACP_DSP_BAR, desc->acp_error_stat); val = snd_sof_dsp_read(sdev, ACP_DSP_BAR, ACP_DMA_ERR_STS_0 + ch * sizeof(u32)); dev_err(sdev->dev, "ACP_DMA_ERR_STS :0x%x ACP_ERROR_STATUS :0x%x\n", val, status); @@ -402,9 +403,11 @@ static irqreturn_t acp_irq_handler(int irq, void *dev_id) if (val & ACP_ERROR_IRQ_MASK) { snd_sof_dsp_write(sdev, ACP_DSP_BAR, desc->ext_intr_stat, ACP_ERROR_IRQ_MASK); - snd_sof_dsp_write(sdev, ACP_DSP_BAR, base + ACP_SW0_I2S_ERROR_REASON, 0); - snd_sof_dsp_write(sdev, ACP_DSP_BAR, base + ACP_SW1_I2S_ERROR_REASON, 0); - snd_sof_dsp_write(sdev, ACP_DSP_BAR, base + ACP_ERROR_STATUS, 0); + snd_sof_dsp_write(sdev, ACP_DSP_BAR, desc->acp_sw0_i2s_err_reason, 0); + /* ACP_SW1_I2S_ERROR_REASON is newly added register from rmb platform onwards */ + if (desc->rev >= 6) + snd_sof_dsp_write(sdev, ACP_DSP_BAR, ACP_SW1_I2S_ERROR_REASON, 0); + snd_sof_dsp_write(sdev, ACP_DSP_BAR, desc->acp_error_stat, 0); irq_flag = 1; } diff --git a/sound/soc/sof/amd/acp.h b/sound/soc/sof/amd/acp.h index 87e79d500865a8..1af86b5b28db80 100644 --- a/sound/soc/sof/amd/acp.h +++ b/sound/soc/sof/amd/acp.h @@ -203,6 +203,8 @@ struct sof_amd_acp_desc { u32 probe_reg_offset; u32 reg_start_addr; u32 reg_end_addr; + u32 acp_error_stat; + u32 acp_sw0_i2s_err_reason; u32 sdw_max_link_count; u64 sdw_acpi_dev_addr; }; diff --git a/sound/soc/sof/amd/pci-acp63.c b/sound/soc/sof/amd/pci-acp63.c index fc898444736571..986f5928caedd8 100644 --- a/sound/soc/sof/amd/pci-acp63.c +++ b/sound/soc/sof/amd/pci-acp63.c @@ -35,6 +35,8 @@ static const struct sof_amd_acp_desc acp63_chip_info = { .ext_intr_cntl = ACP6X_EXTERNAL_INTR_CNTL, .ext_intr_stat = ACP6X_EXT_INTR_STAT, .ext_intr_stat1 = ACP6X_EXT_INTR_STAT1, + .acp_error_stat = ACP6X_ERROR_STATUS, + .acp_sw0_i2s_err_reason = ACP6X_SW0_I2S_ERROR_REASON, .dsp_intr_base = ACP6X_DSP_SW_INTR_BASE, .sram_pte_offset = ACP6X_SRAM_PTE_OFFSET, .hw_semaphore_offset = ACP6X_AXI2DAGB_SEM_0, diff --git a/sound/soc/sof/amd/pci-rmb.c b/sound/soc/sof/amd/pci-rmb.c index 4bc30951f8b0d7..a366f904e6f31f 100644 --- a/sound/soc/sof/amd/pci-rmb.c +++ b/sound/soc/sof/amd/pci-rmb.c @@ -33,6 +33,8 @@ static const struct sof_amd_acp_desc rembrandt_chip_info = { .pgfsm_base = ACP6X_PGFSM_BASE, .ext_intr_stat = ACP6X_EXT_INTR_STAT, .dsp_intr_base = ACP6X_DSP_SW_INTR_BASE, + .acp_error_stat = ACP6X_ERROR_STATUS, + .acp_sw0_i2s_err_reason = ACP6X_SW0_I2S_ERROR_REASON, .sram_pte_offset = ACP6X_SRAM_PTE_OFFSET, .hw_semaphore_offset = ACP6X_AXI2DAGB_SEM_0, .fusion_dsp_offset = ACP6X_DSP_FUSION_RUNSTALL, diff --git a/sound/soc/sof/amd/pci-rn.c b/sound/soc/sof/amd/pci-rn.c index e08875bdfa8b16..2b7c53470ce823 100644 --- a/sound/soc/sof/amd/pci-rn.c +++ b/sound/soc/sof/amd/pci-rn.c @@ -33,6 +33,8 @@ static const struct sof_amd_acp_desc renoir_chip_info = { .pgfsm_base = ACP3X_PGFSM_BASE, .ext_intr_stat = ACP3X_EXT_INTR_STAT, .dsp_intr_base = ACP3X_DSP_SW_INTR_BASE, + .acp_error_stat = ACP3X_ERROR_STATUS, + .acp_sw0_i2s_err_reason = ACP3X_SW_I2S_ERROR_REASON, .sram_pte_offset = ACP3X_SRAM_PTE_OFFSET, .hw_semaphore_offset = ACP3X_AXI2DAGB_SEM_0, .acp_clkmux_sel = ACP3X_CLKMUX_SEL, From b919a27fab37e108164d657ac6e77bf870bf95e6 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 30 Jul 2024 12:35:11 +0200 Subject: [PATCH 246/991] ASoC: MAINTAINERS: Drop Banajit Goswami from Qualcomm sound drivers There was no active maintenance from Banajit Goswami - last email is from 2019 - so make obvious that Qualcomm sound drivers are maintained by only one person. Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240730103511.21728-1-krzysztof.kozlowski@linaro.org Signed-off-by: Mark Brown --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index d304054d661ecd..61a21efc357b10 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18522,7 +18522,6 @@ F: drivers/crypto/intel/qat/ QCOM AUDIO (ASoC) DRIVERS M: Srinivas Kandagatla -M: Banajit Goswami L: alsa-devel@alsa-project.org (moderated for non-subscribers) L: linux-arm-msm@vger.kernel.org S: Supported From 8475a1d9bb7acf1cb15842dd24baab0e8ea4e4ff Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 13 Aug 2024 12:32:09 +0100 Subject: [PATCH 247/991] ALSA: hda: cs35l41: Remove redundant call to hda_cs_dsp_control_remove() The driver doesn't create any ALSA controls for firmware controls, so it shouldn't be calling hda_cs_dsp_control_remove(). commit 312c04cee408 ("ALSA: hda: cs35l41: Stop creating ALSA Controls for firmware coefficients") removed the call to hda_cs_dsp_add_controls() but didn't remove the call for destroying those controls. Signed-off-by: Richard Fitzgerald Fixes: 312c04cee408 ("ALSA: hda: cs35l41: Stop creating ALSA Controls for firmware coefficients") Link: https://patch.msgid.link/20240813113209.648-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index 3a92e98da72d24..d68bf7591d90c1 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -134,7 +134,7 @@ static const struct reg_sequence cs35l41_hda_mute[] = { }; static const struct cs_dsp_client_ops client_ops = { - .control_remove = hda_cs_dsp_control_remove, + /* cs_dsp requires the client to provide this even if it is empty */ }; static int cs35l41_request_tuning_param_file(struct cs35l41_hda *cs35l41, char *tuning_filename, From 779bac9994452f6a894524f70c00cfb0cd4b6364 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 Aug 2024 15:08:04 +0200 Subject: [PATCH 248/991] Revert "ACPI: EC: Evaluate orphan _REG under EC device" This reverts commit 0e6b6dedf168 ("Revert "ACPI: EC: Evaluate orphan _REG under EC device") because the problem addressed by it will be addressed differently in what follows. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hans de Goede Cc: All applicable Link: https://patch.msgid.link/3236716.5fSG56mABF@rjwysocki.net --- drivers/acpi/acpica/acevents.h | 4 --- drivers/acpi/acpica/evregion.c | 6 +++- drivers/acpi/acpica/evxfregn.c | 54 ---------------------------------- drivers/acpi/ec.c | 3 -- include/acpi/acpixf.h | 4 --- 5 files changed, 5 insertions(+), 66 deletions(-) diff --git a/drivers/acpi/acpica/acevents.h b/drivers/acpi/acpica/acevents.h index 2133085deda77f..ddd072cbc738d4 100644 --- a/drivers/acpi/acpica/acevents.h +++ b/drivers/acpi/acpica/acevents.h @@ -191,10 +191,6 @@ void acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, acpi_adr_space_type space_id, u32 function); -void -acpi_ev_execute_orphan_reg_method(struct acpi_namespace_node *node, - acpi_adr_space_type space_id); - acpi_status acpi_ev_execute_reg_method(union acpi_operand_object *region_obj, u32 function); diff --git a/drivers/acpi/acpica/evregion.c b/drivers/acpi/acpica/evregion.c index dc6004daf624b6..18fdf2bc2d499a 100644 --- a/drivers/acpi/acpica/evregion.c +++ b/drivers/acpi/acpica/evregion.c @@ -20,6 +20,10 @@ extern u8 acpi_gbl_default_address_spaces[]; /* Local prototypes */ +static void +acpi_ev_execute_orphan_reg_method(struct acpi_namespace_node *device_node, + acpi_adr_space_type space_id); + static acpi_status acpi_ev_reg_run(acpi_handle obj_handle, u32 level, void *context, void **return_value); @@ -814,7 +818,7 @@ acpi_ev_reg_run(acpi_handle obj_handle, * ******************************************************************************/ -void +static void acpi_ev_execute_orphan_reg_method(struct acpi_namespace_node *device_node, acpi_adr_space_type space_id) { diff --git a/drivers/acpi/acpica/evxfregn.c b/drivers/acpi/acpica/evxfregn.c index 624361a5f34d86..3197e6303c5b08 100644 --- a/drivers/acpi/acpica/evxfregn.c +++ b/drivers/acpi/acpica/evxfregn.c @@ -306,57 +306,3 @@ acpi_execute_reg_methods(acpi_handle device, acpi_adr_space_type space_id) } ACPI_EXPORT_SYMBOL(acpi_execute_reg_methods) - -/******************************************************************************* - * - * FUNCTION: acpi_execute_orphan_reg_method - * - * PARAMETERS: device - Handle for the device - * space_id - The address space ID - * - * RETURN: Status - * - * DESCRIPTION: Execute an "orphan" _REG method that appears under an ACPI - * device. This is a _REG method that has no corresponding region - * within the device's scope. - * - ******************************************************************************/ -acpi_status -acpi_execute_orphan_reg_method(acpi_handle device, acpi_adr_space_type space_id) -{ - struct acpi_namespace_node *node; - acpi_status status; - - ACPI_FUNCTION_TRACE(acpi_execute_orphan_reg_method); - - /* Parameter validation */ - - if (!device) { - return_ACPI_STATUS(AE_BAD_PARAMETER); - } - - status = acpi_ut_acquire_mutex(ACPI_MTX_NAMESPACE); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } - - /* Convert and validate the device handle */ - - node = acpi_ns_validate_handle(device); - if (node) { - - /* - * If an "orphan" _REG method is present in the device's scope - * for the given address space ID, run it. - */ - - acpi_ev_execute_orphan_reg_method(node, space_id); - } else { - status = AE_BAD_PARAMETER; - } - - (void)acpi_ut_release_mutex(ACPI_MTX_NAMESPACE); - return_ACPI_STATUS(status); -} - -ACPI_EXPORT_SYMBOL(acpi_execute_orphan_reg_method) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 299ec653388ceb..68dd17f96f636f 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1507,9 +1507,6 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device, if (call_reg && !test_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags)) { acpi_execute_reg_methods(scope_handle, ACPI_ADR_SPACE_EC); - if (scope_handle != ec->handle) - acpi_execute_orphan_reg_method(ec->handle, ACPI_ADR_SPACE_EC); - set_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags); } diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 80dc36f9d5274c..94d0fc3bd412d3 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -662,10 +662,6 @@ ACPI_EXTERNAL_RETURN_STATUS(acpi_status acpi_execute_reg_methods(acpi_handle device, acpi_adr_space_type space_id)) -ACPI_EXTERNAL_RETURN_STATUS(acpi_status - acpi_execute_orphan_reg_method(acpi_handle device, - acpi_adr_space_type - space_id)) ACPI_EXTERNAL_RETURN_STATUS(acpi_status acpi_remove_address_space_handler(acpi_handle device, From 5d61841c74db8b5bbbf9403f1bd4879f614617d2 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 9 Aug 2024 16:15:39 -0400 Subject: [PATCH 249/991] spi: zynqmp-gqspi: Scale timeout by data size Large blocks of data time out when reading because we don't wait long enough for the transfer to complete. Scale our timeouts based on the amount of data we are tranferring, with a healthy dose of pessimism. Signed-off-by: Sean Anderson Link: https://patch.msgid.link/20240809201540.3363243-1-sean.anderson@linux.dev Signed-off-by: Mark Brown --- drivers/spi/spi-zynqmp-gqspi.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-zynqmp-gqspi.c b/drivers/spi/spi-zynqmp-gqspi.c index 99524a3c9f382e..558c466135a51b 100644 --- a/drivers/spi/spi-zynqmp-gqspi.c +++ b/drivers/spi/spi-zynqmp-gqspi.c @@ -1033,6 +1033,18 @@ static int __maybe_unused zynqmp_runtime_resume(struct device *dev) return 0; } +static unsigned long zynqmp_qspi_timeout(struct zynqmp_qspi *xqspi, u8 bits, + unsigned long bytes) +{ + unsigned long timeout; + + /* Assume we are at most 2x slower than the nominal bus speed */ + timeout = mult_frac(bytes, 2 * 8 * MSEC_PER_SEC, + bits * xqspi->speed_hz); + /* And add 100 ms for scheduling delays */ + return msecs_to_jiffies(timeout + 100); +} + /** * zynqmp_qspi_exec_op() - Initiates the QSPI transfer * @mem: The SPI memory @@ -1049,6 +1061,7 @@ static int zynqmp_qspi_exec_op(struct spi_mem *mem, { struct zynqmp_qspi *xqspi = spi_controller_get_devdata (mem->spi->controller); + unsigned long timeout; int err = 0, i; u32 genfifoentry = 0; u16 opcode = op->cmd.opcode; @@ -1077,8 +1090,10 @@ static int zynqmp_qspi_exec_op(struct spi_mem *mem, zynqmp_gqspi_write(xqspi, GQSPI_IER_OFST, GQSPI_IER_GENFIFOEMPTY_MASK | GQSPI_IER_TXNOT_FULL_MASK); - if (!wait_for_completion_timeout - (&xqspi->data_completion, msecs_to_jiffies(1000))) { + timeout = zynqmp_qspi_timeout(xqspi, op->cmd.buswidth, + op->cmd.nbytes); + if (!wait_for_completion_timeout(&xqspi->data_completion, + timeout)) { err = -ETIMEDOUT; goto return_err; } @@ -1104,8 +1119,10 @@ static int zynqmp_qspi_exec_op(struct spi_mem *mem, GQSPI_IER_TXEMPTY_MASK | GQSPI_IER_GENFIFOEMPTY_MASK | GQSPI_IER_TXNOT_FULL_MASK); - if (!wait_for_completion_timeout - (&xqspi->data_completion, msecs_to_jiffies(1000))) { + timeout = zynqmp_qspi_timeout(xqspi, op->addr.buswidth, + op->addr.nbytes); + if (!wait_for_completion_timeout(&xqspi->data_completion, + timeout)) { err = -ETIMEDOUT; goto return_err; } @@ -1173,8 +1190,9 @@ static int zynqmp_qspi_exec_op(struct spi_mem *mem, GQSPI_IER_RXEMPTY_MASK); } } - if (!wait_for_completion_timeout - (&xqspi->data_completion, msecs_to_jiffies(1000))) + timeout = zynqmp_qspi_timeout(xqspi, op->data.buswidth, + op->data.nbytes); + if (!wait_for_completion_timeout(&xqspi->data_completion, timeout)) err = -ETIMEDOUT; } From cdf65d73e001fde600b18d7e45afadf559425ce5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 Aug 2024 15:11:42 +0200 Subject: [PATCH 250/991] ACPICA: Add a depth argument to acpi_execute_reg_methods() A subsequent change will need to pass a depth argument to acpi_execute_reg_methods(), so prepare that function for it. No intentional functional changes. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hans de Goede Cc: All applicable Link: https://patch.msgid.link/8451567.NyiUUSuA9g@rjwysocki.net --- drivers/acpi/acpica/acevents.h | 2 +- drivers/acpi/acpica/evregion.c | 6 ++++-- drivers/acpi/acpica/evxfregn.c | 10 +++++++--- drivers/acpi/ec.c | 2 +- include/acpi/acpixf.h | 1 + 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/acpica/acevents.h b/drivers/acpi/acpica/acevents.h index ddd072cbc738d4..1c5218b79fc2ac 100644 --- a/drivers/acpi/acpica/acevents.h +++ b/drivers/acpi/acpica/acevents.h @@ -188,7 +188,7 @@ acpi_ev_detach_region(union acpi_operand_object *region_obj, u8 acpi_ns_is_locked); void -acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, +acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, u32 max_depth, acpi_adr_space_type space_id, u32 function); acpi_status diff --git a/drivers/acpi/acpica/evregion.c b/drivers/acpi/acpica/evregion.c index 18fdf2bc2d499a..cf53b9535f18e0 100644 --- a/drivers/acpi/acpica/evregion.c +++ b/drivers/acpi/acpica/evregion.c @@ -65,6 +65,7 @@ acpi_status acpi_ev_initialize_op_regions(void) acpi_gbl_default_address_spaces [i])) { acpi_ev_execute_reg_methods(acpi_gbl_root_node, + ACPI_UINT32_MAX, acpi_gbl_default_address_spaces [i], ACPI_REG_CONNECT); } @@ -672,6 +673,7 @@ acpi_ev_execute_reg_method(union acpi_operand_object *region_obj, u32 function) * FUNCTION: acpi_ev_execute_reg_methods * * PARAMETERS: node - Namespace node for the device + * max_depth - Depth to which search for _REG * space_id - The address space ID * function - Passed to _REG: On (1) or Off (0) * @@ -683,7 +685,7 @@ acpi_ev_execute_reg_method(union acpi_operand_object *region_obj, u32 function) ******************************************************************************/ void -acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, +acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, u32 max_depth, acpi_adr_space_type space_id, u32 function) { struct acpi_reg_walk_info info; @@ -717,7 +719,7 @@ acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, * regions and _REG methods. (i.e. handlers must be installed for all * regions of this Space ID before we can run any _REG methods) */ - (void)acpi_ns_walk_namespace(ACPI_TYPE_ANY, node, ACPI_UINT32_MAX, + (void)acpi_ns_walk_namespace(ACPI_TYPE_ANY, node, max_depth, ACPI_NS_WALK_UNLOCK, acpi_ev_reg_run, NULL, &info, NULL); diff --git a/drivers/acpi/acpica/evxfregn.c b/drivers/acpi/acpica/evxfregn.c index 3197e6303c5b08..95f78383bbdba1 100644 --- a/drivers/acpi/acpica/evxfregn.c +++ b/drivers/acpi/acpica/evxfregn.c @@ -85,7 +85,8 @@ acpi_install_address_space_handler_internal(acpi_handle device, /* Run all _REG methods for this address space */ if (run_reg) { - acpi_ev_execute_reg_methods(node, space_id, ACPI_REG_CONNECT); + acpi_ev_execute_reg_methods(node, ACPI_UINT32_MAX, space_id, + ACPI_REG_CONNECT); } unlock_and_exit: @@ -263,6 +264,7 @@ ACPI_EXPORT_SYMBOL(acpi_remove_address_space_handler) * FUNCTION: acpi_execute_reg_methods * * PARAMETERS: device - Handle for the device + * max_depth - Depth to which search for _REG * space_id - The address space ID * * RETURN: Status @@ -271,7 +273,8 @@ ACPI_EXPORT_SYMBOL(acpi_remove_address_space_handler) * ******************************************************************************/ acpi_status -acpi_execute_reg_methods(acpi_handle device, acpi_adr_space_type space_id) +acpi_execute_reg_methods(acpi_handle device, u32 max_depth, + acpi_adr_space_type space_id) { struct acpi_namespace_node *node; acpi_status status; @@ -296,7 +299,8 @@ acpi_execute_reg_methods(acpi_handle device, acpi_adr_space_type space_id) /* Run all _REG methods for this address space */ - acpi_ev_execute_reg_methods(node, space_id, ACPI_REG_CONNECT); + acpi_ev_execute_reg_methods(node, max_depth, space_id, + ACPI_REG_CONNECT); } else { status = AE_BAD_PARAMETER; } diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 68dd17f96f636f..d9c12db80f1123 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1506,7 +1506,7 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device, } if (call_reg && !test_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags)) { - acpi_execute_reg_methods(scope_handle, ACPI_ADR_SPACE_EC); + acpi_execute_reg_methods(scope_handle, ACPI_UINT32_MAX, ACPI_ADR_SPACE_EC); set_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags); } diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 94d0fc3bd412d3..9f1c1d225e32c6 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -660,6 +660,7 @@ ACPI_EXTERNAL_RETURN_STATUS(acpi_status void *context)) ACPI_EXTERNAL_RETURN_STATUS(acpi_status acpi_execute_reg_methods(acpi_handle device, + u32 nax_depth, acpi_adr_space_type space_id)) ACPI_EXTERNAL_RETURN_STATUS(acpi_status From 71bf41b8e913ec9fc91f0d39ab8fb320229ec604 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 Aug 2024 15:16:21 +0200 Subject: [PATCH 251/991] ACPI: EC: Evaluate _REG outside the EC scope more carefully Commit 60fa6ae6e6d0 ("ACPI: EC: Install address space handler at the namespace root") caused _REG methods for EC operation regions outside the EC device scope to be evaluated which on some systems leads to the evaluation of _REG methods in the scopes of device objects representing devices that are not present and not functional according to the _STA return values. Some of those device objects represent EC "alternatives" and if _REG is evaluated for their operation regions, the platform firmware may be confused and the platform may start to behave incorrectly. To avoid this problem, only evaluate _REG for EC operation regions located in the scopes of device objects representing known-to-be-present devices. For this purpose, partially revert commit 60fa6ae6e6d0 and trigger the evaluation of _REG for EC operation regions from acpi_bus_attach() for the known-valid devices. Fixes: 60fa6ae6e6d0 ("ACPI: EC: Install address space handler at the namespace root") Link: https://lore.kernel.org/linux-acpi/1f76b7e2-1928-4598-8037-28a1785c2d13@redhat.com Link: https://bugzilla.redhat.com/show_bug.cgi?id=2298938 Link: https://bugzilla.redhat.com/show_bug.cgi?id=2302253 Reported-by: Hans de Goede Signed-off-by: Rafael J. Wysocki Reviewed-by: Hans de Goede Cc: All applicable Link: https://patch.msgid.link/23612351.6Emhk5qWAg@rjwysocki.net --- drivers/acpi/ec.c | 11 +++++++++-- drivers/acpi/internal.h | 1 + drivers/acpi/scan.c | 2 ++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index d9c12db80f1123..38d2f6e6b12b4f 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1487,12 +1487,13 @@ static bool install_gpio_irq_event_handler(struct acpi_ec *ec) static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device, bool call_reg) { - acpi_handle scope_handle = ec == first_ec ? ACPI_ROOT_OBJECT : ec->handle; acpi_status status; acpi_ec_start(ec, false); if (!test_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags)) { + acpi_handle scope_handle = ec == first_ec ? ACPI_ROOT_OBJECT : ec->handle; + acpi_ec_enter_noirq(ec); status = acpi_install_address_space_handler_no_reg(scope_handle, ACPI_ADR_SPACE_EC, @@ -1506,7 +1507,7 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device, } if (call_reg && !test_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags)) { - acpi_execute_reg_methods(scope_handle, ACPI_UINT32_MAX, ACPI_ADR_SPACE_EC); + acpi_execute_reg_methods(ec->handle, ACPI_UINT32_MAX, ACPI_ADR_SPACE_EC); set_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags); } @@ -1721,6 +1722,12 @@ static void acpi_ec_remove(struct acpi_device *device) } } +void acpi_ec_register_opregions(struct acpi_device *adev) +{ + if (first_ec && first_ec->handle != adev->handle) + acpi_execute_reg_methods(adev->handle, 1, ACPI_ADR_SPACE_EC); +} + static acpi_status ec_parse_io_ports(struct acpi_resource *resource, void *context) { diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 601b670356e50e..aadd4c218b320e 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -223,6 +223,7 @@ int acpi_ec_add_query_handler(struct acpi_ec *ec, u8 query_bit, acpi_handle handle, acpi_ec_query_func func, void *data); void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit); +void acpi_ec_register_opregions(struct acpi_device *adev); #ifdef CONFIG_PM_SLEEP void acpi_ec_flush_work(void); diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 59771412686ba3..22ae7829a9155d 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -2273,6 +2273,8 @@ static int acpi_bus_attach(struct acpi_device *device, void *first_pass) if (device->handler) goto ok; + acpi_ec_register_opregions(device); + if (!device->flags.initialized) { device->flags.power_manageable = device->power.states[ACPI_STATE_D0].flags.valid; From 1e1fd567d32fcf7544c6e09e0e5bc6c650da6e23 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 13 Aug 2024 12:38:51 +0200 Subject: [PATCH 252/991] dm suspend: return -ERESTARTSYS instead of -EINTR This commit changes device mapper, so that it returns -ERESTARTSYS instead of -EINTR when it is interrupted by a signal (so that the ioctl can be restarted). The manpage signal(7) says that the ioctl function should be restarted if the signal was handled with SA_RESTART. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org --- drivers/md/dm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 97fab2087df86d..87bb9030343582 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2737,7 +2737,7 @@ static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int ta break; if (signal_pending_state(task_state, current)) { - r = -EINTR; + r = -ERESTARTSYS; break; } @@ -2762,7 +2762,7 @@ static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_st break; if (signal_pending_state(task_state, current)) { - r = -EINTR; + r = -ERESTARTSYS; break; } From 7a636b4f03af9d541205f69e373672e7b2b60a8a Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Tue, 13 Aug 2024 12:39:52 +0200 Subject: [PATCH 253/991] dm resume: don't return EINVAL when signalled If the dm_resume method is called on a device that is not suspended, the method will suspend the device briefly, before resuming it (so that the table will be swapped). However, there was a bug that the return value of dm_suspended_md was not checked. dm_suspended_md may return an error when it is interrupted by a signal. In this case, do_resume would call dm_swap_table, which would return -EINVAL. This commit fixes the logic, so that error returned by dm_suspend is checked and the resume operation is undone. Signed-off-by: Mikulas Patocka Signed-off-by: Khazhismel Kumykov Cc: stable@vger.kernel.org --- drivers/md/dm-ioctl.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index c2c07bfa647199..f299ff393a6a2c 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1181,8 +1181,26 @@ static int do_resume(struct dm_ioctl *param) suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; if (param->flags & DM_NOFLUSH_FLAG) suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; - if (!dm_suspended_md(md)) - dm_suspend(md, suspend_flags); + if (!dm_suspended_md(md)) { + r = dm_suspend(md, suspend_flags); + if (r) { + down_write(&_hash_lock); + hc = dm_get_mdptr(md); + if (hc && !hc->new_map) { + hc->new_map = new_map; + new_map = NULL; + } else { + r = -ENXIO; + } + up_write(&_hash_lock); + if (new_map) { + dm_sync_table(md); + dm_table_destroy(new_map); + } + dm_put(md); + return r; + } + } old_size = dm_get_size(md); old_map = dm_swap_table(md, new_map); From 2a0629834cd82f05d424bbc193374f9a43d1f87d Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 9 Aug 2024 11:16:28 +0800 Subject: [PATCH 254/991] vfs: Don't evict inode under the inode lru traversing context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The inode reclaiming process(See function prune_icache_sb) collects all reclaimable inodes and mark them with I_FREEING flag at first, at that time, other processes will be stuck if they try getting these inodes (See function find_inode_fast), then the reclaiming process destroy the inodes by function dispose_list(). Some filesystems(eg. ext4 with ea_inode feature, ubifs with xattr) may do inode lookup in the inode evicting callback function, if the inode lookup is operated under the inode lru traversing context, deadlock problems may happen. Case 1: In function ext4_evict_inode(), the ea inode lookup could happen if ea_inode feature is enabled, the lookup process will be stuck under the evicting context like this: 1. File A has inode i_reg and an ea inode i_ea 2. getfattr(A, xattr_buf) // i_ea is added into lru // lru->i_ea 3. Then, following three processes running like this: PA PB echo 2 > /proc/sys/vm/drop_caches shrink_slab prune_dcache_sb // i_reg is added into lru, lru->i_ea->i_reg prune_icache_sb list_lru_walk_one inode_lru_isolate i_ea->i_state |= I_FREEING // set inode state inode_lru_isolate __iget(i_reg) spin_unlock(&i_reg->i_lock) spin_unlock(lru_lock) rm file A i_reg->nlink = 0 iput(i_reg) // i_reg->nlink is 0, do evict ext4_evict_inode ext4_xattr_delete_inode ext4_xattr_inode_dec_ref_all ext4_xattr_inode_iget ext4_iget(i_ea->i_ino) iget_locked find_inode_fast __wait_on_freeing_inode(i_ea) ----→ AA deadlock dispose_list // cannot be executed by prune_icache_sb wake_up_bit(&i_ea->i_state) Case 2: In deleted inode writing function ubifs_jnl_write_inode(), file deleting process holds BASEHD's wbuf->io_mutex while getting the xattr inode, which could race with inode reclaiming process(The reclaiming process could try locking BASEHD's wbuf->io_mutex in inode evicting function), then an ABBA deadlock problem would happen as following: 1. File A has inode ia and a xattr(with inode ixa), regular file B has inode ib and a xattr. 2. getfattr(A, xattr_buf) // ixa is added into lru // lru->ixa 3. Then, following three processes running like this: PA PB PC echo 2 > /proc/sys/vm/drop_caches shrink_slab prune_dcache_sb // ib and ia are added into lru, lru->ixa->ib->ia prune_icache_sb list_lru_walk_one inode_lru_isolate ixa->i_state |= I_FREEING // set inode state inode_lru_isolate __iget(ib) spin_unlock(&ib->i_lock) spin_unlock(lru_lock) rm file B ib->nlink = 0 rm file A iput(ia) ubifs_evict_inode(ia) ubifs_jnl_delete_inode(ia) ubifs_jnl_write_inode(ia) make_reservation(BASEHD) // Lock wbuf->io_mutex ubifs_iget(ixa->i_ino) iget_locked find_inode_fast __wait_on_freeing_inode(ixa) | iput(ib) // ib->nlink is 0, do evict | ubifs_evict_inode | ubifs_jnl_delete_inode(ib) ↓ ubifs_jnl_write_inode ABBA deadlock ←-----make_reservation(BASEHD) dispose_list // cannot be executed by prune_icache_sb wake_up_bit(&ixa->i_state) Fix the possible deadlock by using new inode state flag I_LRU_ISOLATING to pin the inode in memory while inode_lru_isolate() reclaims its pages instead of using ordinary inode reference. This way inode deletion cannot be triggered from inode_lru_isolate() thus avoiding the deadlock. evict() is made to wait for I_LRU_ISOLATING to be cleared before proceeding with inode cleanup. Link: https://lore.kernel.org/all/37c29c42-7685-d1f0-067d-63582ffac405@huaweicloud.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=219022 Fixes: e50e5129f384 ("ext4: xattr-in-inode support") Fixes: 7959cf3a7506 ("ubifs: journal: Handle xattrs like files") Cc: stable@vger.kernel.org Signed-off-by: Zhihao Cheng Link: https://lore.kernel.org/r/20240809031628.1069873-1-chengzhihao@huaweicloud.com Reviewed-by: Jan Kara Suggested-by: Jan Kara Suggested-by: Mateusz Guzik Signed-off-by: Christian Brauner --- fs/inode.c | 39 +++++++++++++++++++++++++++++++++++++-- include/linux/fs.h | 5 +++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 86670941884b48..10c4619faeef8c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -488,6 +488,39 @@ static void inode_lru_list_del(struct inode *inode) this_cpu_dec(nr_unused); } +static void inode_pin_lru_isolating(struct inode *inode) +{ + lockdep_assert_held(&inode->i_lock); + WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); + inode->i_state |= I_LRU_ISOLATING; +} + +static void inode_unpin_lru_isolating(struct inode *inode) +{ + spin_lock(&inode->i_lock); + WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); + inode->i_state &= ~I_LRU_ISOLATING; + smp_mb(); + wake_up_bit(&inode->i_state, __I_LRU_ISOLATING); + spin_unlock(&inode->i_lock); +} + +static void inode_wait_for_lru_isolating(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (inode->i_state & I_LRU_ISOLATING) { + DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING); + wait_queue_head_t *wqh; + + wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING); + spin_unlock(&inode->i_lock); + __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE); + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_LRU_ISOLATING); + } + spin_unlock(&inode->i_lock); +} + /** * inode_sb_list_add - add inode to the superblock list of inodes * @inode: inode to add @@ -657,6 +690,8 @@ static void evict(struct inode *inode) inode_sb_list_del(inode); + inode_wait_for_lru_isolating(inode); + /* * Wait for flusher thread to be done with the inode so that filesystem * does not start destroying it while writeback is still running. Since @@ -855,7 +890,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * be under pressure before the cache inside the highmem zone. */ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { - __iget(inode); + inode_pin_lru_isolating(inode); spin_unlock(&inode->i_lock); spin_unlock(lru_lock); if (remove_inode_buffers(inode)) { @@ -867,7 +902,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, __count_vm_events(PGINODESTEAL, reap); mm_account_reclaimed_pages(reap); } - iput(inode); + inode_unpin_lru_isolating(inode); spin_lock(lru_lock); return LRU_RETRY; } diff --git a/include/linux/fs.h b/include/linux/fs.h index fd34b5755c0b52..fb0426f349fc5c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2392,6 +2392,9 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. * + * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding + * i_count. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -2415,6 +2418,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, #define I_DONTCACHE (1 << 16) #define I_SYNC_QUEUED (1 << 17) #define I_PINNING_NETFS_WB (1 << 18) +#define __I_LRU_ISOLATING 19 +#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) From e3786b29c54cdae3490b07180a54e2461f42144c Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Thu, 8 Aug 2024 14:29:38 +0100 Subject: [PATCH 255/991] 9p: Fix DIO read through netfs If a program is watching a file on a 9p mount, it won't see any change in size if the file being exported by the server is changed directly in the source filesystem, presumably because 9p doesn't have change notifications, and because netfs skips the reads if the file is empty. Fix this by attempting to read the full size specified when a DIO read is requested (such as when 9p is operating in unbuffered mode) and dealing with a short read if the EOF was less than the expected read. To make this work, filesystems using netfslib must not set NETFS_SREQ_CLEAR_TAIL if performing a DIO read where that read hit the EOF. I don't want to mandatorily clear this flag in netfslib for DIO because, say, ceph might make a read from an object that is not completely filled, but does not reside at the end of file - and so we need to clear the excess. This can be tested by watching an empty file over 9p within a VM (such as in the ktest framework): while true; do read content; if [ -n "$content" ]; then echo $content; break; fi; done < /host/tmp/foo then writing something into the empty file. The watcher should immediately display the file content and break out of the loop. Without this fix, it remains in the loop indefinitely. Fixes: 80105ed2fd27 ("9p: Use netfslib read/write_iter") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218916 Signed-off-by: David Howells Link: https://lore.kernel.org/r/1229195.1723211769@warthog.procyon.org.uk cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Christian Schoenebeck cc: Marc Dionne cc: Ilya Dryomov cc: Steve French cc: Paulo Alcantara cc: Trond Myklebust cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: linux-nfs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Dominique Martinet Signed-off-by: Christian Brauner --- fs/9p/vfs_addr.c | 3 ++- fs/afs/file.c | 3 ++- fs/ceph/addr.c | 6 ++++-- fs/netfs/io.c | 17 +++++++++++------ fs/nfs/fscache.c | 3 ++- fs/smb/client/file.c | 3 ++- 6 files changed, 23 insertions(+), 12 deletions(-) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index a97ceb105cd8dd..24fdc74caeba47 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -75,7 +75,8 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) /* if we just extended the file size, any portion not in * cache won't be on server and is zeroes */ - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (subreq->rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); netfs_subreq_terminated(subreq, err ?: total, false); } diff --git a/fs/afs/file.c b/fs/afs/file.c index c3f0c45ae9a9b6..ec1be0091fdb56 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -242,7 +242,8 @@ static void afs_fetch_data_notify(struct afs_operation *op) req->error = error; if (subreq) { - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (subreq->rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); netfs_subreq_terminated(subreq, error ?: req->actual_len, false); req->subreq = NULL; } else if (req->done) { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index cc0a2240de98ee..c4744a02db753c 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -246,7 +246,8 @@ static void finish_netfs_read(struct ceph_osd_request *req) if (err >= 0) { if (sparse && err > 0) err = ceph_sparse_ext_map_end(op); - if (err < subreq->len) + if (err < subreq->len && + subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (IS_ENCRYPTED(inode) && err > 0) { err = ceph_fscrypt_decrypt_extents(inode, @@ -282,7 +283,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) size_t len; int mode; - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); if (subreq->start >= inode->i_size) diff --git a/fs/netfs/io.c b/fs/netfs/io.c index c179a1c73fa703..5367caf3fa2863 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -530,7 +530,8 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, if (transferred_or_error == 0) { if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { - subreq->error = -ENODATA; + if (rreq->origin != NETFS_DIO_READ) + subreq->error = -ENODATA; goto failed; } } else { @@ -601,9 +602,14 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, } if (subreq->len > ictx->zero_point - subreq->start) subreq->len = ictx->zero_point - subreq->start; + + /* We limit buffered reads to the EOF, but let the + * server deal with larger-than-EOF DIO/unbuffered + * reads. + */ + if (subreq->len > rreq->i_size - subreq->start) + subreq->len = rreq->i_size - subreq->start; } - if (subreq->len > rreq->i_size - subreq->start) - subreq->len = rreq->i_size - subreq->start; if (rreq->rsize && subreq->len > rreq->rsize) subreq->len = rreq->rsize; @@ -739,11 +745,10 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) do { _debug("submit %llx + %llx >= %llx", rreq->start, rreq->submitted, rreq->i_size); - if (rreq->origin == NETFS_DIO_READ && - rreq->start + rreq->submitted >= rreq->i_size) - break; if (!netfs_rreq_submit_slice(rreq, &io_iter)) break; + if (test_bit(NETFS_SREQ_NO_PROGRESS, &rreq->flags)) + break; if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) break; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index bf29a65c5027f4..7a558dea75c409 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -363,7 +363,8 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) return; sreq = netfs->sreq; - if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) + if (test_bit(NFS_IOHDR_EOF, &hdr->flags) && + sreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags); if (hdr->error) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index b2405dd4d4d4da..3f3842e7b44a76 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -217,7 +217,8 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) goto out; } - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (subreq->rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); rc = rdata->server->ops->async_readv(rdata); out: From 810ee43d9cd245d138a2733d87a24858a23f577d Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 12 Aug 2024 00:28:21 +0100 Subject: [PATCH 256/991] Squashfs: sanity check symbolic link size Syzkiller reports a "KMSAN: uninit-value in pick_link" bug. This is caused by an uninitialised page, which is ultimately caused by a corrupted symbolic link size read from disk. The reason why the corrupted symlink size causes an uninitialised page is due to the following sequence of events: 1. squashfs_read_inode() is called to read the symbolic link from disk. This assigns the corrupted value 3875536935 to inode->i_size. 2. Later squashfs_symlink_read_folio() is called, which assigns this corrupted value to the length variable, which being a signed int, overflows producing a negative number. 3. The following loop that fills in the page contents checks that the copied bytes is less than length, which being negative means the loop is skipped, producing an uninitialised page. This patch adds a sanity check which checks that the symbolic link size is not larger than expected. -- Signed-off-by: Phillip Lougher Link: https://lore.kernel.org/r/20240811232821.13903-1-phillip@squashfs.org.uk Reported-by: Lizhi Xu Reported-by: syzbot+24ac24ff58dc5b0d26b9@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000a90e8c061e86a76b@google.com/ V2: fix spelling mistake. Signed-off-by: Christian Brauner --- fs/squashfs/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 16bd693d0b3aa2..d5918eba27e371 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -279,8 +279,13 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; - set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); + if (inode->i_size > PAGE_SIZE) { + ERROR("Corrupted symlink\n"); + return -EINVAL; + } + + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_op = &squashfs_symlink_inode_ops; inode_nohighmem(inode); inode->i_data.a_ops = &squashfs_symlink_aops; From e4956dc7a84da074fd8dc10f7abd147f15b3ae58 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 Aug 2024 06:10:59 -0600 Subject: [PATCH 257/991] io_uring/sqpoll: annotate debug task == current with data_race() There's a debug check in io_sq_thread_park() checking if it's the SQPOLL thread itself calling park. KCSAN warns about this, as we should not be reading sqd->thread outside of sqd->lock. Just silence this with data_race(). The pointer isn't used for anything but this debug check. Reported-by: syzbot+2b946a3fd80caf971b21@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index b3722e5275e77e..3b50dc9586d14f 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -44,7 +44,7 @@ void io_sq_thread_unpark(struct io_sq_data *sqd) void io_sq_thread_park(struct io_sq_data *sqd) __acquires(&sqd->lock) { - WARN_ON_ONCE(sqd->thread == current); + WARN_ON_ONCE(data_race(sqd->thread) == current); atomic_inc(&sqd->park_pending); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); From bcc954c6caba01fca143162d5fbb90e46aa1ad80 Mon Sep 17 00:00:00 2001 From: Ryo Takakura Date: Mon, 12 Aug 2024 16:27:03 +0900 Subject: [PATCH 258/991] printk/panic: Allow cpu backtraces to be written into ringbuffer during panic commit 779dbc2e78d7 ("printk: Avoid non-panic CPUs writing to ringbuffer") disabled non-panic CPUs to further write messages to ringbuffer after panicked. Since the commit, non-panicked CPU's are not allowed to write to ring buffer after panicked and CPU backtrace which is triggered after panicked to sample non-panicked CPUs' backtrace no longer serves its function as it has nothing to print. Fix the issue by allowing non-panicked CPUs to write into ringbuffer while CPU backtrace is in flight. Fixes: 779dbc2e78d7 ("printk: Avoid non-panic CPUs writing to ringbuffer") Signed-off-by: Ryo Takakura Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240812072703.339690-1-takakura@valinux.co.jp Signed-off-by: Petr Mladek --- include/linux/panic.h | 1 + kernel/panic.c | 8 +++++++- kernel/printk/printk.c | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/panic.h b/include/linux/panic.h index 3130e0b5116b03..54d90b6c5f47bd 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -16,6 +16,7 @@ extern void oops_enter(void); extern void oops_exit(void); extern bool oops_may_print(void); +extern bool panic_triggering_all_cpu_backtrace; extern int panic_timeout; extern unsigned long panic_print; extern int panic_on_oops; diff --git a/kernel/panic.c b/kernel/panic.c index f861bedc1925e7..2a0449144f82ea 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -64,6 +64,8 @@ unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; static unsigned int warn_limit __read_mostly; +bool panic_triggering_all_cpu_backtrace; + int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -253,8 +255,12 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & PANIC_PRINT_ALL_CPU_BT) + if (panic_print & PANIC_PRINT_ALL_CPU_BT) { + /* Temporary allow non-panic CPUs to write their backtraces. */ + panic_triggering_all_cpu_backtrace = true; trigger_all_cpu_backtrace(); + panic_triggering_all_cpu_backtrace = false; + } /* * Note that smp_send_stop() is the usual SMP shutdown function, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 054c0e7784fdfd..c22b07049c382c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2316,7 +2316,7 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic()) + if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace) return 0; if (level == LOGLEVEL_SCHED) { From 0ecc5be200c84e67114f3640064ba2bae3ba2f5a Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Tue, 13 Aug 2024 09:48:27 +0800 Subject: [PATCH 259/991] x86/apic: Make x2apic_disable() work correctly x2apic_disable() clears x2apic_state and x2apic_mode unconditionally, even when the state is X2APIC_ON_LOCKED, which prevents the kernel to disable it thereby creating inconsistent state. Due to the early state check for X2APIC_ON, the code path which warns about a locked X2APIC cannot be reached. Test for state < X2APIC_ON instead and move the clearing of the state and mode variables to the place which actually disables X2APIC. [ tglx: Massaged change log. Added Fixes tag. Moved clearing so it's at the right place for back ports ] Fixes: a57e456a7b28 ("x86/apic: Fix fallout from x2apic cleanup") Signed-off-by: Yuntao Wang Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240813014827.895381-1-yuntao.wang@linux.dev --- arch/x86/kernel/apic/apic.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 66fd4b2a37a3a5..373638691cd480 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1775,12 +1775,9 @@ static __init void apic_set_fixmap(bool read_apic); static __init void x2apic_disable(void) { - u32 x2apic_id, state = x2apic_state; + u32 x2apic_id; - x2apic_mode = 0; - x2apic_state = X2APIC_DISABLED; - - if (state != X2APIC_ON) + if (x2apic_state < X2APIC_ON) return; x2apic_id = read_apic_id(); @@ -1793,6 +1790,10 @@ static __init void x2apic_disable(void) } __x2apic_disable(); + + x2apic_mode = 0; + x2apic_state = X2APIC_DISABLED; + /* * Don't reread the APIC ID as it was already done from * check_x2apic() and the APIC driver still is a x2APIC variant, From b098495e69491c2225681f43228312d32477217b Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 3 Aug 2024 19:32:33 +0800 Subject: [PATCH 260/991] KVM: x86: hyper-v: Remove unused inline function kvm_hv_free_pa_page() There is no caller in tree since introduction in commit b4f69df0f65e ("KVM: x86: Make Hyper-V emulation optional") Signed-off-by: Yue Haibing Message-ID: <20240803113233.128185-1-yuehaibing@huawei.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 923e64903da9af..913bfc96959cbf 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -286,7 +286,6 @@ static inline int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return HV_STATUS_ACCESS_DENIED; } static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) {} -static inline void kvm_hv_free_pa_page(struct kvm *kvm) {} static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector) { return false; From df934abb185c71c9f2fa07a5013672d0cbd36560 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Fri, 9 Aug 2024 12:36:12 -0400 Subject: [PATCH 261/991] mlxbf_gige: disable RX filters until RX path initialized A recent change to the driver exposed a bug where the MAC RX filters (unicast MAC, broadcast MAC, and multicast MAC) are configured and enabled before the RX path is fully initialized. The result of this bug is that after the PHY is started packets that match these MAC RX filters start to flow into the RX FIFO. And then, after rx_init() is completed, these packets will go into the driver RX ring as well. If enough packets are received to fill the RX ring (default size is 128 packets) before the call to request_irq() completes, the driver RX function becomes stuck. This bug is intermittent but is most likely to be seen where the oob_net0 interface is connected to a busy network with lots of broadcast and multicast traffic. All the MAC RX filters must be disabled until the RX path is ready, i.e. all initialization is done and all the IRQs are installed. Fixes: f7442a634ac0 ("mlxbf_gige: call request_irq() after NAPI initialized") Reviewed-by: Asmaa Mnebhi Signed-off-by: David Thompson Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240809163612.12852-1-davthompson@nvidia.com Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlxbf_gige/mlxbf_gige.h | 8 +++ .../mellanox/mlxbf_gige/mlxbf_gige_main.c | 10 ++++ .../mellanox/mlxbf_gige/mlxbf_gige_regs.h | 2 + .../mellanox/mlxbf_gige/mlxbf_gige_rx.c | 50 ++++++++++++++++--- 4 files changed, 64 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h index bc94e75a7aebd1..e7777700ee18a7 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h @@ -40,6 +40,7 @@ */ #define MLXBF_GIGE_BCAST_MAC_FILTER_IDX 0 #define MLXBF_GIGE_LOCAL_MAC_FILTER_IDX 1 +#define MLXBF_GIGE_MAX_FILTER_IDX 3 /* Define for broadcast MAC literal */ #define BCAST_MAC_ADDR 0xFFFFFFFFFFFF @@ -175,6 +176,13 @@ enum mlxbf_gige_res { int mlxbf_gige_mdio_probe(struct platform_device *pdev, struct mlxbf_gige *priv); void mlxbf_gige_mdio_remove(struct mlxbf_gige *priv); + +void mlxbf_gige_enable_multicast_rx(struct mlxbf_gige *priv); +void mlxbf_gige_disable_multicast_rx(struct mlxbf_gige *priv); +void mlxbf_gige_enable_mac_rx_filter(struct mlxbf_gige *priv, + unsigned int index); +void mlxbf_gige_disable_mac_rx_filter(struct mlxbf_gige *priv, + unsigned int index); void mlxbf_gige_set_mac_rx_filter(struct mlxbf_gige *priv, unsigned int index, u64 dmac); void mlxbf_gige_get_mac_rx_filter(struct mlxbf_gige *priv, diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c index b157f0f1c5a886..385a56ac73481a 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c @@ -168,6 +168,10 @@ static int mlxbf_gige_open(struct net_device *netdev) if (err) goto napi_deinit; + mlxbf_gige_enable_mac_rx_filter(priv, MLXBF_GIGE_BCAST_MAC_FILTER_IDX); + mlxbf_gige_enable_mac_rx_filter(priv, MLXBF_GIGE_LOCAL_MAC_FILTER_IDX); + mlxbf_gige_enable_multicast_rx(priv); + /* Set bits in INT_EN that we care about */ int_en = MLXBF_GIGE_INT_EN_HW_ACCESS_ERROR | MLXBF_GIGE_INT_EN_TX_CHECKSUM_INPUTS | @@ -379,6 +383,7 @@ static int mlxbf_gige_probe(struct platform_device *pdev) void __iomem *plu_base; void __iomem *base; int addr, phy_irq; + unsigned int i; int err; base = devm_platform_ioremap_resource(pdev, MLXBF_GIGE_RES_MAC); @@ -423,6 +428,11 @@ static int mlxbf_gige_probe(struct platform_device *pdev) priv->rx_q_entries = MLXBF_GIGE_DEFAULT_RXQ_SZ; priv->tx_q_entries = MLXBF_GIGE_DEFAULT_TXQ_SZ; + for (i = 0; i <= MLXBF_GIGE_MAX_FILTER_IDX; i++) + mlxbf_gige_disable_mac_rx_filter(priv, i); + mlxbf_gige_disable_multicast_rx(priv); + mlxbf_gige_disable_promisc(priv); + /* Write initial MAC address to hardware */ mlxbf_gige_initial_mac(priv); diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h index 98a8681c21b9ca..4d14cb13fd64e1 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h @@ -62,6 +62,8 @@ #define MLXBF_GIGE_TX_STATUS_DATA_FIFO_FULL BIT(1) #define MLXBF_GIGE_RX_MAC_FILTER_DMAC_RANGE_START 0x0520 #define MLXBF_GIGE_RX_MAC_FILTER_DMAC_RANGE_END 0x0528 +#define MLXBF_GIGE_RX_MAC_FILTER_GENERAL 0x0530 +#define MLXBF_GIGE_RX_MAC_FILTER_EN_MULTICAST BIT(1) #define MLXBF_GIGE_RX_MAC_FILTER_COUNT_DISC 0x0540 #define MLXBF_GIGE_RX_MAC_FILTER_COUNT_DISC_EN BIT(0) #define MLXBF_GIGE_RX_MAC_FILTER_COUNT_PASS 0x0548 diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c index 69998435849348..eb62620b63c7fc 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c @@ -11,15 +11,31 @@ #include "mlxbf_gige.h" #include "mlxbf_gige_regs.h" -void mlxbf_gige_set_mac_rx_filter(struct mlxbf_gige *priv, - unsigned int index, u64 dmac) +void mlxbf_gige_enable_multicast_rx(struct mlxbf_gige *priv) { void __iomem *base = priv->base; - u64 control; + u64 data; - /* Write destination MAC to specified MAC RX filter */ - writeq(dmac, base + MLXBF_GIGE_RX_MAC_FILTER + - (index * MLXBF_GIGE_RX_MAC_FILTER_STRIDE)); + data = readq(base + MLXBF_GIGE_RX_MAC_FILTER_GENERAL); + data |= MLXBF_GIGE_RX_MAC_FILTER_EN_MULTICAST; + writeq(data, base + MLXBF_GIGE_RX_MAC_FILTER_GENERAL); +} + +void mlxbf_gige_disable_multicast_rx(struct mlxbf_gige *priv) +{ + void __iomem *base = priv->base; + u64 data; + + data = readq(base + MLXBF_GIGE_RX_MAC_FILTER_GENERAL); + data &= ~MLXBF_GIGE_RX_MAC_FILTER_EN_MULTICAST; + writeq(data, base + MLXBF_GIGE_RX_MAC_FILTER_GENERAL); +} + +void mlxbf_gige_enable_mac_rx_filter(struct mlxbf_gige *priv, + unsigned int index) +{ + void __iomem *base = priv->base; + u64 control; /* Enable MAC receive filter mask for specified index */ control = readq(base + MLXBF_GIGE_CONTROL); @@ -27,6 +43,28 @@ void mlxbf_gige_set_mac_rx_filter(struct mlxbf_gige *priv, writeq(control, base + MLXBF_GIGE_CONTROL); } +void mlxbf_gige_disable_mac_rx_filter(struct mlxbf_gige *priv, + unsigned int index) +{ + void __iomem *base = priv->base; + u64 control; + + /* Disable MAC receive filter mask for specified index */ + control = readq(base + MLXBF_GIGE_CONTROL); + control &= ~(MLXBF_GIGE_CONTROL_EN_SPECIFIC_MAC << index); + writeq(control, base + MLXBF_GIGE_CONTROL); +} + +void mlxbf_gige_set_mac_rx_filter(struct mlxbf_gige *priv, + unsigned int index, u64 dmac) +{ + void __iomem *base = priv->base; + + /* Write destination MAC to specified MAC RX filter */ + writeq(dmac, base + MLXBF_GIGE_RX_MAC_FILTER + + (index * MLXBF_GIGE_RX_MAC_FILTER_STRIDE)); +} + void mlxbf_gige_get_mac_rx_filter(struct mlxbf_gige *priv, unsigned int index, u64 *dmac) { From 15e1c3d65975524c5c792fcd59f7d89f00402261 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Fri, 2 Aug 2024 13:16:30 -0700 Subject: [PATCH 262/991] KVM: x86: Use this_cpu_ptr() instead of per_cpu_ptr(smp_processor_id()) Use this_cpu_ptr() instead of open coding the equivalent in various user return MSR helpers. Signed-off-by: Isaku Yamahata Reviewed-by: Chao Gao Reviewed-by: Yuan Yao [sean: massage changelog] Signed-off-by: Sean Christopherson Reviewed-by: Pankaj Gupta Message-ID: <20240802201630.339306-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ef3d3511e4af56..70219e4069874a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -427,8 +427,7 @@ static void kvm_user_return_msr_cpu_online(void) int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); int err; value = (value & mask) | (msrs->values[slot].host & ~mask); @@ -450,8 +449,7 @@ EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); static void drop_user_return_notifiers(void) { - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); if (msrs->registered) kvm_on_user_return(&msrs->urn); From 86cfa9a85fb04fa61e7c6b5a8ecf812437cdad78 Mon Sep 17 00:00:00 2001 From: Daniel Yang Date: Wed, 7 Aug 2024 02:01:21 -0700 Subject: [PATCH 263/991] Documentation: dm-crypt.rst warning + error fix While building kernel documention using make htmldocs command, I was getting unexpected indentation error. Single description was given for two module parameters with wrong indentation. So, I corrected the indentation of both parameters and the description. Signed-off-by: Shibu kumar Signed-off-by: Daniel Yang Signed-off-by: Mikulas Patocka Fixes: 0d815e3400e6 ("dm-crypt: limit the size of encryption requests") --- .../admin-guide/device-mapper/dm-crypt.rst | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst index e625830d335ea7..552c9155165d7c 100644 --- a/Documentation/admin-guide/device-mapper/dm-crypt.rst +++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst @@ -162,13 +162,14 @@ iv_large_sectors Module parameters:: -max_read_size -max_write_size - Maximum size of read or write requests. When a request larger than this size - is received, dm-crypt will split the request. The splitting improves - concurrency (the split requests could be encrypted in parallel by multiple - cores), but it also causes overhead. The user should tune these parameters to - fit the actual workload. + + max_read_size + max_write_size + Maximum size of read or write requests. When a request larger than this size + is received, dm-crypt will split the request. The splitting improves + concurrency (the split requests could be encrypted in parallel by multiple + cores), but it also causes overhead. The user should tune these parameters to + fit the actual workload. Example scripts From c0196faaa927321a63e680427e075734ee656e42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albert=20Jakie=C5=82a?= Date: Fri, 9 Aug 2024 13:56:27 +0000 Subject: [PATCH 264/991] ASoC: SOF: mediatek: Add missing board compatible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Google Dojo compatible. Signed-off-by: Albert Jakieła Reviewed-by: Chen-Yu Tsai Link: https://patch.msgid.link/20240809135627.544429-1-jakiela@google.com Signed-off-by: Mark Brown --- sound/soc/sof/mediatek/mt8195/mt8195.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/sof/mediatek/mt8195/mt8195.c b/sound/soc/sof/mediatek/mt8195/mt8195.c index 1c6e035fd313f7..82d221f53a461c 100644 --- a/sound/soc/sof/mediatek/mt8195/mt8195.c +++ b/sound/soc/sof/mediatek/mt8195/mt8195.c @@ -574,6 +574,9 @@ static struct snd_sof_of_mach sof_mt8195_machs[] = { { .compatible = "google,tomato", .sof_tplg_filename = "sof-mt8195-mt6359-rt1019-rt5682.tplg" + }, { + .compatible = "google,dojo", + .sof_tplg_filename = "sof-mt8195-mt6359-max98390-rt5682.tplg" }, { .compatible = "mediatek,mt8195", .sof_tplg_filename = "sof-mt8195.tplg" From 4b7c3f6d04bd53f2e5b228b6821fb8f5d1ba3071 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:29:40 -0700 Subject: [PATCH 265/991] KVM: x86: Make x2APIC ID 100% readonly Ignore the userspace provided x2APIC ID when fixing up APIC state for KVM_SET_LAPIC, i.e. make the x2APIC fully readonly in KVM. Commit a92e2543d6a8 ("KVM: x86: use hardware-compatible format for APIC ID register"), which added the fixup, didn't intend to allow userspace to modify the x2APIC ID. In fact, that commit is when KVM first started treating the x2APIC ID as readonly, apparently to fix some race: static inline u32 kvm_apic_id(struct kvm_lapic *apic) { - return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff; + /* To avoid a race between apic_base and following APIC_ID update when + * switching to x2apic_mode, the x2apic mode returns initial x2apic id. + */ + if (apic_x2apic_mode(apic)) + return apic->vcpu->vcpu_id; + + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; } Furthermore, KVM doesn't support delivering interrupts to vCPUs with a modified x2APIC ID, but KVM *does* return the modified value on a guest RDMSR and for KVM_GET_LAPIC. I.e. no remotely sane setup can actually work with a modified x2APIC ID. Making the x2APIC ID fully readonly fixes a WARN in KVM's optimized map calculation, which expects the LDR to align with the x2APIC ID. WARNING: CPU: 2 PID: 958 at arch/x86/kvm/lapic.c:331 kvm_recalculate_apic_map+0x609/0xa00 [kvm] CPU: 2 PID: 958 Comm: recalc_apic_map Not tainted 6.4.0-rc3-vanilla+ #35 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.2-1-1 04/01/2014 RIP: 0010:kvm_recalculate_apic_map+0x609/0xa00 [kvm] Call Trace: kvm_apic_set_state+0x1cf/0x5b0 [kvm] kvm_arch_vcpu_ioctl+0x1806/0x2100 [kvm] kvm_vcpu_ioctl+0x663/0x8a0 [kvm] __x64_sys_ioctl+0xb8/0xf0 do_syscall_64+0x56/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7fade8b9dd6f Unfortunately, the WARN can still trigger for other CPUs than the current one by racing against KVM_SET_LAPIC, so remove it completely. Reported-by: Michal Luczaj Closes: https://lore.kernel.org/all/814baa0c-1eaa-4503-129f-059917365e80@rbox.co Reported-by: Haoyu Wu Closes: https://lore.kernel.org/all/20240126161633.62529-1-haoyuwu254@gmail.com Reported-by: syzbot+545f1326f405db4e1c3e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000c2a6b9061cbca3c3@google.com Signed-off-by: Sean Christopherson Message-ID: <20240802202941.344889-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4915acdbfcd8d9..5bb481aefcbcdc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -351,10 +351,8 @@ static void kvm_recalculate_logical_map(struct kvm_apic_map *new, * reversing the LDR calculation to get cluster of APICs, i.e. no * additional work is required. */ - if (apic_x2apic_mode(apic)) { - WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic))); + if (apic_x2apic_mode(apic)) return; - } if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))) { @@ -2966,18 +2964,28 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s, bool set) { if (apic_x2apic_mode(vcpu->arch.apic)) { + u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic); u32 *id = (u32 *)(s->regs + APIC_ID); u32 *ldr = (u32 *)(s->regs + APIC_LDR); u64 icr; if (vcpu->kvm->arch.x2apic_format) { - if (*id != vcpu->vcpu_id) + if (*id != x2apic_id) return -EINVAL; } else { + /* + * Ignore the userspace value when setting APIC state. + * KVM's model is that the x2APIC ID is readonly, e.g. + * KVM only supports delivering interrupts to KVM's + * version of the x2APIC ID. However, for backwards + * compatibility, don't reject attempts to set a + * mismatched ID for userspace that hasn't opted into + * x2apic_format. + */ if (set) - *id >>= 24; + *id = x2apic_id; else - *id <<= 24; + *id = x2apic_id << 24; } /* @@ -2986,7 +2994,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, * split to ICR+ICR2 in userspace for backwards compatibility. */ if (set) { - *ldr = kvm_apic_calc_x2apic_ldr(*id); + *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; From 238d3d63d1e27c8d9733b48f7b682fc6aba86672 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Fri, 2 Aug 2024 13:29:41 -0700 Subject: [PATCH 266/991] KVM: selftests: Add a testcase to verify x2APIC is fully readonly Add a test to verify that userspace can't change a vCPU's x2APIC ID by abusing KVM_SET_LAPIC. KVM models the x2APIC ID (and x2APIC LDR) as readonly, and silently ignores userspace attempts to change the x2APIC ID for backwards compatibility. Signed-off-by: Michal Luczaj [sean: write changelog, add to existing test] Signed-off-by: Sean Christopherson Message-ID: <20240802202941.344889-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/xapic_state_test.c | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index 69849acd95b0af..618cd244239004 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -184,6 +184,33 @@ static void test_apic_id(void) kvm_vm_free(vm); } +static void test_x2apic_id(void) +{ + struct kvm_lapic_state lapic = {}; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int i; + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + vcpu_set_msr(vcpu, MSR_IA32_APICBASE, MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); + + /* + * Try stuffing a modified x2APIC ID, KVM should ignore the value and + * always return the vCPU's default/readonly x2APIC ID. + */ + for (i = 0; i <= 0xff; i++) { + *(u32 *)(lapic.regs + APIC_ID) = i << 24; + *(u32 *)(lapic.regs + APIC_SPIV) = APIC_SPIV_APIC_ENABLED; + vcpu_ioctl(vcpu, KVM_SET_LAPIC, &lapic); + + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &lapic); + TEST_ASSERT(*((u32 *)&lapic.regs[APIC_ID]) == vcpu->id << 24, + "x2APIC ID should be fully readonly"); + } + + kvm_vm_free(vm); +} + int main(int argc, char *argv[]) { struct xapic_vcpu x = { @@ -211,4 +238,5 @@ int main(int argc, char *argv[]) kvm_vm_free(vm); test_apic_id(); + test_x2apic_id(); } From c9b35a6f4edea698a5bb4dd8029e7104ee0a3726 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 11 Jul 2024 20:11:30 +0800 Subject: [PATCH 267/991] KVM: eventfd: Use synchronize_srcu_expedited() on shutdown When hot-unplug a device which has many queues, and guest CPU will has huge jitter, and unplugging is very slow. It turns out synchronize_srcu() in irqfd_shutdown() caused the guest jitter and unplugging latency, so replace synchronize_srcu() with synchronize_srcu_expedited(), to accelerate the unplugging, and reduce the guest OS jitter, this accelerates the VM reboot too. Signed-off-by: Li RongQing Message-ID: <20240711121130.38917-1-lirongqing@baidu.com> [Call it just once in irqfd_resampler_shutdown. - Paolo] Signed-off-by: Paolo Bonzini --- virt/kvm/eventfd.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 229570059a1bb5..992f9beb3e7d04 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -97,18 +97,19 @@ irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) mutex_lock(&kvm->irqfds.resampler_lock); list_del_rcu(&irqfd->resampler_link); - synchronize_srcu(&kvm->irq_srcu); if (list_empty(&resampler->list)) { list_del_rcu(&resampler->link); kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); /* - * synchronize_srcu(&kvm->irq_srcu) already called + * synchronize_srcu_expedited(&kvm->irq_srcu) already called * in kvm_unregister_irq_ack_notifier(). */ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, resampler->notifier.gsi, 0, false); kfree(resampler); + } else { + synchronize_srcu_expedited(&kvm->irq_srcu); } mutex_unlock(&kvm->irqfds.resampler_lock); @@ -126,7 +127,7 @@ irqfd_shutdown(struct work_struct *work) u64 cnt; /* Make sure irqfd has been initialized in assign path. */ - synchronize_srcu(&kvm->irq_srcu); + synchronize_srcu_expedited(&kvm->irq_srcu); /* * Synchronize with the wait-queue and unhook ourselves to prevent @@ -384,7 +385,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) } list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); - synchronize_srcu(&kvm->irq_srcu); + synchronize_srcu_expedited(&kvm->irq_srcu); mutex_unlock(&kvm->irqfds.resampler_lock); } @@ -523,7 +524,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, mutex_lock(&kvm->irq_lock); hlist_del_init_rcu(&kian->link); mutex_unlock(&kvm->irq_lock); - synchronize_srcu(&kvm->irq_srcu); + synchronize_srcu_expedited(&kvm->irq_srcu); kvm_arch_post_irq_ack_notifier_list_update(kvm); } @@ -608,7 +609,7 @@ kvm_irqfd_release(struct kvm *kvm) /* * Take note of a change in irq routing. - * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. + * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards. */ void kvm_irq_routing_update(struct kvm *kvm) { From 11752c013f562a1124088a35bd314aa0e9f0e88f Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Thu, 18 Jul 2024 16:38:50 +0800 Subject: [PATCH 268/991] drm/amdgpu/mes: fix mes ring buffer overflow wait memory room until enough before writing mes packets to avoid ring buffer overflow. v2: squash in sched_hw_submission fix Fixes: de3246254156 ("drm/amdgpu: cleanup MES11 command submission") Fixes: fffe347e1478 ("drm/amdgpu: cleanup MES12 command submission") Signed-off-by: Jack Xiao Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 34e087e8920e635c62e2ed6a758b0cd27f836d13) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 ++ drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++---- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 18 ++++++++++++++---- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index ad49cecb20b8b2..e6344a6b0a9f6a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -212,6 +212,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring, */ if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ) sched_hw_submission = max(sched_hw_submission, 256); + if (ring->funcs->type == AMDGPU_RING_TYPE_MES) + sched_hw_submission = 8; else if (ring == &adev->sdma.instance[0].page) sched_hw_submission = 256; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index f9343642ae7e41..1a5ad5be33bfc9 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -168,7 +168,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, const char *op_str, *misc_op_str; unsigned long flags; u64 status_gpu_addr; - u32 status_offset; + u32 seq, status_offset; u64 *status_ptr; signed long r; int ret; @@ -196,6 +196,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, if (r) goto error_unlock_free; + seq = ++ring->fence_drv.sync_seq; + r = amdgpu_fence_wait_polling(ring, + seq - ring->fence_drv.num_fences_mask, + timeout); + if (r < 1) + goto error_undo; + api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off); api_status->api_completion_fence_addr = status_gpu_addr; api_status->api_completion_fence_value = 1; @@ -208,8 +215,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; mes_status_pkt.api_status.api_completion_fence_addr = ring->fence_drv.gpu_addr; - mes_status_pkt.api_status.api_completion_fence_value = - ++ring->fence_drv.sync_seq; + mes_status_pkt.api_status.api_completion_fence_value = seq; amdgpu_ring_write_multiple(ring, &mes_status_pkt, sizeof(mes_status_pkt) / 4); @@ -229,7 +235,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode); - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout); + r = amdgpu_fence_wait_polling(ring, seq, timeout); if (r < 1 || !*status_ptr) { if (misc_op_str) @@ -252,6 +258,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, amdgpu_device_wb_free(adev, status_offset); return 0; +error_undo: + dev_err(adev->dev, "MES ring buffer is full.\n"); + amdgpu_ring_undo(ring); + error_unlock_free: spin_unlock_irqrestore(&mes->ring_lock, flags); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 0713bc3eb263ea..249e5a66205c27 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -154,7 +154,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, const char *op_str, *misc_op_str; unsigned long flags; u64 status_gpu_addr; - u32 status_offset; + u32 seq, status_offset; u64 *status_ptr; signed long r; int ret; @@ -182,6 +182,13 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, if (r) goto error_unlock_free; + seq = ++ring->fence_drv.sync_seq; + r = amdgpu_fence_wait_polling(ring, + seq - ring->fence_drv.num_fences_mask, + timeout); + if (r < 1) + goto error_undo; + api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off); api_status->api_completion_fence_addr = status_gpu_addr; api_status->api_completion_fence_value = 1; @@ -194,8 +201,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; mes_status_pkt.api_status.api_completion_fence_addr = ring->fence_drv.gpu_addr; - mes_status_pkt.api_status.api_completion_fence_value = - ++ring->fence_drv.sync_seq; + mes_status_pkt.api_status.api_completion_fence_value = seq; amdgpu_ring_write_multiple(ring, &mes_status_pkt, sizeof(mes_status_pkt) / 4); @@ -215,7 +221,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode); - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout); + r = amdgpu_fence_wait_polling(ring, seq, timeout); if (r < 1 || !*status_ptr) { if (misc_op_str) @@ -238,6 +244,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, amdgpu_device_wb_free(adev, status_offset); return 0; +error_undo: + dev_err(adev->dev, "MES ring buffer is full.\n"); + amdgpu_ring_undo(ring); + error_unlock_free: spin_unlock_irqrestore(&mes->ring_lock, flags); From f6098641d3e1e4d4052ff9378857c831f9675f6b Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Tue, 6 Aug 2024 09:55:55 -0400 Subject: [PATCH 269/991] drm/amd/display: fix s2idle entry for DCN3.5+ To be able to get to the lowest power state when suspending systems with DCN3.5+, we must be in IPS before the display hardware is put into D3cold. So, to ensure that the system always reaches the lowest power state while suspending, force systems that support IPS to enter idle optimizations before entering D3cold. Reviewed-by: Roman Li Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher (cherry picked from commit 237193e21b29d4aa0617ffeea3d6f49e72999708) Cc: stable@vger.kernel.org # 6.10+ --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 7e7929f24ae447..983a977632ff4e 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2893,6 +2893,9 @@ static int dm_suspend(void *handle) hpd_rx_irq_work_suspend(dm); + if (adev->dm.dc->caps.ips_support) + dc_allow_idle_optimizations(adev->dm.dc, true); + dc_set_power_state(dm->dc, DC_ACPI_CM_POWER_STATE_D3); dc_dmub_srv_set_power_state(dm->dc->ctx->dmub_srv, DC_ACPI_CM_POWER_STATE_D3); From 0dbb81d44108a2a1004e5b485ef3fca5bc078424 Mon Sep 17 00:00:00 2001 From: Loan Chen Date: Fri, 2 Aug 2024 13:57:40 +0800 Subject: [PATCH 270/991] drm/amd/display: Enable otg synchronization logic for DCN321 [Why] Tiled display cannot synchronize properly after S3. The fix for commit 5f0c74915815 ("drm/amd/display: Fix for otg synchronization logic") is not enable in DCN321, which causes the otg is excluded from synchronization. [How] Enable otg synchronization logic in dcn321. Fixes: 5f0c74915815 ("drm/amd/display: Fix for otg synchronization logic") Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Alvin Lee Signed-off-by: Loan Chen Signed-off-by: Tom Chung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit d6ed53712f583423db61fbb802606759e023bf7b) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c index 9a3cc0514a36ed..8e0588b1cf3054 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c @@ -1778,6 +1778,9 @@ static bool dcn321_resource_construct( dc->caps.color.mpc.ogam_rom_caps.hlg = 0; dc->caps.color.mpc.ocsc = 1; + /* Use pipe context based otg sync logic */ + dc->config.use_pipe_ctx_sync_logic = true; + dc->config.dc_mode_clk_limit_support = true; dc->config.enable_windowed_mpo_odm = true; /* read VBIOS LTTPR caps */ From 338567d17627064dba63cf063459605e782f71d2 Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Mon, 29 Jul 2024 10:23:03 -0400 Subject: [PATCH 271/991] drm/amd/display: Fix MST BW calculation Regression [Why & How] Revert commit 8b2cb32cf0c6 ("drm/amd/display: FEC overhead should be checked once for mst slot nums") Because causes bw calculation regression Cc: mario.limonciello@amd.com Cc: alexander.deucher@amd.com Reported-by: jirislaby@kernel.org Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3495 Closes: https://bugzilla.suse.com/show_bug.cgi?id=1228093 Reviewed-by: Wayne Lin Signed-off-by: Fangzhi Zuo Signed-off-by: Tom Chung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 12dbb3ed212fc7655fce421542a5add637f8af7a) Cc: stable@vger.kernel.org --- .../display/amdgpu_dm/amdgpu_dm_mst_types.c | 33 ++++++++++++++----- .../display/amdgpu_dm/amdgpu_dm_mst_types.h | 3 ++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 915eb2c08ece29..2e9f6da1acdcad 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -804,12 +804,25 @@ struct dsc_mst_fairness_params { }; #if defined(CONFIG_DRM_AMD_DC_FP) -static int kbps_to_peak_pbn(int kbps) +static uint16_t get_fec_overhead_multiplier(struct dc_link *dc_link) +{ + u8 link_coding_cap; + uint16_t fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_8B_10B; + + link_coding_cap = dc_link_dp_mst_decide_link_encoding_format(dc_link); + if (link_coding_cap == DP_128b_132b_ENCODING) + fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_128B_132B; + + return fec_overhead_multiplier_x1000; +} + +static int kbps_to_peak_pbn(int kbps, uint16_t fec_overhead_multiplier_x1000) { u64 peak_kbps = kbps; peak_kbps *= 1006; - peak_kbps = div_u64(peak_kbps, 1000); + peak_kbps *= fec_overhead_multiplier_x1000; + peak_kbps = div_u64(peak_kbps, 1000 * 1000); return (int) DIV64_U64_ROUND_UP(peak_kbps * 64, (54 * 8 * 1000)); } @@ -910,11 +923,12 @@ static int increase_dsc_bpp(struct drm_atomic_state *state, int link_timeslots_used; int fair_pbn_alloc; int ret = 0; + uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); for (i = 0; i < count; i++) { if (vars[i + k].dsc_enabled) { initial_slack[i] = - kbps_to_peak_pbn(params[i].bw_range.max_kbps) - vars[i + k].pbn; + kbps_to_peak_pbn(params[i].bw_range.max_kbps, fec_overhead_multiplier_x1000) - vars[i + k].pbn; bpp_increased[i] = false; remaining_to_increase += 1; } else { @@ -1010,6 +1024,7 @@ static int try_disable_dsc(struct drm_atomic_state *state, int next_index; int remaining_to_try = 0; int ret; + uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); for (i = 0; i < count; i++) { if (vars[i + k].dsc_enabled @@ -1039,7 +1054,7 @@ static int try_disable_dsc(struct drm_atomic_state *state, if (next_index == -1) break; - vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps); + vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps, fec_overhead_multiplier_x1000); ret = drm_dp_atomic_find_time_slots(state, params[next_index].port->mgr, params[next_index].port, @@ -1052,8 +1067,7 @@ static int try_disable_dsc(struct drm_atomic_state *state, vars[next_index].dsc_enabled = false; vars[next_index].bpp_x16 = 0; } else { - vars[next_index].pbn = kbps_to_peak_pbn( - params[next_index].bw_range.max_kbps); + vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps, fec_overhead_multiplier_x1000); ret = drm_dp_atomic_find_time_slots(state, params[next_index].port->mgr, params[next_index].port, @@ -1082,6 +1096,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, int count = 0; int i, k, ret; bool debugfs_overwrite = false; + uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); memset(params, 0, sizeof(params)); @@ -1146,7 +1161,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, /* Try no compression */ for (i = 0; i < count; i++) { vars[i + k].aconnector = params[i].aconnector; - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps); + vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000); vars[i + k].dsc_enabled = false; vars[i + k].bpp_x16 = 0; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, params[i].port, @@ -1165,7 +1180,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, /* Try max compression */ for (i = 0; i < count; i++) { if (params[i].compression_possible && params[i].clock_force_enable != DSC_CLK_FORCE_DISABLE) { - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.min_kbps); + vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.min_kbps, fec_overhead_multiplier_x1000); vars[i + k].dsc_enabled = true; vars[i + k].bpp_x16 = params[i].bw_range.min_target_bpp_x16; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, @@ -1173,7 +1188,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, if (ret < 0) return ret; } else { - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps); + vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000); vars[i + k].dsc_enabled = false; vars[i + k].bpp_x16 = 0; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h index fa84d34b737367..600d6e22101112 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h @@ -46,6 +46,9 @@ #define SYNAPTICS_CASCADED_HUB_ID 0x5A #define IS_SYNAPTICS_CASCADED_PANAMERA(devName, data) ((IS_SYNAPTICS_PANAMERA(devName) && ((int)data[2] == SYNAPTICS_CASCADED_HUB_ID)) ? 1 : 0) +#define PBN_FEC_OVERHEAD_MULTIPLIER_8B_10B 1031 +#define PBN_FEC_OVERHEAD_MULTIPLIER_128B_132B 1000 + enum mst_msg_ready_type { NONE_MSG_RDY_EVENT = 0, DOWN_REP_MSG_RDY_EVENT = 1, From 737222cebecbdbcdde2b69475c52bcb9ecfeb830 Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Tue, 31 Jan 2023 15:05:46 -0100 Subject: [PATCH 272/991] drm/amd/display: fix cursor offset on rotation 180 [why & how] Cursor gets clipped off in the middle of the screen with hw rotation 180. Fix a miscalculation of cursor offset when it's placed near the edges in the pipe split case. Cursor bugs with hw rotation were reported on AMD issue tracker: https://gitlab.freedesktop.org/drm/amd/-/issues/2247 The issues on rotation 270 was fixed by: https://lore.kernel.org/amd-gfx/20221118125935.4013669-22-Brian.Chang@amd.com/ that partially addressed the rotation 180 too. So, this patch is the final bits for rotation 180. Reported-by: Xaver Hugl Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2247 Reviewed-by: Harry Wentland Fixes: 9d84c7ef8a87 ("drm/amd/display: Correct cursor position on horizontal mirror") Signed-off-by: Melissa Wen Signed-off-by: Hamza Mahfooz Signed-off-by: Tom Chung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 1fd2cf090096af8a25bf85564341cfc21cec659d) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c index ff03b1d98aa7af..1b9ac8812f5b09 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c @@ -3589,7 +3589,7 @@ void dcn10_set_cursor_position(struct pipe_ctx *pipe_ctx) (int)hubp->curs_attr.width || pos_cpy.x <= (int)hubp->curs_attr.width + pipe_ctx->plane_state->src_rect.x) { - pos_cpy.x = temp_x + viewport_width; + pos_cpy.x = 2 * viewport_width - temp_x; } } } else { From 56fb276d0244d430496f249335a44ae114dd5f54 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Thu, 1 Aug 2024 16:16:35 -0600 Subject: [PATCH 273/991] drm/amd/display: Adjust cursor position [why & how] When the commit 9d84c7ef8a87 ("drm/amd/display: Correct cursor position on horizontal mirror") was introduced, it used the wrong calculation for the position copy for X. This commit uses the correct calculation for that based on the original patch. Fixes: 9d84c7ef8a87 ("drm/amd/display: Correct cursor position on horizontal mirror") Cc: Mario Limonciello Cc: Alex Deucher Acked-by: Wayne Lin Signed-off-by: Rodrigo Siqueira Signed-off-by: Tom Chung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 8f9b23abbae5ffcd64856facd26a86b67195bc2f) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c index 1b9ac8812f5b09..14a902ff3b8ac9 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c @@ -3682,7 +3682,7 @@ void dcn10_set_cursor_position(struct pipe_ctx *pipe_ctx) (int)hubp->curs_attr.width || pos_cpy.x <= (int)hubp->curs_attr.width + pipe_ctx->plane_state->src_rect.x) { - pos_cpy.x = 2 * viewport_width - temp_x; + pos_cpy.x = temp_x + viewport_width; } } } else { From e414a304f2c5368a84f03ad34d29b89f965a33c9 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 12 Jul 2024 10:00:33 -0400 Subject: [PATCH 274/991] drm/amdgpu/jpeg2: properly set atomics vmid field This needs to be set as well if the IB uses atomics. Reviewed-by: Leo Liu Signed-off-by: Alex Deucher (cherry picked from commit 35c628774e50b3784c59e8ca7973f03bcb067132) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c index 99adf3625657e9..98aa3ccd0d2024 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c @@ -538,11 +538,11 @@ void jpeg_v2_0_dec_ring_emit_ib(struct amdgpu_ring *ring, amdgpu_ring_write(ring, PACKETJ(mmUVD_LMI_JRBC_IB_VMID_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); - amdgpu_ring_write(ring, (vmid | (vmid << 4))); + amdgpu_ring_write(ring, (vmid | (vmid << 4) | (vmid << 8))); amdgpu_ring_write(ring, PACKETJ(mmUVD_LMI_JPEG_VMID_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); - amdgpu_ring_write(ring, (vmid | (vmid << 4))); + amdgpu_ring_write(ring, (vmid | (vmid << 4) | (vmid << 8))); amdgpu_ring_write(ring, PACKETJ(mmUVD_LMI_JRBC_IB_64BIT_BAR_LOW_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); From e6c6bd6253e792cee6c5c065e106e87b9f0d9ae9 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 12 Jul 2024 10:06:05 -0400 Subject: [PATCH 275/991] drm/amdgpu/jpeg4: properly set atomics vmid field This needs to be set as well if the IB uses atomics. Reviewed-by: Leo Liu Signed-off-by: Alex Deucher (cherry picked from commit c6c2e8b6a427d4fecc7c36cffccb908185afcab2) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c index ad524ddc9760a8..f4662920c653f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c @@ -782,11 +782,11 @@ void jpeg_v4_0_3_dec_ring_emit_ib(struct amdgpu_ring *ring, amdgpu_ring_write(ring, PACKETJ(regUVD_LMI_JRBC_IB_VMID_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); - amdgpu_ring_write(ring, (vmid | (vmid << 4))); + amdgpu_ring_write(ring, (vmid | (vmid << 4) | (vmid << 8))); amdgpu_ring_write(ring, PACKETJ(regUVD_LMI_JPEG_VMID_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); - amdgpu_ring_write(ring, (vmid | (vmid << 4))); + amdgpu_ring_write(ring, (vmid | (vmid << 4) | (vmid << 8))); amdgpu_ring_write(ring, PACKETJ(regUVD_LMI_JRBC_IB_64BIT_BAR_LOW_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); From 0573a1e2ea7e35bff08944a40f1adf2bb35cea61 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Tue, 6 Aug 2024 22:27:32 +0200 Subject: [PATCH 276/991] drm/amdgpu: Actually check flags for all context ops. Missing validation ... Checked libdrm and it clears all the structs, so we should be safe to just check everything. Signed-off-by: Bas Nieuwenhuizen Signed-off-by: Alex Deucher (cherry picked from commit c6b86421f1f9ddf9d706f2453159813ee39d0cf9) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 5cb33ac99f7089..c43d1b6e5d66b2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -685,16 +685,24 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void *data, switch (args->in.op) { case AMDGPU_CTX_OP_ALLOC_CTX: + if (args->in.flags) + return -EINVAL; r = amdgpu_ctx_alloc(adev, fpriv, filp, priority, &id); args->out.alloc.ctx_id = id; break; case AMDGPU_CTX_OP_FREE_CTX: + if (args->in.flags) + return -EINVAL; r = amdgpu_ctx_free(fpriv, id); break; case AMDGPU_CTX_OP_QUERY_STATE: + if (args->in.flags) + return -EINVAL; r = amdgpu_ctx_query(adev, fpriv, id, &args->out); break; case AMDGPU_CTX_OP_QUERY_STATE2: + if (args->in.flags) + return -EINVAL; r = amdgpu_ctx_query2(adev, fpriv, id, &args->out); break; case AMDGPU_CTX_OP_GET_STABLE_PSTATE: From 278e1865b7a2124ea783b75ea8b3ee0bc2da5d85 Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 11:43:45 +0800 Subject: [PATCH 277/991] drm/amdgpu/mes12: update mes_v12_api_def.h Update mes12 api definition. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 2ab5dc59177419d8a49e89585e82ff41524270fc) --- drivers/gpu/drm/amd/include/mes_v12_api_def.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/include/mes_v12_api_def.h b/drivers/gpu/drm/amd/include/mes_v12_api_def.h index 4cf2c9f30b3dc3..101e2fe962c6a6 100644 --- a/drivers/gpu/drm/amd/include/mes_v12_api_def.h +++ b/drivers/gpu/drm/amd/include/mes_v12_api_def.h @@ -97,6 +97,7 @@ enum MES_QUEUE_TYPE { MES_QUEUE_TYPE_SDMA, MES_QUEUE_TYPE_MAX, + MES_QUEUE_TYPE_SCHQ = MES_QUEUE_TYPE_MAX, }; struct MES_API_STATUS { @@ -242,8 +243,12 @@ union MESAPI_SET_HW_RESOURCES { uint32_t send_write_data : 1; uint32_t os_tdr_timeout_override : 1; uint32_t use_rs64mem_for_proc_gang_ctx : 1; + uint32_t halt_on_misaligned_access : 1; + uint32_t use_add_queue_unmap_flag_addr : 1; + uint32_t enable_mes_sch_stb_log : 1; + uint32_t limit_single_process : 1; uint32_t unmapped_doorbell_handling: 2; - uint32_t reserved : 15; + uint32_t reserved : 11; }; uint32_t uint32_all; }; From 2029b3d7e1358bcca30f74978543ba35b4bbc43d Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 11:53:35 +0800 Subject: [PATCH 278/991] drm/amdgpu/mes: add multiple mes ring instances support Add multiple mes ring instances in mes structure to support multiple mes pipes. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit c7d4355648ffa02a1551495b05c71ea6c884d29c) --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 5 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 4 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 34 ++++++++++++------------ drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 34 ++++++++++++------------ 9 files changed, 47 insertions(+), 44 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 82452606ae6ca8..f165b9d49e2927 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -995,7 +995,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg, uint32_t xcc_ if (amdgpu_device_skip_hw_access(adev)) return 0; - if (adev->mes.ring.sched.ready) + if (adev->mes.ring[0].sched.ready) return amdgpu_mes_rreg(adev, reg); BUG_ON(!ring->funcs->emit_rreg); @@ -1065,7 +1065,7 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint3 if (amdgpu_device_skip_hw_access(adev)) return; - if (adev->mes.ring.sched.ready) { + if (adev->mes.ring[0].sched.ready) { amdgpu_mes_wreg(adev, reg, v); return; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index c0265902565670..b49b3650fd6217 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -589,7 +589,8 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev) ring = adev->rings[i]; vmhub = ring->vm_hub; - if (ring == &adev->mes.ring || + if (ring == &adev->mes.ring[0] || + ring == &adev->mes.ring[1] || ring == &adev->umsch_mm.ring) continue; @@ -761,7 +762,7 @@ void amdgpu_gmc_fw_reg_write_reg_wait(struct amdgpu_device *adev, unsigned long flags; uint32_t seq; - if (adev->mes.ring.sched.ready) { + if (adev->mes.ring[0].sched.ready) { amdgpu_mes_reg_write_reg_wait(adev, reg0, reg1, ref, mask); return; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index dac88d2dd70d43..8ef53f48ce1506 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -135,9 +135,11 @@ int amdgpu_mes_init(struct amdgpu_device *adev) idr_init(&adev->mes.queue_id_idr); ida_init(&adev->mes.doorbell_ida); spin_lock_init(&adev->mes.queue_id_lock); - spin_lock_init(&adev->mes.ring_lock); mutex_init(&adev->mes.mutex_hidden); + for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) + spin_lock_init(&adev->mes.ring_lock[i]); + adev->mes.total_max_queue = AMDGPU_FENCE_MES_QUEUE_ID_MASK; adev->mes.vmid_mask_mmhub = 0xffffff00; adev->mes.vmid_mask_gfxhub = 0xffffff00; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 2d659c612f033f..f89e3f61fe46b5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -82,8 +82,8 @@ struct amdgpu_mes { uint64_t default_process_quantum; uint64_t default_gang_quantum; - struct amdgpu_ring ring; - spinlock_t ring_lock; + struct amdgpu_ring ring[AMDGPU_MAX_MES_PIPES]; + spinlock_t ring_lock[AMDGPU_MAX_MES_PIPES]; const struct firmware *fw[AMDGPU_MAX_MES_PIPES]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 111c380f929b59..b287a82e6177e9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -858,7 +858,7 @@ void amdgpu_virt_post_reset(struct amdgpu_device *adev) adev->gfx.is_poweron = false; } - adev->mes.ring.sched.ready = false; + adev->mes.ring[0].sched.ready = false; } bool amdgpu_virt_fw_load_skip_check(struct amdgpu_device *adev, uint32_t ucode_id) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index b88a6fa173b362..2797fd84432b22 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -231,7 +231,7 @@ static void gmc_v11_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, /* This is necessary for SRIOV as well as for GFXOFF to function * properly under bare metal */ - if ((adev->gfx.kiq[0].ring.sched.ready || adev->mes.ring.sched.ready) && + if ((adev->gfx.kiq[0].ring.sched.ready || adev->mes.ring[0].sched.ready) && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) { amdgpu_gmc_fw_reg_write_reg_wait(adev, req, ack, inv_req, 1 << vmid, GET_INST(GC, 0)); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c index 26efce9aa4109f..edcb5351f8cca7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c @@ -299,7 +299,7 @@ static void gmc_v12_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, /* This is necessary for SRIOV as well as for GFXOFF to function * properly under bare metal */ - if ((adev->gfx.kiq[0].ring.sched.ready || adev->mes.ring.sched.ready) && + if ((adev->gfx.kiq[0].ring.sched.ready || adev->mes.ring[0].sched.ready) && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) { struct amdgpu_vmhub *hub = &adev->vmhub[vmhub]; const unsigned eng = 17; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 1a5ad5be33bfc9..44bdfa0b263aa0 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -162,7 +162,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, union MESAPI__QUERY_MES_STATUS mes_status_pkt; signed long timeout = 3000000; /* 3000 ms */ struct amdgpu_device *adev = mes->adev; - struct amdgpu_ring *ring = &mes->ring; + struct amdgpu_ring *ring = &mes->ring[0]; struct MES_API_STATUS *api_status; union MESAPI__MISC *x_pkt = pkt; const char *op_str, *misc_op_str; @@ -191,7 +191,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, status_ptr = (u64 *)&adev->wb.wb[status_offset]; *status_ptr = 0; - spin_lock_irqsave(&mes->ring_lock, flags); + spin_lock_irqsave(&mes->ring_lock[0], flags); r = amdgpu_ring_alloc(ring, (size + sizeof(mes_status_pkt)) / 4); if (r) goto error_unlock_free; @@ -221,7 +221,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, sizeof(mes_status_pkt) / 4); amdgpu_ring_commit(ring); - spin_unlock_irqrestore(&mes->ring_lock, flags); + spin_unlock_irqrestore(&mes->ring_lock[0], flags); op_str = mes_v11_0_get_op_string(x_pkt); misc_op_str = mes_v11_0_get_misc_op_string(x_pkt); @@ -263,7 +263,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, amdgpu_ring_undo(ring); error_unlock_free: - spin_unlock_irqrestore(&mes->ring_lock, flags); + spin_unlock_irqrestore(&mes->ring_lock[0], flags); error_wb_free: amdgpu_device_wb_free(adev, status_offset); @@ -1025,7 +1025,7 @@ static int mes_v11_0_kiq_enable_queue(struct amdgpu_device *adev) return r; } - kiq->pmf->kiq_map_queues(kiq_ring, &adev->mes.ring); + kiq->pmf->kiq_map_queues(kiq_ring, &adev->mes.ring[0]); return amdgpu_ring_test_helper(kiq_ring); } @@ -1039,7 +1039,7 @@ static int mes_v11_0_queue_init(struct amdgpu_device *adev, if (pipe == AMDGPU_MES_KIQ_PIPE) ring = &adev->gfx.kiq[0].ring; else if (pipe == AMDGPU_MES_SCHED_PIPE) - ring = &adev->mes.ring; + ring = &adev->mes.ring[0]; else BUG(); @@ -1081,7 +1081,7 @@ static int mes_v11_0_ring_init(struct amdgpu_device *adev) { struct amdgpu_ring *ring; - ring = &adev->mes.ring; + ring = &adev->mes.ring[0]; ring->funcs = &mes_v11_0_ring_funcs; @@ -1134,7 +1134,7 @@ static int mes_v11_0_mqd_sw_init(struct amdgpu_device *adev, if (pipe == AMDGPU_MES_KIQ_PIPE) ring = &adev->gfx.kiq[0].ring; else if (pipe == AMDGPU_MES_SCHED_PIPE) - ring = &adev->mes.ring; + ring = &adev->mes.ring[0]; else BUG(); @@ -1226,12 +1226,12 @@ static int mes_v11_0_sw_fini(void *handle) &adev->gfx.kiq[0].ring.mqd_gpu_addr, &adev->gfx.kiq[0].ring.mqd_ptr); - amdgpu_bo_free_kernel(&adev->mes.ring.mqd_obj, - &adev->mes.ring.mqd_gpu_addr, - &adev->mes.ring.mqd_ptr); + amdgpu_bo_free_kernel(&adev->mes.ring[0].mqd_obj, + &adev->mes.ring[0].mqd_gpu_addr, + &adev->mes.ring[0].mqd_ptr); amdgpu_ring_fini(&adev->gfx.kiq[0].ring); - amdgpu_ring_fini(&adev->mes.ring); + amdgpu_ring_fini(&adev->mes.ring[0]); if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) { mes_v11_0_free_ucode_buffers(adev, AMDGPU_MES_KIQ_PIPE); @@ -1342,9 +1342,9 @@ static int mes_v11_0_kiq_hw_init(struct amdgpu_device *adev) static int mes_v11_0_kiq_hw_fini(struct amdgpu_device *adev) { - if (adev->mes.ring.sched.ready) { - mes_v11_0_kiq_dequeue(&adev->mes.ring); - adev->mes.ring.sched.ready = false; + if (adev->mes.ring[0].sched.ready) { + mes_v11_0_kiq_dequeue(&adev->mes.ring[0]); + adev->mes.ring[0].sched.ready = false; } if (amdgpu_sriov_vf(adev)) { @@ -1362,7 +1362,7 @@ static int mes_v11_0_hw_init(void *handle) int r; struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->mes.ring.sched.ready) + if (adev->mes.ring[0].sched.ready) goto out; if (!adev->enable_mes_kiq) { @@ -1407,7 +1407,7 @@ static int mes_v11_0_hw_init(void *handle) * with MES enabled. */ adev->gfx.kiq[0].ring.sched.ready = false; - adev->mes.ring.sched.ready = true; + adev->mes.ring[0].sched.ready = true; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 249e5a66205c27..515de65d8aa0d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -148,7 +148,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, union MESAPI__QUERY_MES_STATUS mes_status_pkt; signed long timeout = 3000000; /* 3000 ms */ struct amdgpu_device *adev = mes->adev; - struct amdgpu_ring *ring = &mes->ring; + struct amdgpu_ring *ring = &mes->ring[0]; struct MES_API_STATUS *api_status; union MESAPI__MISC *x_pkt = pkt; const char *op_str, *misc_op_str; @@ -177,7 +177,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, status_ptr = (u64 *)&adev->wb.wb[status_offset]; *status_ptr = 0; - spin_lock_irqsave(&mes->ring_lock, flags); + spin_lock_irqsave(&mes->ring_lock[0], flags); r = amdgpu_ring_alloc(ring, (size + sizeof(mes_status_pkt)) / 4); if (r) goto error_unlock_free; @@ -207,7 +207,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, sizeof(mes_status_pkt) / 4); amdgpu_ring_commit(ring); - spin_unlock_irqrestore(&mes->ring_lock, flags); + spin_unlock_irqrestore(&mes->ring_lock[0], flags); op_str = mes_v12_0_get_op_string(x_pkt); misc_op_str = mes_v12_0_get_misc_op_string(x_pkt); @@ -249,7 +249,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, amdgpu_ring_undo(ring); error_unlock_free: - spin_unlock_irqrestore(&mes->ring_lock, flags); + spin_unlock_irqrestore(&mes->ring_lock[0], flags); error_wb_free: amdgpu_device_wb_free(adev, status_offset); @@ -1095,7 +1095,7 @@ static int mes_v12_0_kiq_enable_queue(struct amdgpu_device *adev) return r; } - kiq->pmf->kiq_map_queues(kiq_ring, &adev->mes.ring); + kiq->pmf->kiq_map_queues(kiq_ring, &adev->mes.ring[0]); r = amdgpu_ring_test_ring(kiq_ring); if (r) { @@ -1114,7 +1114,7 @@ static int mes_v12_0_queue_init(struct amdgpu_device *adev, if (pipe == AMDGPU_MES_KIQ_PIPE) ring = &adev->gfx.kiq[0].ring; else if (pipe == AMDGPU_MES_SCHED_PIPE) - ring = &adev->mes.ring; + ring = &adev->mes.ring[0]; else BUG(); @@ -1160,7 +1160,7 @@ static int mes_v12_0_ring_init(struct amdgpu_device *adev) { struct amdgpu_ring *ring; - ring = &adev->mes.ring; + ring = &adev->mes.ring[0]; ring->funcs = &mes_v12_0_ring_funcs; @@ -1213,7 +1213,7 @@ static int mes_v12_0_mqd_sw_init(struct amdgpu_device *adev, if (pipe == AMDGPU_MES_KIQ_PIPE) ring = &adev->gfx.kiq[0].ring; else if (pipe == AMDGPU_MES_SCHED_PIPE) - ring = &adev->mes.ring; + ring = &adev->mes.ring[0]; else BUG(); @@ -1302,12 +1302,12 @@ static int mes_v12_0_sw_fini(void *handle) &adev->gfx.kiq[0].ring.mqd_gpu_addr, &adev->gfx.kiq[0].ring.mqd_ptr); - amdgpu_bo_free_kernel(&adev->mes.ring.mqd_obj, - &adev->mes.ring.mqd_gpu_addr, - &adev->mes.ring.mqd_ptr); + amdgpu_bo_free_kernel(&adev->mes.ring[0].mqd_obj, + &adev->mes.ring[0].mqd_gpu_addr, + &adev->mes.ring[0].mqd_ptr); amdgpu_ring_fini(&adev->gfx.kiq[0].ring); - amdgpu_ring_fini(&adev->mes.ring); + amdgpu_ring_fini(&adev->mes.ring[0]); if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) { mes_v12_0_free_ucode_buffers(adev, AMDGPU_MES_KIQ_PIPE); @@ -1351,7 +1351,7 @@ static void mes_v12_0_kiq_dequeue_sched(struct amdgpu_device *adev) soc21_grbm_select(adev, 0, 0, 0, 0); mutex_unlock(&adev->srbm_mutex); - adev->mes.ring.sched.ready = false; + adev->mes.ring[0].sched.ready = false; } static void mes_v12_0_kiq_setting(struct amdgpu_ring *ring) @@ -1415,9 +1415,9 @@ static int mes_v12_0_kiq_hw_init(struct amdgpu_device *adev) static int mes_v12_0_kiq_hw_fini(struct amdgpu_device *adev) { - if (adev->mes.ring.sched.ready) { + if (adev->mes.ring[0].sched.ready) { mes_v12_0_kiq_dequeue_sched(adev); - adev->mes.ring.sched.ready = false; + adev->mes.ring[0].sched.ready = false; } mes_v12_0_enable(adev, false); @@ -1430,7 +1430,7 @@ static int mes_v12_0_hw_init(void *handle) int r; struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->mes.ring.sched.ready) + if (adev->mes.ring[0].sched.ready) goto out; if (!adev->enable_mes_kiq || adev->enable_uni_mes) { @@ -1482,7 +1482,7 @@ static int mes_v12_0_hw_init(void *handle) * with MES enabled. */ adev->gfx.kiq[0].ring.sched.ready = false; - adev->mes.ring.sched.ready = true; + adev->mes.ring[0].sched.ready = true; return 0; From a13d91bf3c1910212e45a69d04ad40d99878f8da Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 13:19:59 +0800 Subject: [PATCH 279/991] drm/amdgpu/mes12: load unified mes fw on pipe0 and pipe1 Enable unified mes firmware to load on pipe0 and pipe1. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit e69c2dd7534f3fcabf7bb801db2a7ac71e7e5da6) --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 2 +- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 27 +++---------------------- 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 8ef53f48ce1506..81bed8e8478d15 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -1501,7 +1501,7 @@ int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe) amdgpu_ucode_ip_version_decode(adev, GC_HWIP, ucode_prefix, sizeof(ucode_prefix)); - if (adev->enable_uni_mes && pipe == AMDGPU_MES_SCHED_PIPE) { + if (adev->enable_uni_mes) { snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_uni_mes.bin", ucode_prefix); } else if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0) && diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 515de65d8aa0d7..28bb72d2cffdd4 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -744,16 +744,11 @@ static void mes_v12_0_enable(struct amdgpu_device *adev, bool enable) if (enable) { data = RREG32_SOC15(GC, 0, regCP_MES_CNTL); data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE0_RESET, 1); - data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE1_RESET, - (!adev->enable_uni_mes && adev->enable_mes_kiq) ? 1 : 0); + data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE1_RESET, 1); WREG32_SOC15(GC, 0, regCP_MES_CNTL, data); mutex_lock(&adev->srbm_mutex); for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { - if ((!adev->enable_mes_kiq || adev->enable_uni_mes) && - pipe == AMDGPU_MES_KIQ_PIPE) - continue; - soc21_grbm_select(adev, 3, pipe, 0, 0); ucode_addr = adev->mes.uc_start_addr[pipe] >> 2; @@ -767,8 +762,7 @@ static void mes_v12_0_enable(struct amdgpu_device *adev, bool enable) /* unhalt MES and activate pipe0 */ data = REG_SET_FIELD(0, CP_MES_CNTL, MES_PIPE0_ACTIVE, 1); - data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE1_ACTIVE, - (!adev->enable_uni_mes && adev->enable_mes_kiq) ? 1 : 0); + data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE1_ACTIVE, 1); WREG32_SOC15(GC, 0, regCP_MES_CNTL, data); if (amdgpu_emu_mode) @@ -784,8 +778,7 @@ static void mes_v12_0_enable(struct amdgpu_device *adev, bool enable) data = REG_SET_FIELD(data, CP_MES_CNTL, MES_INVALIDATE_ICACHE, 1); data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE0_RESET, 1); - data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE1_RESET, - (!adev->enable_uni_mes && adev->enable_mes_kiq) ? 1 : 0); + data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE1_RESET, 1); data = REG_SET_FIELD(data, CP_MES_CNTL, MES_HALT, 1); WREG32_SOC15(GC, 0, regCP_MES_CNTL, data); } @@ -800,10 +793,6 @@ static void mes_v12_0_set_ucode_start_addr(struct amdgpu_device *adev) mutex_lock(&adev->srbm_mutex); for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { - if ((!adev->enable_mes_kiq || adev->enable_uni_mes) && - pipe == AMDGPU_MES_KIQ_PIPE) - continue; - /* me=3, queue=0 */ soc21_grbm_select(adev, 3, pipe, 0, 0); @@ -1525,17 +1514,7 @@ static int mes_v12_0_early_init(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; int pipe, r; - if (adev->enable_uni_mes) { - r = amdgpu_mes_init_microcode(adev, AMDGPU_MES_SCHED_PIPE); - if (!r) - return 0; - - adev->enable_uni_mes = false; - } - for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { - if (!adev->enable_mes_kiq && pipe == AMDGPU_MES_KIQ_PIPE) - continue; r = amdgpu_mes_init_microcode(adev, pipe); if (r) return r; From 3738a7f0ddb920bde538d3f78a02edbc6ad1307e Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 14:15:48 +0800 Subject: [PATCH 280/991] drm/amdgpu/mes12: add mes pipe switch support Add mes pipe switch to let caller choose pipe to submit packet. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit b2dee0837a4be63e8d3e00550a9f057644f962c4) --- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 59 ++++++++++++++------------ 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 28bb72d2cffdd4..1213f35e29000b 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -142,13 +142,14 @@ static const char *mes_v12_0_get_misc_op_string(union MESAPI__MISC *x_pkt) } static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, - void *pkt, int size, - int api_status_off) + int pipe, void *pkt, int size, + int api_status_off) { union MESAPI__QUERY_MES_STATUS mes_status_pkt; signed long timeout = 3000000; /* 3000 ms */ struct amdgpu_device *adev = mes->adev; - struct amdgpu_ring *ring = &mes->ring[0]; + struct amdgpu_ring *ring = &mes->ring[pipe]; + spinlock_t *ring_lock = &mes->ring_lock[pipe]; struct MES_API_STATUS *api_status; union MESAPI__MISC *x_pkt = pkt; const char *op_str, *misc_op_str; @@ -177,7 +178,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, status_ptr = (u64 *)&adev->wb.wb[status_offset]; *status_ptr = 0; - spin_lock_irqsave(&mes->ring_lock[0], flags); + spin_lock_irqsave(ring_lock, flags); r = amdgpu_ring_alloc(ring, (size + sizeof(mes_status_pkt)) / 4); if (r) goto error_unlock_free; @@ -207,32 +208,33 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, sizeof(mes_status_pkt) / 4); amdgpu_ring_commit(ring); - spin_unlock_irqrestore(&mes->ring_lock[0], flags); + spin_unlock_irqrestore(ring_lock, flags); op_str = mes_v12_0_get_op_string(x_pkt); misc_op_str = mes_v12_0_get_misc_op_string(x_pkt); if (misc_op_str) - dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, - misc_op_str); + dev_dbg(adev->dev, "MES(%d) msg=%s (%s) was emitted\n", + pipe, op_str, misc_op_str); else if (op_str) - dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str); + dev_dbg(adev->dev, "MES(%d) msg=%s was emitted\n", + pipe, op_str); else - dev_dbg(adev->dev, "MES msg=%d was emitted\n", - x_pkt->header.opcode); + dev_dbg(adev->dev, "MES(%d) msg=%d was emitted\n", + pipe, x_pkt->header.opcode); r = amdgpu_fence_wait_polling(ring, seq, timeout); if (r < 1 || !*status_ptr) { if (misc_op_str) - dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n", - op_str, misc_op_str); + dev_err(adev->dev, "MES(%d) failed to respond to msg=%s (%s)\n", + pipe, op_str, misc_op_str); else if (op_str) - dev_err(adev->dev, "MES failed to respond to msg=%s\n", - op_str); + dev_err(adev->dev, "MES(%d) failed to respond to msg=%s\n", + pipe, op_str); else - dev_err(adev->dev, "MES failed to respond to msg=%d\n", - x_pkt->header.opcode); + dev_err(adev->dev, "MES(%d) failed to respond to msg=%d\n", + pipe, x_pkt->header.opcode); while (halt_if_hws_hang) schedule(); @@ -249,7 +251,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, amdgpu_ring_undo(ring); error_unlock_free: - spin_unlock_irqrestore(&mes->ring_lock[0], flags); + spin_unlock_irqrestore(ring_lock, flags); error_wb_free: amdgpu_device_wb_free(adev, status_offset); @@ -321,6 +323,7 @@ static int mes_v12_0_add_hw_queue(struct amdgpu_mes *mes, mes_add_queue_pkt.gds_size = input->queue_size; return mes_v12_0_submit_pkt_and_poll_completion(mes, + AMDGPU_MES_SCHED_PIPE, &mes_add_queue_pkt, sizeof(mes_add_queue_pkt), offsetof(union MESAPI__ADD_QUEUE, api_status)); } @@ -340,6 +343,7 @@ static int mes_v12_0_remove_hw_queue(struct amdgpu_mes *mes, mes_remove_queue_pkt.gang_context_addr = input->gang_context_addr; return mes_v12_0_submit_pkt_and_poll_completion(mes, + AMDGPU_MES_SCHED_PIPE, &mes_remove_queue_pkt, sizeof(mes_remove_queue_pkt), offsetof(union MESAPI__REMOVE_QUEUE, api_status)); } @@ -365,6 +369,7 @@ static int mes_v12_0_map_legacy_queue(struct amdgpu_mes *mes, mes_add_queue_pkt.map_legacy_kq = 1; return mes_v12_0_submit_pkt_and_poll_completion(mes, + AMDGPU_MES_SCHED_PIPE, &mes_add_queue_pkt, sizeof(mes_add_queue_pkt), offsetof(union MESAPI__ADD_QUEUE, api_status)); } @@ -398,6 +403,7 @@ static int mes_v12_0_unmap_legacy_queue(struct amdgpu_mes *mes, } return mes_v12_0_submit_pkt_and_poll_completion(mes, + AMDGPU_MES_SCHED_PIPE, &mes_remove_queue_pkt, sizeof(mes_remove_queue_pkt), offsetof(union MESAPI__REMOVE_QUEUE, api_status)); } @@ -414,7 +420,7 @@ static int mes_v12_0_resume_gang(struct amdgpu_mes *mes, return 0; } -static int mes_v12_0_query_sched_status(struct amdgpu_mes *mes) +static int mes_v12_0_query_sched_status(struct amdgpu_mes *mes, int pipe) { union MESAPI__QUERY_MES_STATUS mes_status_pkt; @@ -424,7 +430,7 @@ static int mes_v12_0_query_sched_status(struct amdgpu_mes *mes) mes_status_pkt.header.opcode = MES_SCH_API_QUERY_SCHEDULER_STATUS; mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; - return mes_v12_0_submit_pkt_and_poll_completion(mes, + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_status_pkt, sizeof(mes_status_pkt), offsetof(union MESAPI__QUERY_MES_STATUS, api_status)); } @@ -486,11 +492,12 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes, } return mes_v12_0_submit_pkt_and_poll_completion(mes, + AMDGPU_MES_SCHED_PIPE, &misc_pkt, sizeof(misc_pkt), offsetof(union MESAPI__MISC, api_status)); } -static int mes_v12_0_set_hw_resources_1(struct amdgpu_mes *mes) +static int mes_v12_0_set_hw_resources_1(struct amdgpu_mes *mes, int pipe) { union MESAPI_SET_HW_RESOURCES_1 mes_set_hw_res_1_pkt; @@ -501,12 +508,12 @@ static int mes_v12_0_set_hw_resources_1(struct amdgpu_mes *mes) mes_set_hw_res_1_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; mes_set_hw_res_1_pkt.mes_kiq_unmap_timeout = 100; - return mes_v12_0_submit_pkt_and_poll_completion(mes, + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_set_hw_res_1_pkt, sizeof(mes_set_hw_res_1_pkt), offsetof(union MESAPI_SET_HW_RESOURCES_1, api_status)); } -static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes) +static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe) { int i; struct amdgpu_device *adev = mes->adev; @@ -566,7 +573,7 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes) mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr; } - return mes_v12_0_submit_pkt_and_poll_completion(mes, + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), offsetof(union MESAPI_SET_HW_RESOURCES, api_status)); } @@ -1446,19 +1453,19 @@ static int mes_v12_0_hw_init(void *handle) if (r) goto failure; - r = mes_v12_0_set_hw_resources(&adev->mes); + r = mes_v12_0_set_hw_resources(&adev->mes, AMDGPU_MES_SCHED_PIPE); if (r) goto failure; if (adev->enable_uni_mes) - mes_v12_0_set_hw_resources_1(&adev->mes); + mes_v12_0_set_hw_resources_1(&adev->mes, AMDGPU_MES_SCHED_PIPE); mes_v12_0_init_aggregated_doorbell(&adev->mes); /* Enable the MES to handle doorbell ring on unmapped queue */ mes_v12_0_enable_unmapped_doorbell_handling(&adev->mes, true); - r = mes_v12_0_query_sched_status(&adev->mes); + r = mes_v12_0_query_sched_status(&adev->mes, AMDGPU_MES_SCHED_PIPE); if (r) { DRM_ERROR("MES is busy\n"); goto failure; From 1097727d6d0c13eca25321fff46714fc5047d6e8 Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 14:44:07 +0800 Subject: [PATCH 281/991] drm/amdgpu/mes12: adjust mes12 sw/hw init for multiple pipes Adjust mes12 sw/hw initiailization for both pipe0 and pipe1 enablement. The two pipes are almost identical pipe. Pipe0 behaves like schq and pipe1 like kiq, pipe0 was mapped by pipe1. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit aa539da8aff07ab08def6490e8c9b441439e70ba) --- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 106 +++++++++++++++---------- 1 file changed, 62 insertions(+), 44 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 1213f35e29000b..d18ec58551937e 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -266,6 +266,8 @@ static int convert_to_mes_queue_type(int queue_type) return MES_QUEUE_TYPE_COMPUTE; else if (queue_type == AMDGPU_RING_TYPE_SDMA) return MES_QUEUE_TYPE_SDMA; + else if (queue_type == AMDGPU_RING_TYPE_MES) + return MES_QUEUE_TYPE_SCHQ; else BUG(); return -1; @@ -352,6 +354,7 @@ static int mes_v12_0_map_legacy_queue(struct amdgpu_mes *mes, struct mes_map_legacy_queue_input *input) { union MESAPI__ADD_QUEUE mes_add_queue_pkt; + int pipe; memset(&mes_add_queue_pkt, 0, sizeof(mes_add_queue_pkt)); @@ -368,8 +371,12 @@ static int mes_v12_0_map_legacy_queue(struct amdgpu_mes *mes, convert_to_mes_queue_type(input->queue_type); mes_add_queue_pkt.map_legacy_kq = 1; - return mes_v12_0_submit_pkt_and_poll_completion(mes, - AMDGPU_MES_SCHED_PIPE, + if (mes->adev->enable_uni_mes) + pipe = AMDGPU_MES_KIQ_PIPE; + else + pipe = AMDGPU_MES_SCHED_PIPE; + + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_add_queue_pkt, sizeof(mes_add_queue_pkt), offsetof(union MESAPI__ADD_QUEUE, api_status)); } @@ -378,6 +385,7 @@ static int mes_v12_0_unmap_legacy_queue(struct amdgpu_mes *mes, struct mes_unmap_legacy_queue_input *input) { union MESAPI__REMOVE_QUEUE mes_remove_queue_pkt; + int pipe; memset(&mes_remove_queue_pkt, 0, sizeof(mes_remove_queue_pkt)); @@ -402,8 +410,12 @@ static int mes_v12_0_unmap_legacy_queue(struct amdgpu_mes *mes, convert_to_mes_queue_type(input->queue_type); } - return mes_v12_0_submit_pkt_and_poll_completion(mes, - AMDGPU_MES_SCHED_PIPE, + if (mes->adev->enable_uni_mes) + pipe = AMDGPU_MES_KIQ_PIPE; + else + pipe = AMDGPU_MES_SCHED_PIPE; + + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_remove_queue_pkt, sizeof(mes_remove_queue_pkt), offsetof(union MESAPI__REMOVE_QUEUE, api_status)); } @@ -439,6 +451,7 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes, struct mes_misc_op_input *input) { union MESAPI__MISC misc_pkt; + int pipe; memset(&misc_pkt, 0, sizeof(misc_pkt)); @@ -491,8 +504,12 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes, return -EINVAL; } - return mes_v12_0_submit_pkt_and_poll_completion(mes, - AMDGPU_MES_SCHED_PIPE, + if (mes->adev->enable_uni_mes) + pipe = AMDGPU_MES_KIQ_PIPE; + else + pipe = AMDGPU_MES_SCHED_PIPE; + + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &misc_pkt, sizeof(misc_pkt), offsetof(union MESAPI__MISC, api_status)); } @@ -1107,14 +1124,12 @@ static int mes_v12_0_queue_init(struct amdgpu_device *adev, struct amdgpu_ring *ring; int r; - if (pipe == AMDGPU_MES_KIQ_PIPE) + if (!adev->enable_uni_mes && pipe == AMDGPU_MES_KIQ_PIPE) ring = &adev->gfx.kiq[0].ring; - else if (pipe == AMDGPU_MES_SCHED_PIPE) - ring = &adev->mes.ring[0]; else - BUG(); + ring = &adev->mes.ring[pipe]; - if ((pipe == AMDGPU_MES_SCHED_PIPE) && + if ((adev->enable_uni_mes || pipe == AMDGPU_MES_SCHED_PIPE) && (amdgpu_in_reset(adev) || adev->in_suspend)) { *(ring->wptr_cpu_addr) = 0; *(ring->rptr_cpu_addr) = 0; @@ -1126,13 +1141,12 @@ static int mes_v12_0_queue_init(struct amdgpu_device *adev, return r; if (pipe == AMDGPU_MES_SCHED_PIPE) { - if (adev->enable_uni_mes) { - mes_v12_0_queue_init_register(ring); - } else { + if (adev->enable_uni_mes) + r = amdgpu_mes_map_legacy_queue(adev, ring); + else r = mes_v12_0_kiq_enable_queue(adev); - if (r) - return r; - } + if (r) + return r; } else { mes_v12_0_queue_init_register(ring); } @@ -1152,25 +1166,29 @@ static int mes_v12_0_queue_init(struct amdgpu_device *adev, return 0; } -static int mes_v12_0_ring_init(struct amdgpu_device *adev) +static int mes_v12_0_ring_init(struct amdgpu_device *adev, int pipe) { struct amdgpu_ring *ring; - ring = &adev->mes.ring[0]; + ring = &adev->mes.ring[pipe]; ring->funcs = &mes_v12_0_ring_funcs; ring->me = 3; - ring->pipe = 0; + ring->pipe = pipe; ring->queue = 0; ring->ring_obj = NULL; ring->use_doorbell = true; - ring->doorbell_index = adev->doorbell_index.mes_ring0 << 1; - ring->eop_gpu_addr = adev->mes.eop_gpu_addr[AMDGPU_MES_SCHED_PIPE]; + ring->eop_gpu_addr = adev->mes.eop_gpu_addr[pipe]; ring->no_scheduler = true; sprintf(ring->name, "mes_%d.%d.%d", ring->me, ring->pipe, ring->queue); + if (pipe == AMDGPU_MES_SCHED_PIPE) + ring->doorbell_index = adev->doorbell_index.mes_ring0 << 1; + else + ring->doorbell_index = adev->doorbell_index.mes_ring1 << 1; + return amdgpu_ring_init(adev, ring, 1024, NULL, 0, AMDGPU_RING_PRIO_DEFAULT, NULL); } @@ -1184,7 +1202,7 @@ static int mes_v12_0_kiq_ring_init(struct amdgpu_device *adev) ring = &adev->gfx.kiq[0].ring; ring->me = 3; - ring->pipe = adev->enable_uni_mes ? 0 : 1; + ring->pipe = 1; ring->queue = 0; ring->adev = NULL; @@ -1206,12 +1224,10 @@ static int mes_v12_0_mqd_sw_init(struct amdgpu_device *adev, int r, mqd_size = sizeof(struct v12_compute_mqd); struct amdgpu_ring *ring; - if (pipe == AMDGPU_MES_KIQ_PIPE) + if (!adev->enable_uni_mes && pipe == AMDGPU_MES_KIQ_PIPE) ring = &adev->gfx.kiq[0].ring; - else if (pipe == AMDGPU_MES_SCHED_PIPE) - ring = &adev->mes.ring[0]; else - BUG(); + ring = &adev->mes.ring[pipe]; if (ring->mqd_obj) return 0; @@ -1252,9 +1268,6 @@ static int mes_v12_0_sw_init(void *handle) return r; for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { - if (!adev->enable_mes_kiq && pipe == AMDGPU_MES_KIQ_PIPE) - continue; - r = mes_v12_0_allocate_eop_buf(adev, pipe); if (r) return r; @@ -1262,18 +1275,15 @@ static int mes_v12_0_sw_init(void *handle) r = mes_v12_0_mqd_sw_init(adev, pipe); if (r) return r; - } - if (adev->enable_mes_kiq) { - r = mes_v12_0_kiq_ring_init(adev); + if (!adev->enable_uni_mes && pipe == AMDGPU_MES_KIQ_PIPE) + r = mes_v12_0_kiq_ring_init(adev); + else + r = mes_v12_0_ring_init(adev, pipe); if (r) return r; } - r = mes_v12_0_ring_init(adev); - if (r) - return r; - return 0; } @@ -1368,10 +1378,10 @@ static int mes_v12_0_kiq_hw_init(struct amdgpu_device *adev) { int r = 0; - mes_v12_0_kiq_setting(&adev->gfx.kiq[0].ring); - if (adev->enable_uni_mes) - return mes_v12_0_hw_init(adev); + mes_v12_0_kiq_setting(&adev->mes.ring[AMDGPU_MES_KIQ_PIPE]); + else + mes_v12_0_kiq_setting(&adev->gfx.kiq[0].ring); if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) { @@ -1398,6 +1408,14 @@ static int mes_v12_0_kiq_hw_init(struct amdgpu_device *adev) if (r) goto failure; + if (adev->enable_uni_mes) { + r = mes_v12_0_set_hw_resources(&adev->mes, AMDGPU_MES_KIQ_PIPE); + if (r) + goto failure; + + mes_v12_0_set_hw_resources_1(&adev->mes, AMDGPU_MES_KIQ_PIPE); + } + r = mes_v12_0_hw_init(adev); if (r) goto failure; @@ -1429,7 +1447,7 @@ static int mes_v12_0_hw_init(void *handle) if (adev->mes.ring[0].sched.ready) goto out; - if (!adev->enable_mes_kiq || adev->enable_uni_mes) { + if (!adev->enable_mes_kiq) { if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) { r = mes_v12_0_load_microcode(adev, AMDGPU_MES_SCHED_PIPE, true); @@ -1449,6 +1467,9 @@ static int mes_v12_0_hw_init(void *handle) mes_v12_0_enable(adev, true); } + /* Enable the MES to handle doorbell ring on unmapped queue */ + mes_v12_0_enable_unmapped_doorbell_handling(&adev->mes, true); + r = mes_v12_0_queue_init(adev, AMDGPU_MES_SCHED_PIPE); if (r) goto failure; @@ -1462,9 +1483,6 @@ static int mes_v12_0_hw_init(void *handle) mes_v12_0_init_aggregated_doorbell(&adev->mes); - /* Enable the MES to handle doorbell ring on unmapped queue */ - mes_v12_0_enable_unmapped_doorbell_handling(&adev->mes, true); - r = mes_v12_0_query_sched_status(&adev->mes, AMDGPU_MES_SCHED_PIPE); if (r) { DRM_ERROR("MES is busy\n"); From 7254027e1e6edbff54f5930a5f13f14ac6f1694c Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 14:49:30 +0800 Subject: [PATCH 282/991] drm/amdgpu/mes12: configure two pipes hardware resources Configure two pipes with different hardware resources. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit ea5d6db17a8e3635ad91e8c53faa1fdc9570fbbb) --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 77 +++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 12 ++-- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 7 +-- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 49 ++++++++-------- 4 files changed, 81 insertions(+), 64 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 81bed8e8478d15..1cb1ec7beefed3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -165,36 +165,38 @@ int amdgpu_mes_init(struct amdgpu_device *adev) adev->mes.sdma_hqd_mask[i] = 0xfc; } - r = amdgpu_device_wb_get(adev, &adev->mes.sch_ctx_offs); - if (r) { - dev_err(adev->dev, - "(%d) ring trail_fence_offs wb alloc failed\n", r); - goto error_ids; - } - adev->mes.sch_ctx_gpu_addr = - adev->wb.gpu_addr + (adev->mes.sch_ctx_offs * 4); - adev->mes.sch_ctx_ptr = - (uint64_t *)&adev->wb.wb[adev->mes.sch_ctx_offs]; + for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) { + r = amdgpu_device_wb_get(adev, &adev->mes.sch_ctx_offs[i]); + if (r) { + dev_err(adev->dev, + "(%d) ring trail_fence_offs wb alloc failed\n", + r); + goto error; + } + adev->mes.sch_ctx_gpu_addr[i] = + adev->wb.gpu_addr + (adev->mes.sch_ctx_offs[i] * 4); + adev->mes.sch_ctx_ptr[i] = + (uint64_t *)&adev->wb.wb[adev->mes.sch_ctx_offs[i]]; - r = amdgpu_device_wb_get(adev, &adev->mes.query_status_fence_offs); - if (r) { - amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs); - dev_err(adev->dev, - "(%d) query_status_fence_offs wb alloc failed\n", r); - goto error_ids; + r = amdgpu_device_wb_get(adev, + &adev->mes.query_status_fence_offs[i]); + if (r) { + dev_err(adev->dev, + "(%d) query_status_fence_offs wb alloc failed\n", + r); + goto error; + } + adev->mes.query_status_fence_gpu_addr[i] = adev->wb.gpu_addr + + (adev->mes.query_status_fence_offs[i] * 4); + adev->mes.query_status_fence_ptr[i] = + (uint64_t *)&adev->wb.wb[adev->mes.query_status_fence_offs[i]]; } - adev->mes.query_status_fence_gpu_addr = - adev->wb.gpu_addr + (adev->mes.query_status_fence_offs * 4); - adev->mes.query_status_fence_ptr = - (uint64_t *)&adev->wb.wb[adev->mes.query_status_fence_offs]; r = amdgpu_device_wb_get(adev, &adev->mes.read_val_offs); if (r) { - amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs); - amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs); dev_err(adev->dev, "(%d) read_val_offs alloc failed\n", r); - goto error_ids; + goto error; } adev->mes.read_val_gpu_addr = adev->wb.gpu_addr + (adev->mes.read_val_offs * 4); @@ -214,10 +216,16 @@ int amdgpu_mes_init(struct amdgpu_device *adev) error_doorbell: amdgpu_mes_doorbell_free(adev); error: - amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs); - amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs); - amdgpu_device_wb_free(adev, adev->mes.read_val_offs); -error_ids: + for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) { + if (adev->mes.sch_ctx_ptr[i]) + amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs[i]); + if (adev->mes.query_status_fence_ptr[i]) + amdgpu_device_wb_free(adev, + adev->mes.query_status_fence_offs[i]); + } + if (adev->mes.read_val_ptr) + amdgpu_device_wb_free(adev, adev->mes.read_val_offs); + idr_destroy(&adev->mes.pasid_idr); idr_destroy(&adev->mes.gang_id_idr); idr_destroy(&adev->mes.queue_id_idr); @@ -228,13 +236,22 @@ int amdgpu_mes_init(struct amdgpu_device *adev) void amdgpu_mes_fini(struct amdgpu_device *adev) { + int i; + amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj, &adev->mes.event_log_gpu_addr, &adev->mes.event_log_cpu_addr); - amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs); - amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs); - amdgpu_device_wb_free(adev, adev->mes.read_val_offs); + for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) { + if (adev->mes.sch_ctx_ptr[i]) + amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs[i]); + if (adev->mes.query_status_fence_ptr[i]) + amdgpu_device_wb_free(adev, + adev->mes.query_status_fence_offs[i]); + } + if (adev->mes.read_val_ptr) + amdgpu_device_wb_free(adev, adev->mes.read_val_offs); + amdgpu_mes_doorbell_free(adev); idr_destroy(&adev->mes.pasid_idr); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index f89e3f61fe46b5..0bc837dab578f9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -112,12 +112,12 @@ struct amdgpu_mes { uint32_t gfx_hqd_mask[AMDGPU_MES_MAX_GFX_PIPES]; uint32_t sdma_hqd_mask[AMDGPU_MES_MAX_SDMA_PIPES]; uint32_t aggregated_doorbells[AMDGPU_MES_PRIORITY_NUM_LEVELS]; - uint32_t sch_ctx_offs; - uint64_t sch_ctx_gpu_addr; - uint64_t *sch_ctx_ptr; - uint32_t query_status_fence_offs; - uint64_t query_status_fence_gpu_addr; - uint64_t *query_status_fence_ptr; + uint32_t sch_ctx_offs[AMDGPU_MAX_MES_PIPES]; + uint64_t sch_ctx_gpu_addr[AMDGPU_MAX_MES_PIPES]; + uint64_t *sch_ctx_ptr[AMDGPU_MAX_MES_PIPES]; + uint32_t query_status_fence_offs[AMDGPU_MAX_MES_PIPES]; + uint64_t query_status_fence_gpu_addr[AMDGPU_MAX_MES_PIPES]; + uint64_t *query_status_fence_ptr[AMDGPU_MAX_MES_PIPES]; uint32_t read_val_offs; uint64_t read_val_gpu_addr; uint32_t *read_val_ptr; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 44bdfa0b263aa0..2ea8223eb969ae 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -522,9 +522,9 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes) mes_set_hw_res_pkt.vmid_mask_gfxhub = mes->vmid_mask_gfxhub; mes_set_hw_res_pkt.gds_size = adev->gds.gds_size; mes_set_hw_res_pkt.paging_vmid = 0; - mes_set_hw_res_pkt.g_sch_ctx_gpu_mc_ptr = mes->sch_ctx_gpu_addr; + mes_set_hw_res_pkt.g_sch_ctx_gpu_mc_ptr = mes->sch_ctx_gpu_addr[0]; mes_set_hw_res_pkt.query_status_fence_gpu_mc_ptr = - mes->query_status_fence_gpu_addr; + mes->query_status_fence_gpu_addr[0]; for (i = 0; i < MAX_COMPUTE_PIPES; i++) mes_set_hw_res_pkt.compute_hqd_mask[i] = @@ -1210,9 +1210,6 @@ static int mes_v11_0_sw_fini(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; int pipe; - amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs); - amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs); - for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { kfree(adev->mes.mqd_backup[pipe]); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index d18ec58551937e..ed77d70441cfe8 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -542,27 +542,33 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe) mes_set_hw_res_pkt.header.opcode = MES_SCH_API_SET_HW_RSRC; mes_set_hw_res_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; - mes_set_hw_res_pkt.vmid_mask_mmhub = mes->vmid_mask_mmhub; - mes_set_hw_res_pkt.vmid_mask_gfxhub = mes->vmid_mask_gfxhub; - mes_set_hw_res_pkt.gds_size = adev->gds.gds_size; - mes_set_hw_res_pkt.paging_vmid = 0; - mes_set_hw_res_pkt.g_sch_ctx_gpu_mc_ptr = mes->sch_ctx_gpu_addr; - mes_set_hw_res_pkt.query_status_fence_gpu_mc_ptr = - mes->query_status_fence_gpu_addr; - - for (i = 0; i < MAX_COMPUTE_PIPES; i++) - mes_set_hw_res_pkt.compute_hqd_mask[i] = - mes->compute_hqd_mask[i]; - - for (i = 0; i < MAX_GFX_PIPES; i++) - mes_set_hw_res_pkt.gfx_hqd_mask[i] = mes->gfx_hqd_mask[i]; - - for (i = 0; i < MAX_SDMA_PIPES; i++) - mes_set_hw_res_pkt.sdma_hqd_mask[i] = mes->sdma_hqd_mask[i]; + if (pipe == AMDGPU_MES_SCHED_PIPE) { + mes_set_hw_res_pkt.vmid_mask_mmhub = mes->vmid_mask_mmhub; + mes_set_hw_res_pkt.vmid_mask_gfxhub = mes->vmid_mask_gfxhub; + mes_set_hw_res_pkt.gds_size = adev->gds.gds_size; + mes_set_hw_res_pkt.paging_vmid = 0; + + for (i = 0; i < MAX_COMPUTE_PIPES; i++) + mes_set_hw_res_pkt.compute_hqd_mask[i] = + mes->compute_hqd_mask[i]; + + for (i = 0; i < MAX_GFX_PIPES; i++) + mes_set_hw_res_pkt.gfx_hqd_mask[i] = + mes->gfx_hqd_mask[i]; + + for (i = 0; i < MAX_SDMA_PIPES; i++) + mes_set_hw_res_pkt.sdma_hqd_mask[i] = + mes->sdma_hqd_mask[i]; + + for (i = 0; i < AMD_PRIORITY_NUM_LEVELS; i++) + mes_set_hw_res_pkt.aggregated_doorbells[i] = + mes->aggregated_doorbells[i]; + } - for (i = 0; i < AMD_PRIORITY_NUM_LEVELS; i++) - mes_set_hw_res_pkt.aggregated_doorbells[i] = - mes->aggregated_doorbells[i]; + mes_set_hw_res_pkt.g_sch_ctx_gpu_mc_ptr = + mes->sch_ctx_gpu_addr[pipe]; + mes_set_hw_res_pkt.query_status_fence_gpu_mc_ptr = + mes->query_status_fence_gpu_addr[pipe]; for (i = 0; i < 5; i++) { mes_set_hw_res_pkt.gc_base[i] = adev->reg_offset[GC_HWIP][0][i]; @@ -1292,9 +1298,6 @@ static int mes_v12_0_sw_fini(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; int pipe; - amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs); - amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs); - for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { kfree(adev->mes.mqd_backup[pipe]); From af401543df510a73f7beb13f80cf4c541be94786 Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 15:23:16 +0800 Subject: [PATCH 283/991] drm/amdgpu/mes12: sw/hw fini for unified mes Free memory for two pipes and unmap pipe0 via pipe1. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 98cae695a8ae0e4291b1fa7feef9b54fabefe885) --- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 31 +++++++++++++++++--------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index ed77d70441cfe8..e39a58d262c94e 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -1305,18 +1305,21 @@ static int mes_v12_0_sw_fini(void *handle) &adev->mes.eop_gpu_addr[pipe], NULL); amdgpu_ucode_release(&adev->mes.fw[pipe]); - } - - amdgpu_bo_free_kernel(&adev->gfx.kiq[0].ring.mqd_obj, - &adev->gfx.kiq[0].ring.mqd_gpu_addr, - &adev->gfx.kiq[0].ring.mqd_ptr); - amdgpu_bo_free_kernel(&adev->mes.ring[0].mqd_obj, - &adev->mes.ring[0].mqd_gpu_addr, - &adev->mes.ring[0].mqd_ptr); + if (adev->enable_uni_mes || pipe == AMDGPU_MES_SCHED_PIPE) { + amdgpu_bo_free_kernel(&adev->mes.ring[pipe].mqd_obj, + &adev->mes.ring[pipe].mqd_gpu_addr, + &adev->mes.ring[pipe].mqd_ptr); + amdgpu_ring_fini(&adev->mes.ring[pipe]); + } + } - amdgpu_ring_fini(&adev->gfx.kiq[0].ring); - amdgpu_ring_fini(&adev->mes.ring[0]); + if (!adev->enable_uni_mes) { + amdgpu_bo_free_kernel(&adev->gfx.kiq[0].ring.mqd_obj, + &adev->gfx.kiq[0].ring.mqd_gpu_addr, + &adev->gfx.kiq[0].ring.mqd_ptr); + amdgpu_ring_fini(&adev->gfx.kiq[0].ring); + } if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) { mes_v12_0_free_ucode_buffers(adev, AMDGPU_MES_KIQ_PIPE); @@ -1433,7 +1436,13 @@ static int mes_v12_0_kiq_hw_init(struct amdgpu_device *adev) static int mes_v12_0_kiq_hw_fini(struct amdgpu_device *adev) { if (adev->mes.ring[0].sched.ready) { - mes_v12_0_kiq_dequeue_sched(adev); + if (adev->enable_uni_mes) + amdgpu_mes_unmap_legacy_queue(adev, + &adev->mes.ring[AMDGPU_MES_SCHED_PIPE], + RESET_QUEUES, 0, 0); + else + mes_v12_0_kiq_dequeue_sched(adev); + adev->mes.ring[0].sched.ready = false; } From 4246b1077ffcc37926868581bb818fdb49d0d065 Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 7 Aug 2024 12:03:11 +0800 Subject: [PATCH 284/991] drm/amdgpu/mes12: fix suspend issue Use mes pipe to unmap kcq and kgq. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit f7fb9d677faf0460131bc2af15afd766d48a1f47) --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 22 ++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 27 +------------------------ 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index f165b9d49e2927..c770cb201e64bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -509,6 +509,16 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) int i, r = 0; int j; + if (adev->enable_mes) { + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + j = i + xcc_id * adev->gfx.num_compute_rings; + amdgpu_mes_unmap_legacy_queue(adev, + &adev->gfx.compute_ring[j], + RESET_QUEUES, 0, 0); + } + return 0; + } + if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; @@ -551,6 +561,18 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id) int i, r = 0; int j; + if (adev->enable_mes) { + if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) { + for (i = 0; i < adev->gfx.num_gfx_rings; i++) { + j = i + xcc_id * adev->gfx.num_gfx_rings; + amdgpu_mes_unmap_legacy_queue(adev, + &adev->gfx.gfx_ring[j], + PREEMPT_QUEUES, 0, 0); + } + } + return 0; + } + if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 506fa80033889f..2c611b8577a7ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -3546,33 +3546,9 @@ static int gfx_v12_0_hw_init(void *handle) return r; } -static int gfx_v12_0_kiq_disable_kgq(struct amdgpu_device *adev) -{ - struct amdgpu_kiq *kiq = &adev->gfx.kiq[0]; - struct amdgpu_ring *kiq_ring = &kiq->ring; - int i, r = 0; - - if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) - return -EINVAL; - - if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size * - adev->gfx.num_gfx_rings)) - return -ENOMEM; - - for (i = 0; i < adev->gfx.num_gfx_rings; i++) - kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i], - PREEMPT_QUEUES, 0, 0); - - if (adev->gfx.kiq[0].ring.sched.ready) - r = amdgpu_ring_test_helper(kiq_ring); - - return r; -} - static int gfx_v12_0_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - int r; uint32_t tmp; amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); @@ -3580,8 +3556,7 @@ static int gfx_v12_0_hw_fini(void *handle) if (!adev->no_hw_access) { if (amdgpu_async_gfx_ring) { - r = gfx_v12_0_kiq_disable_kgq(adev); - if (r) + if (amdgpu_gfx_disable_kgq(adev, 0)) DRM_ERROR("KGQ disable failed\n"); } From 470516c2925493594a690bc4d05b1f4471d9f996 Mon Sep 17 00:00:00 2001 From: "David (Ming Qiang) Wu" Date: Thu, 8 Aug 2024 12:19:50 -0400 Subject: [PATCH 285/991] drm/amd/amdgpu: command submission parser for JPEG Add JPEG IB command parser to ensure registers in the command are within the JPEG IP block. Reviewed-by: Alex Deucher Signed-off-by: David (Ming Qiang) Wu Signed-off-by: Alex Deucher (cherry picked from commit a7f670d5d8e77b092404ca8a35bb0f8f89ed3117) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 3 ++ drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 61 +++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.h | 7 ++- drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c | 1 + drivers/gpu/drm/amd/amdgpu/soc15d.h | 6 +++ 5 files changed, 76 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 9aa952f258cf26..6dfdff58bffd1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1057,6 +1057,9 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p, r = amdgpu_ring_parse_cs(ring, p, job, ib); if (r) return r; + + if (ib->sa_bo) + ib->gpu_addr = amdgpu_sa_bo_gpu_addr(ib->sa_bo); } else { ib->ptr = (uint32_t *)kptr; r = amdgpu_ring_patch_cs_in_place(ring, p, job, ib); diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c index f4662920c653f6..6ae5a784e18746 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c @@ -23,6 +23,7 @@ #include "amdgpu.h" #include "amdgpu_jpeg.h" +#include "amdgpu_cs.h" #include "soc15.h" #include "soc15d.h" #include "jpeg_v4_0_3.h" @@ -782,7 +783,11 @@ void jpeg_v4_0_3_dec_ring_emit_ib(struct amdgpu_ring *ring, amdgpu_ring_write(ring, PACKETJ(regUVD_LMI_JRBC_IB_VMID_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); - amdgpu_ring_write(ring, (vmid | (vmid << 4) | (vmid << 8))); + + if (ring->funcs->parse_cs) + amdgpu_ring_write(ring, 0); + else + amdgpu_ring_write(ring, (vmid | (vmid << 4) | (vmid << 8))); amdgpu_ring_write(ring, PACKETJ(regUVD_LMI_JPEG_VMID_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); @@ -1084,6 +1089,7 @@ static const struct amdgpu_ring_funcs jpeg_v4_0_3_dec_ring_vm_funcs = { .get_rptr = jpeg_v4_0_3_dec_ring_get_rptr, .get_wptr = jpeg_v4_0_3_dec_ring_get_wptr, .set_wptr = jpeg_v4_0_3_dec_ring_set_wptr, + .parse_cs = jpeg_v4_0_3_dec_ring_parse_cs, .emit_frame_size = SOC15_FLUSH_GPU_TLB_NUM_WREG * 6 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 8 + @@ -1248,3 +1254,56 @@ static void jpeg_v4_0_3_set_ras_funcs(struct amdgpu_device *adev) { adev->jpeg.ras = &jpeg_v4_0_3_ras; } + +/** + * jpeg_v4_0_3_dec_ring_parse_cs - command submission parser + * + * @parser: Command submission parser context + * @job: the job to parse + * @ib: the IB to parse + * + * Parse the command stream, return -EINVAL for invalid packet, + * 0 otherwise + */ +int jpeg_v4_0_3_dec_ring_parse_cs(struct amdgpu_cs_parser *parser, + struct amdgpu_job *job, + struct amdgpu_ib *ib) +{ + uint32_t i, reg, res, cond, type; + struct amdgpu_device *adev = parser->adev; + + for (i = 0; i < ib->length_dw ; i += 2) { + reg = CP_PACKETJ_GET_REG(ib->ptr[i]); + res = CP_PACKETJ_GET_RES(ib->ptr[i]); + cond = CP_PACKETJ_GET_COND(ib->ptr[i]); + type = CP_PACKETJ_GET_TYPE(ib->ptr[i]); + + if (res) /* only support 0 at the moment */ + return -EINVAL; + + switch (type) { + case PACKETJ_TYPE0: + if (cond != PACKETJ_CONDITION_CHECK0 || reg < JPEG_REG_RANGE_START || reg > JPEG_REG_RANGE_END) { + dev_err(adev->dev, "Invalid packet [0x%08x]!\n", ib->ptr[i]); + return -EINVAL; + } + break; + case PACKETJ_TYPE3: + if (cond != PACKETJ_CONDITION_CHECK3 || reg < JPEG_REG_RANGE_START || reg > JPEG_REG_RANGE_END) { + dev_err(adev->dev, "Invalid packet [0x%08x]!\n", ib->ptr[i]); + return -EINVAL; + } + break; + case PACKETJ_TYPE6: + if (ib->ptr[i] == CP_PACKETJ_NOP) + continue; + dev_err(adev->dev, "Invalid packet [0x%08x]!\n", ib->ptr[i]); + return -EINVAL; + default: + dev_err(adev->dev, "Unknown packet type %d !\n", type); + return -EINVAL; + } + } + + return 0; +} diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.h b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.h index 747a3e5f68564c..71c54b294e157e 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.h +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.h @@ -46,6 +46,9 @@ #define JRBC_DEC_EXTERNAL_REG_WRITE_ADDR 0x18000 +#define JPEG_REG_RANGE_START 0x4000 +#define JPEG_REG_RANGE_END 0x41c2 + extern const struct amdgpu_ip_block_version jpeg_v4_0_3_ip_block; void jpeg_v4_0_3_dec_ring_emit_ib(struct amdgpu_ring *ring, @@ -62,5 +65,7 @@ void jpeg_v4_0_3_dec_ring_insert_end(struct amdgpu_ring *ring); void jpeg_v4_0_3_dec_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint32_t val); void jpeg_v4_0_3_dec_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg, uint32_t val, uint32_t mask); - +int jpeg_v4_0_3_dec_ring_parse_cs(struct amdgpu_cs_parser *parser, + struct amdgpu_job *job, + struct amdgpu_ib *ib); #endif /* __JPEG_V4_0_3_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c index d694a276498a1e..f4daff90c7709d 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c @@ -646,6 +646,7 @@ static const struct amdgpu_ring_funcs jpeg_v5_0_0_dec_ring_vm_funcs = { .get_rptr = jpeg_v5_0_0_dec_ring_get_rptr, .get_wptr = jpeg_v5_0_0_dec_ring_get_wptr, .set_wptr = jpeg_v5_0_0_dec_ring_set_wptr, + .parse_cs = jpeg_v4_0_3_dec_ring_parse_cs, .emit_frame_size = SOC15_FLUSH_GPU_TLB_NUM_WREG * 6 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 8 + diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h index 2357ff39323f05..e74e1983da53aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h @@ -76,6 +76,12 @@ ((cond & 0xF) << 24) | \ ((type & 0xF) << 28)) +#define CP_PACKETJ_NOP 0x60000000 +#define CP_PACKETJ_GET_REG(x) ((x) & 0x3FFFF) +#define CP_PACKETJ_GET_RES(x) (((x) >> 18) & 0x3F) +#define CP_PACKETJ_GET_COND(x) (((x) >> 24) & 0xF) +#define CP_PACKETJ_GET_TYPE(x) (((x) >> 28) & 0xF) + /* Packet 3 types */ #define PACKET3_NOP 0x10 #define PACKET3_SET_BASE 0x11 From 507a2286c052919fe416b3daa0f0061d0fc702b9 Mon Sep 17 00:00:00 2001 From: Yinjie Yao Date: Fri, 9 Aug 2024 17:20:26 -0400 Subject: [PATCH 286/991] drm/amdgpu: Update kmd_fw_shared for VCN5 kmd_fw_shared changed in VCN5 Signed-off-by: Yinjie Yao Reviewed-by: Ruijing Dong Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit aa02486fb18cecbaca0c4fd393d1a03f1d4c3f9a) --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h index 1a5439abd1a043..c87d68d4be5365 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h @@ -461,8 +461,11 @@ struct amdgpu_vcn5_fw_shared { struct amdgpu_fw_shared_unified_queue_struct sq; uint8_t pad1[8]; struct amdgpu_fw_shared_fw_logging fw_log; + uint8_t pad2[20]; struct amdgpu_fw_shared_rb_setup rb_setup; - uint8_t pad2[4]; + struct amdgpu_fw_shared_smu_interface_info smu_dpm_interface; + struct amdgpu_fw_shared_drm_key_wa drm_key_wa; + uint8_t pad3[9]; }; #define VCN_BLOCK_ENCODE_DISABLE_MASK 0x80 From 23acd1f344e8102f803119d0c8fc4df4628d694f Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Thu, 8 Aug 2024 12:19:22 +0800 Subject: [PATCH 287/991] drm/amd/amdgpu: add HDP_SD support on gc 12.0.0/1 add HDP_SD support on gc 12.0.0/1 Signed-off-by: Kenneth Feng Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 61cffacb3a1c590b15c0e9ff987de02d293e0dd8) --- drivers/gpu/drm/amd/amdgpu/soc24.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c b/drivers/gpu/drm/amd/amdgpu/soc24.c index 7d641d0dadba4d..b0c3678cfb31d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc24.c +++ b/drivers/gpu/drm/amd/amdgpu/soc24.c @@ -406,6 +406,7 @@ static int soc24_common_early_init(void *handle) AMD_CG_SUPPORT_ATHUB_MGCG | AMD_CG_SUPPORT_ATHUB_LS | AMD_CG_SUPPORT_MC_MGCG | + AMD_CG_SUPPORT_HDP_SD | AMD_CG_SUPPORT_MC_LS; adev->pg_flags = AMD_PG_SUPPORT_VCN | AMD_PG_SUPPORT_JPEG | @@ -424,6 +425,7 @@ static int soc24_common_early_init(void *handle) AMD_CG_SUPPORT_ATHUB_MGCG | AMD_CG_SUPPORT_ATHUB_LS | AMD_CG_SUPPORT_MC_MGCG | + AMD_CG_SUPPORT_HDP_SD | AMD_CG_SUPPORT_MC_LS; adev->pg_flags = AMD_PG_SUPPORT_VCN | From 100bff23818eb61751ed05d64a7df36ce9728a4d Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Tue, 13 Aug 2024 15:17:27 +0000 Subject: [PATCH 288/991] perf/bpf: Don't call bpf_overflow_handler() for tracing events The regressing commit is new in 6.10. It assumed that anytime event->prog is set bpf_overflow_handler() should be invoked to execute the attached bpf program. This assumption is false for tracing events, and as a result the regressing commit broke bpftrace by invoking the bpf handler with garbage inputs on overflow. Prior to the regression the overflow handlers formed a chain (of length 0, 1, or 2) and perf_event_set_bpf_handler() (the !tracing case) added bpf_overflow_handler() to that chain, while perf_event_attach_bpf_prog() (the tracing case) did not. Both set event->prog. The chain of overflow handlers was replaced by a single overflow handler slot and a fixed call to bpf_overflow_handler() when appropriate. This modifies the condition there to check event->prog->type == BPF_PROG_TYPE_PERF_EVENT, restoring the previous behavior and fixing bpftrace. Signed-off-by: Kyle Huey Suggested-by: Andrii Nakryiko Reported-by: Joe Damato Closes: https://lore.kernel.org/lkml/ZpFfocvyF3KHaSzF@LQ3V64L9R2/ Fixes: f11f10bfa1ca ("perf/bpf: Call BPF handler directly, not through overflow machinery") Cc: stable@vger.kernel.org Tested-by: Joe Damato # bpftrace Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240813151727.28797-1-jdamato@fastly.com Signed-off-by: Alexei Starovoitov --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index aa3450bdc2276c..c973e3c11e036f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9706,7 +9706,8 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); - if (event->prog && !bpf_overflow_handler(event, data, regs)) + if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && + !bpf_overflow_handler(event, data, regs)) return ret; /* From faada2174c08662ae98b439c69efe3e79382c538 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 13 Aug 2024 16:35:14 +0200 Subject: [PATCH 289/991] dm persistent data: fix memory allocation failure kmalloc is unreliable when allocating more than 8 pages of memory. It may fail when there is plenty of free memory but the memory is fragmented. Zdenek Kabelac observed such failure in his tests. This commit changes kmalloc to kvmalloc - kvmalloc will fall back to vmalloc if the large allocation fails. Signed-off-by: Mikulas Patocka Reported-by: Zdenek Kabelac Reviewed-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/persistent-data/dm-space-map-metadata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 04698fd03e6069..d48c4fafc77989 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -277,7 +277,7 @@ static void sm_metadata_destroy(struct dm_space_map *sm) { struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - kfree(smm); + kvfree(smm); } static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) @@ -772,7 +772,7 @@ struct dm_space_map *dm_sm_metadata_init(void) { struct sm_metadata *smm; - smm = kmalloc(sizeof(*smm), GFP_KERNEL); + smm = kvmalloc(sizeof(*smm), GFP_KERNEL); if (!smm) return ERR_PTR(-ENOMEM); From 3e30296b374af33cb4c12ff93df0b1e5b2d0f80b Mon Sep 17 00:00:00 2001 From: Abhinav Kumar Date: Thu, 8 Aug 2024 16:52:27 -0700 Subject: [PATCH 290/991] drm/msm: fix the highest_bank_bit for sc7180 sc7180 programs the ubwc settings as 0x1e as that would mean a highest bank bit of 14 which matches what the GPU sets as well. However, the highest_bank_bit field of the msm_mdss_data which is being used to program the SSPP's fetch configuration is programmed to a highest bank bit of 16 as 0x3 translates to 16 and not 14. Fix the highest bank bit field used for the SSPP to match the mdss and gpu settings. Fixes: 6f410b246209 ("drm/msm/mdss: populate missing data") Reviewed-by: Rob Clark Tested-by: Stephen Boyd # Trogdor.Lazor Patchwork: https://patchwork.freedesktop.org/patch/607625/ Link: https://lore.kernel.org/r/20240808235227.2701479-1-quic_abhinavk@quicinc.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/msm_mdss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/msm_mdss.c b/drivers/gpu/drm/msm/msm_mdss.c index d90b9471ba6ff9..faa88fd6eb4d6a 100644 --- a/drivers/gpu/drm/msm/msm_mdss.c +++ b/drivers/gpu/drm/msm/msm_mdss.c @@ -577,7 +577,7 @@ static const struct msm_mdss_data sc7180_data = { .ubwc_enc_version = UBWC_2_0, .ubwc_dec_version = UBWC_2_0, .ubwc_static = 0x1e, - .highest_bank_bit = 0x3, + .highest_bank_bit = 0x1, .reg_bus_bw = 76800, }; From f50733b45d865f91db90919f8311e2127ce5a0cb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 8 Aug 2024 11:39:08 -0700 Subject: [PATCH 291/991] exec: Fix ToCToU between perm check and set-uid/gid usage When opening a file for exec via do_filp_open(), permission checking is done against the file's metadata at that moment, and on success, a file pointer is passed back. Much later in the execve() code path, the file metadata (specifically mode, uid, and gid) is used to determine if/how to set the uid and gid. However, those values may have changed since the permissions check, meaning the execution may gain unintended privileges. For example, if a file could change permissions from executable and not set-id: ---------x 1 root root 16048 Aug 7 13:16 target to set-id and non-executable: ---S------ 1 root root 16048 Aug 7 13:16 target it is possible to gain root privileges when execution should have been disallowed. While this race condition is rare in real-world scenarios, it has been observed (and proven exploitable) when package managers are updating the setuid bits of installed programs. Such files start with being world-executable but then are adjusted to be group-exec with a set-uid bit. For example, "chmod o-x,u+s target" makes "target" executable only by uid "root" and gid "cdrom", while also becoming setuid-root: -rwxr-xr-x 1 root cdrom 16048 Aug 7 13:16 target becomes: -rwsr-xr-- 1 root cdrom 16048 Aug 7 13:16 target But racing the chmod means users without group "cdrom" membership can get the permission to execute "target" just before the chmod, and when the chmod finishes, the exec reaches brpm_fill_uid(), and performs the setuid to root, violating the expressed authorization of "only cdrom group members can setuid to root". Re-check that we still have execute permissions in case the metadata has changed. It would be better to keep a copy from the perm-check time, but until we can do that refactoring, the least-bad option is to do a full inode_permission() call (under inode lock). It is understood that this is safe against dead-locks, but hardly optimal. Reported-by: Marco Vanotti Tested-by: Marco Vanotti Suggested-by: Linus Torvalds Cc: stable@vger.kernel.org Cc: Eric Biederman Cc: Alexander Viro Cc: Christian Brauner Signed-off-by: Kees Cook --- fs/exec.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index a126e3d1cacb01..50e76cc633c4ba 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1692,6 +1692,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) unsigned int mode; vfsuid_t vfsuid; vfsgid_t vfsgid; + int err; if (!mnt_may_suid(file->f_path.mnt)) return; @@ -1708,12 +1709,17 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) /* Be careful if suid/sgid is set */ inode_lock(inode); - /* reload atomically mode/uid/gid now that lock held */ + /* Atomically reload and check mode/uid/gid now that lock held. */ mode = inode->i_mode; vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); + err = inode_permission(idmap, inode, MAY_EXEC); inode_unlock(inode); + /* Did the exec bit vanish out from under us? Give up. */ + if (err) + return; + /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) From b739dffa5d570b411d4bdf4bb9b8dfd6b7d72305 Mon Sep 17 00:00:00 2001 From: Stefan Wiehler Date: Mon, 12 Aug 2024 12:06:51 +0200 Subject: [PATCH 292/991] of/irq: Prevent device address out-of-bounds read in interrupt map walk When of_irq_parse_raw() is invoked with a device address smaller than the interrupt parent node (from #address-cells property), KASAN detects the following out-of-bounds read when populating the initial match table (dyndbg="func of_irq_parse_* +p"): OF: of_irq_parse_one: dev=/soc@0/picasso/watchdog, index=0 OF: parent=/soc@0/pci@878000000000/gpio0@17,0, intsize=2 OF: intspec=4 OF: of_irq_parse_raw: ipar=/soc@0/pci@878000000000/gpio0@17,0, size=2 OF: -> addrsize=3 ================================================================== BUG: KASAN: slab-out-of-bounds in of_irq_parse_raw+0x2b8/0x8d0 Read of size 4 at addr ffffff81beca5608 by task bash/764 CPU: 1 PID: 764 Comm: bash Tainted: G O 6.1.67-484c613561-nokia_sm_arm64 #1 Hardware name: Unknown Unknown Product/Unknown Product, BIOS 2023.01-12.24.03-dirty 01/01/2023 Call trace: dump_backtrace+0xdc/0x130 show_stack+0x1c/0x30 dump_stack_lvl+0x6c/0x84 print_report+0x150/0x448 kasan_report+0x98/0x140 __asan_load4+0x78/0xa0 of_irq_parse_raw+0x2b8/0x8d0 of_irq_parse_one+0x24c/0x270 parse_interrupts+0xc0/0x120 of_fwnode_add_links+0x100/0x2d0 fw_devlink_parse_fwtree+0x64/0xc0 device_add+0xb38/0xc30 of_device_add+0x64/0x90 of_platform_device_create_pdata+0xd0/0x170 of_platform_bus_create+0x244/0x600 of_platform_notify+0x1b0/0x254 blocking_notifier_call_chain+0x9c/0xd0 __of_changeset_entry_notify+0x1b8/0x230 __of_changeset_apply_notify+0x54/0xe4 of_overlay_fdt_apply+0xc04/0xd94 ... The buggy address belongs to the object at ffffff81beca5600 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 8 bytes inside of 128-byte region [ffffff81beca5600, ffffff81beca5680) The buggy address belongs to the physical page: page:00000000230d3d03 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1beca4 head:00000000230d3d03 order:1 compound_mapcount:0 compound_pincount:0 flags: 0x8000000000010200(slab|head|zone=2) raw: 8000000000010200 0000000000000000 dead000000000122 ffffff810000c300 raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffffff81beca5500: 04 fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffffff81beca5580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffffff81beca5600: 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ^ ffffff81beca5680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffffff81beca5700: 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc ================================================================== OF: -> got it ! Prevent the out-of-bounds read by copying the device address into a buffer of sufficient size. Signed-off-by: Stefan Wiehler Link: https://lore.kernel.org/r/20240812100652.3800963-1-stefan.wiehler@nokia.com Signed-off-by: Rob Herring (Arm) --- drivers/of/irq.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index c94203ce65bb32..8fd63100ba8f08 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -344,7 +344,8 @@ int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_ar struct device_node *p; const __be32 *addr; u32 intsize; - int i, res; + int i, res, addr_len; + __be32 addr_buf[3] = { 0 }; pr_debug("of_irq_parse_one: dev=%pOF, index=%d\n", device, index); @@ -353,13 +354,19 @@ int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_ar return of_irq_parse_oldworld(device, index, out_irq); /* Get the reg property (if any) */ - addr = of_get_property(device, "reg", NULL); + addr = of_get_property(device, "reg", &addr_len); + + /* Prevent out-of-bounds read in case of longer interrupt parent address size */ + if (addr_len > (3 * sizeof(__be32))) + addr_len = 3 * sizeof(__be32); + if (addr) + memcpy(addr_buf, addr, addr_len); /* Try the new-style interrupts-extended first */ res = of_parse_phandle_with_args(device, "interrupts-extended", "#interrupt-cells", index, out_irq); if (!res) - return of_irq_parse_raw(addr, out_irq); + return of_irq_parse_raw(addr_buf, out_irq); /* Look for the interrupt parent. */ p = of_irq_find_parent(device); @@ -389,7 +396,7 @@ int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_ar /* Check if there are any interrupt-map translations to process */ - res = of_irq_parse_raw(addr, out_irq); + res = of_irq_parse_raw(addr_buf, out_irq); out: of_node_put(p); return res; From 4e91fa1ef3ce6290b4c598e54b5eb6cf134fbec8 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Mon, 12 Aug 2024 21:40:28 +0200 Subject: [PATCH 293/991] i2c: qcom-geni: Add missing geni_icc_disable in geni_i2c_runtime_resume Add the missing geni_icc_disable() call before returning in the geni_i2c_runtime_resume() function. Commit 9ba48db9f77c ("i2c: qcom-geni: Add missing geni_icc_disable in geni_i2c_runtime_resume") by Gaosheng missed disabling the interconnect in one case. Fixes: bf225ed357c6 ("i2c: i2c-qcom-geni: Add interconnect support") Cc: Gaosheng Cui Cc: stable@vger.kernel.org # v5.9+ Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-qcom-geni.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index 365e37bba0f334..06e836e3e87733 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -986,8 +986,10 @@ static int __maybe_unused geni_i2c_runtime_resume(struct device *dev) return ret; ret = clk_prepare_enable(gi2c->core_clk); - if (ret) + if (ret) { + geni_icc_disable(&gi2c->se); return ret; + } ret = geni_se_resources_on(&gi2c->se); if (ret) { From 655111b838cdabdb604f3625a9ff08c5eedb11da Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Mon, 12 Aug 2024 08:51:23 +0200 Subject: [PATCH 294/991] mptcp: correct MPTCP_SUBFLOW_ATTR_SSN_OFFSET reserved size ssn_offset field is u32 and is placed into the netlink response with nla_put_u32(), but only 2 bytes are reserved for the attribute payload in subflow_get_info_size() (even though it makes no difference in the end, as it is aligned up to 4 bytes). Supply the correct argument to the relevant nla_total_size() call to make it less confusing. Fixes: 5147dfb50832 ("mptcp: allow dumping subflow context to userspace") Signed-off-by: Eugene Syromiatnikov Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240812065024.GA19719@asgard.redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c index 3ae46b545d2c2b..2d3efb405437d8 100644 --- a/net/mptcp/diag.c +++ b/net/mptcp/diag.c @@ -94,7 +94,7 @@ static size_t subflow_get_info_size(const struct sock *sk) nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ - nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */ From a24e6e7146e361aa0855cf8ee3b2e80b8eb692e3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 Aug 2024 22:40:39 -0400 Subject: [PATCH 295/991] bcachefs: delete faulty fastpath in bch2_btree_path_traverse_cached() bch2_btree_path_traverse_cached() was previously checking if it could just relock the path, which is a common idiom in path traversal. However, it was using btree_node_relock(), not btree_path_relock(); btree_path_relock() only succeeds if the path was in state BTREE_ITER_NEED_RELOCK. If the path was in state BTREE_ITER_NEED_TRAVERSE a full traversal is needed; this led to a null ptr deref in bch2_btree_path_traverse_cached(). And the short circuit check here isn't needed, since it was already done in the main bch2_btree_path_traverse_one(). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index f2f2e525460b57..79954490627cca 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -497,11 +497,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path path->l[1].b = NULL; - if (bch2_btree_node_relock_notrace(trans, path, 0)) { - path->uptodate = BTREE_ITER_UPTODATE; - return 0; - } - int ret; do { ret = btree_path_traverse_cached_fast(trans, path); From bd864bc2d90790e00b02b17c75fb951cb4b0bb8b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Aug 2024 23:24:03 -0400 Subject: [PATCH 296/991] bcachefs: Fix bch2_trigger_alloc when upgrading from old versions bch2_trigger_alloc was assuming that the new key would always be newly created and thus always an alloc_v4 key, but - not when called from btree_gc. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index d9c5a92fa708fb..0a8a1bc9a4ac04 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -829,7 +829,19 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; + + struct bch_alloc_v4 *new_a; + if (likely(new.k->type == KEY_TYPE_alloc_v4)) { + new_a = bkey_s_to_alloc_v4(new).v; + } else { + BUG_ON(!(flags & BTREE_TRIGGER_gc)); + + struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); + ret = PTR_ERR_OR_ZERO(new_ka); + if (unlikely(ret)) + goto err; + new_a = &new_ka->v; + } if (flags & BTREE_TRIGGER_transactional) { alloc_data_type_set(new_a, new_a->data_type); From d9e615762bf2eb7459fb0f270525f8b186bce6b7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 Aug 2024 04:53:12 -0400 Subject: [PATCH 297/991] bcachefs: bch2_accounting_invalid() fixup Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 046ac92b66395f..212f5392711111 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -154,7 +154,7 @@ int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k, "accounting key replicas entry with bad nr_required"); for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) - bkey_fsck_err_on(acc_k.replicas.devs[i] > acc_k.replicas.devs[i + 1], + bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], c, err, accounting_key_replicas_devs_unsorted, "accounting key replicas entry with unsorted devs"); From 486d920735325e507d965c2639ba2775b81fd329 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 Aug 2024 22:47:55 -0400 Subject: [PATCH 298/991] bcachefs: disk accounting: ignore unknown types forward compat fix Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 212f5392711111..03a9de6c2e0a04 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -528,6 +528,9 @@ int bch2_gc_accounting_done(struct bch_fs *c) struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, e->pos); + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + continue; + u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; @@ -760,6 +763,12 @@ void bch2_verify_accounting_clean(struct bch_fs *c) struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); unsigned nr = bch2_accounting_counters(k.k); + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + continue; + bch2_accounting_mem_read(c, k.k->p, v, nr); if (memcmp(a.v->d, v, nr * sizeof(u64))) { @@ -775,9 +784,6 @@ void bch2_verify_accounting_clean(struct bch_fs *c) mismatch = true; } - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, a.k->p); - switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; From 48d6cc1b4895ada0781da11a0a483332a236ec14 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 Aug 2024 01:01:35 -0400 Subject: [PATCH 299/991] bcachefs: Add missing downgrade table entry Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-downgrade.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 6c4469f53313ed..9f82d497d9e053 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -104,6 +104,7 @@ BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ BCH_FSCK_ERR_fs_usage_replicas_wrong, \ + BCH_FSCK_ERR_accounting_replicas_not_marked, \ BCH_FSCK_ERR_bkey_version_in_future) struct upgrade_downgrade_entry { From 968feb854a86b59cc4bc72af3105989706ca2c7d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 Aug 2024 16:34:28 -0400 Subject: [PATCH 300/991] bcachefs: Convert for_each_btree_node() to lockrestart_do() for_each_btree_node() now works similarly to for_each_btree_key(), where the loop body is passed as an argument to be passed to lockrestart_do(). This now calls trans_begin() on every loop iteration - which fixes an SRCU warning in backpointers fsck. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 15 +++++--------- fs/bcachefs/btree_iter.c | 1 + fs/bcachefs/btree_iter.h | 42 ++++++++++++++++++++++++-------------- fs/bcachefs/debug.c | 38 ++++++++-------------------------- 4 files changed, 42 insertions(+), 54 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 3cc02479a9828e..9edc4c5f735c57 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -763,27 +763,22 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, btree < BTREE_ID_NR && !ret; btree++) { unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1; - struct btree_iter iter; - struct btree *b; if (!(BIT_ULL(btree) & btree_leaf_mask) && !(BIT_ULL(btree) & btree_interior_mask)) continue; - bch2_trans_begin(trans); - - __for_each_btree_node(trans, iter, btree, + ret = __for_each_btree_node(trans, iter, btree, btree == start.btree ? start.pos : POS_MIN, - 0, depth, BTREE_ITER_prefetch, b, ret) { + 0, depth, BTREE_ITER_prefetch, b, ({ mem_may_pin -= btree_buf_bytes(b); if (mem_may_pin <= 0) { c->btree_cache.pinned_nodes_end = *end = BBPOS(btree, b->key.k.p); - bch2_trans_iter_exit(trans, &iter); - return 0; + break; } - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } return ret; diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index aa8a049071f410..2e84d22e17bdd6 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1900,6 +1900,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) goto out; } +/* Only kept for -tools */ struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) { struct btree *b; diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index c7725865309c09..dca62375d7d30f 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -600,23 +600,35 @@ void bch2_trans_srcu_unlock(struct btree_trans *); u32 bch2_trans_begin(struct btree_trans *); -/* - * XXX - * this does not handle transaction restarts from bch2_btree_iter_next_node() - * correctly - */ -#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _locks_want, _depth, _flags, _b, _ret) \ - for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ - _start, _locks_want, _depth, _flags); \ - (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)), \ - !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ - (_b) = bch2_btree_iter_next_node(&(_iter))) +#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _locks_want, _depth, _flags, _b, _do) \ +({ \ + bch2_trans_begin((_trans)); \ + \ + struct btree_iter _iter; \ + bch2_trans_node_iter_init((_trans), &_iter, (_btree_id), \ + _start, _locks_want, _depth, _flags); \ + int _ret3 = 0; \ + do { \ + _ret3 = lockrestart_do((_trans), ({ \ + struct btree *_b = bch2_btree_iter_peek_node(&_iter); \ + if (!_b) \ + break; \ + \ + PTR_ERR_OR_ZERO(_b) ?: (_do); \ + })) ?: \ + lockrestart_do((_trans), \ + PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \ + } while (!_ret3); \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret3; \ +}) #define for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _flags, _b, _ret) \ - __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - 0, 0, _flags, _b, _ret) + _flags, _b, _do) \ + __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + 0, 0, _flags, _b, _do) static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, unsigned flags) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index ebabab171fe5ea..45aec1afdb0e31 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -397,47 +397,27 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans *trans; - struct btree_iter iter; - struct btree *b; - ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - ret = flush_buf(i); + ssize_t ret = flush_buf(i); if (ret) return ret; if (bpos_eq(SPOS_MAX, i->from)) return i->ret; - trans = bch2_trans_get(i->c); -retry: - bch2_trans_begin(trans); - - for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) { - bch2_btree_node_to_text(&i->buf, i->c, b); - i->from = !bpos_eq(SPOS_MAX, b->key.k.p) - ? bpos_successor(b->key.k.p) - : b->key.k.p; - - ret = drop_locks_do(trans, flush_buf(i)); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); - - if (!ret) - ret = flush_buf(i); + return bch2_trans_run(i->c, + for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({ + bch2_btree_node_to_text(&i->buf, i->c, b); + i->from = !bpos_eq(SPOS_MAX, b->key.k.p) + ? bpos_successor(b->key.k.p) + : b->key.k.p; - return ret ?: i->ret; + drop_locks_do(trans, flush_buf(i)); + }))) ?: i->ret; } static const struct file_operations btree_format_debug_ops = { From b2f11c6f3e1fc60742673b8675c95b78447f3dae Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 Aug 2024 21:04:35 -0400 Subject: [PATCH 301/991] lib/generic-radix-tree.c: Fix rare race in __genradix_ptr_alloc() If we need to increase the tree depth, allocate a new node, and then race with another thread that increased the tree depth before us, we'll still have a preallocated node that might be used later. If we then use that node for a new non-root node, it'll still have a pointer to the old root instead of being zeroed - fix this by zeroing it in the cmpxchg failure path. Signed-off-by: Kent Overstreet --- lib/generic-radix-tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index aaefb9b678c8e4..fa692c86f06969 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -121,6 +121,8 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) { v = new_root; new_node = NULL; + } else { + new_node->children[0] = NULL; } } From 7254555c440ff6b136aa97fb3c33fd5e0bb4fb9f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 Aug 2024 14:40:09 -0400 Subject: [PATCH 302/991] bcachefs: Add hysteresis to waiting on btree key cache flush This helps ensure key cache reclaim isn't contending with threads waiting for the key cache to be helped, and fixes a severe performance bug. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.h | 9 +++++++++ fs/bcachefs/btree_trans_commit.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index e6b2cd0dd2c1af..113309f6291868 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -20,6 +20,15 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) return nr_dirty > max_dirty; } +static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c) +{ + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 2048 + (nr_keys * 5) / 8; + + return nr_dirty <= max_dirty; +} + int bch2_btree_key_cache_journal_flush(struct journal *, struct journal_entry_pin *, u64); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index cca336fe46e9b4..f567bfb828508a 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -927,7 +927,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags static int journal_reclaim_wait_done(struct bch_fs *c) { int ret = bch2_journal_error(&c->journal) ?: - !bch2_btree_key_cache_must_wait(c); + bch2_btree_key_cache_wait_done(c); if (!ret) journal_reclaim_kick(&c->journal); From 790666c8ac6427ddaa00f502dc44073c1e039355 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 Aug 2024 14:31:17 -0400 Subject: [PATCH 303/991] bcachefs: Improve trans_blocked_journal_reclaim tracepoint include information about the state of the btree key cache Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.h | 9 +++++++-- fs/bcachefs/trace.c | 1 + fs/bcachefs/trace.h | 27 +++++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index 113309f6291868..51d6289b8dee32 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -11,13 +11,18 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) return max_t(ssize_t, 0, nr_dirty - max_dirty); } -static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c) { size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); size_t max_dirty = 4096 + (nr_keys * 3) / 4; - return nr_dirty > max_dirty; + return nr_dirty - max_dirty; +} + +static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +{ + return __bch2_btree_key_cache_must_wait(c) > 0; } static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c) diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c index dc48b52b01b49c..dfad1d06633ddb 100644 --- a/fs/bcachefs/trace.c +++ b/fs/bcachefs/trace.c @@ -4,6 +4,7 @@ #include "buckets.h" #include "btree_cache.h" #include "btree_iter.h" +#include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update_interior.h" #include "keylist.h" diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index d0e6b9deb6cb4d..c62f00322d1ed6 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -988,10 +988,33 @@ TRACE_EVENT(trans_restart_split_race, __entry->u64s_remaining) ); -DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, +TRACE_EVENT(trans_blocked_journal_reclaim, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + TP_ARGS(trans, caller_ip), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + + __field(unsigned long, key_cache_nr_keys ) + __field(unsigned long, key_cache_nr_dirty ) + __field(long, must_wait ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->key_cache_nr_keys = atomic_long_read(&trans->c->btree_key_cache.nr_keys); + __entry->key_cache_nr_dirty = atomic_long_read(&trans->c->btree_key_cache.nr_dirty); + __entry->must_wait = __bch2_btree_key_cache_must_wait(trans->c); + ), + + TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li", + __entry->trans_fn, (void *) __entry->caller_ip, + __entry->key_cache_nr_keys, + __entry->key_cache_nr_dirty, + __entry->must_wait) ); TRACE_EVENT(trans_restart_journal_preres_get, From 06a8693b890c0cf7d94bf7c6f0e2adf3a3aaa346 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 Aug 2024 15:48:18 -0400 Subject: [PATCH 304/991] bcachefs: Add a time_stat for blocked on key cache flush Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/btree_trans_commit.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index eedf2d6045e73f..0c7086e00d18f5 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -447,6 +447,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(blocked_journal_low_on_space) \ x(blocked_journal_low_on_pin) \ x(blocked_journal_max_in_flight) \ + x(blocked_key_cache_flush) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ x(blocked_write_buffer_full) \ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index f567bfb828508a..ac0c92683aad4b 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -973,9 +973,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, bch2_trans_unlock(trans); trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); + track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true); wait_event_freezable(c->journal.reclaim_wait, (ret = journal_reclaim_wait_done(c))); + + track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false); + if (ret < 0) break; From c99471024f24b3cbafc02bf5b112ecf34b0dbd40 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Aug 2024 23:29:46 -0400 Subject: [PATCH 305/991] bcachefs: Fix warning in __bch2_fsck_err() for trans not passed in Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 2c424435ca4aaa..70ebcca08ba2cf 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1767,6 +1767,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, set_btree_node_read_in_flight(b); + /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */ + bch2_trans_unlock(trans); bch2_btree_node_read(trans, b, true); if (btree_node_read_error(b)) { From d97de0d017cde0d442c3d144b4f969f43064cc0f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Aug 2024 21:31:25 -0400 Subject: [PATCH 306/991] bcachefs: Make bkey_fsck_err() a wrapper around fsck_err() bkey_fsck_err() was added as an interface that looks like fsck_err(), but previously all it did was ensure that the appropriate error counter was incremented in the superblock. This is a cleanup and bugfix patch that converts it to a wrapper around fsck_err(). This is needed to fix an issue with the upgrade path to disk_accounting_v3, where the "silent fix" error list now includes bkey_fsck errors; fsck_err() handles this in a unified way, and since we need to change printing of bkey fsck errors from the caller to the inner bkey_fsck_err() calls, this ends up being a pretty big change. Als,, rename .invalid() methods to .validate(), for clarity, while we're changing the function signature anyways (to drop the printbuf argument). Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 63 ++++++------ fs/bcachefs/alloc_background.h | 26 +++-- fs/bcachefs/backpointers.c | 8 +- fs/bcachefs/backpointers.h | 5 +- fs/bcachefs/bkey.h | 7 +- fs/bcachefs/bkey_methods.c | 109 ++++++++++----------- fs/bcachefs/bkey_methods.h | 21 ++-- fs/bcachefs/btree_io.c | 67 ++++--------- fs/bcachefs/btree_node_scan.c | 2 +- fs/bcachefs/btree_trans_commit.c | 72 +++----------- fs/bcachefs/btree_update_interior.c | 16 +--- fs/bcachefs/data_update.c | 6 +- fs/bcachefs/dirent.c | 33 ++++--- fs/bcachefs/dirent.h | 5 +- fs/bcachefs/disk_accounting.c | 13 ++- fs/bcachefs/disk_accounting.h | 5 +- fs/bcachefs/ec.c | 15 ++- fs/bcachefs/ec.h | 5 +- fs/bcachefs/errcode.h | 1 + fs/bcachefs/error.c | 22 +++++ fs/bcachefs/error.h | 39 ++++---- fs/bcachefs/extents.c | 144 ++++++++++++++-------------- fs/bcachefs/extents.h | 24 ++--- fs/bcachefs/inode.c | 77 +++++++-------- fs/bcachefs/inode.h | 24 ++--- fs/bcachefs/journal_io.c | 24 +---- fs/bcachefs/lru.c | 9 +- fs/bcachefs/lru.h | 5 +- fs/bcachefs/quota.c | 8 +- fs/bcachefs/quota.h | 5 +- fs/bcachefs/reflink.c | 19 ++-- fs/bcachefs/reflink.h | 22 ++--- fs/bcachefs/snapshot.c | 42 ++++---- fs/bcachefs/snapshot.h | 11 +-- fs/bcachefs/subvolume.c | 16 ++-- fs/bcachefs/subvolume.h | 5 +- fs/bcachefs/xattr.c | 21 ++-- fs/bcachefs/xattr.h | 5 +- 38 files changed, 448 insertions(+), 553 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 0a8a1bc9a4ac04..fd3a2522bc3ed3 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -196,75 +196,71 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) return DIV_ROUND_UP(bytes, sizeof(u64)); } -int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); int ret = 0; /* allow for unknown fields */ - bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err, - alloc_v1_val_size_bad, + bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), + c, alloc_v1_val_size_bad, "incorrect value size (%zu < %u)", bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); fsck_err: return ret; } -int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_alloc_unpacked u; int ret = 0; - bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err, - alloc_v2_unpack_error, + bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), + c, alloc_v2_unpack_error, "unpack error"); fsck_err: return ret; } -int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_alloc_unpacked u; int ret = 0; - bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err, - alloc_v2_unpack_error, + bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), + c, alloc_v2_unpack_error, "unpack error"); fsck_err: return ret; } -int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); int ret = 0; - bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err, - alloc_v4_val_size_bad, + bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), + c, alloc_v4_val_size_bad, "bad val size (%u > %zu)", alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k)); bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err, - alloc_v4_backpointers_start_bad, + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), + c, alloc_v4_backpointers_start_bad, "invalid backpointers_start"); - bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err, - alloc_key_data_type_bad, + bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, + c, alloc_key_data_type_bad, "invalid data type (got %u should be %u)", a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); for (unsigned i = 0; i < 2; i++) bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX, - c, err, - alloc_key_io_time_bad, + c, alloc_key_io_time_bad, "invalid io_time[%s]: %llu, max %llu", i == READ ? "read" : "write", a.v->io_time[i], LRU_TIME_MAX); @@ -282,7 +278,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, a.v->dirty_sectors || a.v->cached_sectors || a.v->stripe, - c, err, alloc_key_empty_but_have_data, + c, alloc_key_empty_but_have_data, "empty data type free but have data %u.%u.%u %u", stripe_sectors, a.v->dirty_sectors, @@ -296,7 +292,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_DATA_parity: bkey_fsck_err_on(!a.v->dirty_sectors && !stripe_sectors, - c, err, alloc_key_dirty_sectors_0, + c, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", bch2_data_type_str(a.v->data_type)); break; @@ -305,12 +301,12 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, a.v->dirty_sectors || stripe_sectors || a.v->stripe, - c, err, alloc_key_cached_inconsistency, + c, alloc_key_cached_inconsistency, "data type inconsistency"); bkey_fsck_err_on(!a.v->io_time[READ] && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, - c, err, alloc_key_cached_but_read_time_zero, + c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; case BCH_DATA_stripe: @@ -513,14 +509,13 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) : 0; } -int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err, - bucket_gens_val_size_bad, + bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), + c, bucket_gens_val_size_bad, "bad val size (%zu != %zu)", bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); fsck_err: diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 96a0444ea78ff9..260e7fa83d0518 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -240,52 +240,48 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v1_invalid, \ + .key_validate = bch2_alloc_v1_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v2_invalid, \ + .key_validate = bch2_alloc_v2_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v3_invalid, \ + .key_validate = bch2_alloc_v3_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 16, \ }) #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v4_invalid, \ + .key_validate = bch2_alloc_v4_validate, \ .val_to_text = bch2_alloc_to_text, \ .swab = bch2_alloc_v4_swab, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 48, \ }) -int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ - .key_invalid = bch2_bucket_gens_invalid, \ + .key_validate = bch2_bucket_gens_validate, \ .val_to_text = bch2_bucket_gens_to_text, \ }) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 9edc4c5f735c57..d4da6343efa9bd 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -47,9 +47,8 @@ static bool extent_matches_bp(struct bch_fs *c, return false; } -int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); @@ -68,8 +67,7 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || !bpos_eq(bp.k->p, bp_pos), - c, err, - backpointer_bucket_offset_wrong, + c, backpointer_bucket_offset_wrong, "backpointer bucket_offset wrong"); fsck_err: return ret; diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 6021de1c5e98de..7daecadb764e3e 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -18,14 +18,13 @@ static inline u64 swab40(u64 x) ((x & 0xff00000000ULL) >> 32)); } -int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k, - enum bch_validate_flags, struct printbuf *); +int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags); void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); #define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ - .key_invalid = bch2_backpointer_invalid, \ + .key_validate = bch2_backpointer_validate, \ .val_to_text = bch2_backpointer_k_to_text, \ .swab = bch2_backpointer_swab, \ .min_val_size = 32, \ diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 936357149cf0fb..e34cb2bf329c54 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -10,9 +10,10 @@ #include "vstructs.h" enum bch_validate_flags { - BCH_VALIDATE_write = (1U << 0), - BCH_VALIDATE_commit = (1U << 1), - BCH_VALIDATE_journal = (1U << 2), + BCH_VALIDATE_write = BIT(0), + BCH_VALIDATE_commit = BIT(1), + BCH_VALIDATE_journal = BIT(2), + BCH_VALIDATE_silent = BIT(3), }; #if 0 diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 5f07cf853d0c75..88d8958281e80f 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -27,27 +27,27 @@ const char * const bch2_bkey_types[] = { NULL }; -static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { return 0; } #define bch2_bkey_ops_deleted ((struct bkey_ops) { \ - .key_invalid = deleted_key_invalid, \ + .key_validate = deleted_key_validate, \ }) #define bch2_bkey_ops_whiteout ((struct bkey_ops) { \ - .key_invalid = deleted_key_invalid, \ + .key_validate = deleted_key_validate, \ }) -static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(bkey_val_bytes(k.k), c, err, - bkey_val_size_nonzero, + bkey_fsck_err_on(bkey_val_bytes(k.k), + c, bkey_val_size_nonzero, "incorrect value size (%zu != 0)", bkey_val_bytes(k.k)); fsck_err: @@ -55,11 +55,11 @@ static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, } #define bch2_bkey_ops_error ((struct bkey_ops) { \ - .key_invalid = empty_val_key_invalid, \ + .key_validate = empty_val_key_validate, \ }) -static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { return 0; } @@ -73,17 +73,17 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, } #define bch2_bkey_ops_cookie ((struct bkey_ops) { \ - .key_invalid = key_type_cookie_invalid, \ + .key_validate = key_type_cookie_validate, \ .val_to_text = key_type_cookie_to_text, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ - .key_invalid = empty_val_key_invalid, \ + .key_validate = empty_val_key_validate, \ }) -static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { return 0; } @@ -98,9 +98,9 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, datalen, min(datalen, 32U), d.v->data); } -#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ - .key_invalid = key_type_inline_data_invalid, \ - .val_to_text = key_type_inline_data_to_text, \ +#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ + .key_validate = key_type_inline_data_validate, \ + .val_to_text = key_type_inline_data_to_text, \ }) static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) @@ -110,7 +110,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_ } #define bch2_bkey_ops_set ((struct bkey_ops) { \ - .key_invalid = empty_val_key_invalid, \ + .key_validate = empty_val_key_validate, \ .key_merge = key_type_set_merge, \ }) @@ -123,9 +123,8 @@ const struct bkey_ops bch2_bkey_ops[] = { const struct bkey_ops bch2_bkey_null_ops = { }; -int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) return 0; @@ -133,15 +132,15 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); int ret = 0; - bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err, - bkey_val_size_too_small, + bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, + c, bkey_val_size_too_small, "bad val size (%zu < %u)", bkey_val_bytes(k.k), ops->min_val_size); - if (!ops->key_invalid) + if (!ops->key_validate) return 0; - ret = ops->key_invalid(c, k, flags, err); + ret = ops->key_validate(c, k, flags); fsck_err: return ret; } @@ -161,18 +160,17 @@ const char *bch2_btree_node_type_str(enum btree_node_type type) return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); } -int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, - enum btree_node_type type, - enum bch_validate_flags flags, - struct printbuf *err) +int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, + enum bch_validate_flags flags) { if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) return 0; int ret = 0; - bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err, - bkey_u64s_too_small, + bkey_fsck_err_on(k.k->u64s < BKEY_U64s, + c, bkey_u64s_too_small, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); if (type >= BKEY_TYPE_NR) @@ -180,8 +178,8 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) && - !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, - bkey_invalid_type_for_btree, + !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), + c, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", bch2_btree_node_type_str(type), k.k->type < KEY_TYPE_MAX @@ -189,17 +187,17 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, : "(unknown)"); if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { - bkey_fsck_err_on(k.k->size == 0, c, err, - bkey_extent_size_zero, + bkey_fsck_err_on(k.k->size == 0, + c, bkey_extent_size_zero, "size == 0"); - bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err, - bkey_extent_size_greater_than_offset, + bkey_fsck_err_on(k.k->size > k.k->p.offset, + c, bkey_extent_size_greater_than_offset, "size greater than offset (%u > %llu)", k.k->size, k.k->p.offset); } else { - bkey_fsck_err_on(k.k->size, c, err, - bkey_size_nonzero, + bkey_fsck_err_on(k.k->size, + c, bkey_size_nonzero, "size != 0"); } @@ -207,12 +205,12 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_id btree = type - 1; if (btree_type_has_snapshots(btree)) { - bkey_fsck_err_on(!k.k->p.snapshot, c, err, - bkey_snapshot_zero, + bkey_fsck_err_on(!k.k->p.snapshot, + c, bkey_snapshot_zero, "snapshot == 0"); } else if (!btree_type_has_snapshot_field(btree)) { - bkey_fsck_err_on(k.k->p.snapshot, c, err, - bkey_snapshot_nonzero, + bkey_fsck_err_on(k.k->p.snapshot, + c, bkey_snapshot_nonzero, "nonzero snapshot"); } else { /* @@ -221,34 +219,33 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, */ } - bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err, - bkey_at_pos_max, + bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), + c, bkey_at_pos_max, "key at POS_MAX"); } fsck_err: return ret; } -int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, +int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - enum bch_validate_flags flags, - struct printbuf *err) + enum bch_validate_flags flags) { - return __bch2_bkey_invalid(c, k, type, flags, err) ?: - bch2_bkey_val_invalid(c, k, flags, err); + return __bch2_bkey_validate(c, k, type, flags) ?: + bch2_bkey_val_validate(c, k, flags); } int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, struct printbuf *err) + struct bkey_s_c k, enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err, - bkey_before_start_of_btree_node, + bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), + c, bkey_before_start_of_btree_node, "key before start of btree node"); - bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err, - bkey_after_end_of_btree_node, + bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), + c, bkey_after_end_of_btree_node, "key past end of btree node"); fsck_err: return ret; diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index baef0722f5fb60..3df3dd2723a128 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -14,15 +14,15 @@ extern const char * const bch2_bkey_types[]; extern const struct bkey_ops bch2_bkey_null_ops; /* - * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If + * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If * invalid, entire key will be deleted. * * When invalid, error string is returned via @err. @rw indicates whether key is * being read or written; more aggressive checks can be enabled when rw == WRITE. */ struct bkey_ops { - int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err); + int (*key_validate)(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); @@ -48,14 +48,13 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) : &bch2_bkey_null_ops; } -int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bch_validate_flags, struct printbuf *); -int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bch_validate_flags, struct printbuf *); -int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, - struct bkey_s_c, struct printbuf *); +int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, + enum bch_validate_flags); +int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, + enum bch_validate_flags); +int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, + enum bch_validate_flags); void bch2_bpos_to_text(struct printbuf *, struct bpos); void bch2_bkey_to_text(struct printbuf *, const struct bkey *); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 70ebcca08ba2cf..56ea9a77cd4afd 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -836,14 +836,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, return ret; } -static int bset_key_invalid(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - bool updated_range, int rw, - struct printbuf *err) +static int bset_key_validate(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, + bool updated_range, int rw) { - return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: - (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?: - (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); + return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?: + (!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?: + (rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0); } static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, @@ -858,12 +857,9 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, if (!bkeyp_u64s_valid(&b->format, k)) return false; - struct printbuf buf = PRINTBUF; struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf); - printbuf_exit(&buf); - return ret; + return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent); } static int validate_bset_keys(struct bch_fs *c, struct btree *b, @@ -915,19 +911,11 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, u = __bkey_disassemble(b, k, &tmp); - printbuf_reset(&buf); - if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { - printbuf_reset(&buf); - bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, u.s_c); - - btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bad_bkey, - "invalid bkey: %s", buf.buf); + ret = bset_key_validate(c, b, u.s_c, updated_range, write); + if (ret == -BCH_ERR_fsck_delete_bkey) goto drop_this_key; - } + if (ret) + goto fsck_err; if (write) bch2_bkey_compat(b->c.level, b->c.btree_id, version, @@ -1228,23 +1216,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - printbuf_reset(&buf); - - if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) || + ret = bch2_bkey_val_validate(c, u.s_c, READ); + if (ret == -BCH_ERR_fsck_delete_bkey || (bch2_inject_invalid_keys && !bversion_cmp(u.k->version, MAX_VERSION))) { - printbuf_reset(&buf); - - prt_printf(&buf, "invalid bkey: "); - bch2_bkey_val_invalid(c, u.s_c, READ, &buf); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, u.s_c); - - btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bad_bkey, - "%s", buf.buf); - btree_keys_account_key_drop(&b->nr, 0, k); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); @@ -1253,6 +1228,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, set_btree_bset_end(b, b->set); continue; } + if (ret) + goto fsck_err; if (u.k->type == KEY_TYPE_btree_ptr_v2) { struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); @@ -1954,18 +1931,14 @@ static void btree_node_write_endio(struct bio *bio) static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { - struct printbuf buf = PRINTBUF; bool saw_error; - int ret; - - ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), - BKEY_TYPE_btree, WRITE, &buf); - if (ret) - bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf); - printbuf_exit(&buf); - if (ret) + int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), + BKEY_TYPE_btree, WRITE); + if (ret) { + bch2_fs_inconsistent(c, "invalid btree node key before write"); return ret; + } ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 0011072263778b..b28c649c68389f 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -530,7 +530,7 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); printbuf_exit(&buf); - BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL)); + BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0)); ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); if (ret) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index ac0c92683aad4b..1a1e9d2036da7a 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -818,50 +818,6 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); } -static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, - enum bch_validate_flags flags, - struct btree_insert_entry *i, - struct printbuf *err) -{ - struct bch_fs *c = trans->c; - - printbuf_reset(err); - prt_printf(err, "invalid bkey on insert from %s -> %ps\n", - trans->fn, (void *) i->ip_allocated); - printbuf_indent_add(err, 2); - - bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); - prt_newline(err); - - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err); - bch2_print_string_as_lines(KERN_ERR, err->buf); - - bch2_inconsistent_error(c); - bch2_dump_trans_updates(trans); - - return -EINVAL; -} - -static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans, - struct jset_entry *i) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "invalid bkey on insert from %s\n", trans->fn); - printbuf_indent_add(&buf, 2); - - bch2_journal_entry_to_text(&buf, c, i); - prt_newline(&buf); - - bch2_print_string_as_lines(KERN_ERR, buf.buf); - - bch2_inconsistent_error(c); - bch2_dump_trans_updates(trans); - - return -EINVAL; -} - static int bch2_trans_commit_journal_pin_flush(struct journal *j, struct journal_entry_pin *_pin, u64 seq) { @@ -1064,20 +1020,19 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; trans_for_each_update(trans, i) { - struct printbuf buf = PRINTBUF; enum bch_validate_flags invalid_flags = 0; if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, invalid_flags, &buf))) - ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf); - btree_insert_entry_checks(trans, i); - printbuf_exit(&buf); - - if (ret) + ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), + i->bkey_type, invalid_flags); + if (unlikely(ret)){ + bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", + trans->fn, (void *) i->ip_allocated); return ret; + } + btree_insert_entry_checks(trans, i); } for (struct jset_entry *i = trans->journal_entries; @@ -1088,13 +1043,14 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - if (unlikely(bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, invalid_flags))) - ret = bch2_trans_commit_journal_entry_invalid(trans, i); - - if (ret) + ret = bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, + CPU_BIG_ENDIAN, invalid_flags); + if (unlikely(ret)) { + bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", + trans->fn); return ret; + } } if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index e61f9695771e1e..b3454d4619e8fb 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1364,18 +1364,10 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); - if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), - btree_node_type(b), WRITE, &buf) ?: - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) { - printbuf_reset(&buf); - prt_printf(&buf, "inserting invalid bkey\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - prt_printf(&buf, "\n "); - bch2_bkey_invalid(c, bkey_i_to_s_c(insert), - btree_node_type(b), WRITE, &buf); - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf); - - bch2_fs_inconsistent(c, "%s", buf.buf); + if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), + btree_node_type(b), BCH_VALIDATE_write) ?: + bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) { + bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); dump_stack(); } diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 0087b8555ead3c..6a854c9184965e 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -250,10 +250,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, * it's been hard to reproduce, so this should give us some more * information when it does occur: */ - struct printbuf err = PRINTBUF; - int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err); - printbuf_exit(&err); - + int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), + BCH_VALIDATE_commit); if (invalid) { struct printbuf buf = PRINTBUF; diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index d743da89308ef7..32bfdf19289a9d 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -100,20 +100,19 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { .is_visible = dirent_is_visible, }; -int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); int ret = 0; - bkey_fsck_err_on(!d_name.len, c, err, - dirent_empty_name, + bkey_fsck_err_on(!d_name.len, + c, dirent_empty_name, "empty name"); - bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err, - dirent_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), + c, dirent_val_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); @@ -121,27 +120,27 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, * Check new keys don't exceed the max length * (older keys may be larger.) */ - bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, err, - dirent_name_too_long, + bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, + c, dirent_name_too_long, "dirent name too big (%u > %u)", d_name.len, BCH_NAME_MAX); - bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err, - dirent_name_embedded_nul, + bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), + c, dirent_name_embedded_nul, "dirent has stray data after name's NUL"); bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) || - (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err, - dirent_name_dot_or_dotdot, + (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), + c, dirent_name_dot_or_dotdot, "invalid name"); - bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err, - dirent_name_has_slash, + bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), + c, dirent_name_has_slash, "name with /"); bkey_fsck_err_on(d.v->d_type != DT_SUBVOL && - le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err, - dirent_to_itself, + le64_to_cpu(d.v->d_inum) == d.k->p.inode, + c, dirent_to_itself, "dirent points to own directory"); fsck_err: return ret; diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 24037e6e0a0948..8945145865c53b 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -7,12 +7,11 @@ enum bch_validate_flags; extern const struct bch_hash_desc bch2_dirent_hash_desc; -int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ - .key_invalid = bch2_dirent_invalid, \ + .key_validate = bch2_dirent_validate, \ .val_to_text = bch2_dirent_to_text, \ .min_val_size = 16, \ }) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 03a9de6c2e0a04..f059cbffdf2307 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -126,9 +126,8 @@ static inline bool is_zero(char *start, char *end) #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) -int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); @@ -144,18 +143,18 @@ int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k, break; case BCH_DISK_ACCOUNTING_replicas: bkey_fsck_err_on(!acc_k.replicas.nr_devs, - c, err, accounting_key_replicas_nr_devs_0, + c, accounting_key_replicas_nr_devs_0, "accounting key replicas entry with nr_devs=0"); bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs || (acc_k.replicas.nr_required > 1 && acc_k.replicas.nr_required == acc_k.replicas.nr_devs), - c, err, accounting_key_replicas_nr_required_bad, + c, accounting_key_replicas_nr_required_bad, "accounting key replicas entry with bad nr_required"); for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], - c, err, accounting_key_replicas_devs_unsorted, + c, accounting_key_replicas_devs_unsorted, "accounting key replicas entry with unsorted devs"); end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); @@ -178,7 +177,7 @@ int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k, } bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), - c, err, accounting_key_junk_at_end, + c, accounting_key_junk_at_end, "junk at end of accounting key"); fsck_err: return ret; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 3d3f25e08b6961..b92f8c2e305411 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -82,14 +82,13 @@ int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); -int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_accounting_swab(struct bkey_s); #define bch2_bkey_ops_accounting ((struct bkey_ops) { \ - .key_invalid = bch2_accounting_invalid, \ + .key_validate = bch2_accounting_validate, \ .val_to_text = bch2_accounting_to_text, \ .swab = bch2_accounting_swab, \ .min_val_size = 8, \ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 84f1cbf6497f9f..141a4c63142f5b 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -107,24 +107,23 @@ struct ec_bio { /* Stripes btree keys: */ -int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; int ret = 0; bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || - bpos_gt(k.k->p, POS(0, U32_MAX)), c, err, - stripe_pos_bad, + bpos_gt(k.k->p, POS(0, U32_MAX)), + c, stripe_pos_bad, "stripe at bad pos"); - bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err, - stripe_val_size_bad, + bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), + c, stripe_val_size_bad, "incorrect value size (%zu < %u)", bkey_val_u64s(k.k), stripe_val_u64s(s)); - ret = bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 84a23eeb62495e..90962b3c013057 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -8,8 +8,7 @@ enum bch_validate_flags; -int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, @@ -17,7 +16,7 @@ int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_stripe ((struct bkey_ops) { \ - .key_invalid = bch2_stripe_invalid, \ + .key_validate = bch2_stripe_validate, \ .val_to_text = bch2_stripe_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_stripe, \ diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index a268af3e52bfd8..ab5a7adece1042 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -166,6 +166,7 @@ x(0, journal_reclaim_would_deadlock) \ x(EINVAL, fsck) \ x(BCH_ERR_fsck, fsck_fix) \ + x(BCH_ERR_fsck, fsck_delete_bkey) \ x(BCH_ERR_fsck, fsck_ignore) \ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index a62b6310882003..95afa7bf20205c 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -416,6 +416,28 @@ int __bch2_fsck_err(struct bch_fs *c, return ret; } +int __bch2_bkey_fsck_err(struct bch_fs *c, + struct bkey_s_c k, + enum bch_fsck_flags flags, + enum bch_sb_error_id err, + const char *fmt, ...) +{ + struct printbuf buf = PRINTBUF; + va_list args; + + prt_str(&buf, "invalid bkey "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\n "); + va_start(args, fmt); + prt_vprintf(&buf, fmt, args); + va_end(args); + prt_str(&buf, ": delete?"); + + int ret = __bch2_fsck_err(c, NULL, flags, err, "%s", buf.buf); + printbuf_exit(&buf); + return ret; +} + void bch2_flush_fsck_errs(struct bch_fs *c) { struct fsck_err_state *s, *n; diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 995e6bba9bad80..2f1b86978f3660 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -4,6 +4,7 @@ #include #include +#include "bkey_types.h" #include "sb-errors.h" struct bch_dev; @@ -166,24 +167,30 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define fsck_err_on(cond, c, _err_type, ...) \ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) -__printf(4, 0) -static inline void bch2_bkey_fsck_err(struct bch_fs *c, - struct printbuf *err_msg, - enum bch_sb_error_id err_type, - const char *fmt, ...) -{ - va_list args; +__printf(5, 6) +int __bch2_bkey_fsck_err(struct bch_fs *, + struct bkey_s_c, + enum bch_fsck_flags, + enum bch_sb_error_id, + const char *, ...); - va_start(args, fmt); - prt_vprintf(err_msg, fmt, args); - va_end(args); -} - -#define bkey_fsck_err(c, _err_msg, _err_type, ...) \ +/* + * for now, bkey fsck errors are always handled by deleting the entire key - + * this will change at some point + */ +#define bkey_fsck_err(c, _err_type, _err_msg, ...) \ do { \ - prt_printf(_err_msg, __VA_ARGS__); \ - bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \ - ret = -BCH_ERR_invalid_bkey; \ + if ((flags & BCH_VALIDATE_silent)) { \ + ret = -BCH_ERR_fsck_delete_bkey; \ + goto fsck_err; \ + } \ + int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX, \ + BCH_FSCK_ERR_##_err_type, \ + _err_msg, ##__VA_ARGS__); \ + if (_ret != -BCH_ERR_fsck_fix && \ + _ret != -BCH_ERR_fsck_ignore) \ + ret = _ret; \ + ret = -BCH_ERR_fsck_delete_bkey; \ goto fsck_err; \ } while (0) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 07973198e35fb7..4419ad3e454e4f 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -171,17 +171,16 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err, - btree_ptr_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, + c, btree_ptr_val_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - ret = bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; } @@ -192,28 +191,27 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } -int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); int ret = 0; bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, - c, err, btree_ptr_v2_val_too_big, + c, btree_ptr_v2_val_too_big, "value too big (%zu > %zu)", bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), - c, err, btree_ptr_v2_min_key_bad, + c, btree_ptr_v2_min_key_bad, "min_key > key"); if (flags & BCH_VALIDATE_write) bkey_fsck_err_on(!bp.v->sectors_written, - c, err, btree_ptr_v2_written_0, + c, btree_ptr_v2_written_0, "sectors_written == 0"); - ret = bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; } @@ -399,15 +397,14 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ -int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); int ret = 0; - bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err, - reservation_key_nr_replicas_invalid, + bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, + c, reservation_key_nr_replicas_invalid, "invalid nr_replicas (%u)", r.v->nr_replicas); fsck_err: return ret; @@ -1102,14 +1099,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } - -static int extent_ptr_invalid(struct bch_fs *c, - struct bkey_s_c k, - enum bch_validate_flags flags, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata, - struct printbuf *err) +static int extent_ptr_validate(struct bch_fs *c, + struct bkey_s_c k, + enum bch_validate_flags flags, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata) { int ret = 0; @@ -1128,28 +1123,27 @@ static int extent_ptr_invalid(struct bch_fs *c, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr2) - bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err, - ptr_to_duplicate_device, + bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, + c, ptr_to_duplicate_device, "multiple pointers to same device (%u)", ptr->dev); - bkey_fsck_err_on(bucket >= nbuckets, c, err, - ptr_after_last_bucket, + bkey_fsck_err_on(bucket >= nbuckets, + c, ptr_after_last_bucket, "pointer past last bucket (%llu > %llu)", bucket, nbuckets); - bkey_fsck_err_on(bucket < first_bucket, c, err, - ptr_before_first_bucket, + bkey_fsck_err_on(bucket < first_bucket, + c, ptr_before_first_bucket, "pointer before first bucket (%llu < %u)", bucket, first_bucket); - bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, c, err, - ptr_spans_multiple_buckets, + bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, + c, ptr_spans_multiple_buckets, "pointer spans multiple buckets (%u + %u > %u)", bucket_offset, size_ondisk, bucket_size); fsck_err: return ret; } -int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1164,25 +1158,24 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { - bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err, - extent_ptrs_invalid_entry, - "invalid extent entry type (got %u, max %u)", - __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); + bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, + c, extent_ptrs_invalid_entry, + "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && - !extent_entry_is_ptr(entry), c, err, - btree_ptr_has_non_ptr, + !extent_entry_is_ptr(entry), + c, btree_ptr_has_non_ptr, "has non ptr field"); switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: - ret = extent_ptr_invalid(c, k, flags, &entry->ptr, - size_ondisk, false, err); + ret = extent_ptr_validate(c, k, flags, &entry->ptr, size_ondisk, false); if (ret) return ret; - bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err, - ptr_cached_and_erasure_coded, + bkey_fsck_err_on(entry->ptr.cached && have_ec, + c, ptr_cached_and_erasure_coded, "cached, erasure coded ptr"); if (!entry->ptr.unwritten) @@ -1199,44 +1192,50 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err, - ptr_crc_uncompressed_size_too_small, + bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, + c, ptr_crc_uncompressed_size_too_small, "checksum offset + key size > uncompressed size"); - bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err, - ptr_crc_csum_type_unknown, + bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), + c, ptr_crc_csum_type_unknown, "invalid checksum type"); - bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err, - ptr_crc_compression_type_unknown, + bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, + c, ptr_crc_compression_type_unknown, "invalid compression type"); if (bch2_csum_type_is_encryption(crc.csum_type)) { if (nonce == UINT_MAX) nonce = crc.offset + crc.nonce; else if (nonce != crc.offset + crc.nonce) - bkey_fsck_err(c, err, ptr_crc_nonce_mismatch, + bkey_fsck_err(c, ptr_crc_nonce_mismatch, "incorrect nonce"); } - bkey_fsck_err_on(crc_since_last_ptr, c, err, - ptr_crc_redundant, + bkey_fsck_err_on(crc_since_last_ptr, + c, ptr_crc_redundant, "redundant crc entry"); crc_since_last_ptr = true; bkey_fsck_err_on(crc_is_encoded(crc) && (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, err, - ptr_crc_uncompressed_size_too_big, + (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), + c, ptr_crc_uncompressed_size_too_big, "too large encoded extent"); size_ondisk = crc.compressed_size; break; case BCH_EXTENT_ENTRY_stripe_ptr: - bkey_fsck_err_on(have_ec, c, err, - ptr_stripe_redundant, + bkey_fsck_err_on(have_ec, + c, ptr_stripe_redundant, "redundant stripe entry"); have_ec = true; break; case BCH_EXTENT_ENTRY_rebalance: { + /* + * this shouldn't be a fsck error, for forward + * compatibility; the rebalance code should just refetch + * the compression opt if it's unknown + */ +#if 0 const struct bch_extent_rebalance *r = &entry->rebalance; if (!bch2_compression_opt_valid(r->compression)) { @@ -1245,28 +1244,29 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, opt.type, opt.level); return -BCH_ERR_invalid_bkey; } +#endif break; } } } - bkey_fsck_err_on(!nr_ptrs, c, err, - extent_ptrs_no_ptrs, + bkey_fsck_err_on(!nr_ptrs, + c, extent_ptrs_no_ptrs, "no ptrs"); - bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err, - extent_ptrs_too_many_ptrs, + bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, + c, extent_ptrs_too_many_ptrs, "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); - bkey_fsck_err_on(have_written && have_unwritten, c, err, - extent_ptrs_written_and_unwritten, + bkey_fsck_err_on(have_written && have_unwritten, + c, extent_ptrs_written_and_unwritten, "extent with unwritten and written ptrs"); - bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err, - extent_ptrs_unwritten, + bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, + c, extent_ptrs_unwritten, "has unwritten ptrs"); - bkey_fsck_err_on(crc_since_last_ptr, c, err, - extent_ptrs_redundant_crc, + bkey_fsck_err_on(crc_since_last_ptr, + c, extent_ptrs_redundant_crc, "redundant crc entry"); - bkey_fsck_err_on(have_ec, c, err, - extent_ptrs_redundant_stripe, + bkey_fsck_err_on(have_ec, + c, extent_ptrs_redundant_stripe, "redundant stripe entry"); fsck_err: return ret; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index facdb8a86eec84..1a6ddee48041d1 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -409,26 +409,26 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); #define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \ - .key_invalid = bch2_btree_ptr_invalid, \ + .key_validate = bch2_btree_ptr_validate, \ .val_to_text = bch2_btree_ptr_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_extent, \ }) #define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ - .key_invalid = bch2_btree_ptr_v2_invalid, \ + .key_validate = bch2_btree_ptr_v2_validate, \ .val_to_text = bch2_btree_ptr_v2_to_text, \ .swab = bch2_ptr_swab, \ .compat = bch2_btree_ptr_v2_compat, \ @@ -441,7 +441,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_extent ((struct bkey_ops) { \ - .key_invalid = bch2_bkey_ptrs_invalid, \ + .key_validate = bch2_bkey_ptrs_validate, \ .val_to_text = bch2_bkey_ptrs_to_text, \ .swab = bch2_ptr_swab, \ .key_normalize = bch2_extent_normalize, \ @@ -451,13 +451,13 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ -int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_reservation ((struct bkey_ops) { \ - .key_invalid = bch2_reservation_invalid, \ + .key_validate = bch2_reservation_validate, \ .val_to_text = bch2_reservation_to_text, \ .key_merge = bch2_reservation_merge, \ .trigger = bch2_trigger_reservation, \ @@ -683,8 +683,8 @@ bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_ptr_swab(struct bkey_s); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 1e20020eadd1f5..2be6be33afa3e0 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -434,100 +434,98 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) return &inode_p->inode.k_i; } -static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) +static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bch_inode_unpacked unpacked; int ret = 0; - bkey_fsck_err_on(k.k->p.inode, c, err, - inode_pos_inode_nonzero, + bkey_fsck_err_on(k.k->p.inode, + c, inode_pos_inode_nonzero, "nonzero k.p.inode"); - bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err, - inode_pos_blockdev_range, + bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, + c, inode_pos_blockdev_range, "fs inode in blockdev range"); - bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err, - inode_unpack_error, + bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), + c, inode_unpack_error, "invalid variable length fields"); - bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err, - inode_checksum_type_invalid, + bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, + c, inode_checksum_type_invalid, "invalid data checksum type (%u >= %u", unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); bkey_fsck_err_on(unpacked.bi_compression && - !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err, - inode_compression_type_invalid, + !bch2_compression_opt_valid(unpacked.bi_compression - 1), + c, inode_compression_type_invalid, "invalid compression opt %u", unpacked.bi_compression - 1); bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && - unpacked.bi_nlink != 0, c, err, - inode_unlinked_but_nlink_nonzero, + unpacked.bi_nlink != 0, + c, inode_unlinked_but_nlink_nonzero, "flagged as unlinked but bi_nlink != 0"); - bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err, - inode_subvol_root_but_not_dir, + bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), + c, inode_subvol_root_but_not_dir, "subvolume root but not a directory"); fsck_err: return ret; } -int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); int ret = 0; - bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, - inode_str_hash_invalid, + bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_invalid(c, k, err); + ret = __bch2_inode_validate(c, k, flags); fsck_err: return ret; } -int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); int ret = 0; - bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, - inode_str_hash_invalid, + bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_invalid(c, k, err); + ret = __bch2_inode_validate(c, k, flags); fsck_err: return ret; } -int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); int ret = 0; bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || - INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err, - inode_v3_fields_start_bad, + INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), + c, inode_v3_fields_start_bad, "invalid fields_start (got %llu, min %u max %zu)", INODEv3_FIELDS_START(inode.v), INODEv3_FIELDS_START_INITIAL, bkey_val_u64s(inode.k)); - bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, - inode_str_hash_invalid, + bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_invalid(c, k, err); + ret = __bch2_inode_validate(c, k, flags); fsck_err: return ret; } @@ -625,14 +623,13 @@ int bch2_trigger_inode(struct btree_trans *trans, return 0; } -int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(k.k->p.inode, c, err, - inode_pos_inode_nonzero, + bkey_fsck_err_on(k.k->p.inode, + c, inode_pos_inode_nonzero, "nonzero k.p.inode"); fsck_err: return ret; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index da0e4a7450990f..f1fcb4c58039a5 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -9,12 +9,12 @@ enum bch_validate_flags; extern const char * const bch2_inode_opts[]; -int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_inode_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); +int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); +int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, @@ -22,21 +22,21 @@ int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_inode ((struct bkey_ops) { \ - .key_invalid = bch2_inode_invalid, \ + .key_validate = bch2_inode_validate, \ .val_to_text = bch2_inode_to_text, \ .trigger = bch2_trigger_inode, \ .min_val_size = 16, \ }) #define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ - .key_invalid = bch2_inode_v2_invalid, \ + .key_validate = bch2_inode_v2_validate, \ .val_to_text = bch2_inode_to_text, \ .trigger = bch2_trigger_inode, \ .min_val_size = 32, \ }) #define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ - .key_invalid = bch2_inode_v3_invalid, \ + .key_validate = bch2_inode_v3_validate, \ .val_to_text = bch2_inode_to_text, \ .trigger = bch2_trigger_inode, \ .min_val_size = 48, \ @@ -49,12 +49,12 @@ static inline bool bkey_is_inode(const struct bkey *k) k->type == KEY_TYPE_inode_v3; } -int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ - .key_invalid = bch2_inode_generation_invalid, \ + .key_validate = bch2_inode_generation_validate, \ .val_to_text = bch2_inode_generation_to_text, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 7a833a3f1c63a1..7664b68e6a15ee 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -332,7 +332,6 @@ static int journal_validate_key(struct bch_fs *c, { int write = flags & BCH_VALIDATE_write; void *next = vstruct_next(entry); - struct printbuf buf = PRINTBUF; int ret = 0; if (journal_entry_err_on(!k->k.u64s, @@ -368,34 +367,21 @@ static int journal_validate_key(struct bch_fs *c, bch2_bkey_compat(level, btree_id, version, big_endian, write, NULL, bkey_to_packed(k)); - if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), - __btree_node_type(level, btree_id), write, &buf)) { - printbuf_reset(&buf); - journal_entry_err_msg(&buf, version, jset, entry); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - prt_newline(&buf); - bch2_bkey_invalid(c, bkey_i_to_s_c(k), - __btree_node_type(level, btree_id), write, &buf); - - mustfix_fsck_err(c, journal_entry_bkey_invalid, - "%s", buf.buf); - + ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id), write); + if (ret == -BCH_ERR_fsck_delete_bkey) { le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); - - printbuf_exit(&buf); return FSCK_DELETED_KEY; } + if (ret) + goto fsck_err; if (write) bch2_bkey_compat(level, btree_id, version, big_endian, write, NULL, bkey_to_packed(k)); fsck_err: - printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 83b1586cb3710b..96f2f4f8c39787 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -10,14 +10,13 @@ #include "recovery.h" /* KEY_TYPE_lru is obsolete: */ -int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err, - lru_entry_at_time_0, + bkey_fsck_err_on(!lru_pos_time(k.k->p), + c, lru_entry_at_time_0, "lru entry at time=0"); fsck_err: return ret; diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 5bd8974a7f11e5..e6a7d8241bb807 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -33,14 +33,13 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) return BCH_LRU_read; } -int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_lru_pos_to_text(struct printbuf *, struct bpos); #define bch2_bkey_ops_lru ((struct bkey_ops) { \ - .key_invalid = bch2_lru_invalid, \ + .key_validate = bch2_lru_validate, \ .val_to_text = bch2_lru_to_text, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index a0cca8b70e0ae1..c32a05e252e2ab 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -59,13 +59,13 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { .to_text = bch2_sb_quota_to_text, }; -int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err, - quota_type_invalid, + bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, + c, quota_type_invalid, "invalid quota type (%llu >= %u)", k.k->p.inode, QTYP_NR); fsck_err: diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index 02d37a332218a8..a62abcc5332ad3 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -8,12 +8,11 @@ enum bch_validate_flags; extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_quota ((struct bkey_ops) { \ - .key_invalid = bch2_quota_invalid, \ + .key_validate = bch2_quota_validate, \ .val_to_text = bch2_quota_to_text, \ .min_val_size = 32, \ }) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 5f92715e1525a2..e59c0abb47723b 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -29,15 +29,14 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ -int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); int ret = 0; bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad), - c, err, reflink_p_front_pad_bad, + c, reflink_p_front_pad_bad, "idx < front_pad (%llu < %u)", le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); fsck_err: @@ -256,11 +255,10 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, /* indirect extents */ -int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { - return bch2_bkey_ptrs_invalid(c, k, flags, err); + return bch2_bkey_ptrs_validate(c, k, flags); } void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, @@ -311,9 +309,8 @@ int bch2_trigger_reflink_v(struct btree_trans *trans, /* indirect inline data */ -int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { return 0; } diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index e894f3a2c67ac6..51afe11d8ed65b 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -4,41 +4,37 @@ enum bch_validate_flags; -int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); +int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ - .key_invalid = bch2_reflink_p_invalid, \ + .key_validate = bch2_reflink_p_validate, \ .val_to_text = bch2_reflink_p_to_text, \ .key_merge = bch2_reflink_p_merge, \ .trigger = bch2_trigger_reflink_p, \ .min_val_size = 16, \ }) -int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); +int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ - .key_invalid = bch2_reflink_v_invalid, \ + .key_validate = bch2_reflink_v_validate, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_reflink_v, \ .min_val_size = 8, \ }) -int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_indirect_inline_data(struct btree_trans *, @@ -47,7 +43,7 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ - .key_invalid = bch2_indirect_inline_data_invalid, \ + .key_validate = bch2_indirect_inline_data_validate, \ .val_to_text = bch2_indirect_inline_data_to_text, \ .trigger = bch2_trigger_indirect_inline_data, \ .min_val_size = 8, \ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 96744b1a76f5be..8b18a9b483a4df 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -31,15 +31,14 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, le32_to_cpu(t.v->root_snapshot)); } -int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), c, err, - snapshot_tree_pos_bad, + bkey_lt(k.k->p, POS(0, 1)), + c, snapshot_tree_pos_bad, "bad pos"); fsck_err: return ret; @@ -225,55 +224,54 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, le32_to_cpu(s.v->skip[2])); } -int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_snapshot s; u32 i, id; int ret = 0; bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), c, err, - snapshot_pos_bad, + bkey_lt(k.k->p, POS(0, 1)), + c, snapshot_pos_bad, "bad pos"); s = bkey_s_c_to_snapshot(k); id = le32_to_cpu(s.v->parent); - bkey_fsck_err_on(id && id <= k.k->p.offset, c, err, - snapshot_parent_bad, + bkey_fsck_err_on(id && id <= k.k->p.offset, + c, snapshot_parent_bad, "bad parent node (%u <= %llu)", id, k.k->p.offset); - bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err, - snapshot_children_not_normalized, + bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), + c, snapshot_children_not_normalized, "children not normalized"); - bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err, - snapshot_child_duplicate, + bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], + c, snapshot_child_duplicate, "duplicate child nodes"); for (i = 0; i < 2; i++) { id = le32_to_cpu(s.v->children[i]); - bkey_fsck_err_on(id >= k.k->p.offset, c, err, - snapshot_child_bad, + bkey_fsck_err_on(id >= k.k->p.offset, + c, snapshot_child_bad, "bad child node (%u >= %llu)", id, k.k->p.offset); } if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || - le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err, - snapshot_skiplist_not_normalized, + le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), + c, snapshot_skiplist_not_normalized, "skiplist not normalized"); for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { id = le32_to_cpu(s.v->skip[i]); - bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err, - snapshot_skiplist_bad, + bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), + c, snapshot_skiplist_bad, "bad skiplist node %u", id); } } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 31b0ee03e96288..eb5ef64221d6e1 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -5,11 +5,11 @@ enum bch_validate_flags; void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ - .key_invalid = bch2_snapshot_tree_invalid, \ + .key_validate = bch2_snapshot_tree_validate, \ .val_to_text = bch2_snapshot_tree_to_text, \ .min_val_size = 8, \ }) @@ -19,14 +19,13 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ - .key_invalid = bch2_snapshot_invalid, \ + .key_validate = bch2_snapshot_validate, \ .val_to_text = bch2_snapshot_to_text, \ .trigger = bch2_mark_snapshot, \ .min_val_size = 24, \ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index f56720b558626a..dbe834cb349f4a 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -207,23 +207,23 @@ int bch2_check_subvol_children(struct bch_fs *c) /* Subvolumes: */ -int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k); int ret = 0; bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) || - bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err, - subvol_pos_bad, + bkey_gt(k.k->p, SUBVOL_POS_MAX), + c, subvol_pos_bad, "invalid pos"); - bkey_fsck_err_on(!subvol.v->snapshot, c, err, - subvol_snapshot_bad, + bkey_fsck_err_on(!subvol.v->snapshot, + c, subvol_snapshot_bad, "invalid snapshot"); - bkey_fsck_err_on(!subvol.v->inode, c, err, - subvol_inode_bad, + bkey_fsck_err_on(!subvol.v->inode, + c, subvol_inode_bad, "invalid inode"); fsck_err: return ret; diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index afa5e871efb252..a8299ba2cab2ce 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -10,15 +10,14 @@ enum bch_validate_flags; int bch2_check_subvols(struct bch_fs *); int bch2_check_subvol_children(struct bch_fs *); -int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ - .key_invalid = bch2_subvolume_invalid, \ + .key_validate = bch2_subvolume_validate, \ .val_to_text = bch2_subvolume_to_text, \ .trigger = bch2_subvolume_trigger, \ .min_val_size = 16, \ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index c11bf6dacc2c76..f2b4c17a0307df 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -70,17 +70,16 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { .cmp_bkey = xattr_cmp_bkey, }; -int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, le16_to_cpu(xattr.v->x_val_len)); int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err, - xattr_val_size_too_small, + bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, + c, xattr_val_size_too_small, "value too small (%zu < %u)", bkey_val_u64s(k.k), val_u64s); @@ -88,17 +87,17 @@ int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, val_u64s = xattr_val_u64s(xattr.v->x_name_len, le16_to_cpu(xattr.v->x_val_len) + 4); - bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err, - xattr_val_size_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, + c, xattr_val_size_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), val_u64s); - bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err, - xattr_invalid_type, + bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), + c, xattr_invalid_type, "invalid type (%u)", xattr.v->x_type); - bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err, - xattr_name_invalid_chars, + bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), + c, xattr_name_invalid_chars, "xattr name has invalid characters"); fsck_err: return ret; diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 1574b9eb4c8505..c188a5ad64cef6 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -6,12 +6,11 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; -int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ - .key_invalid = bch2_xattr_invalid, \ + .key_validate = bch2_xattr_validate, \ .val_to_text = bch2_xattr_to_text, \ .min_val_size = 8, \ }) From 5132b99bb62664c02bd6c0dd62ad3fedc75294d8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Aug 2024 02:35:10 -0400 Subject: [PATCH 307/991] bcachefs: Kill __bch2_accounting_mem_mod() The next patch will be adding a disk accounting counter type which is not kept in the in-memory eytzinger tree. As prep, fold __bch2_accounting_mem_mod() into bch2_accounting_mem_mod_locked() so that we can check for that counter type and bail out without calling bpos_to_disk_accounting_pos() twice. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 4 +-- fs/bcachefs/disk_accounting.c | 4 +-- fs/bcachefs/disk_accounting.h | 54 ++++++++++++++------------------ 3 files changed, 28 insertions(+), 34 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 1a1e9d2036da7a..a0101d9c5d83f6 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -712,7 +712,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, a->k.version = journal_pos_to_bversion(&trans->journal_res, (u64 *) entry - (u64 *) trans->journal_entries); BUG_ON(bversion_zero(a->k.version)); - ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false); + ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false, false); if (ret) goto revert_fs_usage; } @@ -798,7 +798,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start); bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, false); + bch2_accounting_mem_mod_locked(trans, a.c, false, false); bch2_accounting_neg(a); } percpu_up_read(&c->mark_lock); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index f059cbffdf2307..e57b40623cd98b 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -566,7 +566,7 @@ int bch2_gc_accounting_done(struct bch_fs *c) struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; accounting_key_init(&k_i.k, &acc_k, src_v, nr); - bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false); + bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false, false); preempt_disable(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); @@ -595,7 +595,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) return 0; percpu_down_read(&c->mark_lock); - int ret = __bch2_accounting_mem_mod(c, bkey_s_c_to_accounting(k), false); + int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), false, true); percpu_up_read(&c->mark_lock); if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) && diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index b92f8c2e305411..653090667aaa4d 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -106,41 +106,17 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r) int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool); void bch2_accounting_mem_gc(struct bch_fs *); -static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) -{ - struct bch_accounting_mem *acc = &c->accounting; - unsigned idx; - - EBUG_ON(gc && !acc->gc_running); - - while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { - int ret = bch2_accounting_mem_insert(c, a, gc); - if (ret) - return ret; - } - - struct accounting_mem_entry *e = &acc->k.data[idx]; - - EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); - - for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) - this_cpu_add(e->v[gc][i], a.v->d[i]); - return 0; -} - /* * Update in memory counters so they match the btree update we're doing; called * from transaction commit path */ -static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) +static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc, bool read) { struct bch_fs *c = trans->c; + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, a.k->p); - if (!gc) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, a.k->p); - + if (!gc && !read) { switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; @@ -161,13 +137,31 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru } } - return __bch2_accounting_mem_mod(c, a, gc); + struct bch_accounting_mem *acc = &c->accounting; + unsigned idx; + + EBUG_ON(gc && !acc->gc_running); + + while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { + int ret = bch2_accounting_mem_insert(c, a, gc); + if (ret) + return ret; + } + + struct accounting_mem_entry *e = &acc->k.data[idx]; + + EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); + + for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) + this_cpu_add(e->v[gc][i], a.v->d[i]); + return 0; } static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) { percpu_down_read(&trans->c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, a, gc); + int ret = bch2_accounting_mem_mod_locked(trans, a, gc, false); percpu_up_read(&trans->c->mark_lock); return ret; } From 58474f76a770bcc79d4b2d7232e4d6650e732b50 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Aug 2024 02:27:36 -0400 Subject: [PATCH 308/991] bcachefs: bcachefs_metadata_version_disk_accounting_inum This adds another disk accounting counter to track usage per inode number (any snapshot ID). This will be used for a couple things: - It'll give us a way to tell the user how much space a given file ista consuming in all snapshots; i.e. how much extra space it's consuming due to snapshot versioning. - It counts number of extents and total size of extents (both in btree keyspace sectors and actual disk usage), meaning it gives us average extent size: that is, it'll let us cheaply find fragmented files that should be defragmented. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/buckets.c | 14 ++++++++++++++ fs/bcachefs/disk_accounting.c | 3 +++ fs/bcachefs/disk_accounting.h | 3 +++ fs/bcachefs/disk_accounting_format.h | 8 +++++++- fs/bcachefs/sb-downgrade.c | 5 ++++- 6 files changed, 33 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index b25f863567287c..c75f2e0f32bb96 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -676,7 +676,8 @@ struct bch_sb_field_ext { x(mi_btree_bitmap, BCH_VERSION(1, 7)) \ x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ x(disk_accounting_v2, BCH_VERSION(1, 9)) \ - x(disk_accounting_v3, BCH_VERSION(1, 10)) + x(disk_accounting_v3, BCH_VERSION(1, 10)) \ + x(disk_accounting_inum, BCH_VERSION(1, 11)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 9f7004e941ce41..b69ef4b3de6e2a 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -810,6 +810,20 @@ static int __trigger_extent(struct btree_trans *trans, ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc); if (ret) return ret; + } else { + bool insert = !(flags & BTREE_TRIGGER_overwrite); + struct disk_accounting_pos acc_inum_key = { + .type = BCH_DISK_ACCOUNTING_inum, + .inum.inum = k.k->p.inode, + }; + s64 v[3] = { + insert ? 1 : -1, + insert ? k.k->size : -((s64) k.k->size), + replicas_sectors, + }; + ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); + if (ret) + return ret; } if (bch2_bkey_rebalance_opts(k)) { diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index e57b40623cd98b..e972e2bca546a9 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -768,6 +768,9 @@ void bch2_verify_accounting_clean(struct bch_fs *c) if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) continue; + if (acc_k.type == BCH_DISK_ACCOUNTING_inum) + continue; + bch2_accounting_mem_read(c, k.k->p, v, nr); if (memcmp(a.v->d, v, nr * sizeof(u64))) { diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 653090667aaa4d..f29fd0dd9581fb 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -116,6 +116,9 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, a.k->p); + if (acc_k.type == BCH_DISK_ACCOUNTING_inum) + return 0; + if (!gc && !read) { switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index a93cf26ff4a94f..7b6e6c97e6aa6a 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -103,7 +103,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type) x(compression, 4) \ x(snapshot, 5) \ x(btree, 6) \ - x(rebalance_work, 7) + x(rebalance_work, 7) \ + x(inum, 8) enum disk_accounting_type { #define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr, @@ -136,6 +137,10 @@ struct bch_acct_btree { __u32 id; } __packed; +struct bch_acct_inum { + __u64 inum; +} __packed; + struct bch_acct_rebalance_work { }; @@ -152,6 +157,7 @@ struct disk_accounting_pos { struct bch_acct_snapshot snapshot; struct bch_acct_btree btree; struct bch_acct_rebalance_work rebalance_work; + struct bch_acct_inum inum; } __packed; } __packed; struct bpos _pad; diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 9f82d497d9e053..650a1f77ca4036 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -72,7 +72,10 @@ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \ BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) + BCH_FSCK_ERR_accounting_key_junk_at_end) \ + x(disk_accounting_inum, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ From 63de936b513f7a9ce559194d3269ac291f4f4662 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 21 Jul 2024 17:38:40 +0200 Subject: [PATCH 309/991] media: atomisp: Fix streaming no longer working on BYT / ISP2400 devices Commit a0821ca14bb8 ("media: atomisp: Remove test pattern generator (TPG) support") broke BYT support because it removed a seemingly unused field from struct sh_css_sp_config and a seemingly unused value from enum ia_css_input_mode. But these are part of the ABI between the kernel and firmware on ISP2400 and this part of the TPG support removal changes broke ISP2400 support. ISP2401 support was not affected because on ISP2401 only a part of struct sh_css_sp_config is used. Restore the removed field and enum value to fix this. Fixes: a0821ca14bb8 ("media: atomisp: Remove test pattern generator (TPG) support") Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Signed-off-by: Hans Verkuil --- .../media/atomisp/pci/ia_css_stream_public.h | 8 ++++++-- .../media/atomisp/pci/sh_css_internal.h | 19 ++++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/ia_css_stream_public.h b/drivers/staging/media/atomisp/pci/ia_css_stream_public.h index 961c612880833a..aad860e54d3a71 100644 --- a/drivers/staging/media/atomisp/pci/ia_css_stream_public.h +++ b/drivers/staging/media/atomisp/pci/ia_css_stream_public.h @@ -27,12 +27,16 @@ #include "ia_css_prbs.h" #include "ia_css_input_port.h" -/* Input modes, these enumerate all supported input modes. - * Note that not all ISP modes support all input modes. +/* + * Input modes, these enumerate all supported input modes. + * This enum is part of the atomisp firmware ABI and must + * NOT be changed! + * Note that not all ISP modes support all input modes. */ enum ia_css_input_mode { IA_CSS_INPUT_MODE_SENSOR, /** data from sensor */ IA_CSS_INPUT_MODE_FIFO, /** data from input-fifo */ + IA_CSS_INPUT_MODE_TPG, /** data from test-pattern generator */ IA_CSS_INPUT_MODE_PRBS, /** data from pseudo-random bit stream */ IA_CSS_INPUT_MODE_MEMORY, /** data from a frame in memory */ IA_CSS_INPUT_MODE_BUFFERED_SENSOR /** data is sent through mipi buffer */ diff --git a/drivers/staging/media/atomisp/pci/sh_css_internal.h b/drivers/staging/media/atomisp/pci/sh_css_internal.h index a2d972ea3fa084..959e7f549641c1 100644 --- a/drivers/staging/media/atomisp/pci/sh_css_internal.h +++ b/drivers/staging/media/atomisp/pci/sh_css_internal.h @@ -344,7 +344,14 @@ struct sh_css_sp_input_formatter_set { #define IA_CSS_MIPI_SIZE_CHECK_MAX_NOF_ENTRIES_PER_PORT (3) -/* SP configuration information */ +/* + * SP configuration information + * + * This struct is part of the atomisp firmware ABI and is directly copied + * to ISP DRAM by sh_css_store_sp_group_to_ddr() + * + * Do NOT change this struct's layout or remove seemingly unused fields! + */ struct sh_css_sp_config { u8 no_isp_sync; /* Signal host immediately after start */ u8 enable_raw_pool_locking; /** Enable Raw Buffer Locking for HALv3 Support */ @@ -354,6 +361,10 @@ struct sh_css_sp_config { host (true) or when they are passed to the preview/video pipe (false). */ + /* + * Note the fields below are only used on the ISP2400 not on the ISP2401, + * sh_css_store_sp_group_to_ddr() skip copying these when run on the ISP2401. + */ struct { u8 a_changed; u8 b_changed; @@ -363,11 +374,13 @@ struct sh_css_sp_config { } input_formatter; sync_generator_cfg_t sync_gen; + tpg_cfg_t tpg; prbs_cfg_t prbs; input_system_cfg_t input_circuit; u8 input_circuit_cfg_changed; - u32 mipi_sizes_for_check[N_CSI_PORTS][IA_CSS_MIPI_SIZE_CHECK_MAX_NOF_ENTRIES_PER_PORT]; - u8 enable_isys_event_queue; + u32 mipi_sizes_for_check[N_CSI_PORTS][IA_CSS_MIPI_SIZE_CHECK_MAX_NOF_ENTRIES_PER_PORT]; + /* These last 2 fields are used on both the ISP2400 and the ISP2401 */ + u8 enable_isys_event_queue; u8 disable_cont_vf; }; From a2cbb1603943281a604f5adc48079a148db5cb0d Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 8 Aug 2024 16:06:40 -0700 Subject: [PATCH 310/991] tcp: Update window clamping condition This patch is based on the discussions between Neal Cardwell and Eric Dumazet in the link https://lore.kernel.org/netdev/20240726204105.1466841-1-quic_subashab@quicinc.com/ It was correctly pointed out that tp->window_clamp would not be updated in cases where net.ipv4.tcp_moderate_rcvbuf=0 or if (copied <= tp->rcvq_space.space). While it is expected for most setups to leave the sysctl enabled, the latter condition may not end up hitting depending on the TCP receive queue size and the pattern of arriving data. The updated check should be hit only on initial MSS update from TCP_MIN_MSS to measured MSS value and subsequently if there was an update to a larger value. Fixes: 05f76b2d634e ("tcp: Adjust clamping window for applications specifying SO_RCVBUF") Signed-off-by: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e2b9583ed96abc..e37488d3453f03 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -238,9 +238,14 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) */ if (unlikely(len != icsk->icsk_ack.rcv_mss)) { u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE; + u8 old_ratio = tcp_sk(sk)->scaling_ratio; do_div(val, skb->truesize); tcp_sk(sk)->scaling_ratio = val ? val : 1; + + if (old_ratio != tcp_sk(sk)->scaling_ratio) + WRITE_ONCE(tcp_sk(sk)->window_clamp, + tcp_win_from_space(sk, sk->sk_rcvbuf)); } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); @@ -754,7 +759,8 @@ void tcp_rcv_space_adjust(struct sock *sk) * */ - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)) { + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { u64 rcvwin, grow; int rcvbuf; @@ -770,22 +776,12 @@ void tcp_rcv_space_adjust(struct sock *sk) rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - if (rcvbuf > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); - - /* Make the window clamp follow along. */ - WRITE_ONCE(tp->window_clamp, - tcp_win_from_space(sk, rcvbuf)); - } - } else { - /* Make the window clamp follow along while being bounded - * by SO_RCVBUF. - */ - int clamp = tcp_win_from_space(sk, min(rcvbuf, sk->sk_rcvbuf)); + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); - if (clamp > tp->window_clamp) - WRITE_ONCE(tp->window_clamp, clamp); + /* Make the window clamp follow along. */ + WRITE_ONCE(tp->window_clamp, + tcp_win_from_space(sk, rcvbuf)); } } tp->rcvq_space.space = copied; From c286f204ce6ba7b48e3dcba53eda7df8eaa64dd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Jos=C3=A9=20Arboleda?= Date: Tue, 13 Aug 2024 11:10:53 -0500 Subject: [PATCH 311/991] ALSA: usb-audio: Support Yamaha P-125 quirk entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a USB quirk for the Yamaha P-125 digital piano. Signed-off-by: Juan José Arboleda Cc: Link: https://patch.msgid.link/20240813161053.70256-1-soyjuanarbol@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/quirks-table.h | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index f13a8d63a019a4..aaa6a515d0f8a4 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -273,6 +273,7 @@ YAMAHA_DEVICE(0x105a, NULL), YAMAHA_DEVICE(0x105b, NULL), YAMAHA_DEVICE(0x105c, NULL), YAMAHA_DEVICE(0x105d, NULL), +YAMAHA_DEVICE(0x1718, "P-125"), { USB_DEVICE(0x0499, 0x1503), .driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) { From fa0db8e568787c665384430eaf2221b299b85367 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 13 Aug 2024 15:19:01 +0200 Subject: [PATCH 312/991] Revert "ata: libata-scsi: Honor the D_SENSE bit for CK_COND=1 and no error" This reverts commit 28ab9769117ca944cb6eb537af5599aa436287a4. Sense data can be in either fixed format or descriptor format. SAT-6 revision 1, "10.4.6 Control mode page", defines the D_SENSE bit: "The SATL shall support this bit as defined in SPC-5 with the following exception: if the D_ SENSE bit is set to zero (i.e., fixed format sense data), then the SATL should return fixed format sense data for ATA PASS-THROUGH commands." The libata SATL has always kept D_SENSE set to zero by default. (It is however possible to change the value using a MODE SELECT SG_IO command.) Failed ATA PASS-THROUGH commands correctly respected the D_SENSE bit, however, successful ATA PASS-THROUGH commands incorrectly returned the sense data in descriptor format (regardless of the D_SENSE bit). Commit 28ab9769117c ("ata: libata-scsi: Honor the D_SENSE bit for CK_COND=1 and no error") fixed this bug for successful ATA PASS-THROUGH commands. However, after commit 28ab9769117c ("ata: libata-scsi: Honor the D_SENSE bit for CK_COND=1 and no error"), there were bug reports that hdparm, hddtemp, and udisks were no longer working as expected. These applications incorrectly assume the returned sense data is in descriptor format, without even looking at the RESPONSE CODE field in the returned sense data (to see which format the returned sense data is in). Considering that there will be broken versions of these applications around roughly forever, we are stuck with being bug compatible with older kernels. Cc: stable@vger.kernel.org # 4.19+ Reported-by: Stephan Eisvogel Reported-by: Christian Heusel Closes: https://lore.kernel.org/linux-ide/0bf3f2f0-0fc6-4ba5-a420-c0874ef82d64@heusel.eu/ Fixes: 28ab9769117c ("ata: libata-scsi: Honor the D_SENSE bit for CK_COND=1 and no error") Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240813131900.1285842-2-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/libata-scsi.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index d6f5e25e1ed894..473e00a58a8b0c 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -951,8 +951,19 @@ static void ata_gen_passthru_sense(struct ata_queued_cmd *qc) &sense_key, &asc, &ascq); ata_scsi_set_sense(qc->dev, cmd, sense_key, asc, ascq); } else { - /* ATA PASS-THROUGH INFORMATION AVAILABLE */ - ata_scsi_set_sense(qc->dev, cmd, RECOVERED_ERROR, 0, 0x1D); + /* + * ATA PASS-THROUGH INFORMATION AVAILABLE + * + * Note: we are supposed to call ata_scsi_set_sense(), which + * respects the D_SENSE bit, instead of unconditionally + * generating the sense data in descriptor format. However, + * because hdparm, hddtemp, and udisks incorrectly assume sense + * data in descriptor format, without even looking at the + * RESPONSE CODE field in the returned sense data (to see which + * format the returned sense data is in), we are stuck with + * being bug compatible with older kernels. + */ + scsi_build_sense(cmd, 1, RECOVERED_ERROR, 0, 0x1D); } } From 829e2a23121fb36ee30ea5145c2a85199f68e2c8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 14 Aug 2024 12:04:59 +0200 Subject: [PATCH 313/991] ALSA: hda/tas2781: Use correct endian conversion The data conversion is done rather by a wrong function. We convert to BE32, not from BE32. Although the end result must be same, this was complained by the compiler. Fix the code again and align with another similar function tas2563_apply_calib() that does already right. Fixes: 3beddef84d90 ("ALSA: hda/tas2781: fix wrong calibrated data order") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408141630.DiDUB8Z4-lkp@intel.com/ Link: https://patch.msgid.link/20240814100500.1944-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_i2c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/tas2781_hda_i2c.c b/sound/pci/hda/tas2781_hda_i2c.c index 7dbfc92d9d55c4..89d8235537cd3b 100644 --- a/sound/pci/hda/tas2781_hda_i2c.c +++ b/sound/pci/hda/tas2781_hda_i2c.c @@ -527,8 +527,8 @@ static void tas2781_apply_calib(struct tasdevice_priv *tas_priv) for (i = 0; i < tas_priv->ndev; i++) { for (j = 0; j < CALIB_MAX; j++) { - data = get_unaligned_be32( - &tas_priv->cali_data.data[offset]); + data = cpu_to_be32( + *(uint32_t *)&tas_priv->cali_data.data[offset]); rc = tasdevice_dev_bulk_write(tas_priv, i, TASDEVICE_REG(0, page_array[j], rgno_array[j]), (unsigned char *)&data, 4); From 73c34b0b85d46bf9c2c0b367aeaffa1e2481b136 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jul 2024 13:44:33 -0700 Subject: [PATCH 314/991] xfs: attr forks require attr, not attr2 It turns out that I misunderstood the difference between the attr and attr2 feature bits. "attr" means that at some point an attr fork was created somewhere in the filesystem. "attr2" means that inodes have variable-sized forks, but says nothing about whether or not there actually /are/ attr forks in the system. If we have an attr fork, we only need to check that attr is set. Fixes: 99d9d8d05da26 ("xfs: scrub inode block mappings") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/scrub/bmap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 24a15bf784f11b..5ab2ac53c92002 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -938,7 +938,13 @@ xchk_bmap( } break; case XFS_ATTR_FORK: - if (!xfs_has_attr(mp) && !xfs_has_attr2(mp)) + /* + * "attr" means that an attr fork was created at some point in + * the life of this filesystem. "attr2" means that inodes have + * variable-sized data/attr fork areas. Hence we only check + * attr here. + */ + if (!xfs_has_attr(mp)) xchk_ino_set_corrupt(sc, sc->ip->i_ino); break; default: From 04d6dbb55301a29aeb9a197e6b0012cdc265f1e4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 4 Aug 2024 14:39:34 -0700 Subject: [PATCH 315/991] xfs: revert AIL TASK_KILLABLE threshold In commit 9adf40249e6c, we changed the behavior of the AIL thread to set its own task state to KILLABLE whenever the timeout value is nonzero. Unfortunately, this missed the fact that xfsaild_push will return 50ms (aka a longish sleep) when we reach the push target or the AIL becomes empty, so xfsaild goes to sleep for a long period of time in uninterruptible D state. This results in artificially high load averages because KILLABLE processes are UNINTERRUPTIBLE, which contributes to load average even though the AIL is asleep waiting for someone to interrupt it. It's not blocked on IOs or anything, but people scrap ps for processes that look like they're stuck in D state, so restore the previous threshold. Fixes: 9adf40249e6c ("xfs: AIL doesn't need manual pushing") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_trans_ail.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 0fafcc9f3dbe44..8ede9d099d1fea 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -644,7 +644,12 @@ xfsaild( set_freezable(); while (1) { - if (tout) + /* + * Long waits of 50ms or more occur when we've run out of items + * to push, so we only want uninterruptible state if we're + * actually blocked on something. + */ + if (tout && tout <= 20) set_current_state(TASK_KILLABLE|TASK_FREEZABLE); else set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); From 8d16762047c627073955b7ed171a36addaf7b1ff Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 4 Aug 2024 14:39:57 -0700 Subject: [PATCH 316/991] xfs: conditionally allow FS_XFLAG_REALTIME changes if S_DAX is set If a file has the S_DAX flag (aka fsdax access mode) set, we cannot allow users to change the realtime flag unless the datadev and rtdev both support fsdax access modes. Even if there are no extents allocated to the file, the setattr thread could be racing with another thread that has already started down the write code paths. Fixes: ba23cba9b3bdc ("fs: allow per-device dax status checking for filesystems") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_ioctl.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 4e933db75b12b9..6b13666d4e9635 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -483,6 +483,17 @@ xfs_ioctl_setattr_xflags( /* Can't change realtime flag if any extents are allocated. */ if (ip->i_df.if_nextents || ip->i_delayed_blks) return -EINVAL; + + /* + * If S_DAX is enabled on this file, we can only switch the + * device if both support fsdax. We can't update S_DAX because + * there might be other threads walking down the access paths. + */ + if (IS_DAX(VFS_I(ip)) && + (mp->m_ddev_targp->bt_daxdev == NULL || + (mp->m_rtdev_targp && + mp->m_rtdev_targp->bt_daxdev == NULL))) + return -EINVAL; } if (rtflag) { From 66155de93bcf4f2967e602a4b3bf7ebe58f34b11 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:02:58 -0700 Subject: [PATCH 317/991] KVM: x86: Disallow read-only memslots for SEV-ES and SEV-SNP (and TDX) Disallow read-only memslots for SEV-{ES,SNP} VM types, as KVM can't directly emulate instructions for ES/SNP, and instead the guest must explicitly request emulation. Unless the guest explicitly requests emulation without accessing memory, ES/SNP relies on KVM creating an MMIO SPTE, with the subsequent #NPF being reflected into the guest as a #VC. But for read-only memslots, KVM deliberately doesn't create MMIO SPTEs, because except for ES/SNP, doing so requires setting reserved bits in the SPTE, i.e. the SPTE can't be readable while also generating a #VC on writes. Because KVM never creates MMIO SPTEs and jumps directly to emulation, the guest never gets a #VC. And since KVM simply resumes the guest if ES/SNP guests trigger emulation, KVM effectively puts the vCPU into an infinite #NPF loop if the vCPU attempts to write read-only memory. Disallow read-only memory for all VMs with protected state, i.e. for upcoming TDX VMs as well as ES/SNP VMs. For TDX, it's actually possible to support read-only memory, as TDX uses EPT Violation #VE to reflect the fault into the guest, e.g. KVM could configure read-only SPTEs with RX protections and SUPPRESS_VE=0. But there is no strong use case for supporting read-only memslots on TDX, e.g. the main historical usage is to emulate option ROMs, but TDX disallows executing from shared memory. And if someone comes along with a legitimate, strong use case, the restriction can always be lifted for TDX. Don't bother trying to retroactively apply the restriction to SEV-ES VMs that are created as type KVM_X86_DEFAULT_VM. Read-only memslots can't possibly work for SEV-ES, i.e. disallowing such memslots is really just means reporting an error to userspace instead of silently hanging vCPUs. Trying to deal with the ordering between KVM_SEV_INIT and memslot creation isn't worth the marginal benefit it would provide userspace. Fixes: 26c44aa9e076 ("KVM: SEV: define VM types for SEV and SEV-ES") Fixes: 1dfe571c12cf ("KVM: SEV: Add initial SEV-SNP support") Cc: Peter Gonda Cc: Michael Roth Cc: Vishal Annapurve Cc: Ackerly Tng Signed-off-by: Sean Christopherson Message-ID: <20240809190319.1710470-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ include/linux/kvm_host.h | 7 +++++++ virt/kvm/kvm_main.c | 5 ++--- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 94e7b5a4fafeb5..4a68cb3eba78f8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2192,6 +2192,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, #define kvm_arch_has_private_mem(kvm) false #endif +#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) + static inline u16 kvm_read_ldt(void) { u16 ldt; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 79a6b1a63027a4..b23c6d48392f7c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -715,6 +715,13 @@ static inline bool kvm_arch_has_private_mem(struct kvm *kvm) } #endif +#ifndef kvm_arch_has_readonly_mem +static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) +{ + return IS_ENABLED(CONFIG_HAVE_KVM_READONLY_MEM); +} +#endif + struct kvm_memslots { u64 generation; atomic_long_t last_used_slot; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 92901656a0d413..cb2b78e92910fb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1578,15 +1578,14 @@ static int check_memory_region_flags(struct kvm *kvm, if (mem->flags & KVM_MEM_GUEST_MEMFD) valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; -#ifdef CONFIG_HAVE_KVM_READONLY_MEM /* * GUEST_MEMFD is incompatible with read-only memslots, as writes to * read-only memslots have emulated MMIO, not page fault, semantics, * and KVM doesn't allow emulated MMIO for private memory. */ - if (!(mem->flags & KVM_MEM_GUEST_MEMFD)) + if (kvm_arch_has_readonly_mem(kvm) && + !(mem->flags & KVM_MEM_GUEST_MEMFD)) valid_flags |= KVM_MEM_READONLY; -#endif if (mem->flags & ~valid_flags) return -EINVAL; From f94511df53bb792e505c98662971434c7995388a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 7 Aug 2024 11:37:31 +0100 Subject: [PATCH 318/991] arm64: uaccess: correct thinko in __get_mem_asm() In the CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y version of __get_mem_asm(), we incorrectly use _ASM_EXTABLE_##type##ACCESS_ERR() such that upon a fault the extable fixup handler writes -EFAULT into "%w0", which is the register containing 'x' (the result of the load). This was a thinko in commit: 86a6a68febfcf57b ("arm64: start using 'asm goto' for get_user() when available") Prior to that commit _ASM_EXTABLE_##type##ACCESS_ERR_ZERO() was used such that the extable fixup handler wrote -EFAULT into "%w0" (the register containing 'err'), and zero into "%w1" (the register containing 'x'). When the 'err' variable was removed, the extable entry was updated incorrectly. Writing -EFAULT to the value register is unnecessary but benign: * We never want -EFAULT in the value register, and previously this would have been zeroed in the extable fixup handler. * In __get_user_error() the value is overwritten with zero explicitly in the error path. * The asm goto outputs cannot be used when the goto label is taken, as older compilers (e.g. clang < 16.0.0) do not guarantee that asm goto outputs are usable in this path and may use a stale value rather than the value in an output register. Consequently, zeroing in the extable fixup handler is insufficient to ensure callers see zero in the error path. * The expected usage of unsafe_get_user() and get_kernel_nofault() requires that the value is not consumed in the error path. Some versions of GCC would mis-compile asm goto with outputs, and erroneously omit subsequent assignments, breaking the error path handling in __get_user_error(). This was discussed at: https://lore.kernel.org/lkml/ZpfxLrJAOF2YNqCk@J2N7QTR9R3.cambridge.arm.com/ ... and was fixed by removing support for asm goto with outputs on those broken compilers in commit: f2f6a8e887172503 ("init/Kconfig: remove CONFIG_GCC_ASM_GOTO_OUTPUT_WORKAROUND") With that out of the way, we can safely replace the usage of _ASM_EXTABLE_##type##ACCESS_ERR() with _ASM_EXTABLE_##type##ACCESS(), leaving the value register unchanged in the case a fault is taken, as was originally intended. This matches other architectures and matches our __put_mem_asm(). Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20240807103731.2498893-1-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 28f665e0975a28..1aa4ecb73429fe 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -188,7 +188,7 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr) #define __get_mem_asm(load, reg, x, addr, label, type) \ asm_goto_output( \ "1: " load " " reg "0, [%1]\n" \ - _ASM_EXTABLE_##type##ACCESS_ERR(1b, %l2, %w0) \ + _ASM_EXTABLE_##type##ACCESS(1b, %l2) \ : "=r" (x) \ : "r" (addr) : : label) #else From a21dcf0ea8566ebbe011c79d6ed08cdfea771de3 Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Mon, 5 Aug 2024 11:30:24 +0800 Subject: [PATCH 319/991] arm64: ACPI: NUMA: initialize all values of acpi_early_node_map to NUMA_NO_NODE Currently, only acpi_early_node_map[0] was initialized to NUMA_NO_NODE. To ensure all the values were properly initialized, switch to initialize all of them to NUMA_NO_NODE. Fixes: e18962491696 ("arm64: numa: rework ACPI NUMA initialization") Cc: # 4.19.x Reported-by: Andrew Jones Suggested-by: Andrew Jones Signed-off-by: Haibo Xu Reviewed-by: Anshuman Khandual Reviewed-by: Sunil V L Reviewed-by: Andrew Jones Acked-by: Catalin Marinas Acked-by: Lorenzo Pieralisi Reviewed-by: Hanjun Guo Link: https://lore.kernel.org/r/853d7f74aa243f6f5999e203246f0d1ae92d2b61.1722828421.git.haibo1.xu@intel.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/acpi_numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c index 0c036a9a3c338a..2465f291c7e17c 100644 --- a/arch/arm64/kernel/acpi_numa.c +++ b/arch/arm64/kernel/acpi_numa.c @@ -27,7 +27,7 @@ #include -static int acpi_early_node_map[NR_CPUS] __initdata = { NUMA_NO_NODE }; +static int acpi_early_node_map[NR_CPUS] __initdata = { [0 ... NR_CPUS - 1] = NUMA_NO_NODE }; int __init acpi_numa_get_nid(unsigned int cpu) { From 1c0e5881691a787a9399a99bff4d56ead6e75e91 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Wed, 14 Aug 2024 10:31:13 +0200 Subject: [PATCH 320/991] KVM: SEV: uapi: fix typo in SEV_RET_INVALID_CONFIG "INVALID" is misspelt in "SEV_RET_INAVLID_CONFIG". Since this is part of the UAPI, keep the current definition and add a new one with the fix. Fix-suggested-by: Marc Zyngier Signed-off-by: Amit Shah Message-ID: <20240814083113.21622-1-amit@kernel.org> Signed-off-by: Paolo Bonzini --- include/uapi/linux/psp-sev.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index 2289b7c76c59aa..832c15d9155bdb 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -51,6 +51,7 @@ typedef enum { SEV_RET_INVALID_PLATFORM_STATE, SEV_RET_INVALID_GUEST_STATE, SEV_RET_INAVLID_CONFIG, + SEV_RET_INVALID_CONFIG = SEV_RET_INAVLID_CONFIG, SEV_RET_INVALID_LEN, SEV_RET_ALREADY_OWNED, SEV_RET_INVALID_CERTIFICATE, From 2251db28edcc70b7ee8a8c6bcbaecf752b3ea5ec Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 6 Aug 2024 13:49:13 +0200 Subject: [PATCH 321/991] ASoC: codecs: wcd937x: Fix missing de-assert of reset GPIO The device never comes online from a reset/shutdown state, because the driver de-asserts reset GPIO when requesting it but then, at the end of probe() through wcd937x_reset(), leaves it asserted. Cc: stable@vger.kernel.org Fixes: 9be3ec196da4 ("ASoC: codecs: wcd937x: add wcd937x codec driver") Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240806114913.40022-1-krzysztof.kozlowski@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wcd937x.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/wcd937x.c b/sound/soc/codecs/wcd937x.c index 13926f4b0d9f16..af296b77a723ad 100644 --- a/sound/soc/codecs/wcd937x.c +++ b/sound/soc/codecs/wcd937x.c @@ -242,10 +242,9 @@ static const struct regmap_irq_chip wcd937x_regmap_irq_chip = { static void wcd937x_reset(struct wcd937x_priv *wcd937x) { - usleep_range(20, 30); - gpiod_set_value(wcd937x->reset_gpio, 1); - + usleep_range(20, 30); + gpiod_set_value(wcd937x->reset_gpio, 0); usleep_range(20, 30); } From 57d5af2660e9443b081eeaf1c373b3ce48477828 Mon Sep 17 00:00:00 2001 From: Vignesh Raghavendra Date: Wed, 14 Aug 2024 20:42:37 +0530 Subject: [PATCH 322/991] spi: spi-cadence-quadspi: Fix OSPI NOR failures during system resume Its necessary to call pm_runtime_force_*() hooks as part of system suspend/resume calls so that the runtime_pm hooks get called. This ensures latest state of the IP is cached and restored during system sleep. This is especially true if runtime autosuspend is enabled as runtime suspend hooks may not be called at all before system sleeps. Without this patch, OSPI NOR enumeration (READ_ID) fails during resume as context saved during suspend path is inconsistent. Fixes: 078d62de433b ("spi: cadence-qspi: add system-wide suspend and resume callbacks") Signed-off-by: Vignesh Raghavendra Link: https://patch.msgid.link/20240814151237.3856184-1-vigneshr@ti.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index 05ebb03d319fcb..d4607cb89c4840 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -2000,13 +2000,25 @@ static int cqspi_runtime_resume(struct device *dev) static int cqspi_suspend(struct device *dev) { struct cqspi_st *cqspi = dev_get_drvdata(dev); + int ret; - return spi_controller_suspend(cqspi->host); + ret = spi_controller_suspend(cqspi->host); + if (ret) + return ret; + + return pm_runtime_force_suspend(dev); } static int cqspi_resume(struct device *dev) { struct cqspi_st *cqspi = dev_get_drvdata(dev); + int ret; + + ret = pm_runtime_force_resume(dev); + if (ret) { + dev_err(dev, "pm_runtime_force_resume failed on resume\n"); + return ret; + } return spi_controller_resume(cqspi->host); } From 71833e79a42178d8a50b5081c98c78ace9325628 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 14 Aug 2024 13:16:49 +0100 Subject: [PATCH 323/991] i2c: Use IS_REACHABLE() for substituting empty ACPI functions Replace IS_ENABLED() with IS_REACHABLE() to substitute empty stubs for: i2c_acpi_get_i2c_resource() i2c_acpi_client_count() i2c_acpi_find_bus_speed() i2c_acpi_new_device_by_fwnode() i2c_adapter *i2c_acpi_find_adapter_by_handle() i2c_acpi_waive_d0_probe() commit f17c06c6608a ("i2c: Fix conditional for substituting empty ACPI functions") partially fixed this conditional to depend on CONFIG_I2C, but used IS_ENABLED(), which is wrong since CONFIG_I2C is tristate. CONFIG_ACPI is boolean but let's also change it to use IS_REACHABLE() to future-proof it against becoming tristate. Somehow despite testing various combinations of CONFIG_I2C and CONFIG_ACPI we missed the combination CONFIG_I2C=m, CONFIG_ACPI=y. Signed-off-by: Richard Fitzgerald Fixes: f17c06c6608a ("i2c: Fix conditional for substituting empty ACPI functions") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408141333.gYnaitcV-lkp@intel.com/ Reviewed-by: Takashi Iwai Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 7eedd0c662dad2..377def4972985c 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -1066,7 +1066,7 @@ static inline int of_i2c_get_board_info(struct device *dev, struct acpi_resource; struct acpi_resource_i2c_serialbus; -#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_I2C) +#if IS_REACHABLE(CONFIG_ACPI) && IS_REACHABLE(CONFIG_I2C) bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c); int i2c_acpi_client_count(struct acpi_device *adev); From 2848ff28d180bd63a95da8e5dcbcdd76c1beeb7b Mon Sep 17 00:00:00 2001 From: Mitchell Levy Date: Mon, 12 Aug 2024 13:44:12 -0700 Subject: [PATCH 324/991] x86/fpu: Avoid writing LBR bit to IA32_XSS unless supported There are two distinct CPU features related to the use of XSAVES and LBR: whether LBR is itself supported and whether XSAVES supports LBR. The LBR subsystem correctly checks both in intel_pmu_arch_lbr_init(), but the XSTATE subsystem does not. The LBR bit is only removed from xfeatures_mask_independent when LBR is not supported by the CPU, but there is no validation of XSTATE support. If XSAVES does not support LBR the write to IA32_XSS causes a #GP fault, leaving the state of IA32_XSS unchanged, i.e. zero. The fault is handled with a warning and the boot continues. Consequently the next XRSTORS which tries to restore supervisor state fails with #GP because the RFBM has zero for all supervisor features, which does not match the XCOMP_BV field. As XFEATURE_MASK_FPSTATE includes supervisor features setting up the FPU causes a #GP, which ends up in fpu_reset_from_exception_fixup(). That fails due to the same problem resulting in recursive #GPs until the kernel runs out of stack space and double faults. Prevent this by storing the supported independent features in fpu_kernel_cfg during XSTATE initialization and use that cached value for retrieving the independent feature bits to be written into IA32_XSS. [ tglx: Massaged change log ] Fixes: f0dccc9da4c0 ("x86/fpu/xstate: Support dynamic supervisor feature for LBR") Suggested-by: Thomas Gleixner Signed-off-by: Mitchell Levy Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240812-xsave-lbr-fix-v3-1-95bac1bf62f4@gmail.com --- arch/x86/include/asm/fpu/types.h | 7 +++++++ arch/x86/kernel/fpu/xstate.c | 3 +++ arch/x86/kernel/fpu/xstate.h | 4 ++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index eb17f31b06d25c..de16862bf230bc 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -591,6 +591,13 @@ struct fpu_state_config { * even without XSAVE support, i.e. legacy features FP + SSE */ u64 legacy_features; + /* + * @independent_features: + * + * Features that are supported by XSAVES, but not managed as part of + * the FPU core, such as LBR + */ + u64 independent_features; }; /* FPU state configuration information */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c5a026fee5e061..1339f8328db5ad 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -788,6 +788,9 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) goto out_disable; } + fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & + XFEATURE_MASK_INDEPENDENT; + /* * Clear XSAVE features that are disabled in the normal CPUID. */ diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 2ee0b9c53dccc4..afb404cd205911 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -62,9 +62,9 @@ static inline u64 xfeatures_mask_supervisor(void) static inline u64 xfeatures_mask_independent(void) { if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) - return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; + return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR; - return XFEATURE_MASK_INDEPENDENT; + return fpu_kernel_cfg.independent_features; } /* XSAVE/XRSTOR wrapper functions */ From 3cd740b985963f874a1a094f1969e998b9d05554 Mon Sep 17 00:00:00 2001 From: Tom Hughes Date: Tue, 6 Aug 2024 12:40:52 +0100 Subject: [PATCH 325/991] netfilter: allow ipv6 fragments to arrive on different devices Commit 264640fc2c5f4 ("ipv6: distinguish frag queues by device for multicast and link-local packets") modified the ipv6 fragment reassembly logic to distinguish frag queues by device for multicast and link-local packets but in fact only the main reassembly code limits the use of the device to those address types and the netfilter reassembly code uses the device for all packets. This means that if fragments of a packet arrive on different interfaces then netfilter will fail to reassemble them and the fragments will be expired without going any further through the filters. Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units") Signed-off-by: Tom Hughes Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_conntrack_reasm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 6f0844c9315d18..4120e67a8ce6bb 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -154,6 +154,10 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, }; struct inet_frag_queue *q; + if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))) + key.iif = 0; + q = inet_frag_find(nf_frag->fqdir, &key); if (!q) return NULL; From 61119394631f219e23ce98bcc3eb993a64a8ea64 Mon Sep 17 00:00:00 2001 From: Celeste Liu Date: Thu, 27 Jun 2024 22:23:39 +0800 Subject: [PATCH 326/991] riscv: entry: always initialize regs->a0 to -ENOSYS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise when the tracer changes syscall number to -1, the kernel fails to initialize a0 with -ENOSYS and subsequently fails to return the error code of the failed syscall to userspace. For example, it will break strace syscall tampering. Fixes: 52449c17bdd1 ("riscv: entry: set a0 = -ENOSYS only when syscall != -1") Reported-by: "Dmitry V. Levin" Reviewed-by: Björn Töpel Cc: stable@vger.kernel.org Signed-off-by: Celeste Liu Link: https://lore.kernel.org/r/20240627142338.5114-2-CoelacanthusHex@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 05a16b1f0aee85..51ebfd23e00764 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -319,6 +319,7 @@ void do_trap_ecall_u(struct pt_regs *regs) regs->epc += 4; regs->orig_a0 = regs->a0; + regs->a0 = -ENOSYS; riscv_v_vstate_discard(regs); @@ -328,8 +329,7 @@ void do_trap_ecall_u(struct pt_regs *regs) if (syscall >= 0 && syscall < NR_syscalls) syscall_handler(regs, syscall); - else if (syscall != -1) - regs->a0 = -ENOSYS; + /* * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), * so the maximum stack offset is 1k bytes (10 bits). From 57d76bc51fd80824bcc0c84a5b5ec944f1b51edd Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 8 May 2024 21:19:17 +0200 Subject: [PATCH 327/991] riscv: change XIP's kernel_map.size to be size of the entire kernel With XIP kernel, kernel_map.size is set to be only the size of data part of the kernel. This is inconsistent with "normal" kernel, who sets it to be the size of the entire kernel. More importantly, XIP kernel fails to boot if CONFIG_DEBUG_VIRTUAL is enabled, because there are checks on virtual addresses with the assumption that kernel_map.size is the size of the entire kernel (these checks are in arch/riscv/mm/physaddr.c). Change XIP's kernel_map.size to be the size of the entire kernel. Signed-off-by: Nam Cao Cc: # v6.1+ Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240508191917.2892064-1-namcao@linutronix.de Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 8b698d9609e701..eb0649a61b4c1f 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -927,7 +927,7 @@ static void __init create_kernel_page_table(pgd_t *pgdir, PMD_SIZE, PAGE_KERNEL_EXEC); /* Map the data in RAM */ - end_va = kernel_map.virt_addr + XIP_OFFSET + kernel_map.size; + end_va = kernel_map.virt_addr + kernel_map.size; for (va = kernel_map.virt_addr + XIP_OFFSET; va < end_va; va += PMD_SIZE) create_pgd_mapping(pgdir, va, kernel_map.phys_addr + (va - (kernel_map.virt_addr + XIP_OFFSET)), @@ -1096,7 +1096,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) phys_ram_base = CONFIG_PHYS_RAM_BASE; kernel_map.phys_addr = (uintptr_t)CONFIG_PHYS_RAM_BASE; - kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_sdata); + kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_start); kernel_map.va_kernel_xip_pa_offset = kernel_map.virt_addr - kernel_map.xiprom; #else From a445699879f989f6700df81f497b70bf94cc6163 Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Mon, 5 Aug 2024 11:30:23 +0800 Subject: [PATCH 328/991] RISC-V: ACPI: NUMA: initialize all values of acpi_early_node_map to NUMA_NO_NODE Currently, only acpi_early_node_map[0] was initialized to NUMA_NO_NODE. To ensure all the values were properly initialized, switch to initialize all of them to NUMA_NO_NODE. Fixes: eabd9db64ea8 ("ACPI: RISCV: Add NUMA support based on SRAT and SLIT") Reported-by: Andrew Jones Suggested-by: Andrew Jones Signed-off-by: Haibo Xu Reviewed-by: Sunil V L Reviewed-by: Andrew Jones Reviewed-by: Hanjun Guo Link: https://lore.kernel.org/r/0d362a8ae50558b95685da4c821b2ae9e8cf78be.1722828421.git.haibo1.xu@intel.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/acpi_numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/acpi_numa.c b/arch/riscv/kernel/acpi_numa.c index 0231482d6946aa..ff95aeebee3ebc 100644 --- a/arch/riscv/kernel/acpi_numa.c +++ b/arch/riscv/kernel/acpi_numa.c @@ -28,7 +28,7 @@ #include -static int acpi_early_node_map[NR_CPUS] __initdata = { NUMA_NO_NODE }; +static int acpi_early_node_map[NR_CPUS] __initdata = { [0 ... NR_CPUS - 1] = NUMA_NO_NODE }; int __init acpi_numa_get_nid(unsigned int cpu) { From c42e2f076769c9c1bc5f3f0aa1c2032558e76647 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 9 Aug 2024 14:44:43 -0700 Subject: [PATCH 329/991] RISC-V: hwprobe: Add MISALIGNED_PERF key RISCV_HWPROBE_KEY_CPUPERF_0 was mistakenly flagged as a bitmask in hwprobe_key_is_bitmask(), when in reality it was an enum value. This causes problems when used in conjunction with RISCV_HWPROBE_WHICH_CPUS, since SLOW, FAST, and EMULATED have values whose bits overlap with each other. If the caller asked for the set of CPUs that was SLOW or EMULATED, the returned set would also include CPUs that were FAST. Introduce a new hwprobe key, RISCV_HWPROBE_KEY_MISALIGNED_PERF, which returns the same values in response to a direct query (with no flags), but is properly handled as an enumerated value. As a result, SLOW, FAST, and EMULATED are all correctly treated as distinct values under the new key when queried with the WHICH_CPUS flag. Leave the old key in place to avoid disturbing applications which may have already come to rely on the key, with or without its broken behavior with respect to the WHICH_CPUS flag. Fixes: e178bf146e4b ("RISC-V: hwprobe: Introduce which-cpus flag") Signed-off-by: Evan Green Reviewed-by: Charlie Jenkins Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20240809214444.3257596-2-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/arch/riscv/hwprobe.rst | 20 +++++++++++++------- arch/riscv/include/asm/hwprobe.h | 2 +- arch/riscv/include/uapi/asm/hwprobe.h | 1 + arch/riscv/kernel/sys_hwprobe.c | 1 + 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/Documentation/arch/riscv/hwprobe.rst b/Documentation/arch/riscv/hwprobe.rst index 3db60a0911df65..a994eed75bde03 100644 --- a/Documentation/arch/riscv/hwprobe.rst +++ b/Documentation/arch/riscv/hwprobe.rst @@ -239,8 +239,13 @@ The following keys are defined: ratified in commit 98918c844281 ("Merge pull request #1217 from riscv/zawrs") of riscv-isa-manual. -* :c:macro:`RISCV_HWPROBE_KEY_CPUPERF_0`: A bitmask that contains performance - information about the selected set of processors. +* :c:macro:`RISCV_HWPROBE_KEY_CPUPERF_0`: Deprecated. Returns similar values to + :c:macro:`RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF`, but the key was + mistakenly classified as a bitmask rather than a value. + +* :c:macro:`RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF`: An enum value describing + the performance of misaligned scalar native word accesses on the selected set + of processors. * :c:macro:`RISCV_HWPROBE_MISALIGNED_UNKNOWN`: The performance of misaligned accesses is unknown. @@ -249,12 +254,13 @@ The following keys are defined: emulated via software, either in or below the kernel. These accesses are always extremely slow. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_SLOW`: Misaligned accesses are slower - than equivalent byte accesses. Misaligned accesses may be supported - directly in hardware, or trapped and emulated by software. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SLOW`: Misaligned native word + sized accesses are slower than the equivalent quantity of byte accesses. + Misaligned accesses may be supported directly in hardware, or trapped and + emulated by software. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_FAST`: Misaligned accesses are faster - than equivalent byte accesses. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_FAST`: Misaligned native word + sized accesses are faster than the equivalent quantity of byte accesses. * :c:macro:`RISCV_HWPROBE_MISALIGNED_UNSUPPORTED`: Misaligned accesses are not supported at all and will generate a misaligned address fault. diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index ef01c182af2b05..ffb9484531af7e 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,7 +8,7 @@ #include -#define RISCV_HWPROBE_MAX_KEY 8 +#define RISCV_HWPROBE_MAX_KEY 9 static inline bool riscv_hwprobe_key_is_valid(__s64 key) { diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index b706c8e47b027b..6357530842752a 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -82,6 +82,7 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6 #define RISCV_HWPROBE_KEY_HIGHEST_VIRT_ADDRESS 7 #define RISCV_HWPROBE_KEY_TIME_CSR_FREQ 8 +#define RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF 9 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 8d1b5c35d2a731..2d0f4f6a32c3fc 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -225,6 +225,7 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, break; case RISCV_HWPROBE_KEY_CPUPERF_0: + case RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF: pair->value = hwprobe_misaligned(cpus); break; From 1f5288874de776412041022607513ffac74ae1a6 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 9 Aug 2024 14:44:44 -0700 Subject: [PATCH 330/991] RISC-V: hwprobe: Add SCALAR to misaligned perf defines In preparation for misaligned vector performance hwprobe keys, rename the hwprobe key values associated with misaligned scalar accesses to include the term SCALAR. Leave the old defines in place to maintain source compatibility. This change is intended to be a functional no-op. Signed-off-by: Evan Green Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240809214444.3257596-3-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/arch/riscv/hwprobe.rst | 28 ++++++++++++---------- arch/riscv/include/uapi/asm/hwprobe.h | 5 ++++ arch/riscv/kernel/sys_hwprobe.c | 10 ++++---- arch/riscv/kernel/traps_misaligned.c | 6 ++--- arch/riscv/kernel/unaligned_access_speed.c | 12 +++++----- 5 files changed, 34 insertions(+), 27 deletions(-) diff --git a/Documentation/arch/riscv/hwprobe.rst b/Documentation/arch/riscv/hwprobe.rst index a994eed75bde03..85b709257918ad 100644 --- a/Documentation/arch/riscv/hwprobe.rst +++ b/Documentation/arch/riscv/hwprobe.rst @@ -247,23 +247,25 @@ The following keys are defined: the performance of misaligned scalar native word accesses on the selected set of processors. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_UNKNOWN`: The performance of misaligned - accesses is unknown. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN`: The performance of + misaligned scalar accesses is unknown. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_EMULATED`: Misaligned accesses are - emulated via software, either in or below the kernel. These accesses are - always extremely slow. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED`: Misaligned scalar + accesses are emulated via software, either in or below the kernel. These + accesses are always extremely slow. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_SLOW`: Misaligned native word - sized accesses are slower than the equivalent quantity of byte accesses. - Misaligned accesses may be supported directly in hardware, or trapped and - emulated by software. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW`: Misaligned scalar native + word sized accesses are slower than the equivalent quantity of byte + accesses. Misaligned accesses may be supported directly in hardware, or + trapped and emulated by software. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_FAST`: Misaligned native word - sized accesses are faster than the equivalent quantity of byte accesses. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SCALAR_FAST`: Misaligned scalar native + word sized accesses are faster than the equivalent quantity of byte + accesses. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_UNSUPPORTED`: Misaligned accesses are - not supported at all and will generate a misaligned address fault. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED`: Misaligned scalar + accesses are not supported at all and will generate a misaligned address + fault. * :c:macro:`RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE`: An unsigned int which represents the size of the Zicboz block in bytes. diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index 6357530842752a..1e153cda57db85 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -83,6 +83,11 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_KEY_HIGHEST_VIRT_ADDRESS 7 #define RISCV_HWPROBE_KEY_TIME_CSR_FREQ 8 #define RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF 9 +#define RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN 0 +#define RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED 1 +#define RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW 2 +#define RISCV_HWPROBE_MISALIGNED_SCALAR_FAST 3 +#define RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED 4 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 2d0f4f6a32c3fc..cea0ca2bf2a25e 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -178,13 +178,13 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus) perf = this_perf; if (perf != this_perf) { - perf = RISCV_HWPROBE_MISALIGNED_UNKNOWN; + perf = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; break; } } if (perf == -1ULL) - return RISCV_HWPROBE_MISALIGNED_UNKNOWN; + return RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; return perf; } @@ -192,12 +192,12 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus) static u64 hwprobe_misaligned(const struct cpumask *cpus) { if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS)) - return RISCV_HWPROBE_MISALIGNED_FAST; + return RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; if (IS_ENABLED(CONFIG_RISCV_EMULATED_UNALIGNED_ACCESS) && unaligned_ctl_available()) - return RISCV_HWPROBE_MISALIGNED_EMULATED; + return RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED; - return RISCV_HWPROBE_MISALIGNED_SLOW; + return RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; } #endif diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index b62d5a2f4541e7..192cd5603e95f9 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -338,7 +338,7 @@ int handle_misaligned_load(struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS - *this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_EMULATED; + *this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED; #endif if (!unaligned_enabled) @@ -532,13 +532,13 @@ static bool check_unaligned_access_emulated(int cpu) unsigned long tmp_var, tmp_val; bool misaligned_emu_detected; - *mas_ptr = RISCV_HWPROBE_MISALIGNED_UNKNOWN; + *mas_ptr = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; __asm__ __volatile__ ( " "REG_L" %[tmp], 1(%[ptr])\n" : [tmp] "=r" (tmp_val) : [ptr] "r" (&tmp_var) : "memory"); - misaligned_emu_detected = (*mas_ptr == RISCV_HWPROBE_MISALIGNED_EMULATED); + misaligned_emu_detected = (*mas_ptr == RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED); /* * If unaligned_ctl is already set, this means that we detected that all * CPUS uses emulated misaligned access at boot time. If that changed diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c index a9a6bcb02acf11..160628a2116de4 100644 --- a/arch/riscv/kernel/unaligned_access_speed.c +++ b/arch/riscv/kernel/unaligned_access_speed.c @@ -34,9 +34,9 @@ static int check_unaligned_access(void *param) struct page *page = param; void *dst; void *src; - long speed = RISCV_HWPROBE_MISALIGNED_SLOW; + long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; - if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN) + if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) return 0; /* Make an unaligned destination buffer. */ @@ -95,14 +95,14 @@ static int check_unaligned_access(void *param) } if (word_cycles < byte_cycles) - speed = RISCV_HWPROBE_MISALIGNED_FAST; + speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; ratio = div_u64((byte_cycles * 100), word_cycles); pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n", cpu, ratio / 100, ratio % 100, - (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow"); + (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow"); per_cpu(misaligned_access_speed, cpu) = speed; @@ -110,7 +110,7 @@ static int check_unaligned_access(void *param) * Set the value of fast_misaligned_access of a CPU. These operations * are atomic to avoid race conditions. */ - if (speed == RISCV_HWPROBE_MISALIGNED_FAST) + if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) cpumask_set_cpu(cpu, &fast_misaligned_access); else cpumask_clear_cpu(cpu, &fast_misaligned_access); @@ -188,7 +188,7 @@ static int riscv_online_cpu(unsigned int cpu) static struct page *buf; /* We are already set since the last check */ - if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN) + if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) goto exit; buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); From ff9f065318e17a1a97981d9e535fcfc6ce5d5614 Mon Sep 17 00:00:00 2001 From: YR Yang Date: Thu, 1 Aug 2024 16:43:26 +0800 Subject: [PATCH 331/991] ASoC: mediatek: mt8188: Mark AFE_DAC_CON0 register as volatile Add AFE Control Register 0 to the volatile_register. AFE_DAC_CON0 can be modified by both the SOF and ALSA drivers. If this register is read and written in cache mode, the cached value might not reflect the actual value when the register is modified by another driver. It can cause playback or capture failures. Therefore, it is necessary to add AFE_DAC_CON0 to the list of volatile registers. Signed-off-by: YR Yang Reviewed-by: Fei Shao Reviewed-by: Trevor Wu Link: https://patch.msgid.link/20240801084326.1472-1-yr.yang@mediatek.com Signed-off-by: Mark Brown --- sound/soc/mediatek/mt8188/mt8188-afe-pcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/mediatek/mt8188/mt8188-afe-pcm.c b/sound/soc/mediatek/mt8188/mt8188-afe-pcm.c index ccb6c1f3adc7d9..73e5c63aeec878 100644 --- a/sound/soc/mediatek/mt8188/mt8188-afe-pcm.c +++ b/sound/soc/mediatek/mt8188/mt8188-afe-pcm.c @@ -2748,6 +2748,7 @@ static bool mt8188_is_volatile_reg(struct device *dev, unsigned int reg) case AFE_ASRC12_NEW_CON9: case AFE_LRCK_CNT: case AFE_DAC_MON0: + case AFE_DAC_CON0: case AFE_DL2_CUR: case AFE_DL3_CUR: case AFE_DL6_CUR: From d1a7b382a9d3f0f3e5a80e0be2991c075fa4f618 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Tue, 6 Aug 2024 16:43:24 +0100 Subject: [PATCH 332/991] netfilter: nfnetlink: Initialise extack before use in ACKs Add missing extack initialisation when ACKing BATCH_BEGIN and BATCH_END. Fixes: bf2ac490d28c ("netfilter: nfnetlink: Handle ACK flags for batch messages") Signed-off-by: Donald Hunter Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4abf660c7baff0..932b3ddb34f13c 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -427,8 +427,10 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, nfnl_unlock(subsys_id); - if (nlh->nlmsg_flags & NLM_F_ACK) + if (nlh->nlmsg_flags & NLM_F_ACK) { + memset(&extack, 0, sizeof(extack)); nfnl_err_add(&err_list, nlh, 0, &extack); + } while (skb->len >= nlmsg_total_size(0)) { int msglen, type; @@ -577,6 +579,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, ss->abort(net, oskb, NFNL_ABORT_NONE); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); } else if (nlh->nlmsg_flags & NLM_F_ACK) { + memset(&extack, 0, sizeof(extack)); nfnl_err_add(&err_list, nlh, 0, &extack); } } else { From e9767137308daf906496613fd879808a07f006a2 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Tue, 6 Aug 2024 17:16:37 +0100 Subject: [PATCH 333/991] netfilter: flowtable: initialise extack before use Fix missing initialisation of extack in flow offload. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Signed-off-by: Donald Hunter Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index ff1a4e36c2b5dc..e06bc36f49fe74 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -841,8 +841,8 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, struct list_head *block_cb_list) { struct flow_cls_offload cls_flow = {}; + struct netlink_ext_ack extack = {}; struct flow_block_cb *block_cb; - struct netlink_ext_ack extack; __be16 proto = ETH_P_ALL; int err, i = 0; From 7d8dc1c7be8d3509e8f5164dd5df64c8e34d7eeb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Aug 2024 21:28:41 +0200 Subject: [PATCH 334/991] netfilter: nf_queue: drop packets with cloned unconfirmed conntracks Conntrack assumes an unconfirmed entry (not yet committed to global hash table) has a refcount of 1 and is not visible to other cores. With multicast forwarding this assumption breaks down because such skbs get cloned after being picked up, i.e. ct->use refcount is > 1. Likewise, bridge netfilter will clone broad/mutlicast frames and all frames in case they need to be flood-forwarded during learning phase. For ip multicast forwarding or plain bridge flood-forward this will "work" because packets don't leave softirq and are implicitly serialized. With nfqueue this no longer holds true, the packets get queued and can be reinjected in arbitrary ways. Disable this feature, I see no other solution. After this patch, nfqueue cannot queue packets except the last multicast/broadcast packet. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 6 +++++- net/netfilter/nfnetlink_queue.c | 35 +++++++++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 09f6a773a7080a..8f9c19d992ac5c 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -622,8 +622,12 @@ static unsigned int br_nf_local_in(void *priv, if (likely(nf_ct_is_confirmed(ct))) return NF_ACCEPT; + if (WARN_ON_ONCE(refcount_read(&nfct->use) != 1)) { + nf_reset_ct(skb); + return NF_ACCEPT; + } + WARN_ON_ONCE(skb_shared(skb)); - WARN_ON_ONCE(refcount_read(&nfct->use) != 1); /* We can't call nf_confirm here, it would create a dependency * on nf_conntrack module. diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 55e28e1da66ec6..e0716da256bf55 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -820,10 +820,41 @@ static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; - const struct nf_conn *ct = (void *)skb_nfct(entry->skb); + struct nf_conn *ct = (void *)skb_nfct(entry->skb); + unsigned long status; + unsigned int use; - if (ct && ((ct->status & flags) == IPS_DYING)) + if (!ct) + return false; + + status = READ_ONCE(ct->status); + if ((status & flags) == IPS_DYING) return true; + + if (status & IPS_CONFIRMED) + return false; + + /* in some cases skb_clone() can occur after initial conntrack + * pickup, but conntrack assumes exclusive skb->_nfct ownership for + * unconfirmed entries. + * + * This happens for br_netfilter and with ip multicast routing. + * We can't be solved with serialization here because one clone could + * have been queued for local delivery. + */ + use = refcount_read(&ct->ct_general.use); + if (likely(use == 1)) + return false; + + /* Can't decrement further? Exclusive ownership. */ + if (!refcount_dec_not_one(&ct->ct_general.use)) + return false; + + skb_set_nfct(entry->skb, 0); + /* No nf_ct_put(): we already decremented .use and it cannot + * drop down to 0. + */ + return true; #endif return false; } From ea2306f0330c59ac8cd6ba13193497f0a6a02684 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 8 Aug 2024 23:14:43 +0200 Subject: [PATCH 335/991] selftests: netfilter: add test for br_netfilter+conntrack+queue combination Trigger cloned skbs leaving softirq protection. This triggers splat without the preceeding change ("netfilter: nf_queue: drop packets with cloned unconfirmed conntracks"): WARNING: at net/netfilter/nf_conntrack_core.c:1198 __nf_conntrack_confirm.. because local delivery and forwarding will race for confirmation. Based on a reproducer script from Yi Chen. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- .../testing/selftests/net/netfilter/Makefile | 1 + .../net/netfilter/br_netfilter_queue.sh | 78 +++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100755 tools/testing/selftests/net/netfilter/br_netfilter_queue.sh diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index 47945b2b3f9258..d13fb5ea3e8942 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -7,6 +7,7 @@ MNL_CFLAGS := $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null) MNL_LDLIBS := $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) TEST_PROGS := br_netfilter.sh bridge_brouter.sh +TEST_PROGS += br_netfilter_queue.sh TEST_PROGS += conntrack_icmp_related.sh TEST_PROGS += conntrack_ipip_mtu.sh TEST_PROGS += conntrack_tcp_unreplied.sh diff --git a/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh b/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh new file mode 100755 index 00000000000000..6a764d70ab06f9 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +source lib.sh + +checktool "nft --version" "run test without nft tool" + +cleanup() { + cleanup_all_ns +} + +setup_ns c1 c2 c3 sender + +trap cleanup EXIT + +nf_queue_wait() +{ + grep -q "^ *$1 " "/proc/self/net/netfilter/nfnetlink_queue" +} + +port_add() { + ns="$1" + dev="$2" + a="$3" + + ip link add name "$dev" type veth peer name "$dev" netns "$ns" + + ip -net "$ns" addr add 192.168.1."$a"/24 dev "$dev" + ip -net "$ns" link set "$dev" up + + ip link set "$dev" master br0 + ip link set "$dev" up +} + +[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } + +ip link add br0 type bridge +ip addr add 192.168.1.254/24 dev br0 + +port_add "$c1" "c1" 1 +port_add "$c2" "c2" 2 +port_add "$c3" "c3" 3 +port_add "$sender" "sender" 253 + +ip link set br0 up + +modprobe -q br_netfilter + +sysctl net.bridge.bridge-nf-call-iptables=1 || exit 1 + +ip netns exec "$sender" ping -I sender -c1 192.168.1.1 || exit 1 +ip netns exec "$sender" ping -I sender -c1 192.168.1.2 || exit 2 +ip netns exec "$sender" ping -I sender -c1 192.168.1.3 || exit 3 + +nft -f /dev/stdin < /dev/null & + +busywait 5000 nf_queue_wait + +for i in $(seq 1 5); do conntrack -F > /dev/null 2> /dev/null; sleep 0.1 ; done & +ip netns exec "$sender" ping -I sender -f -c 50 -b 192.168.1.255 + +read t < /proc/sys/kernel/tainted +if [ "$t" -eq 0 ];then + echo PASS: kernel not tainted +else + echo ERROR: kernel is tainted + exit 1 +fi + +exit 0 From e0b6648b0446e59522819c75ba1dcb09e68d3e94 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 9 Aug 2024 15:07:30 +0200 Subject: [PATCH 336/991] netfilter: nf_tables: Audit log dump reset after the fact In theory, dumpreset may fail and invalidate the preceeding log message. Fix this and use the occasion to prepare for object reset locking, which benefits from a few unrelated changes: * Add an early call to nfnetlink_unicast if not resetting which effectively skips the audit logging but also unindents it. * Extract the table's name from the netlink attribute (which is verified via earlier table lookup) to not rely upon validity of the looked up table pointer. * Do not use local variable family, it will vanish. Fixes: 8e6cf365e1d5 ("audit: log nftables configuration change events") Signed-off-by: Phil Sutter Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 481ee78e77bcf9..4fa132715fcc29 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8055,6 +8055,7 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { + const struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; @@ -8064,6 +8065,7 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, struct sk_buff *skb2; bool reset = false; u32 objtype; + char *buf; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { @@ -8102,27 +8104,23 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) reset = true; - if (reset) { - const struct nftables_pernet *nft_net; - char *buf; - - nft_net = nft_pernet(net); - buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq); - - audit_log_nfcfg(buf, - family, - 1, - AUDIT_NFT_OP_OBJ_RESET, - GFP_ATOMIC); - kfree(buf); - } - err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, family, table, obj, reset); if (err < 0) goto err_fill_obj_info; + if (!reset) + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + + buf = kasprintf(GFP_ATOMIC, "%.*s:%u", + nla_len(nla[NFTA_OBJ_TABLE]), + (char *)nla_data(nla[NFTA_OBJ_TABLE]), + nft_net->base_seq); + audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, + AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); + kfree(buf); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); err_fill_obj_info: From 69fc3e9e90f1afc11f4015e6b75d18ab9acee348 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 9 Aug 2024 15:07:31 +0200 Subject: [PATCH 337/991] netfilter: nf_tables: Introduce nf_tables_getobj_single Outsource the reply skb preparation for non-dump getrule requests into a distinct function. Prep work for object reset locking. Signed-off-by: Phil Sutter Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 75 ++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4fa132715fcc29..c12c9cae784d7f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8052,10 +8052,10 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) } /* called with rcu_read_lock held */ -static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, - const struct nlattr * const nla[]) +static struct sk_buff * +nf_tables_getobj_single(u32 portid, const struct nfnl_info *info, + const struct nlattr * const nla[], bool reset) { - const struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; @@ -8063,52 +8063,69 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, struct net *net = info->net; struct nft_object *obj; struct sk_buff *skb2; - bool reset = false; u32 objtype; - char *buf; int err; - if (info->nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start = nf_tables_dump_obj_start, - .dump = nf_tables_dump_obj, - .done = nf_tables_dump_obj_done, - .module = THIS_MODULE, - .data = (void *)nla, - }; - - return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); - } - if (!nla[NFTA_OBJ_NAME] || !nla[NFTA_OBJ_TYPE]) - return -EINVAL; + return ERR_PTR(-EINVAL); table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); - return PTR_ERR(table); + return ERR_CAST(table); } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); - return PTR_ERR(obj); + return ERR_CAST(obj); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) - return -ENOMEM; + return ERR_PTR(-ENOMEM); + + err = nf_tables_fill_obj_info(skb2, net, portid, + info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, + family, table, obj, reset); + if (err < 0) { + kfree_skb(skb2); + return ERR_PTR(err); + } + + return skb2; +} + +static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; + struct sk_buff *skb2; + bool reset = false; + char *buf; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nf_tables_dump_obj_start, + .dump = nf_tables_dump_obj, + .done = nf_tables_dump_obj_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) reset = true; - err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, - info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, - family, table, obj, reset); - if (err < 0) - goto err_fill_obj_info; + skb2 = nf_tables_getobj_single(portid, info, nla, reset); + if (IS_ERR(skb2)) + return PTR_ERR(skb2); if (!reset) return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); @@ -8121,11 +8138,7 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); - return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); - -err_fill_obj_info: - kfree_skb(skb2); - return err; + return nfnetlink_unicast(skb2, net, portid); } static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) From bd662c4218f9648e888bebde9468146965f3f8a0 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 9 Aug 2024 15:07:32 +0200 Subject: [PATCH 338/991] netfilter: nf_tables: Add locking for NFT_MSG_GETOBJ_RESET requests Objects' dump callbacks are not concurrency-safe per-se with reset bit set. If two CPUs perform a reset at the same time, at least counter and quota objects suffer from value underrun. Prevent this by introducing dedicated locking callbacks for nfnetlink and the asynchronous dump handling to serialize access. Fixes: 43da04a593d8 ("netfilter: nf_tables: atomic dump and reset for stateful objects") Signed-off-by: Phil Sutter Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 72 ++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 13 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c12c9cae784d7f..0a2f7934695896 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8020,6 +8020,19 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int nf_tables_dumpreset_obj(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); + int ret; + + mutex_lock(&nft_net->commit_mutex); + ret = nf_tables_dump_obj(skb, cb); + mutex_unlock(&nft_net->commit_mutex); + + return ret; +} + static int nf_tables_dump_obj_start(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; @@ -8036,12 +8049,18 @@ static int nf_tables_dump_obj_start(struct netlink_callback *cb) if (nla[NFTA_OBJ_TYPE]) ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) - ctx->reset = true; - return 0; } +static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb) +{ + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; + + ctx->reset = true; + + return nf_tables_dump_obj_start(cb); +} + static int nf_tables_dump_obj_done(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; @@ -8100,18 +8119,43 @@ nf_tables_getobj_single(u32 portid, const struct nfnl_info *info, static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) +{ + u32 portid = NETLINK_CB(skb).portid; + struct sk_buff *skb2; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nf_tables_dump_obj_start, + .dump = nf_tables_dump_obj, + .done = nf_tables_dump_obj_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + skb2 = nf_tables_getobj_single(portid, info, nla, false); + if (IS_ERR(skb2)) + return PTR_ERR(skb2); + + return nfnetlink_unicast(skb2, info->net, portid); +} + +static int nf_tables_getobj_reset(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); u32 portid = NETLINK_CB(skb).portid; struct net *net = info->net; struct sk_buff *skb2; - bool reset = false; char *buf; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { - .start = nf_tables_dump_obj_start, - .dump = nf_tables_dump_obj, + .start = nf_tables_dumpreset_obj_start, + .dump = nf_tables_dumpreset_obj, .done = nf_tables_dump_obj_done, .module = THIS_MODULE, .data = (void *)nla, @@ -8120,16 +8164,18 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } - if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) - reset = true; + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + rcu_read_unlock(); + mutex_lock(&nft_net->commit_mutex); + skb2 = nf_tables_getobj_single(portid, info, nla, true); + mutex_unlock(&nft_net->commit_mutex); + rcu_read_lock(); + module_put(THIS_MODULE); - skb2 = nf_tables_getobj_single(portid, info, nla, reset); if (IS_ERR(skb2)) return PTR_ERR(skb2); - if (!reset) - return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); - buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), (char *)nla_data(nla[NFTA_OBJ_TABLE]), @@ -9421,7 +9467,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ_RESET] = { - .call = nf_tables_getobj, + .call = nf_tables_getobj_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, From 14d069d92951a3e150c0a81f2ca3b93e54da913b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 13 Aug 2024 09:12:53 -0700 Subject: [PATCH 339/991] i2c: tegra: Do not mark ACPI devices as irq safe On ACPI machines, the tegra i2c module encounters an issue due to a mutex being called inside a spinlock. This leads to the following bug: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585 ... Call trace: __might_sleep __mutex_lock_common mutex_lock_nested acpi_subsys_runtime_resume rpm_resume tegra_i2c_xfer The problem arises because during __pm_runtime_resume(), the spinlock &dev->power.lock is acquired before rpm_resume() is called. Later, rpm_resume() invokes acpi_subsys_runtime_resume(), which relies on mutexes, triggering the error. To address this issue, devices on ACPI are now marked as not IRQ-safe, considering the dependency of acpi_subsys_runtime_resume() on mutexes. Fixes: bd2fdedbf2ba ("i2c: tegra: Add the ACPI support") Cc: # v5.17+ Co-developed-by: Michael van der Westhuizen Signed-off-by: Michael van der Westhuizen Signed-off-by: Breno Leitao Reviewed-by: Dmitry Osipenko Reviewed-by: Andy Shevchenko Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-tegra.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 85b31edc558dff..1df5b42041427c 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -1802,9 +1802,9 @@ static int tegra_i2c_probe(struct platform_device *pdev) * domain. * * VI I2C device shouldn't be marked as IRQ-safe because VI I2C won't - * be used for atomic transfers. + * be used for atomic transfers. ACPI device is not IRQ safe also. */ - if (!IS_VI(i2c_dev)) + if (!IS_VI(i2c_dev) && !has_acpi_companion(i2c_dev->dev)) pm_runtime_irq_safe(i2c_dev->dev); pm_runtime_enable(i2c_dev->dev); From 1f7574a1f9a892dd79e0dfc03f38573e9c399ec2 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Mon, 15 Jul 2024 22:17:44 +0300 Subject: [PATCH 340/991] arm64: dts: qcom: disable GPU on x1e80100 by default The GPU on X1E80100 requires ZAP 'shader' file to be useful. Since the file is signed by the OEM keys and might be not available by default, disable the GPU node and drop the firmware name from the x1e80100.dtsi file. Devices not being fused to use OEM keys can specify generic location at `qcom/x1e80100/gen70500_zap.mbn` while enabling the GPU. The CRD and QCP were lucky enough to work with the default settings, so reenable the GPU on those platforms and provide correct firmware-name (including the SoC subdir). Fixes: 721e38301b79 ("arm64: dts: qcom: x1e80100: Add gpu support") Cc: Akhil P Oommen Reviewed-by: Konrad Dybcio Signed-off-by: Dmitry Baryshkov Reviewed-by: Caleb Connolly Reviewed-by: Akhil P Oommen Link: https://lore.kernel.org/r/20240715-x1e8-zap-name-v3-1-e7a5258c3c2e@linaro.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-crd.dts | 8 ++++++++ arch/arm64/boot/dts/qcom/x1e80100-qcp.dts | 8 ++++++++ arch/arm64/boot/dts/qcom/x1e80100.dtsi | 3 ++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts index 82f34dfe409057..e17ab8251e2a55 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts @@ -636,6 +636,14 @@ }; }; +&gpu { + status = "okay"; + + zap-shader { + firmware-name = "qcom/x1e80100/gen70500_zap.mbn"; + }; +}; + &i2c0 { clock-frequency = <400000>; diff --git a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts index 2dcf2a17511dbf..8098e6730ae52f 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts @@ -606,6 +606,14 @@ }; }; +&gpu { + status = "okay"; + + zap-shader { + firmware-name = "qcom/x1e80100/gen70500_zap.mbn"; + }; +}; + &lpass_tlmm { spkr_01_sd_n_active: spkr-01-sd-n-active-state { pins = "gpio12"; diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi index c13811a4ef909d..29cb3dddbb6fa1 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi @@ -3167,9 +3167,10 @@ interconnects = <&gem_noc MASTER_GFX3D 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "gfx-mem"; + status = "disabled"; + zap-shader { memory-region = <&gpu_microcode_mem>; - firmware-name = "qcom/gen70500_zap.mbn"; }; gpu_opp_table: opp-table { From dfbe93f32c12f5628bd83303e10ba63621c259ae Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Tue, 16 Jul 2024 12:35:03 +0200 Subject: [PATCH 341/991] arm64: dts: qcom: x1e80100: Fix Adreno SMMU global interrupt Fix the unfortunate off-by-one. Fixes: 721e38301b79 ("arm64: dts: qcom: x1e80100: Add gpu support") Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240716-topic-h_bits-v1-1-f6c5d3ff982c@linaro.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi index 29cb3dddbb6fa1..cd732ef88cd8e0 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi @@ -3301,7 +3301,7 @@ reg = <0x0 0x03da0000 0x0 0x40000>; #iommu-cells = <2>; #global-interrupts = <1>; - interrupts = , + interrupts = , , , , From 9960085a3a82c58d3323c1c20b991db6045063b0 Mon Sep 17 00:00:00 2001 From: Murali Nalajala Date: Wed, 14 Aug 2024 15:32:44 -0700 Subject: [PATCH 342/991] firmware: qcom: scm: Mark get_wq_ctx() as atomic call Currently get_wq_ctx() is wrongly configured as a standard call. When two SMC calls are in sleep and one SMC wakes up, it calls get_wq_ctx() to resume the corresponding sleeping thread. But if get_wq_ctx() is interrupted, goes to sleep and another SMC call is waiting to be allocated a waitq context, it leads to a deadlock. To avoid this get_wq_ctx() must be an atomic call and can't be a standard SMC call. Hence mark get_wq_ctx() as a fast call. Fixes: 6bf325992236 ("firmware: qcom: scm: Add wait-queue handling logic") Cc: stable@vger.kernel.org Signed-off-by: Murali Nalajala Signed-off-by: Unnathi Chalicheemala Reviewed-by: Elliot Berman Link: https://lore.kernel.org/r/20240814223244.40081-1-quic_uchalich@quicinc.com Signed-off-by: Bjorn Andersson --- drivers/firmware/qcom/qcom_scm-smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/qcom/qcom_scm-smc.c b/drivers/firmware/qcom/qcom_scm-smc.c index dca5f3f1883bb4..2b4c2826f57251 100644 --- a/drivers/firmware/qcom/qcom_scm-smc.c +++ b/drivers/firmware/qcom/qcom_scm-smc.c @@ -73,7 +73,7 @@ int scm_get_wq_ctx(u32 *wq_ctx, u32 *flags, u32 *more_pending) struct arm_smccc_res get_wq_res; struct arm_smccc_args get_wq_ctx = {0}; - get_wq_ctx.args[0] = ARM_SMCCC_CALL_VAL(ARM_SMCCC_STD_CALL, + get_wq_ctx.args[0] = ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_64, ARM_SMCCC_OWNER_SIP, SCM_SMC_FNID(QCOM_SCM_SVC_WAITQ, QCOM_SCM_WAITQ_GET_WQ_CTX)); From 1c753d001a259d0278fe318a1ed3c8aa5f3ea09e Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 31 Jul 2024 09:44:56 +0200 Subject: [PATCH 343/991] firmware: qcom: tzmem: fix virtual-to-physical address conversion We currently only correctly convert the virtual address passed by the caller to qcom_tzmem_to_phys() if it corresponds to the base address of the chunk. If the user wants to convert some pointer at an offset relative to that base address, we'll return 0. Let's change the implementation of qcom_tzmem_to_phys(): iterate over the chunks and try to call gen_pool_virt_to_phys() just-in-time instead of trying to call it only once when creating the chunk. Fixes: 84f5a7b67b61 ("firmware: qcom: add a dedicated TrustZone buffer allocator") Reported-by: Johan Hovold Closes: https://lore.kernel.org/lkml/20240729095542.21097-1-johan+linaro@kernel.org/ Acked-by: Andrew Halaney Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20240731-tzmem-efivars-fix-v2-1-f0e84071ec07@linaro.org Signed-off-by: Bjorn Andersson --- drivers/firmware/qcom/qcom_tzmem.c | 32 ++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/firmware/qcom/qcom_tzmem.c b/drivers/firmware/qcom/qcom_tzmem.c index 17948cfc82e76b..caedeef0059c9c 100644 --- a/drivers/firmware/qcom/qcom_tzmem.c +++ b/drivers/firmware/qcom/qcom_tzmem.c @@ -40,7 +40,6 @@ struct qcom_tzmem_pool { }; struct qcom_tzmem_chunk { - phys_addr_t paddr; size_t size; struct qcom_tzmem_pool *owner; }; @@ -385,7 +384,6 @@ void *qcom_tzmem_alloc(struct qcom_tzmem_pool *pool, size_t size, gfp_t gfp) return NULL; } - chunk->paddr = gen_pool_virt_to_phys(pool->genpool, vaddr); chunk->size = size; chunk->owner = pool; @@ -431,25 +429,37 @@ void qcom_tzmem_free(void *vaddr) EXPORT_SYMBOL_GPL(qcom_tzmem_free); /** - * qcom_tzmem_to_phys() - Map the virtual address of a TZ buffer to physical. - * @vaddr: Virtual address of the buffer allocated from a TZ memory pool. + * qcom_tzmem_to_phys() - Map the virtual address of TZ memory to physical. + * @vaddr: Virtual address of memory allocated from a TZ memory pool. * - * Can be used in any context. The address must have been returned by a call - * to qcom_tzmem_alloc(). + * Can be used in any context. The address must point to memory allocated + * using qcom_tzmem_alloc(). * - * Returns: Physical address of the buffer. + * Returns: + * Physical address mapped from the virtual or 0 if the mapping failed. */ phys_addr_t qcom_tzmem_to_phys(void *vaddr) { struct qcom_tzmem_chunk *chunk; + struct radix_tree_iter iter; + void __rcu **slot; + phys_addr_t ret; guard(spinlock_irqsave)(&qcom_tzmem_chunks_lock); - chunk = radix_tree_lookup(&qcom_tzmem_chunks, (unsigned long)vaddr); - if (!chunk) - return 0; + radix_tree_for_each_slot(slot, &qcom_tzmem_chunks, &iter, 0) { + chunk = radix_tree_deref_slot_protected(slot, + &qcom_tzmem_chunks_lock); - return chunk->paddr; + ret = gen_pool_virt_to_phys(chunk->owner->genpool, + (unsigned long)vaddr); + if (ret == -1) + continue; + + return ret; + } + + return 0; } EXPORT_SYMBOL_GPL(qcom_tzmem_to_phys); From 924fc22c282edbf93869b150d9e1b47e0b10485e Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 31 Jul 2024 09:44:57 +0200 Subject: [PATCH 344/991] firmware: qcom: qseecom: remove unused functions qseecom_scm_dev(), qseecom_dma_alloc() and qseecom_dma_free() are no longer used following the conversion to using tzmem. Remove them. Fixes: 6612103ec35a ("firmware: qcom: qseecom: convert to using the TZ allocator") Reviewed-by: Andrew Halaney Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20240731-tzmem-efivars-fix-v2-2-f0e84071ec07@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/firmware/qcom/qcom_qseecom.h | 45 ---------------------- 1 file changed, 45 deletions(-) diff --git a/include/linux/firmware/qcom/qcom_qseecom.h b/include/linux/firmware/qcom/qcom_qseecom.h index 1dc5b3b50aa9f6..3387897bf36843 100644 --- a/include/linux/firmware/qcom/qcom_qseecom.h +++ b/include/linux/firmware/qcom/qcom_qseecom.h @@ -25,51 +25,6 @@ struct qseecom_client { u32 app_id; }; -/** - * qseecom_scm_dev() - Get the SCM device associated with the QSEECOM client. - * @client: The QSEECOM client device. - * - * Returns the SCM device under which the provided QSEECOM client device - * operates. This function is intended to be used for DMA allocations. - */ -static inline struct device *qseecom_scm_dev(struct qseecom_client *client) -{ - return client->aux_dev.dev.parent->parent; -} - -/** - * qseecom_dma_alloc() - Allocate DMA memory for a QSEECOM client. - * @client: The QSEECOM client to allocate the memory for. - * @size: The number of bytes to allocate. - * @dma_handle: Pointer to where the DMA address should be stored. - * @gfp: Allocation flags. - * - * Wrapper function for dma_alloc_coherent(), allocating DMA memory usable for - * TZ/QSEECOM communication. Refer to dma_alloc_coherent() for details. - */ -static inline void *qseecom_dma_alloc(struct qseecom_client *client, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) -{ - return dma_alloc_coherent(qseecom_scm_dev(client), size, dma_handle, gfp); -} - -/** - * dma_free_coherent() - Free QSEECOM DMA memory. - * @client: The QSEECOM client for which the memory has been allocated. - * @size: The number of bytes allocated. - * @cpu_addr: Virtual memory address to free. - * @dma_handle: DMA memory address to free. - * - * Wrapper function for dma_free_coherent(), freeing memory previously - * allocated with qseecom_dma_alloc(). Refer to dma_free_coherent() for - * details. - */ -static inline void qseecom_dma_free(struct qseecom_client *client, size_t size, - void *cpu_addr, dma_addr_t dma_handle) -{ - return dma_free_coherent(qseecom_scm_dev(client), size, cpu_addr, dma_handle); -} - /** * qcom_qseecom_app_send() - Send to and receive data from a given QSEE app. * @client: The QSEECOM client associated with the target app. From 6c569b77f0300f8a9960277c7094fa0f128eb811 Mon Sep 17 00:00:00 2001 From: Abhinav Jain Date: Wed, 14 Aug 2024 13:37:43 +0530 Subject: [PATCH 345/991] selftest: af_unix: Fix kselftest compilation warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change expected_buf from (const void *) to (const char *) in function __recvpair(). This change fixes the below warnings during test compilation: ``` In file included from msg_oob.c:14: msg_oob.c: In function ‘__recvpair’: ../../kselftest_harness.h:106:40: warning: format ‘%s’ expects argument of type ‘char *’,but argument 6 has type ‘const void *’ [-Wformat=] ../../kselftest_harness.h:101:17: note: in expansion of macro ‘__TH_LOG’ msg_oob.c:235:17: note: in expansion of macro ‘TH_LOG’ ../../kselftest_harness.h:106:40: warning: format ‘%s’ expects argument of type ‘char *’,but argument 6 has type ‘const void *’ [-Wformat=] ../../kselftest_harness.h:101:17: note: in expansion of macro ‘__TH_LOG’ msg_oob.c:259:25: note: in expansion of macro ‘TH_LOG’ ``` Fixes: d098d77232c3 ("selftest: af_unix: Add msg_oob.c.") Signed-off-by: Abhinav Jain Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20240814080743.1156166-1-jain.abhinav177@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/af_unix/msg_oob.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 16d0c172eaebec..535eb2c3d7d1c8 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -209,7 +209,7 @@ static void __sendpair(struct __test_metadata *_metadata, static void __recvpair(struct __test_metadata *_metadata, FIXTURE_DATA(msg_oob) *self, - const void *expected_buf, int expected_len, + const char *expected_buf, int expected_len, int buf_len, int flags) { int i, ret[2], recv_errno[2], expected_errno = 0; From 0863bffda1131fd2fa9c05b653ad9ee3d8db127e Mon Sep 17 00:00:00 2001 From: Griffin Kroah-Hartman Date: Wed, 14 Aug 2024 13:17:47 +0200 Subject: [PATCH 346/991] Revert "serial: 8250_omap: Set the console genpd always on if no console suspend" This reverts commit 68e6939ea9ec3d6579eadeab16060339cdeaf940. Kevin reported that this causes a crash during suspend on platforms that dont use PM domains. Link: https://lore.kernel.org/r/7ha5hgpchq.fsf@baylibre.com Cc: Thomas Richard Fixes: 68e6939ea9ec ("serial: 8250_omap: Set the console genpd always on if no console suspend") Cc: stable Reported-by: Kevin Hilman Signed-off-by: Griffin Kroah-Hartman Link: https://lore.kernel.org/r/20240814111747.82371-1-griffin@kroah.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_omap.c | 33 +++++------------------------ 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c index 1af9aed99c6512..afef1dd4ddf49c 100644 --- a/drivers/tty/serial/8250/8250_omap.c +++ b/drivers/tty/serial/8250/8250_omap.c @@ -27,7 +27,6 @@ #include #include #include -#include #include "8250.h" @@ -119,12 +118,6 @@ #define UART_OMAP_TO_L 0x26 #define UART_OMAP_TO_H 0x27 -/* - * Copy of the genpd flags for the console. - * Only used if console suspend is disabled - */ -static unsigned int genpd_flags_console; - struct omap8250_priv { void __iomem *membase; int line; @@ -1655,7 +1648,6 @@ static int omap8250_suspend(struct device *dev) { struct omap8250_priv *priv = dev_get_drvdata(dev); struct uart_8250_port *up = serial8250_get_port(priv->line); - struct generic_pm_domain *genpd = pd_to_genpd(dev->pm_domain); int err = 0; serial8250_suspend_port(priv->line); @@ -1666,19 +1658,8 @@ static int omap8250_suspend(struct device *dev) if (!device_may_wakeup(dev)) priv->wer = 0; serial_out(up, UART_OMAP_WER, priv->wer); - if (uart_console(&up->port)) { - if (console_suspend_enabled) - err = pm_runtime_force_suspend(dev); - else { - /* - * The pd shall not be powered-off (no console suspend). - * Make copy of genpd flags before to set it always on. - * The original value is restored during the resume. - */ - genpd_flags_console = genpd->flags; - genpd->flags |= GENPD_FLAG_ALWAYS_ON; - } - } + if (uart_console(&up->port) && console_suspend_enabled) + err = pm_runtime_force_suspend(dev); flush_work(&priv->qos_work); return err; @@ -1688,16 +1669,12 @@ static int omap8250_resume(struct device *dev) { struct omap8250_priv *priv = dev_get_drvdata(dev); struct uart_8250_port *up = serial8250_get_port(priv->line); - struct generic_pm_domain *genpd = pd_to_genpd(dev->pm_domain); int err; if (uart_console(&up->port) && console_suspend_enabled) { - if (console_suspend_enabled) { - err = pm_runtime_force_resume(dev); - if (err) - return err; - } else - genpd->flags = genpd_flags_console; + err = pm_runtime_force_resume(dev); + if (err) + return err; } serial8250_resume_port(priv->line); From f75c235565f90c4a17b125e47f1c68ef6b8c2bce Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Wed, 14 Aug 2024 02:09:53 -0700 Subject: [PATCH 347/991] arm64: Fix KASAN random tag seed initialization Currently, kasan_init_sw_tags() is called before setup_per_cpu_areas(), so per_cpu(prng_state, cpu) accesses the same address regardless of the value of "cpu", and the same seed value gets copied to the percpu area for every CPU. Fix this by moving the call to smp_prepare_boot_cpu(), which is the first architecture hook after setup_per_cpu_areas(). Fixes: 3c9e3aa11094 ("kasan: add tag related helper functions") Fixes: 3f41b6093823 ("kasan: fix random seed generation for tag-based mode") Signed-off-by: Samuel Holland Reviewed-by: Andrey Konovalov Link: https://lore.kernel.org/r/20240814091005.969756-1-samuel.holland@sifive.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/setup.c | 3 --- arch/arm64/kernel/smp.c | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index a096e2451044d3..b22d28ec80284b 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -355,9 +355,6 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) smp_init_cpus(); smp_build_mpidr_hash(); - /* Init percpu seeds for random tags after cpus are set up. */ - kasan_init_sw_tags(); - #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Make sure init_thread_info.ttbr0 always generates translation diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 5e18fbcee9a20c..f01f0fd7b7feb4 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -467,6 +467,8 @@ void __init smp_prepare_boot_cpu(void) init_gic_priority_masking(); kasan_init_hw_tags(); + /* Init percpu seeds for random tags after cpus are set up. */ + kasan_init_sw_tags(); } /* From 69139d2919dd4aa9a553c8245e7c63e82613e3fc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 11 Aug 2024 19:21:53 -0700 Subject: [PATCH 348/991] vsock: fix recursive ->recvmsg calls After a vsock socket has been added to a BPF sockmap, its prot->recvmsg has been replaced with vsock_bpf_recvmsg(). Thus the following recursiion could happen: vsock_bpf_recvmsg() -> __vsock_recvmsg() -> vsock_connectible_recvmsg() -> prot->recvmsg() -> vsock_bpf_recvmsg() again We need to fix it by calling the original ->recvmsg() without any BPF sockmap logic in __vsock_recvmsg(). Fixes: 634f1a7110b4 ("vsock: support sockmap") Reported-by: syzbot+bdb4bd87b5e22058e2a4@syzkaller.appspotmail.com Tested-by: syzbot+bdb4bd87b5e22058e2a4@syzkaller.appspotmail.com Cc: Bobby Eshleman Cc: Michael S. Tsirkin Cc: Stefano Garzarella Signed-off-by: Cong Wang Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20240812022153.86512-1-xiyou.wangcong@gmail.com Signed-off-by: Paolo Abeni --- include/net/af_vsock.h | 4 ++++ net/vmw_vsock/af_vsock.c | 50 +++++++++++++++++++++++---------------- net/vmw_vsock/vsock_bpf.c | 4 ++-- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 535701efc1e5ce..24d970f7a4fa22 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -230,8 +230,12 @@ struct vsock_tap { int vsock_add_tap(struct vsock_tap *vt); int vsock_remove_tap(struct vsock_tap *vt); void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque); +int __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags); int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); +int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags); int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 4b040285aa78c0..0ff9b2dd86bac2 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1270,25 +1270,28 @@ static int vsock_dgram_connect(struct socket *sock, return err; } +int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags) +{ + struct sock *sk = sock->sk; + struct vsock_sock *vsk = vsock_sk(sk); + + return vsk->transport->dgram_dequeue(vsk, msg, len, flags); +} + int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { #ifdef CONFIG_BPF_SYSCALL + struct sock *sk = sock->sk; const struct proto *prot; -#endif - struct vsock_sock *vsk; - struct sock *sk; - sk = sock->sk; - vsk = vsock_sk(sk); - -#ifdef CONFIG_BPF_SYSCALL prot = READ_ONCE(sk->sk_prot); if (prot != &vsock_proto) return prot->recvmsg(sk, msg, len, flags, NULL); #endif - return vsk->transport->dgram_dequeue(vsk, msg, len, flags); + return __vsock_dgram_recvmsg(sock, msg, len, flags); } EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); @@ -2174,15 +2177,12 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, } int -vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +__vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; -#ifdef CONFIG_BPF_SYSCALL - const struct proto *prot; -#endif int err; sk = sock->sk; @@ -2233,14 +2233,6 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, goto out; } -#ifdef CONFIG_BPF_SYSCALL - prot = READ_ONCE(sk->sk_prot); - if (prot != &vsock_proto) { - release_sock(sk); - return prot->recvmsg(sk, msg, len, flags, NULL); - } -#endif - if (sk->sk_type == SOCK_STREAM) err = __vsock_stream_recvmsg(sk, msg, len, flags); else @@ -2250,6 +2242,22 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, release_sock(sk); return err; } + +int +vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ +#ifdef CONFIG_BPF_SYSCALL + struct sock *sk = sock->sk; + const struct proto *prot; + + prot = READ_ONCE(sk->sk_prot); + if (prot != &vsock_proto) + return prot->recvmsg(sk, msg, len, flags, NULL); +#endif + + return __vsock_connectible_recvmsg(sock, msg, len, flags); +} EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg); static int vsock_set_rcvlowat(struct sock *sk, int val) diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c index a3c97546ab84a6..c42c5cc18f3241 100644 --- a/net/vmw_vsock/vsock_bpf.c +++ b/net/vmw_vsock/vsock_bpf.c @@ -64,9 +64,9 @@ static int __vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int int err; if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) - err = vsock_connectible_recvmsg(sock, msg, len, flags); + err = __vsock_connectible_recvmsg(sock, msg, len, flags); else if (sk->sk_type == SOCK_DGRAM) - err = vsock_dgram_recvmsg(sock, msg, len, flags); + err = __vsock_dgram_recvmsg(sock, msg, len, flags); else err = -EPROTOTYPE; From fde25c20f51807db340b875953cfd1cedaa392fc Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Mon, 12 Aug 2024 17:08:24 +0300 Subject: [PATCH 349/991] net: ethtool: Allow write mechanism of LPL and both LPL and EPL CMIS 5.2 standard section 9.4.2 defines four types of firmware update supported mechanism: None, only LPL, only EPL, both LPL and EPL. Currently, only LPL (Local Payload) type of write firmware block is supported. However, if the module supports both LPL and EPL the flashing process wrongly fails for no supporting LPL. Fix that, by allowing the write mechanism to be LPL or both LPL and EPL. Fixes: c4f78134d45c ("ethtool: cmis_fw_update: add a layer for supporting firmware update using CDB") Reported-by: Vladyslav Mykhaliuk Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Link: https://patch.msgid.link/20240812140824.3718826-1-danieller@nvidia.com Signed-off-by: Paolo Abeni --- net/ethtool/cmis_fw_update.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c index ae4b4b28a6014f..655ff5224ffa30 100644 --- a/net/ethtool/cmis_fw_update.c +++ b/net/ethtool/cmis_fw_update.c @@ -35,7 +35,10 @@ struct cmis_cdb_fw_mng_features_rpl { __be16 resv7; }; -#define CMIS_CDB_FW_WRITE_MECHANISM_LPL 0x01 +enum cmis_cdb_fw_write_mechanism { + CMIS_CDB_FW_WRITE_MECHANISM_LPL = 0x01, + CMIS_CDB_FW_WRITE_MECHANISM_BOTH = 0x11, +}; static int cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, @@ -64,7 +67,8 @@ cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, } rpl = (struct cmis_cdb_fw_mng_features_rpl *)args.req.payload; - if (!(rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL)) { + if (!(rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL || + rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_BOTH)) { ethnl_module_fw_flash_ntf_err(dev, ntf_params, "Write LPL is not supported", NULL); From 1f1b194284093d619c9fbc7e9e38b2c68d0408e8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 12 Aug 2024 15:13:22 +0100 Subject: [PATCH 350/991] net: thunder_bgx: Fix netdev structure allocation Commit 94833addfaba ("net: thunderx: Unembed netdev structure") had a go at dynamically allocating the netdev structures for the thunderx_bgx driver. This change results in my ThunderX box catching fire (to be fair, it is what it does best). The issues with this change are that: - bgx_lmac_enable() is called *after* bgx_acpi_register_phy() and bgx_init_of_phy(), both expecting netdev to be a valid pointer. - bgx_init_of_phy() populates the MAC addresses for *all* LMACs attached to a given BGX instance, and thus needs netdev for each of them to have been allocated. There is a few things to be said about how the driver mixes LMAC and BGX states which leads to this sorry state, but that's beside the point. To address this, go back to a situation where all netdev structures are allocated before the driver starts relying on them, and move the freeing of these structures to driver removal. Someone brave enough can always go and restructure the driver if they want. Fixes: 94833addfaba ("net: thunderx: Unembed netdev structure") Signed-off-by: Marc Zyngier Cc: Breno Leitao Cc: Sunil Goutham Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Reviewed-by: Simon Horman Reviewed-by: Breno Leitao Link: https://patch.msgid.link/20240812141322.1742918-1-maz@kernel.org Signed-off-by: Paolo Abeni --- .../net/ethernet/cavium/thunder/thunder_bgx.c | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index a40c266c37f207..608cc6af5af1c7 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -1054,18 +1054,12 @@ static int phy_interface_mode(u8 lmac_type) static int bgx_lmac_enable(struct bgx *bgx, u8 lmacid) { - struct lmac *lmac, **priv; + struct lmac *lmac; u64 cfg; lmac = &bgx->lmac[lmacid]; lmac->bgx = bgx; - lmac->netdev = alloc_netdev_dummy(sizeof(struct lmac *)); - if (!lmac->netdev) - return -ENOMEM; - priv = netdev_priv(lmac->netdev); - *priv = lmac; - if ((lmac->lmac_type == BGX_MODE_SGMII) || (lmac->lmac_type == BGX_MODE_QSGMII) || (lmac->lmac_type == BGX_MODE_RGMII)) { @@ -1191,7 +1185,6 @@ static void bgx_lmac_disable(struct bgx *bgx, u8 lmacid) (lmac->lmac_type != BGX_MODE_10G_KR) && lmac->phydev) phy_disconnect(lmac->phydev); - free_netdev(lmac->netdev); lmac->phydev = NULL; } @@ -1653,6 +1646,23 @@ static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) bgx_get_qlm_mode(bgx); + for (lmac = 0; lmac < bgx->lmac_count; lmac++) { + struct lmac *lmacp, **priv; + + lmacp = &bgx->lmac[lmac]; + lmacp->netdev = alloc_netdev_dummy(sizeof(struct lmac *)); + + if (!lmacp->netdev) { + for (int i = 0; i < lmac; i++) + free_netdev(bgx->lmac[i].netdev); + err = -ENOMEM; + goto err_enable; + } + + priv = netdev_priv(lmacp->netdev); + *priv = lmacp; + } + err = bgx_init_phy(bgx); if (err) goto err_enable; @@ -1692,8 +1702,10 @@ static void bgx_remove(struct pci_dev *pdev) u8 lmac; /* Disable all LMACs */ - for (lmac = 0; lmac < bgx->lmac_count; lmac++) + for (lmac = 0; lmac < bgx->lmac_count; lmac++) { bgx_lmac_disable(bgx, lmac); + free_netdev(bgx->lmac[lmac].netdev); + } pci_free_irq(pdev, GMPX_GMI_TX_INT, bgx); From fd45cc614b8acca5bb435ba37fe9b3f9a17fab84 Mon Sep 17 00:00:00 2001 From: Alex Bee Date: Mon, 5 Aug 2024 13:08:56 +0200 Subject: [PATCH 351/991] drm/rockchip: inno-hdmi: Fix infoframe upload HDMI analyser shows that the AVI infoframe is no being longer send. The switch to the HDMI connector api should have used the frame content which is now given in the buffer parameter, but instead still uses the (now) empty and superfluous packed_frame variable. Fix it. Fixes: 65548c8ff0ab ("drm/rockchip: inno_hdmi: Switch to HDMI connector") Signed-off-by: Alex Bee Acked-by: Maxime Ripard Signed-off-by: Heiko Stuebner Link: https://patchwork.freedesktop.org/patch/msgid/20240805110855.274140-2-knaerzche@gmail.com --- drivers/gpu/drm/rockchip/inno_hdmi.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/rockchip/inno_hdmi.c b/drivers/gpu/drm/rockchip/inno_hdmi.c index 2241e53a294690..dec6913cec5b59 100644 --- a/drivers/gpu/drm/rockchip/inno_hdmi.c +++ b/drivers/gpu/drm/rockchip/inno_hdmi.c @@ -279,7 +279,6 @@ static int inno_hdmi_upload_frame(struct drm_connector *connector, const u8 *buffer, size_t len) { struct inno_hdmi *hdmi = connector_to_inno_hdmi(connector); - u8 packed_frame[HDMI_MAXIMUM_INFO_FRAME_SIZE]; ssize_t i; if (type != HDMI_INFOFRAME_TYPE_AVI) { @@ -291,8 +290,7 @@ static int inno_hdmi_upload_frame(struct drm_connector *connector, inno_hdmi_disable_frame(connector, type); for (i = 0; i < len; i++) - hdmi_writeb(hdmi, HDMI_CONTROL_PACKET_ADDR + i, - packed_frame[i]); + hdmi_writeb(hdmi, HDMI_CONTROL_PACKET_ADDR + i, buffer[i]); return 0; } From 52dd070c62e4ae2b5e7411b920e3f7a64235ecfb Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Wed, 14 Aug 2024 20:47:40 +0800 Subject: [PATCH 352/991] pmdomain: imx: wait SSAR when i.MX93 power domain on With "quiet" set in bootargs, there is power domain failure: "imx93_power_domain 44462400.power-domain: pd_off timeout: name: 44462400.power-domain, stat: 4" The current power on opertation takes ISO state as power on finished flag, but it is wrong. Before powering on operation really finishes, powering off comes and powering off will never finish because the last powering on still not finishes, so the following powering off actually not trigger hardware state machine to run. SSAR is the last step when powering on a domain, so need to wait SSAR done when powering on. Since EdgeLock Enclave(ELE) handshake is involved in the flow, enlarge the waiting time to 10ms for both on and off to avoid timeout. Cc: stable@vger.kernel.org Fixes: 0a0f7cc25d4a ("soc: imx: add i.MX93 SRC power domain driver") Reviewed-by: Jacky Bai Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20240814124740.2778952-1-peng.fan@oss.nxp.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/imx/imx93-pd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/pmdomain/imx/imx93-pd.c b/drivers/pmdomain/imx/imx93-pd.c index 1e94b499c19bcc..d750a7dc58d212 100644 --- a/drivers/pmdomain/imx/imx93-pd.c +++ b/drivers/pmdomain/imx/imx93-pd.c @@ -20,6 +20,7 @@ #define FUNC_STAT_PSW_STAT_MASK BIT(0) #define FUNC_STAT_RST_STAT_MASK BIT(2) #define FUNC_STAT_ISO_STAT_MASK BIT(4) +#define FUNC_STAT_SSAR_STAT_MASK BIT(8) struct imx93_power_domain { struct generic_pm_domain genpd; @@ -50,7 +51,7 @@ static int imx93_pd_on(struct generic_pm_domain *genpd) writel(val, addr + MIX_SLICE_SW_CTRL_OFF); ret = readl_poll_timeout(addr + MIX_FUNC_STAT_OFF, val, - !(val & FUNC_STAT_ISO_STAT_MASK), 1, 10000); + !(val & FUNC_STAT_SSAR_STAT_MASK), 1, 10000); if (ret) { dev_err(domain->dev, "pd_on timeout: name: %s, stat: %x\n", genpd->name, val); return ret; @@ -72,7 +73,7 @@ static int imx93_pd_off(struct generic_pm_domain *genpd) writel(val, addr + MIX_SLICE_SW_CTRL_OFF); ret = readl_poll_timeout(addr + MIX_FUNC_STAT_OFF, val, - val & FUNC_STAT_PSW_STAT_MASK, 1, 1000); + val & FUNC_STAT_PSW_STAT_MASK, 1, 10000); if (ret) { dev_err(domain->dev, "pd_off timeout: name: %s, stat: %x\n", genpd->name, val); return ret; From cdc90f75387c42d64a0ed1ba03550ea9447249d4 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 13 Aug 2024 09:37:19 +0200 Subject: [PATCH 353/991] pse-core: Conditionally set current limit during PI regulator registration Fix an issue where `devm_regulator_register()` would fail for PSE controllers that do not support current limit control, such as simple GPIO-based controllers like the podl-pse-regulator. The `REGULATOR_CHANGE_CURRENT` flag and `max_uA` constraint are now conditionally set only if the `pi_set_current_limit` operation is supported. This change prevents the regulator registration routine from attempting to call `pse_pi_set_current_limit()`, which would return `-EOPNOTSUPP` and cause the registration to fail. Fixes: 4a83abcef5f4f ("net: pse-pd: Add new power limit get and set c33 features") Signed-off-by: Oleksij Rempel Reviewed-by: Kory Maincent Tested-by: Kyle Swenson Link: https://patch.msgid.link/20240813073719.2304633-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- drivers/net/pse-pd/pse_core.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c index ec20953e0f825c..4f032b16a8a0a6 100644 --- a/drivers/net/pse-pd/pse_core.c +++ b/drivers/net/pse-pd/pse_core.c @@ -401,9 +401,14 @@ devm_pse_pi_regulator_register(struct pse_controller_dev *pcdev, rdesc->ops = &pse_pi_ops; rdesc->owner = pcdev->owner; - rinit_data->constraints.valid_ops_mask = REGULATOR_CHANGE_STATUS | - REGULATOR_CHANGE_CURRENT; - rinit_data->constraints.max_uA = MAX_PI_CURRENT; + rinit_data->constraints.valid_ops_mask = REGULATOR_CHANGE_STATUS; + + if (pcdev->ops->pi_set_current_limit) { + rinit_data->constraints.valid_ops_mask |= + REGULATOR_CHANGE_CURRENT; + rinit_data->constraints.max_uA = MAX_PI_CURRENT; + } + rinit_data->supply_regulator = "vpwr"; rconfig.dev = pcdev->dev; From 7965a7f32a53d9ad807ce2c53bdda69ba104974f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 13 Aug 2024 15:39:34 +0200 Subject: [PATCH 354/991] selftests: net: lib: kill PIDs before del netns When deleting netns, it is possible to still have some tasks running, e.g. background tasks like tcpdump running in the background, not stopped because the test has been interrupted. Before deleting the netns, it is then safer to kill all attached PIDs, if any. That should reduce some noises after the end of some tests, and help with the debugging of some issues. That's why this modification is seen as a "fix". Fixes: 25ae948b4478 ("selftests/net: add lib.sh") Acked-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Acked-by: Florian Westphal Reviewed-by: Hangbin Liu Link: https://patch.msgid.link/20240813-upstream-net-20240813-selftests-net-lib-kill-v1-1-27b689b248b8@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/lib.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index d0219032f77300..8ee4489238ca4b 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -146,6 +146,7 @@ cleanup_ns() for ns in "$@"; do [ -z "${ns}" ] && continue + ip netns pids "${ns}" 2> /dev/null | xargs -r kill || true ip netns delete "${ns}" &> /dev/null || true if ! busywait $BUSYWAIT_TIMEOUT ip netns list \| grep -vq "^$ns$" &> /dev/null; then echo "Warn: Failed to remove namespace $ns" From 8445d9d3c03101859663d34fda747f6a50947556 Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Tue, 13 Aug 2024 22:10:20 +0800 Subject: [PATCH 355/991] net: hns3: fix wrong use of semaphore up Currently, if hns3 PF or VF FLR reset failed after five times retry, the reset done process will directly release the semaphore which has already released in hclge_reset_prepare_general. This will cause down operation fail. So this patch fixes it by adding reset state judgement. The up operation is only called after successful PF FLR reset. Fixes: 8627bdedc435 ("net: hns3: refactor the precedure of PF FLR") Fixes: f28368bb4542 ("net: hns3: refactor the procedure of VF FLR") Signed-off-by: Jie Wang Signed-off-by: Jijie Shao Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 ++-- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 82574ce0194fba..125e04434611d3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -11516,8 +11516,8 @@ static void hclge_reset_done(struct hnae3_ae_dev *ae_dev) dev_err(&hdev->pdev->dev, "fail to rebuild, ret=%d\n", ret); hdev->reset_type = HNAE3_NONE_RESET; - clear_bit(HCLGE_STATE_RST_HANDLING, &hdev->state); - up(&hdev->reset_sem); + if (test_and_clear_bit(HCLGE_STATE_RST_HANDLING, &hdev->state)) + up(&hdev->reset_sem); } static void hclge_clear_resetting_state(struct hclge_dev *hdev) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 3735d2fed11f75..094a7c7b55921f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1747,8 +1747,8 @@ static void hclgevf_reset_done(struct hnae3_ae_dev *ae_dev) ret); hdev->reset_type = HNAE3_NONE_RESET; - clear_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state); - up(&hdev->reset_sem); + if (test_and_clear_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state)) + up(&hdev->reset_sem); } static u32 hclgevf_get_fw_version(struct hnae3_handle *handle) From 30545e17eac1f50c5ef49644daf6af205100a965 Mon Sep 17 00:00:00 2001 From: Peiyang Wang Date: Tue, 13 Aug 2024 22:10:21 +0800 Subject: [PATCH 356/991] net: hns3: use the user's cfg after reset Consider the followed case that the user change speed and reset the net interface. Before the hw change speed successfully, the driver get old old speed from hw by timer task. After reset, the previous speed is config to hw. As a result, the new speed is configed successfully but lost after PF reset. The followed pictured shows more dirrectly. +------+ +----+ +----+ | USER | | PF | | HW | +---+--+ +-+--+ +-+--+ | ethtool -s 100G | | +------------------>| set speed 100G | | +--------------------->| | | set successfully | | |<---------------------+---+ | |query cfg (timer task)| | | +--------------------->| | handle speed | | return 200G | | changing event | ethtool --reset |<---------------------+ | (100G) +------------------>| cfg previous speed |<--+ | | after reset (200G) | | +--------------------->| | | +---+ | |query cfg (timer task)| | | +--------------------->| | handle speed | | return 100G | | changing event | |<---------------------+ | (200G) | | |<--+ | |query cfg (timer task)| | +--------------------->| | | return 200G | | |<---------------------+ | | | v v v This patch save new speed if hw change speed successfully, which will be used after reset successfully. Fixes: 2d03eacc0b7e ("net: hns3: Only update mac configuation when necessary") Signed-off-by: Peiyang Wang Signed-off-by: Jijie Shao Signed-off-by: Paolo Abeni --- .../hisilicon/hns3/hns3pf/hclge_main.c | 24 ++++++++++++++----- .../hisilicon/hns3/hns3pf/hclge_mdio.c | 3 +++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 125e04434611d3..465f0d58228374 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2653,8 +2653,17 @@ static int hclge_cfg_mac_speed_dup_h(struct hnae3_handle *handle, int speed, { struct hclge_vport *vport = hclge_get_vport(handle); struct hclge_dev *hdev = vport->back; + int ret; + + ret = hclge_cfg_mac_speed_dup(hdev, speed, duplex, lane_num); - return hclge_cfg_mac_speed_dup(hdev, speed, duplex, lane_num); + if (ret) + return ret; + + hdev->hw.mac.req_speed = speed; + hdev->hw.mac.req_duplex = duplex; + + return 0; } static int hclge_set_autoneg_en(struct hclge_dev *hdev, bool enable) @@ -2956,17 +2965,20 @@ static int hclge_mac_init(struct hclge_dev *hdev) if (!test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state)) hdev->hw.mac.duplex = HCLGE_MAC_FULL; - ret = hclge_cfg_mac_speed_dup_hw(hdev, hdev->hw.mac.speed, - hdev->hw.mac.duplex, hdev->hw.mac.lane_num); - if (ret) - return ret; - if (hdev->hw.mac.support_autoneg) { ret = hclge_set_autoneg_en(hdev, hdev->hw.mac.autoneg); if (ret) return ret; } + if (!hdev->hw.mac.autoneg) { + ret = hclge_cfg_mac_speed_dup_hw(hdev, hdev->hw.mac.req_speed, + hdev->hw.mac.req_duplex, + hdev->hw.mac.lane_num); + if (ret) + return ret; + } + mac->link = 0; if (mac->user_fec_mode & BIT(HNAE3_FEC_USER_DEF)) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c index 85fb11de43a124..80079657afebe0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c @@ -191,6 +191,9 @@ static void hclge_mac_adjust_link(struct net_device *netdev) if (ret) netdev_err(netdev, "failed to adjust link.\n"); + hdev->hw.mac.req_speed = (u32)speed; + hdev->hw.mac.req_duplex = (u8)duplex; + ret = hclge_cfg_flowctrl(hdev); if (ret) netdev_err(netdev, "failed to configure flow control.\n"); From be5e816d00a506719e9dbb1a9c861c5ced30a109 Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Tue, 13 Aug 2024 22:10:22 +0800 Subject: [PATCH 357/991] net: hns3: fix a deadlock problem when config TC during resetting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When config TC during the reset process, may cause a deadlock, the flow is as below: pf reset start │ ▼ ...... setup tc │ │ ▼ ▼ DOWN: napi_disable() napi_disable()(skip) │ │ │ ▼ ▼ ...... ...... │ │ ▼ │ napi_enable() │ ▼ UINIT: netif_napi_del() │ ▼ ...... │ ▼ INIT: netif_napi_add() │ ▼ ...... global reset start │ │ ▼ ▼ UP: napi_enable()(skip) ...... │ │ ▼ ▼ ...... napi_disable() In reset process, the driver will DOWN the port and then UINIT, in this case, the setup tc process will UP the port before UINIT, so cause the problem. Adds a DOWN process in UINIT to fix it. Fixes: bb6b94a896d4 ("net: hns3: Add reset interface implementation in client") Signed-off-by: Jie Wang Signed-off-by: Jijie Shao Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index a5fc0209d628eb..4cbc4d069a1f36 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -5724,6 +5724,9 @@ static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle) struct net_device *netdev = handle->kinfo.netdev; struct hns3_nic_priv *priv = netdev_priv(netdev); + if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) + hns3_nic_net_stop(netdev); + if (!test_and_clear_bit(HNS3_NIC_STATE_INITED, &priv->state)) { netdev_warn(netdev, "already uninitialized\n"); return 0; From 86db7bfb06704ef17340eeae71c832f21cfce35c Mon Sep 17 00:00:00 2001 From: Peiyang Wang Date: Tue, 13 Aug 2024 22:10:23 +0800 Subject: [PATCH 358/991] net: hns3: void array out of bound when loop tnl_num When query reg inf of SSU, it loops tnl_num times. However, tnl_num comes from hardware and the length of array is a fixed value. To void array out of bound, make sure the loop time is not greater than the length of array Signed-off-by: Peiyang Wang Signed-off-by: Jijie Shao Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index e132c2f0956098..cc7f46c0b35ff6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -1598,8 +1598,7 @@ static void hclge_query_reg_info_of_ssu(struct hclge_dev *hdev) { u32 loop_para[HCLGE_MOD_MSG_PARA_ARRAY_MAX_SIZE] = {0}; struct hclge_mod_reg_common_msg msg; - u8 i, j, num; - u32 loop_time; + u8 i, j, num, loop_time; num = ARRAY_SIZE(hclge_ssu_reg_common_msg); for (i = 0; i < num; i++) { @@ -1609,7 +1608,8 @@ static void hclge_query_reg_info_of_ssu(struct hclge_dev *hdev) loop_time = 1; loop_para[0] = 0; if (msg.need_para) { - loop_time = hdev->ae_dev->dev_specs.tnl_num; + loop_time = min(hdev->ae_dev->dev_specs.tnl_num, + HCLGE_MOD_MSG_PARA_ARRAY_MAX_SIZE); for (j = 0; j < loop_time; j++) loop_para[j] = j + 1; } From 7660833d217528c8f2385528951ab820a031e4e3 Mon Sep 17 00:00:00 2001 From: Peiyang Wang Date: Tue, 13 Aug 2024 22:10:24 +0800 Subject: [PATCH 359/991] net: hns3: use correct release function during uninitialization pci_request_regions is called to apply for PCI I/O and memory resources when the driver is initialized, Therefore, when the driver is uninstalled, pci_release_regions should be used to release PCI I/O and memory resources instead of pci_release_mem_regions is used to release memory reasouces only. Signed-off-by: Peiyang Wang Signed-off-by: Jijie Shao Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 465f0d58228374..6c33195a1168f8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -11456,7 +11456,7 @@ static void hclge_pci_uninit(struct hclge_dev *hdev) pcim_iounmap(pdev, hdev->hw.hw.io_base); pci_free_irq_vectors(pdev); - pci_release_mem_regions(pdev); + pci_release_regions(pdev); pci_disable_device(pdev); } From d33d26036a0274b472299d7dcdaa5fb34329f91b Mon Sep 17 00:00:00 2001 From: Roland Xu Date: Thu, 15 Aug 2024 10:58:13 +0800 Subject: [PATCH 360/991] rtmutex: Drop rt_mutex::wait_lock before scheduling rt_mutex_handle_deadlock() is called with rt_mutex::wait_lock held. In the good case it returns with the lock held and in the deadlock case it emits a warning and goes into an endless scheduling loop with the lock held, which triggers the 'scheduling in atomic' warning. Unlock rt_mutex::wait_lock in the dead lock case before issuing the warning and dropping into the schedule for ever loop. [ tglx: Moved unlock before the WARN(), removed the pointless comment, massaged changelog, added Fixes tag ] Fixes: 3d5c9340d194 ("rtmutex: Handle deadlock detection smarter") Signed-off-by: Roland Xu Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/ME0P300MB063599BEF0743B8FA339C2CECC802@ME0P300MB0635.AUSP300.PROD.OUTLOOK.COM --- kernel/locking/rtmutex.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 88d08eeb8bc03d..fba1229f1de669 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1644,6 +1644,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, } static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, + struct rt_mutex_base *lock, struct rt_mutex_waiter *w) { /* @@ -1656,10 +1657,10 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, if (build_ww_mutex() && w->ww_ctx) return; - /* - * Yell loudly and stop the task right here. - */ + raw_spin_unlock_irq(&lock->wait_lock); + WARN(1, "rtmutex deadlock detected\n"); + while (1) { set_current_state(TASK_INTERRUPTIBLE); rt_mutex_schedule(); @@ -1713,7 +1714,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, } else { __set_current_state(TASK_RUNNING); remove_waiter(lock, waiter); - rt_mutex_handle_deadlock(ret, chwalk, waiter); + rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); } /* From ddeb7989a98faf8da67ac613731a0eee32667b7d Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Wed, 17 Jul 2024 07:04:28 -0700 Subject: [PATCH 361/991] drm/xe: Validate user fence during creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fail invalid addresses during user fence creation. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Brost Reviewed-by: Thomas Hellström Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240717140429.1396820-1-matthew.brost@intel.com (cherry picked from commit 0fde907da2d5fd4da68845e96c6842497159c858) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_sync.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c index c4e018aa2982fb..e8d31e01086019 100644 --- a/drivers/gpu/drm/xe/xe_sync.c +++ b/drivers/gpu/drm/xe/xe_sync.c @@ -53,14 +53,18 @@ static struct xe_user_fence *user_fence_create(struct xe_device *xe, u64 addr, u64 value) { struct xe_user_fence *ufence; + u64 __user *ptr = u64_to_user_ptr(addr); + + if (!access_ok(ptr, sizeof(ptr))) + return ERR_PTR(-EFAULT); ufence = kmalloc(sizeof(*ufence), GFP_KERNEL); if (!ufence) - return NULL; + return ERR_PTR(-ENOMEM); ufence->xe = xe; kref_init(&ufence->refcount); - ufence->addr = u64_to_user_ptr(addr); + ufence->addr = ptr; ufence->value = value; ufence->mm = current->mm; mmgrab(ufence->mm); @@ -183,8 +187,8 @@ int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef, } else { sync->ufence = user_fence_create(xe, sync_in.addr, sync_in.timeline_value); - if (XE_IOCTL_DBG(xe, !sync->ufence)) - return -ENOMEM; + if (XE_IOCTL_DBG(xe, IS_ERR(sync->ufence))) + return PTR_ERR(sync->ufence); } break; From e98a032c0340d45c199f4eb536359f5762a8748f Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Thu, 18 Jul 2024 14:05:45 -0700 Subject: [PATCH 362/991] drm/xe: Move part of xe_file cleanup to a helper In order to make xe_file ref counted, move destruction of xe_file members to a helper. v2: Move xe_vm_close_and_put back into xe_file_close (Matt) Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240718210548.3580382-2-umesh.nerlige.ramappa@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 3d0c4a62cc553c6ffde4cb11620eba991e770665) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 76109415eba61e..452dbc1b495baa 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -90,9 +90,25 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file) return 0; } +static void xe_file_destroy(struct xe_file *xef) +{ + struct xe_device *xe = xef->xe; + + xa_destroy(&xef->exec_queue.xa); + mutex_destroy(&xef->exec_queue.lock); + xa_destroy(&xef->vm.xa); + mutex_destroy(&xef->vm.lock); + + spin_lock(&xe->clients.lock); + xe->clients.count--; + spin_unlock(&xe->clients.lock); + + xe_drm_client_put(xef->client); + kfree(xef); +} + static void xe_file_close(struct drm_device *dev, struct drm_file *file) { - struct xe_device *xe = to_xe_device(dev); struct xe_file *xef = file->driver_priv; struct xe_vm *vm; struct xe_exec_queue *q; @@ -108,21 +124,12 @@ static void xe_file_close(struct drm_device *dev, struct drm_file *file) xe_exec_queue_kill(q); xe_exec_queue_put(q); } - xa_destroy(&xef->exec_queue.xa); - mutex_destroy(&xef->exec_queue.lock); mutex_lock(&xef->vm.lock); xa_for_each(&xef->vm.xa, idx, vm) xe_vm_close_and_put(vm); mutex_unlock(&xef->vm.lock); - xa_destroy(&xef->vm.xa); - mutex_destroy(&xef->vm.lock); - spin_lock(&xe->clients.lock); - xe->clients.count--; - spin_unlock(&xe->clients.lock); - - xe_drm_client_put(xef->client); - kfree(xef); + xe_file_destroy(xef); } static const struct drm_ioctl_desc xe_ioctls[] = { From d28bb0120f360e772458a7cf295d6d0ae3dc18a4 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Thu, 18 Jul 2024 14:05:46 -0700 Subject: [PATCH 363/991] drm/xe: Add ref counting for xe_file Add ref counting for xe_file. v2: - Add kernel doc for exported functions (Matt) - Instead of xe_file_destroy, export the get/put helpers (Lucas) v3: Fixup the kernel-doc format and description (Matt, Lucas) Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240718210548.3580382-3-umesh.nerlige.ramappa@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit ce8c161cbad43f4056451e541f7ae3471d0cca12) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.c | 33 ++++++++++++++++++++++++++-- drivers/gpu/drm/xe/xe_device.h | 3 +++ drivers/gpu/drm/xe/xe_device_types.h | 3 +++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 452dbc1b495baa..4178bd05701eba 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -87,11 +87,14 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file) spin_unlock(&xe->clients.lock); file->driver_priv = xef; + kref_init(&xef->refcount); + return 0; } -static void xe_file_destroy(struct xe_file *xef) +static void xe_file_destroy(struct kref *ref) { + struct xe_file *xef = container_of(ref, struct xe_file, refcount); struct xe_device *xe = xef->xe; xa_destroy(&xef->exec_queue.xa); @@ -107,6 +110,32 @@ static void xe_file_destroy(struct xe_file *xef) kfree(xef); } +/** + * xe_file_get() - Take a reference to the xe file object + * @xef: Pointer to the xe file + * + * Anyone with a pointer to xef must take a reference to the xe file + * object using this call. + * + * Return: xe file pointer + */ +struct xe_file *xe_file_get(struct xe_file *xef) +{ + kref_get(&xef->refcount); + return xef; +} + +/** + * xe_file_put() - Drop a reference to the xe file object + * @xef: Pointer to the xe file + * + * Used to drop reference to the xef object + */ +void xe_file_put(struct xe_file *xef) +{ + kref_put(&xef->refcount, xe_file_destroy); +} + static void xe_file_close(struct drm_device *dev, struct drm_file *file) { struct xe_file *xef = file->driver_priv; @@ -129,7 +158,7 @@ static void xe_file_close(struct drm_device *dev, struct drm_file *file) xe_vm_close_and_put(vm); mutex_unlock(&xef->vm.lock); - xe_file_destroy(xef); + xe_file_put(xef); } static const struct drm_ioctl_desc xe_ioctls[] = { diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index bb07f5669dbb35..b3952718b3c1cb 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -170,4 +170,7 @@ static inline bool xe_device_wedged(struct xe_device *xe) void xe_device_declare_wedged(struct xe_device *xe); +struct xe_file *xe_file_get(struct xe_file *xef); +void xe_file_put(struct xe_file *xef); + #endif diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 3bca6d344744ac..cbc582bcc90a5f 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -566,6 +566,9 @@ struct xe_file { /** @client: drm client */ struct xe_drm_client *client; + + /** @refcount: ref count of this xe file */ + struct kref refcount; }; #endif From 6309f9b1fc4de2daa1293fe12a488d765e60507d Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Thu, 18 Jul 2024 14:05:47 -0700 Subject: [PATCH 364/991] drm/xe: Take a ref to xe file when user creates a VM Take a reference to xef when user creates the VM and put the reference when user destroys the VM. Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240718210548.3580382-4-umesh.nerlige.ramappa@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit a2387e69493df3de706f14e4573ee123d23d5d34) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_vm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 5b166fa03684e2..6bfcbd4e778a9e 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1601,6 +1601,10 @@ static void vm_destroy_work_func(struct work_struct *w) XE_WARN_ON(vm->pt_root[id]); trace_xe_vm_free(vm); + + if (vm->xef) + xe_file_put(vm->xef); + kfree(vm); } @@ -1916,7 +1920,7 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data, } args->vm_id = id; - vm->xef = xef; + vm->xef = xe_file_get(xef); /* Record BO memory for VM pagetable created against client */ for_each_tile(tile, xe, id) From 817c70e2ba278e9d5360833b1137ef8855ac1728 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Thu, 18 Jul 2024 14:05:48 -0700 Subject: [PATCH 365/991] drm/xe: Fix use after free when client stats are captured xe_file_close triggers an asynchronous queue cleanup and then frees up the xef object. Since queue cleanup flushes all pending jobs and the KMD stores client usage stats into the xef object after jobs are flushed, we see a use-after-free for the xef object. Resolve this by taking a reference to xef from xe_exec_queue. While at it, revert an earlier change that contained a partial work around for this issue. v2: - Take a ref to xef even for the VM bind queue (Matt) - Squash patches relevant to that fix and work around (Lucas) v3: Fix typo (Lucas) Fixes: ce62827bc294 ("drm/xe: Do not access xe file when updating exec queue run_ticks") Fixes: 6109f24f87d7 ("drm/xe: Add helper to accumulate exec queue runtime") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/issues/1908 Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240718210548.3580382-5-umesh.nerlige.ramappa@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 2149ded63079449b8dddf9da38392632f155e6b5) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_drm_client.c | 5 +---- drivers/gpu/drm/xe/xe_exec_queue.c | 10 +++++++++- drivers/gpu/drm/xe/xe_exec_queue_types.h | 7 +++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c index 6a26923fa10e0f..7ddd59908334c5 100644 --- a/drivers/gpu/drm/xe/xe_drm_client.c +++ b/drivers/gpu/drm/xe/xe_drm_client.c @@ -251,11 +251,8 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file) /* Accumulate all the exec queues from this client */ mutex_lock(&xef->exec_queue.lock); - xa_for_each(&xef->exec_queue.xa, i, q) { + xa_for_each(&xef->exec_queue.xa, i, q) xe_exec_queue_update_run_ticks(q); - xef->run_ticks[q->class] += q->run_ticks - q->old_run_ticks; - q->old_run_ticks = q->run_ticks; - } mutex_unlock(&xef->exec_queue.lock); /* Get the total GPU cycles */ diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 0ba37835849b01..a39384bb9553ff 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -37,6 +37,10 @@ static void __xe_exec_queue_free(struct xe_exec_queue *q) { if (q->vm) xe_vm_put(q->vm); + + if (q->xef) + xe_file_put(q->xef); + kfree(q); } @@ -649,6 +653,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data, goto kill_exec_queue; args->exec_queue_id = id; + q->xef = xe_file_get(xef); return 0; @@ -762,6 +767,7 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q) */ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) { + struct xe_file *xef; struct xe_lrc *lrc; u32 old_ts, new_ts; @@ -773,6 +779,8 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) if (!q->vm || !q->vm->xef) return; + xef = q->vm->xef; + /* * Only sample the first LRC. For parallel submission, all of them are * scheduled together and we compensate that below by multiplying by @@ -783,7 +791,7 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) */ lrc = q->lrc[0]; new_ts = xe_lrc_update_timestamp(lrc, &old_ts); - q->run_ticks += (new_ts - old_ts) * q->width; + xef->run_ticks[q->class] += (new_ts - old_ts) * q->width; } void xe_exec_queue_kill(struct xe_exec_queue *q) diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 201588ec33c359..a35ce24c97982c 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -38,6 +38,9 @@ enum xe_exec_queue_priority { * a kernel object. */ struct xe_exec_queue { + /** @xef: Back pointer to xe file if this is user created exec queue */ + struct xe_file *xef; + /** @gt: graphics tile this exec queue can submit to */ struct xe_gt *gt; /** @@ -139,10 +142,6 @@ struct xe_exec_queue { * Protected by @vm's resv. Unused if @vm == NULL. */ u64 tlb_flush_seqno; - /** @old_run_ticks: prior hw engine class run time in ticks for this exec queue */ - u64 old_run_ticks; - /** @run_ticks: hw engine class run time in ticks for this exec queue */ - u64 run_ticks; /** @lrc: logical ring context for this exec queue */ struct xe_lrc *lrc[]; }; From 64da63cd3f7d771bf8f240e72203da1f72aa3728 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 18 Jul 2024 22:31:55 +0200 Subject: [PATCH 366/991] drm/xe/vf: Fix register value lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should use the number of actual entries stored in the runtime register buffer, not the maximum number of entries that this buffer can hold, otherwise bsearch() may fail and we may miss the data and wrongly report unexpected access to some registers. Fixes: 4edadc41a3a4 ("drm/xe/vf: Use register values obtained from the PF") Signed-off-by: Michal Wajdeczko Cc: Piotr Piórkowski Cc: Matt Roper Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240718203155.486-1-michal.wajdeczko@intel.com (cherry picked from commit ad16682db18f4414e53bba1ce0db75b08bdc4dff) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c index 41e46a00c01e89..8892d6c2291ebb 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c @@ -850,7 +850,7 @@ static struct vf_runtime_reg *vf_lookup_reg(struct xe_gt *gt, u32 addr) xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt))); - return bsearch(&key, runtime->regs, runtime->regs_size, sizeof(key), + return bsearch(&key, runtime->regs, runtime->num_regs, sizeof(key), vf_runtime_reg_cmp); } From 55ea73aacfb9a92def840a7110a468c5a76caeb5 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 19 Jul 2024 10:29:05 -0700 Subject: [PATCH 367/991] drm/xe: Build PM into GuC CT layer Take PM ref when any G2H are outstanding, drop when none are outstanding. To safely ensure we have PM ref when in the GuC CT layer, a PM ref needs to be held when scheduler messages are pending too. v2: - Add outer PM protections to xe_file_close (CI) v3: - Only take PM ref 0->1 and drop on 1->0 (Matthew Auld) v4: - Add assert to G2H increment function v5: - Rebase v6: - Declare xe as local variable in xe_file_close (CI) Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Matthew Auld Cc: Rodrigo Vivi Cc: Nirmoy Das Signed-off-by: Matthew Brost Reviewed-by: Matthew Auld Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240719172905.1527927-5-matthew.brost@intel.com (cherry picked from commit d930c19fdff3109e97b610fa10943b7602efcabd) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.c | 5 +++++ drivers/gpu/drm/xe/xe_guc_ct.c | 10 +++++++++- drivers/gpu/drm/xe/xe_guc_submit.c | 4 ++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 4178bd05701eba..f2f1d8ddb22138 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -138,11 +138,14 @@ void xe_file_put(struct xe_file *xef) static void xe_file_close(struct drm_device *dev, struct drm_file *file) { + struct xe_device *xe = to_xe_device(dev); struct xe_file *xef = file->driver_priv; struct xe_vm *vm; struct xe_exec_queue *q; unsigned long idx; + xe_pm_runtime_get(xe); + /* * No need for exec_queue.lock here as there is no contention for it * when FD is closing as IOCTLs presumably can't be modifying the @@ -159,6 +162,8 @@ static void xe_file_close(struct drm_device *dev, struct drm_file *file) mutex_unlock(&xef->vm.lock); xe_file_put(xef); + + xe_pm_runtime_put(xe); } static const struct drm_ioctl_desc xe_ioctls[] = { diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 7d2e937da1d83d..64afc90ad2c519 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -327,6 +327,8 @@ static void xe_guc_ct_set_state(struct xe_guc_ct *ct, xe_gt_assert(ct_to_gt(ct), ct->g2h_outstanding == 0 || state == XE_GUC_CT_STATE_STOPPED); + if (ct->g2h_outstanding) + xe_pm_runtime_put(ct_to_xe(ct)); ct->g2h_outstanding = 0; ct->state = state; @@ -495,10 +497,15 @@ static void h2g_reserve_space(struct xe_guc_ct *ct, u32 cmd_len) static void __g2h_reserve_space(struct xe_guc_ct *ct, u32 g2h_len, u32 num_g2h) { xe_gt_assert(ct_to_gt(ct), g2h_len <= ct->ctbs.g2h.info.space); + xe_gt_assert(ct_to_gt(ct), (!g2h_len && !num_g2h) || + (g2h_len && num_g2h)); if (g2h_len) { lockdep_assert_held(&ct->fast_lock); + if (!ct->g2h_outstanding) + xe_pm_runtime_get_noresume(ct_to_xe(ct)); + ct->ctbs.g2h.info.space -= g2h_len; ct->g2h_outstanding += num_g2h; } @@ -511,7 +518,8 @@ static void __g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space); ct->ctbs.g2h.info.space += g2h_len; - --ct->g2h_outstanding; + if (!--ct->g2h_outstanding) + xe_pm_runtime_put(ct_to_xe(ct)); } static void g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 8d7e7f4bbff716..6398629e6b4ecd 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1393,6 +1393,8 @@ static void guc_exec_queue_process_msg(struct xe_sched_msg *msg) default: XE_WARN_ON("Unknown message type"); } + + xe_pm_runtime_put(guc_to_xe(exec_queue_to_guc(msg->private_data))); } static const struct drm_sched_backend_ops drm_sched_ops = { @@ -1482,6 +1484,8 @@ static void guc_exec_queue_kill(struct xe_exec_queue *q) static void guc_exec_queue_add_msg(struct xe_exec_queue *q, struct xe_sched_msg *msg, u32 opcode) { + xe_pm_runtime_get_noresume(guc_to_xe(exec_queue_to_guc(q))); + INIT_LIST_HEAD(&msg->link); msg->opcode = opcode; msg->private_data = q; From 4f7652dcd339aca6678084d42fda999ecb19b624 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Tue, 6 Aug 2024 20:05:16 +0200 Subject: [PATCH 368/991] drm/xe/pf: Fix VF config validation on multi-GT platforms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When validating VF config on the media GT, we may wrongly report that VF is already partially configured on it, as we consider GGTT and LMEM provisioning done on the primary GT (since both GGTT and LMEM are tile-level resources, not a GT-level). This will cause skipping a VF auto-provisioning on the media-GT and in result will block a VF from successfully initialize that GT. Fix that by considering GGTT and LMEM configurations only when checking if a VF provisioning is complete, and omit GGTT and LMEM when reporting empty/partial provisioning. Fixes: 234670cea9a2 ("drm/xe/pf: Skip fair VFs provisioning if already provisioned") Signed-off-by: Michal Wajdeczko Cc: Piotr Piórkowski Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20240806180516.618-1-michal.wajdeczko@intel.com (cherry picked from commit 5bdacb0907c1f531995b6ba47b832ac3a0182ae9) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 4699b78360013c..b6f0a7299c0309 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -1927,6 +1927,7 @@ static int pf_validate_vf_config(struct xe_gt *gt, unsigned int vfid) { struct xe_gt *primary_gt = gt_to_tile(gt)->primary_gt; struct xe_device *xe = gt_to_xe(gt); + bool is_primary = !xe_gt_is_media_type(gt); bool valid_ggtt, valid_ctxs, valid_dbs; bool valid_any, valid_all; @@ -1935,13 +1936,17 @@ static int pf_validate_vf_config(struct xe_gt *gt, unsigned int vfid) valid_dbs = pf_get_vf_config_dbs(gt, vfid); /* note that GuC doorbells are optional */ - valid_any = valid_ggtt || valid_ctxs || valid_dbs; - valid_all = valid_ggtt && valid_ctxs; + valid_any = valid_ctxs || valid_dbs; + valid_all = valid_ctxs; + + /* and GGTT/LMEM is configured on primary GT only */ + valid_all = valid_all && valid_ggtt; + valid_any = valid_any || (valid_ggtt && is_primary); if (IS_DGFX(xe)) { bool valid_lmem = pf_get_vf_config_ggtt(primary_gt, vfid); - valid_any = valid_any || valid_lmem; + valid_any = valid_any || (valid_lmem && is_primary); valid_all = valid_all && valid_lmem; } From 90be4cc6f7674a1478c4c750beeee3edd14aee38 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 19 Jul 2024 10:29:02 -0700 Subject: [PATCH 369/991] drm/xe: Add xe_gt_tlb_invalidation_fence_init helper Other layers should not be touching struct xe_gt_tlb_invalidation_fence directly, add helper for initialization. v2: - Add dma_fence_get and list init to xe_gt_tlb_invalidation_fence_init Signed-off-by: Matthew Brost Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240719172905.1527927-2-matthew.brost@intel.com (cherry picked from commit a522b285c6b4b611406d59612a8d7241714d2e31) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 36 +++++++++++++++++++++ drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h | 3 ++ drivers/gpu/drm/xe/xe_pt.c | 26 +-------------- 3 files changed, 40 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index d9359976ab8bb6..92a18a0e4acd19 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -508,3 +508,39 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len) return 0; } + +static const char * +invalidation_fence_get_driver_name(struct dma_fence *dma_fence) +{ + return "xe"; +} + +static const char * +invalidation_fence_get_timeline_name(struct dma_fence *dma_fence) +{ + return "invalidation_fence"; +} + +static const struct dma_fence_ops invalidation_fence_ops = { + .get_driver_name = invalidation_fence_get_driver_name, + .get_timeline_name = invalidation_fence_get_timeline_name, +}; + +/** + * xe_gt_tlb_invalidation_fence_init - Initialize TLB invalidation fence + * @gt: GT + * @fence: TLB invalidation fence to initialize + * + * Initialize TLB invalidation fence for use + */ +void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, + struct xe_gt_tlb_invalidation_fence *fence) +{ + spin_lock_irq(>->tlb_invalidation.lock); + dma_fence_init(&fence->base, &invalidation_fence_ops, + >->tlb_invalidation.lock, + dma_fence_context_alloc(1), 1); + spin_unlock_irq(>->tlb_invalidation.lock); + INIT_LIST_HEAD(&fence->link); + dma_fence_get(&fence->base); +} diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h index bf3bebd9f985ba..948f4a2f52144e 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h @@ -26,4 +26,7 @@ int xe_gt_tlb_invalidation_range(struct xe_gt *gt, int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno); int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len); +void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, + struct xe_gt_tlb_invalidation_fence *fence); + #endif /* _XE_GT_TLB_INVALIDATION_ */ diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index ade9e7a3a0adb4..4c17a1ec8f8b90 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1115,23 +1115,6 @@ struct invalidation_fence { u32 asid; }; -static const char * -invalidation_fence_get_driver_name(struct dma_fence *dma_fence) -{ - return "xe"; -} - -static const char * -invalidation_fence_get_timeline_name(struct dma_fence *dma_fence) -{ - return "invalidation_fence"; -} - -static const struct dma_fence_ops invalidation_fence_ops = { - .get_driver_name = invalidation_fence_get_driver_name, - .get_timeline_name = invalidation_fence_get_timeline_name, -}; - static void invalidation_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb) { @@ -1170,15 +1153,8 @@ static int invalidation_fence_init(struct xe_gt *gt, trace_xe_gt_tlb_invalidation_fence_create(gt_to_xe(gt), &ifence->base); - spin_lock_irq(>->tlb_invalidation.lock); - dma_fence_init(&ifence->base.base, &invalidation_fence_ops, - >->tlb_invalidation.lock, - dma_fence_context_alloc(1), 1); - spin_unlock_irq(>->tlb_invalidation.lock); - - INIT_LIST_HEAD(&ifence->base.link); + xe_gt_tlb_invalidation_fence_init(gt, &ifence->base); - dma_fence_get(&ifence->base.base); /* Ref for caller */ ifence->fence = fence; ifence->gt = gt; ifence->start = start; From 58bfe6674467f4c037e89111e6007f25b34d8bb3 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 19 Jul 2024 10:29:03 -0700 Subject: [PATCH 370/991] drm/xe: Drop xe_gt_tlb_invalidation_wait Having two methods to wait on GT TLB invalidations is not ideal. Remove xe_gt_tlb_invalidation_wait and only use GT TLB invalidation fences. In addition to two methods being less than ideal, once GT TLB invalidations are coalesced the seqno cannot be assigned during xe_gt_tlb_invalidation_ggtt/range. Thus xe_gt_tlb_invalidation_wait would not have a seqno to wait one. A fence however can be armed and later signaled. v3: - Add explaination about coalescing to commit message v4: - Don't put dma fence if defined on stack (CI) v5: - Initialize ret to zero (CI) v6: - Use invalidation_fence_signal helper in tlb timeout (Matthew Auld) Signed-off-by: Matthew Brost Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240719172905.1527927-3-matthew.brost@intel.com (cherry picked from commit 61ac035361ae555ee5a17a7667fe96afdde3d59a) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 148 ++++++++------------ drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h | 10 +- drivers/gpu/drm/xe/xe_pt.c | 2 +- drivers/gpu/drm/xe/xe_vm.c | 30 ++-- 4 files changed, 80 insertions(+), 110 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index 92a18a0e4acd19..c3419d4412ce85 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -17,6 +17,8 @@ #include "xe_trace.h" #include "regs/xe_guc_regs.h" +#define FENCE_STACK_BIT DMA_FENCE_FLAG_USER_BITS + /* * TLB inval depends on pending commands in the CT queue and then the real * invalidation time. Double up the time to process full CT queue @@ -33,6 +35,23 @@ static long tlb_timeout_jiffies(struct xe_gt *gt) return hw_tlb_timeout + 2 * delay; } +static void +__invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence) +{ + bool stack = test_bit(FENCE_STACK_BIT, &fence->base.flags); + + trace_xe_gt_tlb_invalidation_fence_signal(xe, fence); + dma_fence_signal(&fence->base); + if (!stack) + dma_fence_put(&fence->base); +} + +static void +invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence) +{ + list_del(&fence->link); + __invalidation_fence_signal(xe, fence); +} static void xe_gt_tlb_fence_timeout(struct work_struct *work) { @@ -54,10 +73,8 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work) xe_gt_err(gt, "TLB invalidation fence timeout, seqno=%d recv=%d", fence->seqno, gt->tlb_invalidation.seqno_recv); - list_del(&fence->link); fence->base.error = -ETIME; - dma_fence_signal(&fence->base); - dma_fence_put(&fence->base); + invalidation_fence_signal(xe, fence); } if (!list_empty(>->tlb_invalidation.pending_fences)) queue_delayed_work(system_wq, @@ -87,21 +104,6 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt) return 0; } -static void -__invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence) -{ - trace_xe_gt_tlb_invalidation_fence_signal(xe, fence); - dma_fence_signal(&fence->base); - dma_fence_put(&fence->base); -} - -static void -invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence) -{ - list_del(&fence->link); - __invalidation_fence_signal(xe, fence); -} - /** * xe_gt_tlb_invalidation_reset - Initialize GT TLB invalidation reset * @gt: graphics tile @@ -111,7 +113,6 @@ invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fe void xe_gt_tlb_invalidation_reset(struct xe_gt *gt) { struct xe_gt_tlb_invalidation_fence *fence, *next; - struct xe_guc *guc = >->uc.guc; int pending_seqno; /* @@ -134,7 +135,6 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt) else pending_seqno = gt->tlb_invalidation.seqno - 1; WRITE_ONCE(gt->tlb_invalidation.seqno_recv, pending_seqno); - wake_up_all(&guc->ct.wq); list_for_each_entry_safe(fence, next, >->tlb_invalidation.pending_fences, link) @@ -165,6 +165,8 @@ static int send_tlb_invalidation(struct xe_guc *guc, int seqno; int ret; + xe_gt_assert(gt, fence); + /* * XXX: The seqno algorithm relies on TLB invalidation being processed * in order which they currently are, if that changes the algorithm will @@ -173,10 +175,8 @@ static int send_tlb_invalidation(struct xe_guc *guc, mutex_lock(&guc->ct.lock); seqno = gt->tlb_invalidation.seqno; - if (fence) { - fence->seqno = seqno; - trace_xe_gt_tlb_invalidation_fence_send(xe, fence); - } + fence->seqno = seqno; + trace_xe_gt_tlb_invalidation_fence_send(xe, fence); action[1] = seqno; ret = xe_guc_ct_send_locked(&guc->ct, action, len, G2H_LEN_DW_TLB_INVALIDATE, 1); @@ -209,7 +209,6 @@ static int send_tlb_invalidation(struct xe_guc *guc, TLB_INVALIDATION_SEQNO_MAX; if (!gt->tlb_invalidation.seqno) gt->tlb_invalidation.seqno = 1; - ret = seqno; } mutex_unlock(&guc->ct.lock); @@ -223,14 +222,16 @@ static int send_tlb_invalidation(struct xe_guc *guc, /** * xe_gt_tlb_invalidation_guc - Issue a TLB invalidation on this GT for the GuC * @gt: graphics tile + * @fence: invalidation fence which will be signal on TLB invalidation + * completion * * Issue a TLB invalidation for the GuC. Completion of TLB is asynchronous and - * caller can use seqno + xe_gt_tlb_invalidation_wait to wait for completion. + * caller can use the invalidation fence to wait for completion. * - * Return: Seqno which can be passed to xe_gt_tlb_invalidation_wait on success, - * negative error code on error. + * Return: 0 on success, negative error code on error */ -static int xe_gt_tlb_invalidation_guc(struct xe_gt *gt) +static int xe_gt_tlb_invalidation_guc(struct xe_gt *gt, + struct xe_gt_tlb_invalidation_fence *fence) { u32 action[] = { XE_GUC_ACTION_TLB_INVALIDATION, @@ -238,7 +239,7 @@ static int xe_gt_tlb_invalidation_guc(struct xe_gt *gt) MAKE_INVAL_OP(XE_GUC_TLB_INVAL_GUC), }; - return send_tlb_invalidation(>->uc.guc, NULL, action, + return send_tlb_invalidation(>->uc.guc, fence, action, ARRAY_SIZE(action)); } @@ -257,13 +258,15 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt) if (xe_guc_ct_enabled(>->uc.guc.ct) && gt->uc.guc.submission_state.enabled) { - int seqno; + struct xe_gt_tlb_invalidation_fence fence; + int ret; - seqno = xe_gt_tlb_invalidation_guc(gt); - if (seqno <= 0) - return seqno; + xe_gt_tlb_invalidation_fence_init(gt, &fence, true); + ret = xe_gt_tlb_invalidation_guc(gt, &fence); + if (ret < 0) + return ret; - xe_gt_tlb_invalidation_wait(gt, seqno); + xe_gt_tlb_invalidation_fence_wait(&fence); } else if (xe_device_uc_enabled(xe) && !xe_device_wedged(xe)) { if (IS_SRIOV_VF(xe)) return 0; @@ -290,18 +293,16 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt) * * @gt: graphics tile * @fence: invalidation fence which will be signal on TLB invalidation - * completion, can be NULL + * completion * @start: start address * @end: end address * @asid: address space id * * Issue a range based TLB invalidation if supported, if not fallback to a full - * TLB invalidation. Completion of TLB is asynchronous and caller can either use - * the invalidation fence or seqno + xe_gt_tlb_invalidation_wait to wait for - * completion. + * TLB invalidation. Completion of TLB is asynchronous and caller can use + * the invalidation fence to wait for completion. * - * Return: Seqno which can be passed to xe_gt_tlb_invalidation_wait on success, - * negative error code on error. + * Return: Negative error code on error, 0 on success */ int xe_gt_tlb_invalidation_range(struct xe_gt *gt, struct xe_gt_tlb_invalidation_fence *fence, @@ -312,11 +313,11 @@ int xe_gt_tlb_invalidation_range(struct xe_gt *gt, u32 action[MAX_TLB_INVALIDATION_LEN]; int len = 0; + xe_gt_assert(gt, fence); + /* Execlists not supported */ if (gt_to_xe(gt)->info.force_execlist) { - if (fence) - __invalidation_fence_signal(xe, fence); - + __invalidation_fence_signal(xe, fence); return 0; } @@ -382,12 +383,10 @@ int xe_gt_tlb_invalidation_range(struct xe_gt *gt, * @vma: VMA to invalidate * * Issue a range based TLB invalidation if supported, if not fallback to a full - * TLB invalidation. Completion of TLB is asynchronous and caller can either use - * the invalidation fence or seqno + xe_gt_tlb_invalidation_wait to wait for - * completion. + * TLB invalidation. Completion of TLB is asynchronous and caller can use + * the invalidation fence to wait for completion. * - * Return: Seqno which can be passed to xe_gt_tlb_invalidation_wait on success, - * negative error code on error. + * Return: Negative error code on error, 0 on success */ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt, struct xe_gt_tlb_invalidation_fence *fence, @@ -400,43 +399,6 @@ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt, xe_vma_vm(vma)->usm.asid); } -/** - * xe_gt_tlb_invalidation_wait - Wait for TLB to complete - * @gt: graphics tile - * @seqno: seqno to wait which was returned from xe_gt_tlb_invalidation - * - * Wait for tlb_timeout_jiffies() for a TLB invalidation to complete. - * - * Return: 0 on success, -ETIME on TLB invalidation timeout - */ -int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno) -{ - struct xe_guc *guc = >->uc.guc; - int ret; - - /* Execlists not supported */ - if (gt_to_xe(gt)->info.force_execlist) - return 0; - - /* - * XXX: See above, this algorithm only works if seqno are always in - * order - */ - ret = wait_event_timeout(guc->ct.wq, - tlb_invalidation_seqno_past(gt, seqno), - tlb_timeout_jiffies(gt)); - if (!ret) { - struct drm_printer p = xe_gt_err_printer(gt); - - xe_gt_err(gt, "TLB invalidation time'd out, seqno=%d, recv=%d\n", - seqno, gt->tlb_invalidation.seqno_recv); - xe_guc_ct_print(&guc->ct, &p, true); - return -ETIME; - } - - return 0; -} - /** * xe_guc_tlb_invalidation_done_handler - TLB invalidation done handler * @guc: guc @@ -480,12 +442,7 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len) return 0; } - /* - * wake_up_all() and wait_event_timeout() already have the correct - * barriers. - */ WRITE_ONCE(gt->tlb_invalidation.seqno_recv, msg[0]); - wake_up_all(&guc->ct.wq); list_for_each_entry_safe(fence, next, >->tlb_invalidation.pending_fences, link) { @@ -530,11 +487,13 @@ static const struct dma_fence_ops invalidation_fence_ops = { * xe_gt_tlb_invalidation_fence_init - Initialize TLB invalidation fence * @gt: GT * @fence: TLB invalidation fence to initialize + * @stack: fence is stack variable * * Initialize TLB invalidation fence for use */ void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, - struct xe_gt_tlb_invalidation_fence *fence) + struct xe_gt_tlb_invalidation_fence *fence, + bool stack) { spin_lock_irq(>->tlb_invalidation.lock); dma_fence_init(&fence->base, &invalidation_fence_ops, @@ -542,5 +501,8 @@ void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, dma_fence_context_alloc(1), 1); spin_unlock_irq(>->tlb_invalidation.lock); INIT_LIST_HEAD(&fence->link); - dma_fence_get(&fence->base); + if (stack) + set_bit(FENCE_STACK_BIT, &fence->base.flags); + else + dma_fence_get(&fence->base); } diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h index 948f4a2f52144e..f430d5797af701 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h @@ -23,10 +23,16 @@ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt, int xe_gt_tlb_invalidation_range(struct xe_gt *gt, struct xe_gt_tlb_invalidation_fence *fence, u64 start, u64 end, u32 asid); -int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno); int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len); void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, - struct xe_gt_tlb_invalidation_fence *fence); + struct xe_gt_tlb_invalidation_fence *fence, + bool stack); + +static inline void +xe_gt_tlb_invalidation_fence_wait(struct xe_gt_tlb_invalidation_fence *fence) +{ + dma_fence_wait(&fence->base, false); +} #endif /* _XE_GT_TLB_INVALIDATION_ */ diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 4c17a1ec8f8b90..31a751a5de3f1f 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1153,7 +1153,7 @@ static int invalidation_fence_init(struct xe_gt *gt, trace_xe_gt_tlb_invalidation_fence_create(gt_to_xe(gt), &ifence->base); - xe_gt_tlb_invalidation_fence_init(gt, &ifence->base); + xe_gt_tlb_invalidation_fence_init(gt, &ifence->base, false); ifence->fence = fence; ifence->gt = gt; diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 6bfcbd4e778a9e..931935ec33dbe9 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -3341,10 +3341,10 @@ int xe_vm_invalidate_vma(struct xe_vma *vma) { struct xe_device *xe = xe_vma_vm(vma)->xe; struct xe_tile *tile; + struct xe_gt_tlb_invalidation_fence fence[XE_MAX_TILES_PER_DEVICE]; u32 tile_needs_invalidate = 0; - int seqno[XE_MAX_TILES_PER_DEVICE]; u8 id; - int ret; + int ret = 0; xe_assert(xe, !xe_vma_is_null(vma)); trace_xe_vma_invalidate(vma); @@ -3369,29 +3369,31 @@ int xe_vm_invalidate_vma(struct xe_vma *vma) for_each_tile(tile, xe, id) { if (xe_pt_zap_ptes(tile, vma)) { - tile_needs_invalidate |= BIT(id); xe_device_wmb(xe); + xe_gt_tlb_invalidation_fence_init(tile->primary_gt, + &fence[id], true); + /* * FIXME: We potentially need to invalidate multiple * GTs within the tile */ - seqno[id] = xe_gt_tlb_invalidation_vma(tile->primary_gt, NULL, vma); - if (seqno[id] < 0) - return seqno[id]; - } - } - - for_each_tile(tile, xe, id) { - if (tile_needs_invalidate & BIT(id)) { - ret = xe_gt_tlb_invalidation_wait(tile->primary_gt, seqno[id]); + ret = xe_gt_tlb_invalidation_vma(tile->primary_gt, + &fence[id], vma); if (ret < 0) - return ret; + goto wait; + + tile_needs_invalidate |= BIT(id); } } +wait: + for_each_tile(tile, xe, id) + if (tile_needs_invalidate & BIT(id)) + xe_gt_tlb_invalidation_fence_wait(&fence[id]); + vma->tile_invalidated = vma->tile_mask; - return 0; + return ret; } struct xe_vm_snapshot { From f002702290fccbd473f5bb94e52f25c96917fff2 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 19 Jul 2024 10:29:04 -0700 Subject: [PATCH 371/991] drm/xe: Hold a PM ref when GT TLB invalidations are inflight Avoid GT TLB invalidation timeouts by holding a PM ref when invalidations are inflight. v2: - Drop PM ref before signaling fence (CI) v3: - Move invalidation_fence_signal helper in tlb timeout to previous patch (Matthew Auld) Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Rodrigo Vivi Cc: Nirmoy Das Signed-off-by: Matthew Brost Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240719172905.1527927-4-matthew.brost@intel.com (cherry picked from commit 0a382f9bc5dc4744a33970a5ed4df8f9c702ee9e) Requires: 46209ce5287b ("drm/xe: Add xe_gt_tlb_invalidation_fence_init helper") Requires: 0e414ab036e0 ("drm/xe: Drop xe_gt_tlb_invalidation_wait") Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 23 +++++++++++++++++-- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h | 1 + .../gpu/drm/xe/xe_gt_tlb_invalidation_types.h | 4 ++++ drivers/gpu/drm/xe/xe_vm.c | 4 +++- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index c3419d4412ce85..481d83d07367d2 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -13,6 +13,7 @@ #include "xe_guc.h" #include "xe_guc_ct.h" #include "xe_mmio.h" +#include "xe_pm.h" #include "xe_sriov.h" #include "xe_trace.h" #include "regs/xe_guc_regs.h" @@ -41,6 +42,7 @@ __invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_ bool stack = test_bit(FENCE_STACK_BIT, &fence->base.flags); trace_xe_gt_tlb_invalidation_fence_signal(xe, fence); + xe_gt_tlb_invalidation_fence_fini(fence); dma_fence_signal(&fence->base); if (!stack) dma_fence_put(&fence->base); @@ -263,8 +265,10 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt) xe_gt_tlb_invalidation_fence_init(gt, &fence, true); ret = xe_gt_tlb_invalidation_guc(gt, &fence); - if (ret < 0) + if (ret < 0) { + xe_gt_tlb_invalidation_fence_fini(&fence); return ret; + } xe_gt_tlb_invalidation_fence_wait(&fence); } else if (xe_device_uc_enabled(xe) && !xe_device_wedged(xe)) { @@ -489,12 +493,15 @@ static const struct dma_fence_ops invalidation_fence_ops = { * @fence: TLB invalidation fence to initialize * @stack: fence is stack variable * - * Initialize TLB invalidation fence for use + * Initialize TLB invalidation fence for use. xe_gt_tlb_invalidation_fence_fini + * must be called if fence is not signaled. */ void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, struct xe_gt_tlb_invalidation_fence *fence, bool stack) { + xe_pm_runtime_get_noresume(gt_to_xe(gt)); + spin_lock_irq(>->tlb_invalidation.lock); dma_fence_init(&fence->base, &invalidation_fence_ops, >->tlb_invalidation.lock, @@ -505,4 +512,16 @@ void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, set_bit(FENCE_STACK_BIT, &fence->base.flags); else dma_fence_get(&fence->base); + fence->gt = gt; +} + +/** + * xe_gt_tlb_invalidation_fence_fini - Finalize TLB invalidation fence + * @fence: TLB invalidation fence to finalize + * + * Drop PM ref which fence took durinig init. + */ +void xe_gt_tlb_invalidation_fence_fini(struct xe_gt_tlb_invalidation_fence *fence) +{ + xe_pm_runtime_put(gt_to_xe(fence->gt)); } diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h index f430d5797af701..a84065fa324c76 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h @@ -28,6 +28,7 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len); void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt, struct xe_gt_tlb_invalidation_fence *fence, bool stack); +void xe_gt_tlb_invalidation_fence_fini(struct xe_gt_tlb_invalidation_fence *fence); static inline void xe_gt_tlb_invalidation_fence_wait(struct xe_gt_tlb_invalidation_fence *fence) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h index 934c828efe31cb..de6e825e0851e5 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h @@ -8,6 +8,8 @@ #include +struct xe_gt; + /** * struct xe_gt_tlb_invalidation_fence - XE GT TLB invalidation fence * @@ -17,6 +19,8 @@ struct xe_gt_tlb_invalidation_fence { /** @base: dma fence base */ struct dma_fence base; + /** @gt: GT which fence belong to */ + struct xe_gt *gt; /** @link: link into list of pending tlb fences */ struct list_head link; /** @seqno: seqno of TLB invalidation to signal fence one */ diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 931935ec33dbe9..c7561a56abaf20 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -3379,8 +3379,10 @@ int xe_vm_invalidate_vma(struct xe_vma *vma) */ ret = xe_gt_tlb_invalidation_vma(tile->primary_gt, &fence[id], vma); - if (ret < 0) + if (ret < 0) { + xe_gt_tlb_invalidation_fence_fini(&fence[id]); goto wait; + } tile_needs_invalidate |= BIT(id); } From af8e119f52e9c13e556be9e03f27957554a84656 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 15 Aug 2024 17:11:17 +0300 Subject: [PATCH 372/991] xhci: Fix Panther point NULL pointer deref at full-speed re-enumeration re-enumerating full-speed devices after a failed address device command can trigger a NULL pointer dereference. Full-speed devices may need to reconfigure the endpoint 0 Max Packet Size value during enumeration. Usb core calls usb_ep0_reinit() in this case, which ends up calling xhci_configure_endpoint(). On Panther point xHC the xhci_configure_endpoint() function will additionally check and reserve bandwidth in software. Other hosts do this in hardware If xHC address device command fails then a new xhci_virt_device structure is allocated as part of re-enabling the slot, but the bandwidth table pointers are not set up properly here. This triggers the NULL pointer dereference the next time usb_ep0_reinit() is called and xhci_configure_endpoint() tries to check and reserve bandwidth [46710.713538] usb 3-1: new full-speed USB device number 5 using xhci_hcd [46710.713699] usb 3-1: Device not responding to setup address. [46710.917684] usb 3-1: Device not responding to setup address. [46711.125536] usb 3-1: device not accepting address 5, error -71 [46711.125594] BUG: kernel NULL pointer dereference, address: 0000000000000008 [46711.125600] #PF: supervisor read access in kernel mode [46711.125603] #PF: error_code(0x0000) - not-present page [46711.125606] PGD 0 P4D 0 [46711.125610] Oops: Oops: 0000 [#1] PREEMPT SMP PTI [46711.125615] CPU: 1 PID: 25760 Comm: kworker/1:2 Not tainted 6.10.3_2 #1 [46711.125620] Hardware name: Gigabyte Technology Co., Ltd. [46711.125623] Workqueue: usb_hub_wq hub_event [usbcore] [46711.125668] RIP: 0010:xhci_reserve_bandwidth (drivers/usb/host/xhci.c Fix this by making sure bandwidth table pointers are set up correctly after a failed address device command, and additionally by avoiding checking for bandwidth in cases like this where no actual endpoints are added or removed, i.e. only context for default control endpoint 0 is evaluated. Reported-by: Karel Balej Closes: https://lore.kernel.org/linux-usb/D3CKQQAETH47.1MUO22RTCH2O3@matfyz.cz/ Cc: stable@vger.kernel.org Fixes: 651aaf36a7d7 ("usb: xhci: Handle USB transaction error on address command") Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240815141117.2702314-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 0a8cf6c17f827a..efdf4c228b8c0a 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -2837,7 +2837,7 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci, xhci->num_active_eps); return -ENOMEM; } - if ((xhci->quirks & XHCI_SW_BW_CHECKING) && + if ((xhci->quirks & XHCI_SW_BW_CHECKING) && !ctx_change && xhci_reserve_bandwidth(xhci, virt_dev, command->in_ctx)) { if ((xhci->quirks & XHCI_EP_LIMIT_QUIRK)) xhci_free_host_resources(xhci, ctrl_ctx); @@ -4200,8 +4200,10 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev, mutex_unlock(&xhci->mutex); ret = xhci_disable_slot(xhci, udev->slot_id); xhci_free_virt_device(xhci, udev->slot_id); - if (!ret) - xhci_alloc_dev(hcd, udev); + if (!ret) { + if (xhci_alloc_dev(hcd, udev) == 1) + xhci_setup_addressable_virt_dev(xhci, udev); + } kfree(command->completion); kfree(command); return -EPROTO; From 164199615ae230ace4519141285f06766d6d8036 Mon Sep 17 00:00:00 2001 From: Yuntao Liu Date: Thu, 15 Aug 2024 08:49:23 +0000 Subject: [PATCH 373/991] ASoC: amd: acp: fix module autoloading Add MODULE_DEVICE_TABLE(), so modules could be properly autoloaded based on the alias from platform_device_id table. Fixes: 9d8a7be88b336 ("ASoC: amd: acp: Add legacy sound card support for Chrome audio") Signed-off-by: Yuntao Liu Link: https://patch.msgid.link/20240815084923.756476-1-liuyuntao12@huawei.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-legacy-mach.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/amd/acp/acp-legacy-mach.c b/sound/soc/amd/acp/acp-legacy-mach.c index 47c3b5f167f59e..0d529e32e552bd 100644 --- a/sound/soc/amd/acp/acp-legacy-mach.c +++ b/sound/soc/amd/acp/acp-legacy-mach.c @@ -227,6 +227,8 @@ static const struct platform_device_id board_ids[] = { }, { } }; +MODULE_DEVICE_TABLE(platform, board_ids); + static struct platform_driver acp_asoc_audio = { .driver = { .pm = &snd_soc_pm_ops, From 9bb5e74b2bf88fbb024bb15ded3b011e02c673be Mon Sep 17 00:00:00 2001 From: Griffin Kroah-Hartman Date: Thu, 15 Aug 2024 11:49:20 +0200 Subject: [PATCH 374/991] Revert "misc: fastrpc: Restrict untrusted app to attach to privileged PD" This reverts commit bab2f5e8fd5d2f759db26b78d9db57412888f187. Joel reported that this commit breaks userspace and stops sensors in SDM845 from working. Also breaks other qcom SoC devices running postmarketOS. Cc: stable Cc: Ekansh Gupta Cc: Dmitry Baryshkov Reported-by: Joel Selvaraj Link: https://lore.kernel.org/r/9a9f5646-a554-4b65-8122-d212bb665c81@umsystem.edu Signed-off-by: Griffin Kroah-Hartman Acked-by: Srinivas Kandagatla Fixes: bab2f5e8fd5d ("misc: fastrpc: Restrict untrusted app to attach to privileged PD") Link: https://lore.kernel.org/r/20240815094920.8242-1-griffin@kroah.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 22 +++------------------- include/uapi/misc/fastrpc.h | 3 --- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 5204fda51da3fc..339d126414d4bf 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -2085,16 +2085,6 @@ static int fastrpc_req_mem_map(struct fastrpc_user *fl, char __user *argp) return err; } -static int is_attach_rejected(struct fastrpc_user *fl) -{ - /* Check if the device node is non-secure */ - if (!fl->is_secure_dev) { - dev_dbg(&fl->cctx->rpdev->dev, "untrusted app trying to attach to privileged DSP PD\n"); - return -EACCES; - } - return 0; -} - static long fastrpc_device_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2107,19 +2097,13 @@ static long fastrpc_device_ioctl(struct file *file, unsigned int cmd, err = fastrpc_invoke(fl, argp); break; case FASTRPC_IOCTL_INIT_ATTACH: - err = is_attach_rejected(fl); - if (!err) - err = fastrpc_init_attach(fl, ROOT_PD); + err = fastrpc_init_attach(fl, ROOT_PD); break; case FASTRPC_IOCTL_INIT_ATTACH_SNS: - err = is_attach_rejected(fl); - if (!err) - err = fastrpc_init_attach(fl, SENSORS_PD); + err = fastrpc_init_attach(fl, SENSORS_PD); break; case FASTRPC_IOCTL_INIT_CREATE_STATIC: - err = is_attach_rejected(fl); - if (!err) - err = fastrpc_init_create_static_process(fl, argp); + err = fastrpc_init_create_static_process(fl, argp); break; case FASTRPC_IOCTL_INIT_CREATE: err = fastrpc_init_create_process(fl, argp); diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h index 91583690bddc52..f33d914d8f4699 100644 --- a/include/uapi/misc/fastrpc.h +++ b/include/uapi/misc/fastrpc.h @@ -8,14 +8,11 @@ #define FASTRPC_IOCTL_ALLOC_DMA_BUFF _IOWR('R', 1, struct fastrpc_alloc_dma_buf) #define FASTRPC_IOCTL_FREE_DMA_BUFF _IOWR('R', 2, __u32) #define FASTRPC_IOCTL_INVOKE _IOWR('R', 3, struct fastrpc_invoke) -/* This ioctl is only supported with secure device nodes */ #define FASTRPC_IOCTL_INIT_ATTACH _IO('R', 4) #define FASTRPC_IOCTL_INIT_CREATE _IOWR('R', 5, struct fastrpc_init_create) #define FASTRPC_IOCTL_MMAP _IOWR('R', 6, struct fastrpc_req_mmap) #define FASTRPC_IOCTL_MUNMAP _IOWR('R', 7, struct fastrpc_req_munmap) -/* This ioctl is only supported with secure device nodes */ #define FASTRPC_IOCTL_INIT_ATTACH_SNS _IO('R', 8) -/* This ioctl is only supported with secure device nodes */ #define FASTRPC_IOCTL_INIT_CREATE_STATIC _IOWR('R', 9, struct fastrpc_init_create_static) #define FASTRPC_IOCTL_MEM_MAP _IOWR('R', 10, struct fastrpc_mem_map) #define FASTRPC_IOCTL_MEM_UNMAP _IOWR('R', 11, struct fastrpc_mem_unmap) From 92e9bac18124682c4b99ede9ee3bcdd68f121e92 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 15 Aug 2024 01:04:31 +0100 Subject: [PATCH 375/991] kunit/overflow: Fix UB in overflow_allocation_test The 'device_name' array doesn't exist out of the 'overflow_allocation_test' function scope. However, it is being used as a driver name when calling 'kunit_driver_create' from 'kunit_device_register'. It produces the kernel panic with KASAN enabled. Since this variable is used in one place only, remove it and pass the device name into kunit_device_register directly as an ascii string. Signed-off-by: Ivan Orlov Reviewed-by: David Gow Link: https://lore.kernel.org/r/20240815000431.401869-1-ivan.orlov0322@gmail.com Signed-off-by: Kees Cook --- lib/overflow_kunit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c index f314a0c15a6d83..2abc78367dd110 100644 --- a/lib/overflow_kunit.c +++ b/lib/overflow_kunit.c @@ -668,7 +668,6 @@ DEFINE_TEST_ALLOC(devm_kzalloc, devm_kfree, 1, 1, 0); static void overflow_allocation_test(struct kunit *test) { - const char device_name[] = "overflow-test"; struct device *dev; int count = 0; @@ -678,7 +677,7 @@ static void overflow_allocation_test(struct kunit *test) } while (0) /* Create dummy device for devm_kmalloc()-family tests. */ - dev = kunit_device_register(test, device_name); + dev = kunit_device_register(test, "overflow-test"); KUNIT_ASSERT_FALSE_MSG(test, IS_ERR(dev), "Cannot register test device\n"); From 020925ce92990c3bf59ab2cde386ac6d9ec734ff Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 7 Aug 2024 15:05:12 -0700 Subject: [PATCH 376/991] kallsyms: Do not cleanup .llvm. suffix before sorting symbols Cleaning up the symbols causes various issues afterwards. Let's sort the list based on original name. Signed-off-by: Song Liu Fixes: 8cc32a9bbf29 ("kallsyms: strip LTO-only suffixes from promoted global functions") Reviewed-by: Masami Hiramatsu (Google) Tested-by: Masami Hiramatsu (Google) Acked-by: Petr Mladek Reviewed-by: Sami Tolvanen Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240807220513.3100483-2-song@kernel.org Signed-off-by: Kees Cook --- scripts/kallsyms.c | 31 ++----------------------------- scripts/link-vmlinux.sh | 4 ---- 2 files changed, 2 insertions(+), 33 deletions(-) diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 0ed873491bf555..123dab0572f800 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -5,8 +5,7 @@ * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. * - * Usage: kallsyms [--all-symbols] [--absolute-percpu] - * [--lto-clang] in.map > out.S + * Usage: kallsyms [--all-symbols] [--absolute-percpu] in.map > out.S * * Table compression uses all the unused char codes on the symbols and * maps these to the most used substrings (tokens). For instance, it might @@ -62,7 +61,6 @@ static struct sym_entry **table; static unsigned int table_size, table_cnt; static int all_symbols; static int absolute_percpu; -static int lto_clang; static int token_profit[0x10000]; @@ -73,8 +71,7 @@ static unsigned char best_table_len[256]; static void usage(void) { - fprintf(stderr, "Usage: kallsyms [--all-symbols] [--absolute-percpu] " - "[--lto-clang] in.map > out.S\n"); + fprintf(stderr, "Usage: kallsyms [--all-symbols] [--absolute-percpu] in.map > out.S\n"); exit(1); } @@ -344,25 +341,6 @@ static bool symbol_absolute(const struct sym_entry *s) return s->percpu_absolute; } -static void cleanup_symbol_name(char *s) -{ - char *p; - - /* - * ASCII[.] = 2e - * ASCII[0-9] = 30,39 - * ASCII[A-Z] = 41,5a - * ASCII[_] = 5f - * ASCII[a-z] = 61,7a - * - * As above, replacing the first '.' in ".llvm." with '\0' does not - * affect the main sorting, but it helps us with subsorting. - */ - p = strstr(s, ".llvm."); - if (p) - *p = '\0'; -} - static int compare_names(const void *a, const void *b) { int ret; @@ -526,10 +504,6 @@ static void write_src(void) output_address(relative_base); printf("\n"); - if (lto_clang) - for (i = 0; i < table_cnt; i++) - cleanup_symbol_name((char *)table[i]->sym); - sort_symbols_by_name(); output_label("kallsyms_seqs_of_names"); for (i = 0; i < table_cnt; i++) @@ -807,7 +781,6 @@ int main(int argc, char **argv) static const struct option long_options[] = { {"all-symbols", no_argument, &all_symbols, 1}, {"absolute-percpu", no_argument, &absolute_percpu, 1}, - {"lto-clang", no_argument, <o_clang, 1}, {}, }; diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index f7b2503cdba956..22d0bc8439863f 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -156,10 +156,6 @@ kallsyms() kallsymopt="${kallsymopt} --absolute-percpu" fi - if is_enabled CONFIG_LTO_CLANG; then - kallsymopt="${kallsymopt} --lto-clang" - fi - info KSYMS "${2}.S" scripts/kallsyms ${kallsymopt} "${1}" > "${2}.S" From fb6a421fb6153d97cf3058f9bd550b377b76a490 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 7 Aug 2024 15:05:13 -0700 Subject: [PATCH 377/991] kallsyms: Match symbols exactly with CONFIG_LTO_CLANG With CONFIG_LTO_CLANG=y, the compiler may add .llvm. suffix to function names to avoid duplication. APIs like kallsyms_lookup_name() and kallsyms_on_each_match_symbol() tries to match these symbol names without the .llvm. suffix, e.g., match "c_stop" with symbol c_stop.llvm.17132674095431275852. This turned out to be problematic for use cases that require exact match, for example, livepatch. Fix this by making the APIs to match symbols exactly. Also cleanup kallsyms_selftests accordingly. Signed-off-by: Song Liu Fixes: 8cc32a9bbf29 ("kallsyms: strip LTO-only suffixes from promoted global functions") Tested-by: Masami Hiramatsu (Google) Reviewed-by: Masami Hiramatsu (Google) Acked-by: Petr Mladek Reviewed-by: Sami Tolvanen Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240807220513.3100483-3-song@kernel.org Signed-off-by: Kees Cook --- kernel/kallsyms.c | 55 +++++--------------------------------- kernel/kallsyms_selftest.c | 22 +-------------- 2 files changed, 7 insertions(+), 70 deletions(-) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fb2c77368d187e..a9a0ca605d4a8c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -160,38 +160,6 @@ unsigned long kallsyms_sym_address(int idx) return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; } -static void cleanup_symbol_name(char *s) -{ - char *res; - - if (!IS_ENABLED(CONFIG_LTO_CLANG)) - return; - - /* - * LLVM appends various suffixes for local functions and variables that - * must be promoted to global scope as part of LTO. This can break - * hooking of static functions with kprobes. '.' is not a valid - * character in an identifier in C. Suffixes only in LLVM LTO observed: - * - foo.llvm.[0-9a-f]+ - */ - res = strstr(s, ".llvm."); - if (res) - *res = '\0'; - - return; -} - -static int compare_symbol_name(const char *name, char *namebuf) -{ - /* The kallsyms_seqs_of_names is sorted based on names after - * cleanup_symbol_name() (see scripts/kallsyms.c) if clang lto is enabled. - * To ensure correct bisection in kallsyms_lookup_names(), do - * cleanup_symbol_name(namebuf) before comparing name and namebuf. - */ - cleanup_symbol_name(namebuf); - return strcmp(name, namebuf); -} - static unsigned int get_symbol_seq(int index) { unsigned int i, seq = 0; @@ -219,7 +187,7 @@ static int kallsyms_lookup_names(const char *name, seq = get_symbol_seq(mid); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - ret = compare_symbol_name(name, namebuf); + ret = strcmp(name, namebuf); if (ret > 0) low = mid + 1; else if (ret < 0) @@ -236,7 +204,7 @@ static int kallsyms_lookup_names(const char *name, seq = get_symbol_seq(low - 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - if (compare_symbol_name(name, namebuf)) + if (strcmp(name, namebuf)) break; low--; } @@ -248,7 +216,7 @@ static int kallsyms_lookup_names(const char *name, seq = get_symbol_seq(high + 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - if (compare_symbol_name(name, namebuf)) + if (strcmp(name, namebuf)) break; high++; } @@ -407,8 +375,7 @@ static int kallsyms_lookup_buildid(unsigned long addr, if (modbuildid) *modbuildid = NULL; - ret = strlen(namebuf); - goto found; + return strlen(namebuf); } /* See if it's in a module or a BPF JITed image. */ @@ -422,8 +389,6 @@ static int kallsyms_lookup_buildid(unsigned long addr, ret = ftrace_mod_address_lookup(addr, symbolsize, offset, modname, namebuf); -found: - cleanup_symbol_name(namebuf); return ret; } @@ -450,8 +415,6 @@ const char *kallsyms_lookup(unsigned long addr, int lookup_symbol_name(unsigned long addr, char *symname) { - int res; - symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; @@ -462,16 +425,10 @@ int lookup_symbol_name(unsigned long addr, char *symname) /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), symname, KSYM_NAME_LEN); - goto found; + return 0; } /* See if it's in a module. */ - res = lookup_module_symbol_name(addr, symname); - if (res) - return res; - -found: - cleanup_symbol_name(symname); - return 0; + return lookup_module_symbol_name(addr, symname); } /* Look up a kernel symbol and return it in a text buffer. */ diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index 2f84896a7bcbdf..873f7c445488c1 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -187,31 +187,11 @@ static void test_perf_kallsyms_lookup_name(void) stat.min, stat.max, div_u64(stat.sum, stat.real_cnt)); } -static bool match_cleanup_name(const char *s, const char *name) -{ - char *p; - int len; - - if (!IS_ENABLED(CONFIG_LTO_CLANG)) - return false; - - p = strstr(s, ".llvm."); - if (!p) - return false; - - len = strlen(name); - if (p - s != len) - return false; - - return !strncmp(s, name, len); -} - static int find_symbol(void *data, const char *name, unsigned long addr) { struct test_stat *stat = (struct test_stat *)data; - if (strcmp(name, stat->name) == 0 || - (!stat->perf && match_cleanup_name(name, stat->name))) { + if (!strcmp(name, stat->name)) { stat->real_cnt++; stat->addr = addr; From aae6b81260fd9a7224f7eb4fc440d625852245bb Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 12 Aug 2024 10:43:48 -0400 Subject: [PATCH 378/991] Bluetooth: HCI: Invert LE State quirk to be opt-out rather then opt-in This inverts the LE State quirk so by default we assume the controllers would report valid states rather than invalid which is how quirks normally behave, also this would result in HCI command failing it the LE States are really broken thus exposing the controllers that are really broken in this respect. Link: https://github.com/bluez/bluez/issues/584 Fixes: 220915857e29 ("Bluetooth: Adding driver and quirk defs for multi-role LE") Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel.c | 10 ---------- drivers/bluetooth/btintel_pcie.c | 3 --- drivers/bluetooth/btmtksdio.c | 3 --- drivers/bluetooth/btrtl.c | 1 - drivers/bluetooth/btusb.c | 4 ++-- drivers/bluetooth/hci_qca.c | 4 ++-- drivers/bluetooth/hci_vhci.c | 2 -- include/net/bluetooth/hci.h | 17 ++++++++++------- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_event.c | 2 +- 10 files changed, 16 insertions(+), 32 deletions(-) diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c index 2ebc970e6573fb..7d5e4de64e3cec 100644 --- a/drivers/bluetooth/btintel.c +++ b/drivers/bluetooth/btintel.c @@ -2945,9 +2945,6 @@ static int btintel_setup_combined(struct hci_dev *hdev) INTEL_ROM_LEGACY_NO_WBS_SUPPORT)) set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); - if (ver.hw_variant == 0x08 && ver.fw_variant == 0x22) - set_bit(HCI_QUIRK_VALID_LE_STATES, - &hdev->quirks); err = btintel_legacy_rom_setup(hdev, &ver); break; @@ -2956,7 +2953,6 @@ static int btintel_setup_combined(struct hci_dev *hdev) case 0x12: /* ThP */ case 0x13: /* HrP */ case 0x14: /* CcP */ - set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); fallthrough; case 0x0c: /* WsP */ /* Apply the device specific HCI quirks @@ -3048,9 +3044,6 @@ static int btintel_setup_combined(struct hci_dev *hdev) /* These variants don't seem to support LE Coded PHY */ set_bit(HCI_QUIRK_BROKEN_LE_CODED, &hdev->quirks); - /* Set Valid LE States quirk */ - set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); - /* Setup MSFT Extension support */ btintel_set_msft_opcode(hdev, ver.hw_variant); @@ -3076,9 +3069,6 @@ static int btintel_setup_combined(struct hci_dev *hdev) */ set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); - /* Apply LE States quirk from solar onwards */ - set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); - /* Setup MSFT Extension support */ btintel_set_msft_opcode(hdev, INTEL_HW_VARIANT(ver_tlv.cnvi_bt)); diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c index 0d1a0415557b2d..1c7631f22c522b 100644 --- a/drivers/bluetooth/btintel_pcie.c +++ b/drivers/bluetooth/btintel_pcie.c @@ -1180,9 +1180,6 @@ static int btintel_pcie_setup(struct hci_dev *hdev) */ set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); - /* Apply LE States quirk from solar onwards */ - set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); - /* Setup MSFT Extension support */ btintel_set_msft_opcode(hdev, INTEL_HW_VARIANT(ver_tlv.cnvi_bt)); diff --git a/drivers/bluetooth/btmtksdio.c b/drivers/bluetooth/btmtksdio.c index 39d6898497a404..497e4c87f5be56 100644 --- a/drivers/bluetooth/btmtksdio.c +++ b/drivers/bluetooth/btmtksdio.c @@ -1148,9 +1148,6 @@ static int btmtksdio_setup(struct hci_dev *hdev) } } - /* Valid LE States quirk for MediaTek 7921 */ - set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); - break; case 0x7663: case 0x7668: diff --git a/drivers/bluetooth/btrtl.c b/drivers/bluetooth/btrtl.c index f2f37143c454d5..fd7991ea76726e 100644 --- a/drivers/bluetooth/btrtl.c +++ b/drivers/bluetooth/btrtl.c @@ -1287,7 +1287,6 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev) case CHIP_ID_8852C: case CHIP_ID_8851B: case CHIP_ID_8852BT: - set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); /* RTL8852C needs to transmit mSBC data continuously without diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index acdba5d77694f8..51d9d4532dda4e 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -3956,8 +3956,8 @@ static int btusb_probe(struct usb_interface *intf, if (id->driver_info & BTUSB_WIDEBAND_SPEECH) set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); - if (id->driver_info & BTUSB_VALID_LE_STATES) - set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); + if (!(id->driver_info & BTUSB_VALID_LE_STATES)) + set_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks); if (id->driver_info & BTUSB_DIGIANSWER) { data->cmdreq_type = USB_TYPE_VENDOR; diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 45adc1560d949a..4b1ad7ea5b95a9 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -2474,8 +2474,8 @@ static int qca_serdev_probe(struct serdev_device *serdev) set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); - if (data->capabilities & QCA_CAP_VALID_LE_STATES) - set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); + if (!(data->capabilities & QCA_CAP_VALID_LE_STATES)) + set_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks); } return 0; diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c index c4046f8f1985ab..43e9ac5a3324e2 100644 --- a/drivers/bluetooth/hci_vhci.c +++ b/drivers/bluetooth/hci_vhci.c @@ -425,8 +425,6 @@ static int __vhci_create_device(struct vhci_data *data, __u8 opcode) if (opcode & 0x80) set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks); - set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); - if (hci_register_dev(hdev) < 0) { BT_ERR("Can't register HCI device"); hci_free_dev(hdev); diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index e372a88e8c3f6a..d1d073089f384e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -206,14 +206,17 @@ enum { */ HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, - /* When this quirk is set, the controller has validated that - * LE states reported through the HCI_LE_READ_SUPPORTED_STATES are - * valid. This mechanism is necessary as many controllers have - * been seen has having trouble initiating a connectable - * advertisement despite the state combination being reported as - * supported. + /* When this quirk is set, the LE states reported through the + * HCI_LE_READ_SUPPORTED_STATES are invalid/broken. + * + * This mechanism is necessary as many controllers have been seen has + * having trouble initiating a connectable advertisement despite the + * state combination being reported as supported. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. */ - HCI_QUIRK_VALID_LE_STATES, + HCI_QUIRK_BROKEN_LE_STATES, /* When this quirk is set, then erroneous data reporting * is ignored. This is mainly due to the fact that the HCI diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 31020891fc68cd..e449dba698f35e 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -825,7 +825,7 @@ extern struct mutex hci_cb_list_lock; } while (0) #define hci_dev_le_state_simultaneous(hdev) \ - (test_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks) && \ + (!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) && \ (hdev->le_states[4] & 0x08) && /* Central */ \ (hdev->le_states[4] & 0x40) && /* Peripheral */ \ (hdev->le_states[3] & 0x10)) /* Simultaneous */ diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index d0c118c47f6c93..1c82dcdf6e8fc7 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5920,7 +5920,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, * while we have an existing one in peripheral role. */ if (hdev->conn_hash.le_num_peripheral > 0 && - (!test_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks) || + (test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) || !(hdev->le_states[3] & 0x10))) return NULL; From 932021a11805b9da4bd6abf66fe233cccd59fe0e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 12 Aug 2024 11:22:08 -0400 Subject: [PATCH 379/991] Bluetooth: hci_core: Fix LE quote calculation Function hci_sched_le needs to update the respective counter variable inplace other the likes of hci_quote_sent would attempt to use the possible outdated value of conn->{le_cnt,acl_cnt}. Link: https://github.com/bluez/bluez/issues/915 Fixes: 73d80deb7bdf ("Bluetooth: prioritizing data over HCI") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 06da8ac13dca8e..f25a21f532aa75 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3664,19 +3664,19 @@ static void hci_sched_le(struct hci_dev *hdev) { struct hci_chan *chan; struct sk_buff *skb; - int quote, cnt, tmp; + int quote, *cnt, tmp; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, LE_LINK)) return; - cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt; + cnt = hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; - __check_timeout(hdev, cnt, LE_LINK); + __check_timeout(hdev, *cnt, LE_LINK); - tmp = cnt; - while (cnt && (chan = hci_chan_sent(hdev, LE_LINK, "e))) { + tmp = *cnt; + while (*cnt && (chan = hci_chan_sent(hdev, LE_LINK, "e))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote-- && (skb = skb_peek(&chan->data_q))) { BT_DBG("chan %p skb %p len %d priority %u", chan, skb, @@ -3691,7 +3691,7 @@ static void hci_sched_le(struct hci_dev *hdev) hci_send_frame(hdev, skb); hdev->le_last_tx = jiffies; - cnt--; + (*cnt)--; chan->sent++; chan->conn->sent++; @@ -3701,12 +3701,7 @@ static void hci_sched_le(struct hci_dev *hdev) } } - if (hdev->le_pkts) - hdev->le_cnt = cnt; - else - hdev->acl_cnt = cnt; - - if (cnt != tmp) + if (*cnt != tmp) hci_prio_recalculate(hdev, LE_LINK); } From 28cd47f75185c4818b0fb1b46f2f02faaba96376 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 30 Aug 2023 15:08:06 -0700 Subject: [PATCH 380/991] Bluetooth: SMP: Fix assumption of Central always being Initiator SMP initiator role shall be considered the one that initiates the pairing procedure with SMP_CMD_PAIRING_REQ: BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part H page 1557: Figure 2.1: LE pairing phases Note that by sending SMP_CMD_SECURITY_REQ it doesn't change the role to be Initiator. Link: https://github.com/bluez/bluez/issues/567 Fixes: b28b4943660f ("Bluetooth: Add strict checks for allowed SMP PDUs") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/smp.c | 144 ++++++++++++++++++++++---------------------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 1e7ea3a4b7ef32..4f9fdf400584e5 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -914,7 +914,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, * Confirms and the responder Enters the passkey. */ if (smp->method == OVERLAP) { - if (hcon->role == HCI_ROLE_MASTER) + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp->method = CFM_PASSKEY; else smp->method = REQ_PASSKEY; @@ -964,7 +964,7 @@ static u8 smp_confirm(struct smp_chan *smp) smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp); - if (conn->hcon->out) + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); else SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); @@ -980,7 +980,8 @@ static u8 smp_random(struct smp_chan *smp) int ret; bt_dev_dbg(conn->hcon->hdev, "conn %p %s", conn, - conn->hcon->out ? "initiator" : "responder"); + test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" : + "responder"); ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp, hcon->init_addr_type, &hcon->init_addr, @@ -994,7 +995,7 @@ static u8 smp_random(struct smp_chan *smp) return SMP_CONFIRM_FAILED; } - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { u8 stk[16]; __le64 rand = 0; __le16 ediv = 0; @@ -1256,14 +1257,15 @@ static void smp_distribute_keys(struct smp_chan *smp) rsp = (void *) &smp->prsp[1]; /* The responder sends its keys first */ - if (hcon->out && (smp->remote_key_dist & KEY_DIST_MASK)) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags) && + (smp->remote_key_dist & KEY_DIST_MASK)) { smp_allow_key_dist(smp); return; } req = (void *) &smp->preq[1]; - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { keydist = &rsp->init_key_dist; *keydist &= req->init_key_dist; } else { @@ -1432,7 +1434,7 @@ static int sc_mackey_and_ltk(struct smp_chan *smp, u8 mackey[16], u8 ltk[16]) struct hci_conn *hcon = smp->conn->hcon; u8 *na, *nb, a[7], b[7]; - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { na = smp->prnd; nb = smp->rrnd; } else { @@ -1460,7 +1462,7 @@ static void sc_dhkey_check(struct smp_chan *smp) a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local_addr = a; remote_addr = b; memcpy(io_cap, &smp->preq[1], 3); @@ -1539,7 +1541,7 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op) /* The round is only complete when the initiator * receives pairing random. */ - if (!hcon->out) { + if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); if (smp->passkey_round == 20) @@ -1567,7 +1569,7 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op) SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); return 0; @@ -1578,7 +1580,7 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op) case SMP_CMD_PUBLIC_KEY: default: /* Initiating device starts the round */ - if (!hcon->out) + if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return 0; bt_dev_dbg(hdev, "Starting passkey round %u", @@ -1623,7 +1625,7 @@ static int sc_user_reply(struct smp_chan *smp, u16 mgmt_op, __le32 passkey) } /* Initiator sends DHKey check first */ - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); } else if (test_and_clear_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags)) { @@ -1746,7 +1748,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_cmd_pairing rsp, *req = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct hci_dev *hdev = conn->hcon->hdev; - struct smp_chan *smp; + struct smp_chan *smp = chan->data; u8 key_size, auth, sec_level; int ret; @@ -1755,16 +1757,14 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*req)) return SMP_INVALID_PARAMS; - if (conn->hcon->role != HCI_ROLE_SLAVE) + if (smp && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_CMD_NOTSUPP; - if (!chan->data) + if (!smp) { smp = smp_chan_create(conn); - else - smp = chan->data; - - if (!smp) - return SMP_UNSPECIFIED; + if (!smp) + return SMP_UNSPECIFIED; + } /* We didn't start the pairing, so match remote */ auth = req->auth_req & AUTH_REQ_MASK(hdev); @@ -1946,7 +1946,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*rsp)) return SMP_INVALID_PARAMS; - if (conn->hcon->role != HCI_ROLE_MASTER) + if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_CMD_NOTSUPP; skb_pull(skb, sizeof(*rsp)); @@ -2041,7 +2041,7 @@ static u8 sc_check_confirm(struct smp_chan *smp) if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) return sc_passkey_round(smp, SMP_CMD_PAIRING_CONFIRM); - if (conn->hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); @@ -2063,7 +2063,7 @@ static int fixup_sc_false_positive(struct smp_chan *smp) u8 auth; /* The issue is only observed when we're in responder role */ - if (hcon->out) + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_UNSPECIFIED; if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) { @@ -2099,7 +2099,8 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) struct hci_dev *hdev = hcon->hdev; bt_dev_dbg(hdev, "conn %p %s", conn, - hcon->out ? "initiator" : "responder"); + test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" : + "responder"); if (skb->len < sizeof(smp->pcnf)) return SMP_INVALID_PARAMS; @@ -2121,7 +2122,7 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) return ret; } - if (conn->hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); @@ -2156,7 +2157,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) if (!test_bit(SMP_FLAG_SC, &smp->flags)) return smp_random(smp); - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { pkax = smp->local_pk; pkbx = smp->remote_pk; na = smp->prnd; @@ -2169,7 +2170,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) } if (smp->method == REQ_OOB) { - if (!hcon->out) + if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); @@ -2180,7 +2181,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) return sc_passkey_round(smp, SMP_CMD_PAIRING_RANDOM); - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { u8 cfm[16]; err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk, @@ -2221,7 +2222,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) return SMP_UNSPECIFIED; if (smp->method == REQ_OOB) { - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); } @@ -2295,10 +2296,27 @@ bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level, return false; } +static void smp_send_pairing_req(struct smp_chan *smp, __u8 auth) +{ + struct smp_cmd_pairing cp; + + if (smp->conn->hcon->type == ACL_LINK) + build_bredr_pairing_cmd(smp, &cp, NULL); + else + build_pairing_cmd(smp->conn, &cp, NULL, auth); + + smp->preq[0] = SMP_CMD_PAIRING_REQ; + memcpy(&smp->preq[1], &cp, sizeof(cp)); + + smp_send_cmd(smp->conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); + + set_bit(SMP_FLAG_INITIATOR, &smp->flags); +} + static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_security_req *rp = (void *) skb->data; - struct smp_cmd_pairing cp; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_chan *smp; @@ -2347,16 +2365,20 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) skb_pull(skb, sizeof(*rp)); - memset(&cp, 0, sizeof(cp)); - build_pairing_cmd(conn, &cp, NULL, auth); + smp_send_pairing_req(smp, auth); - smp->preq[0] = SMP_CMD_PAIRING_REQ; - memcpy(&smp->preq[1], &cp, sizeof(cp)); + return 0; +} - smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); - SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); +static void smp_send_security_req(struct smp_chan *smp, __u8 auth) +{ + struct smp_cmd_security_req cp; - return 0; + cp.auth_req = auth; + smp_send_cmd(smp->conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp); + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ); + + clear_bit(SMP_FLAG_INITIATOR, &smp->flags); } int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) @@ -2427,23 +2449,11 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) authreq |= SMP_AUTH_MITM; } - if (hcon->role == HCI_ROLE_MASTER) { - struct smp_cmd_pairing cp; - - build_pairing_cmd(conn, &cp, NULL, authreq); - smp->preq[0] = SMP_CMD_PAIRING_REQ; - memcpy(&smp->preq[1], &cp, sizeof(cp)); - - smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); - SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); - } else { - struct smp_cmd_security_req cp; - cp.auth_req = authreq; - smp_send_cmd(conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp); - SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ); - } + if (hcon->role == HCI_ROLE_MASTER) + smp_send_pairing_req(smp, authreq); + else + smp_send_security_req(smp, authreq); - set_bit(SMP_FLAG_INITIATOR, &smp->flags); ret = 0; unlock: @@ -2694,8 +2704,6 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) static u8 sc_select_method(struct smp_chan *smp) { - struct l2cap_conn *conn = smp->conn; - struct hci_conn *hcon = conn->hcon; struct smp_cmd_pairing *local, *remote; u8 local_mitm, remote_mitm, local_io, remote_io, method; @@ -2708,7 +2716,7 @@ static u8 sc_select_method(struct smp_chan *smp) * the "struct smp_cmd_pairing" from them we need to skip the * first byte which contains the opcode. */ - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local = (void *) &smp->preq[1]; remote = (void *) &smp->prsp[1]; } else { @@ -2777,7 +2785,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) /* Non-initiating device sends its public key after receiving * the key from the initiating device. */ - if (!hcon->out) { + if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { err = sc_send_public_key(smp); if (err) return err; @@ -2839,7 +2847,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) } if (smp->method == REQ_OOB) { - if (hcon->out) + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); @@ -2848,7 +2856,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } - if (hcon->out) + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); if (smp->method == REQ_PASSKEY) { @@ -2863,7 +2871,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) /* The Initiating device waits for the non-initiating device to * send the confirm value. */ - if (conn->hcon->out) + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return 0; err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd, @@ -2897,7 +2905,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local_addr = a; remote_addr = b; memcpy(io_cap, &smp->prsp[1], 3); @@ -2922,7 +2930,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) if (crypto_memneq(check->e, e, 16)) return SMP_DHKEY_CHECK_FAILED; - if (!hcon->out) { + if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) { set_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags); return 0; @@ -2934,7 +2942,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) sc_add_ltk(smp); - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { hci_le_start_enc(hcon, 0, 0, smp->tk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; } @@ -3083,7 +3091,6 @@ static void bredr_pairing(struct l2cap_chan *chan) struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; - struct smp_cmd_pairing req; struct smp_chan *smp; bt_dev_dbg(hdev, "chan %p", chan); @@ -3135,14 +3142,7 @@ static void bredr_pairing(struct l2cap_chan *chan) bt_dev_dbg(hdev, "starting SMP over BR/EDR"); - /* Prepare and send the BR/EDR SMP Pairing Request */ - build_bredr_pairing_cmd(smp, &req, NULL); - - smp->preq[0] = SMP_CMD_PAIRING_REQ; - memcpy(&smp->preq[1], &req, sizeof(req)); - - smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(req), &req); - SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); + smp_send_pairing_req(smp, 0x00); } static void smp_resume_cb(struct l2cap_chan *chan) From 538fd3921afac97158d4177139a0ad39f056dbb2 Mon Sep 17 00:00:00 2001 From: Griffin Kroah-Hartman Date: Thu, 15 Aug 2024 13:51:00 +0200 Subject: [PATCH 381/991] Bluetooth: MGMT: Add error handling to pair_device() hci_conn_params_add() never checks for a NULL value and could lead to a NULL pointer dereference causing a crash. Fixed by adding error handling in the function. Cc: Stable Fixes: 5157b8a503fa ("Bluetooth: Fix initializing conn_params in scan phase") Signed-off-by: Griffin Kroah-Hartman Reported-by: Yiwei Zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 40d4887c7f7913..25979f4283a6ff 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3456,6 +3456,10 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, * will be kept and this function does nothing. */ p = hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type); + if (!p) { + err = -EIO; + goto unlock; + } if (p->auto_connect == HCI_AUTO_CONN_EXPLICIT) p->auto_connect = HCI_AUTO_CONN_DISABLED; From 624ab9cde26a9f150b4fd268b0f3dae3184dc40c Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 16 Jul 2024 09:06:30 -0700 Subject: [PATCH 382/991] drm/msm/adreno: Fix error return if missing firmware-name -ENODEV is used to signify that there is no zap shader for the platform, and the CPU can directly take the GPU out of secure mode. We want to use this return code when there is no zap-shader node. But not when there is, but without a firmware-name property. This case we want to treat as-if the needed fw is not found. Signed-off-by: Rob Clark Reviewed-by: Dmitry Baryshkov Reviewed-by: Akhil P Oommen Patchwork: https://patchwork.freedesktop.org/patch/604564/ --- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index 1c6626747b98fa..ecc3fc5cec2270 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -99,7 +99,7 @@ static int zap_shader_load_mdt(struct msm_gpu *gpu, const char *fwname, * was a bad idea, and is only provided for backwards * compatibility for older targets. */ - return -ENODEV; + return -ENOENT; } if (IS_ERR(fw)) { From 6486cad00a8b7f8585983408c152bbe33dda529b Mon Sep 17 00:00:00 2001 From: David Gstir Date: Wed, 17 Jul 2024 13:28:44 +0200 Subject: [PATCH 383/991] KEYS: trusted: fix DCP blob payload length assignment The DCP trusted key type uses the wrong helper function to store the blob's payload length which can lead to the wrong byte order being used in case this would ever run on big endian architectures. Fix by using correct helper function. Cc: stable@vger.kernel.org # v6.10+ Fixes: 2e8a0f40a39c ("KEYS: trusted: Introduce NXP DCP-backed trusted keys") Suggested-by: Richard Weinberger Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405240610.fj53EK0q-lkp@intel.com/ Signed-off-by: David Gstir Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- security/keys/trusted-keys/trusted_dcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/keys/trusted-keys/trusted_dcp.c b/security/keys/trusted-keys/trusted_dcp.c index b5f81a05be3676..b0947f072a98c8 100644 --- a/security/keys/trusted-keys/trusted_dcp.c +++ b/security/keys/trusted-keys/trusted_dcp.c @@ -222,7 +222,7 @@ static int trusted_dcp_seal(struct trusted_key_payload *p, char *datablob) return ret; } - b->payload_len = get_unaligned_le32(&p->key_len); + put_unaligned_le32(p->key_len, &b->payload_len); p->blob_len = blen; return 0; } From 0e28bf61a5f9ab30be3f3b4eafb8d097e39446bb Mon Sep 17 00:00:00 2001 From: David Gstir Date: Wed, 17 Jul 2024 13:28:45 +0200 Subject: [PATCH 384/991] KEYS: trusted: dcp: fix leak of blob encryption key Trusted keys unseal the key blob on load, but keep the sealed payload in the blob field so that every subsequent read (export) will simply convert this field to hex and send it to userspace. With DCP-based trusted keys, we decrypt the blob encryption key (BEK) in the Kernel due hardware limitations and then decrypt the blob payload. BEK decryption is done in-place which means that the trusted key blob field is modified and it consequently holds the BEK in plain text. Every subsequent read of that key thus send the plain text BEK instead of the encrypted BEK to userspace. This issue only occurs when importing a trusted DCP-based key and then exporting it again. This should rarely happen as the common use cases are to either create a new trusted key and export it, or import a key blob and then just use it without exporting it again. Fix this by performing BEK decryption and encryption in a dedicated buffer. Further always wipe the plain text BEK buffer to prevent leaking the key via uninitialized memory. Cc: stable@vger.kernel.org # v6.10+ Fixes: 2e8a0f40a39c ("KEYS: trusted: Introduce NXP DCP-backed trusted keys") Signed-off-by: David Gstir Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- security/keys/trusted-keys/trusted_dcp.c | 33 +++++++++++++++--------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/security/keys/trusted-keys/trusted_dcp.c b/security/keys/trusted-keys/trusted_dcp.c index b0947f072a98c8..4edc5bbbcda3c9 100644 --- a/security/keys/trusted-keys/trusted_dcp.c +++ b/security/keys/trusted-keys/trusted_dcp.c @@ -186,20 +186,21 @@ static int do_aead_crypto(u8 *in, u8 *out, size_t len, u8 *key, u8 *nonce, return ret; } -static int decrypt_blob_key(u8 *key) +static int decrypt_blob_key(u8 *encrypted_key, u8 *plain_key) { - return do_dcp_crypto(key, key, false); + return do_dcp_crypto(encrypted_key, plain_key, false); } -static int encrypt_blob_key(u8 *key) +static int encrypt_blob_key(u8 *plain_key, u8 *encrypted_key) { - return do_dcp_crypto(key, key, true); + return do_dcp_crypto(plain_key, encrypted_key, true); } static int trusted_dcp_seal(struct trusted_key_payload *p, char *datablob) { struct dcp_blob_fmt *b = (struct dcp_blob_fmt *)p->blob; int blen, ret; + u8 plain_blob_key[AES_KEYSIZE_128]; blen = calc_blob_len(p->key_len); if (blen > MAX_BLOB_SIZE) @@ -207,30 +208,36 @@ static int trusted_dcp_seal(struct trusted_key_payload *p, char *datablob) b->fmt_version = DCP_BLOB_VERSION; get_random_bytes(b->nonce, AES_KEYSIZE_128); - get_random_bytes(b->blob_key, AES_KEYSIZE_128); + get_random_bytes(plain_blob_key, AES_KEYSIZE_128); - ret = do_aead_crypto(p->key, b->payload, p->key_len, b->blob_key, + ret = do_aead_crypto(p->key, b->payload, p->key_len, plain_blob_key, b->nonce, true); if (ret) { pr_err("Unable to encrypt blob payload: %i\n", ret); - return ret; + goto out; } - ret = encrypt_blob_key(b->blob_key); + ret = encrypt_blob_key(plain_blob_key, b->blob_key); if (ret) { pr_err("Unable to encrypt blob key: %i\n", ret); - return ret; + goto out; } put_unaligned_le32(p->key_len, &b->payload_len); p->blob_len = blen; - return 0; + ret = 0; + +out: + memzero_explicit(plain_blob_key, sizeof(plain_blob_key)); + + return ret; } static int trusted_dcp_unseal(struct trusted_key_payload *p, char *datablob) { struct dcp_blob_fmt *b = (struct dcp_blob_fmt *)p->blob; int blen, ret; + u8 plain_blob_key[AES_KEYSIZE_128]; if (b->fmt_version != DCP_BLOB_VERSION) { pr_err("DCP blob has bad version: %i, expected %i\n", @@ -248,14 +255,14 @@ static int trusted_dcp_unseal(struct trusted_key_payload *p, char *datablob) goto out; } - ret = decrypt_blob_key(b->blob_key); + ret = decrypt_blob_key(b->blob_key, plain_blob_key); if (ret) { pr_err("Unable to decrypt blob key: %i\n", ret); goto out; } ret = do_aead_crypto(b->payload, p->key, p->key_len + DCP_BLOB_AUTHLEN, - b->blob_key, b->nonce, false); + plain_blob_key, b->nonce, false); if (ret) { pr_err("Unwrap of DCP payload failed: %i\n", ret); goto out; @@ -263,6 +270,8 @@ static int trusted_dcp_unseal(struct trusted_key_payload *p, char *datablob) ret = 0; out: + memzero_explicit(plain_blob_key, sizeof(plain_blob_key)); + return ret; } From e01d48c699bbe015d887cb598e4047f08f3998a8 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 14 Aug 2024 21:26:19 +0200 Subject: [PATCH 385/991] riscv: Fix out-of-bounds when accessing Andes per hart vendor extension array The out-of-bounds access is reported by UBSAN: [ 0.000000] UBSAN: array-index-out-of-bounds in ../arch/riscv/kernel/vendor_extensions.c:41:66 [ 0.000000] index -1 is out of range for type 'riscv_isavendorinfo [32]' [ 0.000000] CPU: 0 UID: 0 PID: 0 Comm: swapper Not tainted 6.11.0-rc2ubuntu-defconfig #2 [ 0.000000] Hardware name: riscv-virtio,qemu (DT) [ 0.000000] Call Trace: [ 0.000000] [] dump_backtrace+0x32/0x40 [ 0.000000] [] show_stack+0x38/0x44 [ 0.000000] [] dump_stack_lvl+0x70/0x9c [ 0.000000] [] dump_stack+0x18/0x20 [ 0.000000] [] ubsan_epilogue+0x10/0x46 [ 0.000000] [] __ubsan_handle_out_of_bounds+0x94/0x9c [ 0.000000] [] __riscv_isa_vendor_extension_available+0x90/0x92 [ 0.000000] [] riscv_cpufeature_patch_func+0xc4/0x148 [ 0.000000] [] _apply_alternatives+0x42/0x50 [ 0.000000] [] apply_boot_alternatives+0x3c/0x100 [ 0.000000] [] setup_arch+0x85a/0x8bc [ 0.000000] [] start_kernel+0xa4/0xfb6 The dereferencing using cpu should actually not happen, so remove it. Fixes: 23c996fc2bc1 ("riscv: Extend cpufeature.c to detect vendor extensions") Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240814192619.276794-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vendor_extensions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/vendor_extensions.c b/arch/riscv/kernel/vendor_extensions.c index b6c1e7b5d34b36..a8126d1183412f 100644 --- a/arch/riscv/kernel/vendor_extensions.c +++ b/arch/riscv/kernel/vendor_extensions.c @@ -38,7 +38,7 @@ bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsig #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES case ANDES_VENDOR_ID: bmap = &riscv_isa_vendor_ext_list_andes.all_harts_isa_bitmap; - cpu_bmap = &riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap[cpu]; + cpu_bmap = riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap; break; #endif default: From 74c2ab6d653b4c2354df65a7f7f2df1925a40a51 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Thu, 8 Aug 2024 20:23:32 +0800 Subject: [PATCH 386/991] smb/client: avoid possible NULL dereference in cifs_free_subrequest() Clang static checker (scan-build) warning: cifsglob.h:line 890, column 3 Access to field 'ops' results in a dereference of a null pointer. Commit 519be989717c ("cifs: Add a tracepoint to track credits involved in R/W requests") adds a check for 'rdata->server', and let clang throw this warning about NULL dereference. When 'rdata->credits.value != 0 && rdata->server == NULL' happens, add_credits_and_wake_if() will call rdata->server->ops->add_credits(). This will cause NULL dereference problem. Add a check for 'rdata->server' to avoid NULL dereference. Cc: stable@vger.kernel.org Fixes: 69c3c023af25 ("cifs: Implement netfslib hooks") Reviewed-by: David Howells Signed-off-by: Su Hui Signed-off-by: Steve French --- fs/smb/client/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index b2405dd4d4d4da..45459af5044dde 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -315,7 +315,7 @@ static void cifs_free_subrequest(struct netfs_io_subrequest *subreq) #endif } - if (rdata->credits.value != 0) + if (rdata->credits.value != 0) { trace_smb3_rw_credits(rdata->rreq->debug_id, rdata->subreq.debug_index, rdata->credits.value, @@ -323,8 +323,12 @@ static void cifs_free_subrequest(struct netfs_io_subrequest *subreq) rdata->server ? rdata->server->in_flight : 0, -rdata->credits.value, cifs_trace_rw_credits_free_subreq); + if (rdata->server) + add_credits_and_wake_if(rdata->server, &rdata->credits, 0); + else + rdata->credits.value = 0; + } - add_credits_and_wake_if(rdata->server, &rdata->credits, 0); if (rdata->have_xid) free_xid(rdata->xid); } From c916ca35308d3187c9928664f9be249b22a3a701 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 3 Aug 2024 17:11:37 +0800 Subject: [PATCH 387/991] md/raid1: Fix data corruption for degraded array with slow disk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit read_balance() will avoid reading from slow disks as much as possible, however, if valid data only lands in slow disks, and a new normal disk is still in recovery, unrecovered data can be read: raid1_read_request read_balance raid1_should_read_first -> return false choose_best_rdev -> normal disk is not recovered, return -1 choose_bb_rdev -> missing the checking of recovery, return the normal disk -> read unrecovered data Root cause is that the checking of recovery is missing in choose_bb_rdev(). Hence add such checking to fix the problem. Also fix similar problem in choose_slow_rdev(). Cc: stable@vger.kernel.org Fixes: 9f3ced792203 ("md/raid1: factor out choose_bb_rdev() from read_balance()") Fixes: dfa8ecd167c1 ("md/raid1: factor out choose_slow_rdev() from read_balance()") Reported-and-tested-by: Mateusz Jończyk Closes: https://lore.kernel.org/all/9952f532-2554-44bf-b906-4880b2e88e3a@o2.pl/ Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240803091137.3197008-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7acfe7c9dc8daa..761989d6790683 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -617,6 +617,12 @@ static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio, return -1; } +static bool rdev_in_recovery(struct md_rdev *rdev, struct r1bio *r1_bio) +{ + return !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < r1_bio->sector + r1_bio->sectors; +} + static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) { @@ -635,6 +641,7 @@ static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio, rdev = conf->mirrors[disk].rdev; if (!rdev || test_bit(Faulty, &rdev->flags) || + rdev_in_recovery(rdev, r1_bio) || test_bit(WriteMostly, &rdev->flags)) continue; @@ -673,7 +680,8 @@ static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio, rdev = conf->mirrors[disk].rdev; if (!rdev || test_bit(Faulty, &rdev->flags) || - !test_bit(WriteMostly, &rdev->flags)) + !test_bit(WriteMostly, &rdev->flags) || + rdev_in_recovery(rdev, r1_bio)) continue; /* there are no bad blocks, we can use this disk */ @@ -733,9 +741,7 @@ static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio) if (!rdev || test_bit(Faulty, &rdev->flags)) return false; - /* still in recovery */ - if (!test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < r1_bio->sector + r1_bio->sectors) + if (rdev_in_recovery(rdev, r1_bio)) return false; /* don't read from slow disk unless have to */ From 836bb3268db405cf9021496ac4dbc26d3e4758fe Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 15 Aug 2024 14:03:43 -0500 Subject: [PATCH 388/991] smb3: fix lock breakage for cached writes Mandatory locking is enforced for cached writes, which violates default posix semantics, and also it is enforced inconsistently. This apparently breaks recent versions of libreoffice, but can also be demonstrated by opening a file twice from the same client, locking it from handle one and writing to it from handle two (which fails, returning EACCES). Since there was already a mount option "forcemandatorylock" (which defaults to off), with this change only when the user intentionally specifies "forcemandatorylock" on mount will we break posix semantics on write to a locked range (ie we will only fail the write in this case, if the user mounts with "forcemandatorylock"). Fixes: 85160e03a79e ("CIFS: Implement caching mechanism for mandatory brlocks") Cc: stable@vger.kernel.org Cc: Pavel Shilovsky Reported-by: abartlet@samba.org Reported-by: Kevin Ottens Reviewed-by: David Howells Signed-off-by: Steve French --- fs/smb/client/file.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 45459af5044dde..06a0667f8ff201 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -2753,6 +2753,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file->f_mapping->host; struct cifsInodeInfo *cinode = CIFS_I(inode); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); ssize_t rc; rc = netfs_start_io_write(inode); @@ -2769,12 +2770,16 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) if (rc <= 0) goto out; - if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) && + (cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), server->vals->exclusive_lock_type, 0, - NULL, CIFS_WRITE_OP)) - rc = netfs_buffered_write_iter_locked(iocb, from, NULL); - else + NULL, CIFS_WRITE_OP))) { rc = -EACCES; + goto out; + } + + rc = netfs_buffered_write_iter_locked(iocb, from, NULL); + out: up_read(&cinode->lock_sem); netfs_end_io_write(inode); From 5b4f3af39b6588e8de4444d8e1ccf759b40f9414 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 8 Aug 2024 16:04:04 -0600 Subject: [PATCH 389/991] smb: smb2pdu.h: Use static_assert() to check struct sizes Commit 9f9bef9bc5c6 ("smb: smb2pdu.h: Avoid -Wflex-array-member-not-at-end warnings") introduced tagged `struct create_context_hdr`. We want to ensure that when new members need to be added to the flexible structure, they are always included within this tagged struct. So, we use `static_assert()` to ensure that the memory layout for both the flexible structure and the tagged struct is the same after any changes. Acked-by: Namjae Jeon Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steve French --- fs/smb/common/smb2pdu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index c3ee42188d252e..c769f9dbc0b467 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1216,6 +1216,8 @@ struct create_context { ); __u8 Buffer[]; } __packed; +static_assert(offsetof(struct create_context, Buffer) == sizeof(struct create_context_hdr), + "struct member likely outside of __struct_group()"); struct smb2_create_req { struct smb2_hdr hdr; From b313a8c835516bdda85025500be866ac8a74e022 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 15 Aug 2024 10:47:36 +0800 Subject: [PATCH 390/991] block: Fix lockdep warning in blk_mq_mark_tag_wait Lockdep reported a warning in Linux version 6.6: [ 414.344659] ================================ [ 414.345155] WARNING: inconsistent lock state [ 414.345658] 6.6.0-07439-gba2303cacfda #6 Not tainted [ 414.346221] -------------------------------- [ 414.346712] inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. [ 414.347545] kworker/u10:3/1152 [HC0[0]:SC0[0]:HE0:SE1] takes: [ 414.349245] ffff88810edd1098 (&sbq->ws[i].wait){+.?.}-{2:2}, at: blk_mq_dispatch_rq_list+0x131c/0x1ee0 [ 414.351204] {IN-SOFTIRQ-W} state was registered at: [ 414.351751] lock_acquire+0x18d/0x460 [ 414.352218] _raw_spin_lock_irqsave+0x39/0x60 [ 414.352769] __wake_up_common_lock+0x22/0x60 [ 414.353289] sbitmap_queue_wake_up+0x375/0x4f0 [ 414.353829] sbitmap_queue_clear+0xdd/0x270 [ 414.354338] blk_mq_put_tag+0xdf/0x170 [ 414.354807] __blk_mq_free_request+0x381/0x4d0 [ 414.355335] blk_mq_free_request+0x28b/0x3e0 [ 414.355847] __blk_mq_end_request+0x242/0xc30 [ 414.356367] scsi_end_request+0x2c1/0x830 [ 414.345155] WARNING: inconsistent lock state [ 414.345658] 6.6.0-07439-gba2303cacfda #6 Not tainted [ 414.346221] -------------------------------- [ 414.346712] inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. [ 414.347545] kworker/u10:3/1152 [HC0[0]:SC0[0]:HE0:SE1] takes: [ 414.349245] ffff88810edd1098 (&sbq->ws[i].wait){+.?.}-{2:2}, at: blk_mq_dispatch_rq_list+0x131c/0x1ee0 [ 414.351204] {IN-SOFTIRQ-W} state was registered at: [ 414.351751] lock_acquire+0x18d/0x460 [ 414.352218] _raw_spin_lock_irqsave+0x39/0x60 [ 414.352769] __wake_up_common_lock+0x22/0x60 [ 414.353289] sbitmap_queue_wake_up+0x375/0x4f0 [ 414.353829] sbitmap_queue_clear+0xdd/0x270 [ 414.354338] blk_mq_put_tag+0xdf/0x170 [ 414.354807] __blk_mq_free_request+0x381/0x4d0 [ 414.355335] blk_mq_free_request+0x28b/0x3e0 [ 414.355847] __blk_mq_end_request+0x242/0xc30 [ 414.356367] scsi_end_request+0x2c1/0x830 [ 414.356863] scsi_io_completion+0x177/0x1610 [ 414.357379] scsi_complete+0x12f/0x260 [ 414.357856] blk_complete_reqs+0xba/0xf0 [ 414.358338] __do_softirq+0x1b0/0x7a2 [ 414.358796] irq_exit_rcu+0x14b/0x1a0 [ 414.359262] sysvec_call_function_single+0xaf/0xc0 [ 414.359828] asm_sysvec_call_function_single+0x1a/0x20 [ 414.360426] default_idle+0x1e/0x30 [ 414.360873] default_idle_call+0x9b/0x1f0 [ 414.361390] do_idle+0x2d2/0x3e0 [ 414.361819] cpu_startup_entry+0x55/0x60 [ 414.362314] start_secondary+0x235/0x2b0 [ 414.362809] secondary_startup_64_no_verify+0x18f/0x19b [ 414.363413] irq event stamp: 428794 [ 414.363825] hardirqs last enabled at (428793): [] ktime_get+0x1dc/0x200 [ 414.364694] hardirqs last disabled at (428794): [] _raw_spin_lock_irq+0x47/0x50 [ 414.365629] softirqs last enabled at (428444): [] __do_softirq+0x540/0x7a2 [ 414.366522] softirqs last disabled at (428419): [] irq_exit_rcu+0x14b/0x1a0 [ 414.367425] other info that might help us debug this: [ 414.368194] Possible unsafe locking scenario: [ 414.368900] CPU0 [ 414.369225] ---- [ 414.369548] lock(&sbq->ws[i].wait); [ 414.370000] [ 414.370342] lock(&sbq->ws[i].wait); [ 414.370802] *** DEADLOCK *** [ 414.371569] 5 locks held by kworker/u10:3/1152: [ 414.372088] #0: ffff88810130e938 ((wq_completion)writeback){+.+.}-{0:0}, at: process_scheduled_works+0x357/0x13f0 [ 414.373180] #1: ffff88810201fdb8 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}, at: process_scheduled_works+0x3a3/0x13f0 [ 414.374384] #2: ffffffff86ffbdc0 (rcu_read_lock){....}-{1:2}, at: blk_mq_run_hw_queue+0x637/0xa00 [ 414.375342] #3: ffff88810edd1098 (&sbq->ws[i].wait){+.?.}-{2:2}, at: blk_mq_dispatch_rq_list+0x131c/0x1ee0 [ 414.376377] #4: ffff888106205a08 (&hctx->dispatch_wait_lock){+.-.}-{2:2}, at: blk_mq_dispatch_rq_list+0x1337/0x1ee0 [ 414.378607] stack backtrace: [ 414.379177] CPU: 0 PID: 1152 Comm: kworker/u10:3 Not tainted 6.6.0-07439-gba2303cacfda #6 [ 414.380032] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 414.381177] Workqueue: writeback wb_workfn (flush-253:0) [ 414.381805] Call Trace: [ 414.382136] [ 414.382429] dump_stack_lvl+0x91/0xf0 [ 414.382884] mark_lock_irq+0xb3b/0x1260 [ 414.383367] ? __pfx_mark_lock_irq+0x10/0x10 [ 414.383889] ? stack_trace_save+0x8e/0xc0 [ 414.384373] ? __pfx_stack_trace_save+0x10/0x10 [ 414.384903] ? graph_lock+0xcf/0x410 [ 414.385350] ? save_trace+0x3d/0xc70 [ 414.385808] mark_lock.part.20+0x56d/0xa90 [ 414.386317] mark_held_locks+0xb0/0x110 [ 414.386791] ? __pfx_do_raw_spin_lock+0x10/0x10 [ 414.387320] lockdep_hardirqs_on_prepare+0x297/0x3f0 [ 414.387901] ? _raw_spin_unlock_irq+0x28/0x50 [ 414.388422] trace_hardirqs_on+0x58/0x100 [ 414.388917] _raw_spin_unlock_irq+0x28/0x50 [ 414.389422] __blk_mq_tag_busy+0x1d6/0x2a0 [ 414.389920] __blk_mq_get_driver_tag+0x761/0x9f0 [ 414.390899] blk_mq_dispatch_rq_list+0x1780/0x1ee0 [ 414.391473] ? __pfx_blk_mq_dispatch_rq_list+0x10/0x10 [ 414.392070] ? sbitmap_get+0x2b8/0x450 [ 414.392533] ? __blk_mq_get_driver_tag+0x210/0x9f0 [ 414.393095] __blk_mq_sched_dispatch_requests+0xd99/0x1690 [ 414.393730] ? elv_attempt_insert_merge+0x1b1/0x420 [ 414.394302] ? __pfx___blk_mq_sched_dispatch_requests+0x10/0x10 [ 414.394970] ? lock_acquire+0x18d/0x460 [ 414.395456] ? blk_mq_run_hw_queue+0x637/0xa00 [ 414.395986] ? __pfx_lock_acquire+0x10/0x10 [ 414.396499] blk_mq_sched_dispatch_requests+0x109/0x190 [ 414.397100] blk_mq_run_hw_queue+0x66e/0xa00 [ 414.397616] blk_mq_flush_plug_list.part.17+0x614/0x2030 [ 414.398244] ? __pfx_blk_mq_flush_plug_list.part.17+0x10/0x10 [ 414.398897] ? writeback_sb_inodes+0x241/0xcc0 [ 414.399429] blk_mq_flush_plug_list+0x65/0x80 [ 414.399957] __blk_flush_plug+0x2f1/0x530 [ 414.400458] ? __pfx___blk_flush_plug+0x10/0x10 [ 414.400999] blk_finish_plug+0x59/0xa0 [ 414.401467] wb_writeback+0x7cc/0x920 [ 414.401935] ? __pfx_wb_writeback+0x10/0x10 [ 414.402442] ? mark_held_locks+0xb0/0x110 [ 414.402931] ? __pfx_do_raw_spin_lock+0x10/0x10 [ 414.403462] ? lockdep_hardirqs_on_prepare+0x297/0x3f0 [ 414.404062] wb_workfn+0x2b3/0xcf0 [ 414.404500] ? __pfx_wb_workfn+0x10/0x10 [ 414.404989] process_scheduled_works+0x432/0x13f0 [ 414.405546] ? __pfx_process_scheduled_works+0x10/0x10 [ 414.406139] ? do_raw_spin_lock+0x101/0x2a0 [ 414.406641] ? assign_work+0x19b/0x240 [ 414.407106] ? lock_is_held_type+0x9d/0x110 [ 414.407604] worker_thread+0x6f2/0x1160 [ 414.408075] ? __kthread_parkme+0x62/0x210 [ 414.408572] ? lockdep_hardirqs_on_prepare+0x297/0x3f0 [ 414.409168] ? __kthread_parkme+0x13c/0x210 [ 414.409678] ? __pfx_worker_thread+0x10/0x10 [ 414.410191] kthread+0x33c/0x440 [ 414.410602] ? __pfx_kthread+0x10/0x10 [ 414.411068] ret_from_fork+0x4d/0x80 [ 414.411526] ? __pfx_kthread+0x10/0x10 [ 414.411993] ret_from_fork_asm+0x1b/0x30 [ 414.412489] When interrupt is turned on while a lock holding by spin_lock_irq it throws a warning because of potential deadlock. blk_mq_prep_dispatch_rq blk_mq_get_driver_tag __blk_mq_get_driver_tag __blk_mq_alloc_driver_tag blk_mq_tag_busy -> tag is already busy // failed to get driver tag blk_mq_mark_tag_wait spin_lock_irq(&wq->lock) -> lock A (&sbq->ws[i].wait) __add_wait_queue(wq, wait) -> wait queue active blk_mq_get_driver_tag __blk_mq_tag_busy -> 1) tag must be idle, which means there can't be inflight IO spin_lock_irq(&tags->lock) -> lock B (hctx->tags) spin_unlock_irq(&tags->lock) -> unlock B, turn on interrupt accidentally -> 2) context must be preempt by IO interrupt to trigger deadlock. As shown above, the deadlock is not possible in theory, but the warning still need to be fixed. Fix it by using spin_lock_irqsave to get lockB instead of spin_lock_irq. Fixes: 4f1731df60f9 ("blk-mq: fix potential io hang by wrong 'wake_batch'") Signed-off-by: Li Lingfeng Reviewed-by: Ming Lei Reviewed-by: Yu Kuai Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240815024736.2040971-1-lilingfeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index cc57e2dd9a0bb3..2cafcf11ee8bee 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -38,6 +38,7 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; + unsigned long flags; struct blk_mq_tags *tags = hctx->tags; /* @@ -56,11 +57,11 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) return; } - spin_lock_irq(&tags->lock); + spin_lock_irqsave(&tags->lock, flags); users = tags->active_queues + 1; WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); - spin_unlock_irq(&tags->lock); + spin_unlock_irqrestore(&tags->lock, flags); } /* From 9b340aeb26d50e9a9ec99599e2a39b035fac978e Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 16 Aug 2024 06:19:23 +1000 Subject: [PATCH 391/991] nouveau/firmware: use dma non-coherent allocator Currently, enabling SG_DEBUG in the kernel will cause nouveau to hit a BUG() on startup, when the iommu is enabled: kernel BUG at include/linux/scatterlist.h:187! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 7 PID: 930 Comm: (udev-worker) Not tainted 6.9.0-rc3Lyude-Test+ #30 Hardware name: MSI MS-7A39/A320M GAMING PRO (MS-7A39), BIOS 1.I0 01/22/2019 RIP: 0010:sg_init_one+0x85/0xa0 Code: 69 88 32 01 83 e1 03 f6 c3 03 75 20 a8 01 75 1e 48 09 cb 41 89 54 24 08 49 89 1c 24 41 89 6c 24 0c 5b 5d 41 5c e9 7b b9 88 00 <0f> 0b 0f 0b 0f 0b 48 8b 05 5e 46 9a 01 eb b2 66 66 2e 0f 1f 84 00 RSP: 0018:ffffa776017bf6a0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffa77600d87000 RCX: 000000000000002b RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffffa77680d87000 RBP: 000000000000e000 R08: 0000000000000000 R09: 0000000000000000 R10: ffff98f4c46aa508 R11: 0000000000000000 R12: ffff98f4c46aa508 R13: ffff98f4c46aa008 R14: ffffa77600d4a000 R15: ffffa77600d4a018 FS: 00007feeb5aae980(0000) GS:ffff98f5c4dc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f22cb9a4520 CR3: 00000001043ba000 CR4: 00000000003506f0 Call Trace: ? die+0x36/0x90 ? do_trap+0xdd/0x100 ? sg_init_one+0x85/0xa0 ? do_error_trap+0x65/0x80 ? sg_init_one+0x85/0xa0 ? exc_invalid_op+0x50/0x70 ? sg_init_one+0x85/0xa0 ? asm_exc_invalid_op+0x1a/0x20 ? sg_init_one+0x85/0xa0 nvkm_firmware_ctor+0x14a/0x250 [nouveau] nvkm_falcon_fw_ctor+0x42/0x70 [nouveau] ga102_gsp_booter_ctor+0xb4/0x1a0 [nouveau] r535_gsp_oneinit+0xb3/0x15f0 [nouveau] ? srso_return_thunk+0x5/0x5f ? srso_return_thunk+0x5/0x5f ? nvkm_udevice_new+0x95/0x140 [nouveau] ? srso_return_thunk+0x5/0x5f ? srso_return_thunk+0x5/0x5f ? ktime_get+0x47/0xb0 Fix this by using the non-coherent allocator instead, I think there might be a better answer to this, but it involve ripping up some of APIs using sg lists. Cc: stable@vger.kernel.org Fixes: 2541626cfb79 ("drm/nouveau/acr: use common falcon HS FW code for ACR FWs") Signed-off-by: Dave Airlie Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240815201923.632803-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nvkm/core/firmware.c | 9 ++++++--- drivers/gpu/drm/nouveau/nvkm/falcon/fw.c | 6 ++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/core/firmware.c b/drivers/gpu/drm/nouveau/nvkm/core/firmware.c index adc60b25f8e6c6..0af01a0ec60168 100644 --- a/drivers/gpu/drm/nouveau/nvkm/core/firmware.c +++ b/drivers/gpu/drm/nouveau/nvkm/core/firmware.c @@ -205,7 +205,8 @@ nvkm_firmware_dtor(struct nvkm_firmware *fw) break; case NVKM_FIRMWARE_IMG_DMA: nvkm_memory_unref(&memory); - dma_free_coherent(fw->device->dev, sg_dma_len(&fw->mem.sgl), fw->img, fw->phys); + dma_free_noncoherent(fw->device->dev, sg_dma_len(&fw->mem.sgl), + fw->img, fw->phys, DMA_TO_DEVICE); break; case NVKM_FIRMWARE_IMG_SGT: nvkm_memory_unref(&memory); @@ -236,10 +237,12 @@ nvkm_firmware_ctor(const struct nvkm_firmware_func *func, const char *name, break; case NVKM_FIRMWARE_IMG_DMA: { dma_addr_t addr; - len = ALIGN(fw->len, PAGE_SIZE); - fw->img = dma_alloc_coherent(fw->device->dev, len, &addr, GFP_KERNEL); + fw->img = dma_alloc_noncoherent(fw->device->dev, + len, &addr, + DMA_TO_DEVICE, + GFP_KERNEL); if (fw->img) { memcpy(fw->img, src, fw->len); fw->phys = addr; diff --git a/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c b/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c index 80a480b1217468..a1c8545f1249a1 100644 --- a/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c +++ b/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c @@ -89,6 +89,12 @@ nvkm_falcon_fw_boot(struct nvkm_falcon_fw *fw, struct nvkm_subdev *user, nvkm_falcon_fw_dtor_sigs(fw); } + /* after last write to the img, sync dma mappings */ + dma_sync_single_for_device(fw->fw.device->dev, + fw->fw.phys, + sg_dma_len(&fw->fw.mem.sgl), + DMA_TO_DEVICE); + FLCNFW_DBG(fw, "resetting"); fw->func->reset(fw); From b96ed2c97c791954abc881ef384e773010945aec Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 14 Aug 2024 14:25:00 +0200 Subject: [PATCH 392/991] virtio_net: move netdev_tx_reset_queue() call before RX napi enable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During suspend/resume the following BUG was hit: ------------[ cut here ]------------ kernel BUG at lib/dynamic_queue_limits.c:99! Internal error: Oops - BUG: 0 [#1] SMP ARM Modules linked in: bluetooth ecdh_generic ecc libaes CPU: 1 PID: 1282 Comm: rtcwake Not tainted 6.10.0-rc3-00732-gc8bd1f7f3e61 #15240 Hardware name: Generic DT based system PC is at dql_completed+0x270/0x2cc LR is at __free_old_xmit+0x120/0x198 pc : []    lr : []    psr: 80000013 ... Flags: Nzcv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none Control: 10c5387d  Table: 43a4406a  DAC: 00000051 ... Process rtcwake (pid: 1282, stack limit = 0xfbc21278) Stack: (0xe0805e80 to 0xe0806000) ... Call trace:  dql_completed from __free_old_xmit+0x120/0x198  __free_old_xmit from free_old_xmit+0x44/0xe4  free_old_xmit from virtnet_poll_tx+0x88/0x1b4  virtnet_poll_tx from __napi_poll+0x2c/0x1d4  __napi_poll from net_rx_action+0x140/0x2b4  net_rx_action from handle_softirqs+0x11c/0x350  handle_softirqs from call_with_stack+0x18/0x20  call_with_stack from do_softirq+0x48/0x50  do_softirq from __local_bh_enable_ip+0xa0/0xa4  __local_bh_enable_ip from virtnet_open+0xd4/0x21c  virtnet_open from virtnet_restore+0x94/0x120  virtnet_restore from virtio_device_restore+0x110/0x1f4  virtio_device_restore from dpm_run_callback+0x3c/0x100  dpm_run_callback from device_resume+0x12c/0x2a8  device_resume from dpm_resume+0x12c/0x1e0  dpm_resume from dpm_resume_end+0xc/0x18  dpm_resume_end from suspend_devices_and_enter+0x1f0/0x72c  suspend_devices_and_enter from pm_suspend+0x270/0x2a0  pm_suspend from state_store+0x68/0xc8  state_store from kernfs_fop_write_iter+0x10c/0x1cc  kernfs_fop_write_iter from vfs_write+0x2b0/0x3dc  vfs_write from ksys_write+0x5c/0xd4  ksys_write from ret_fast_syscall+0x0/0x54 Exception stack(0xe8bf1fa8 to 0xe8bf1ff0) ... ---[ end trace 0000000000000000 ]--- After virtnet_napi_enable() is called, the following path is hit: __napi_poll() -> virtnet_poll() -> virtnet_poll_cleantx() -> netif_tx_wake_queue() That wakes the TX queue and allows skbs to be submitted and accounted by BQL counters. Then netdev_tx_reset_queue() is called that resets BQL counters and eventually leads to the BUG in dql_completed(). Move virtnet_napi_tx_enable() what does BQL counters reset before RX napi enable to avoid the issue. Reported-by: Marek Szyprowski Closes: https://lore.kernel.org/netdev/e632e378-d019-4de7-8f13-07c572ab37a9@samsung.com/ Fixes: c8bd1f7f3e61 ("virtio_net: add support for Byte Queue Limits") Tested-by: Marek Szyprowski Signed-off-by: Jiri Pirko Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://patch.msgid.link/20240814122500.1710279-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3f10c72743e94d..c6af189480929a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2867,8 +2867,8 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) if (err < 0) goto err_xdp_reg_mem_model; - virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index)); + virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi); return 0; From c948c0973df5db9314459da621342e1170bd9e8e Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Wed, 14 Aug 2024 15:54:29 -0700 Subject: [PATCH 393/991] bnxt_en: Don't clear ntuple filters and rss contexts during ethtool ops The driver currently blindly deletes its cache of RSS cotexts and ntuple filters when the ethtool channel count is changing. It also deletes the ntuple filters cache when the default indirection table is changing. The core will not allow ethtool channels to drop below any that have been configured as ntuple destinations since this commit from 2022: 47f3ecf4763d ("ethtool: Fail number of channels change when it conflicts with rxnfc") So there is absolutely no need to delete the ntuple filters and RSS contexts when changing ethtool channels. It is also unnecessary to delete ntuple filters when the default RSS indirection table is changing. Remove bnxt_clear_usr_fltrs() and bnxt_clear_rss_ctxis() from the ethtool ops and change them to static functions. This bug will cause confusion to the end user and causes failure when running the rss_ctx.py selftest. Fixes: 1018319f949c ("bnxt_en: Invalidate user filters when needed") Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20240725111912.7bc17cf6@kernel.org/ Reviewed-by: Andy Gospodarek Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://patch.msgid.link/20240814225429.199280-1-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 -- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 4 ---- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index e27e1082ee33ae..04a623b3eee29e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5056,7 +5056,7 @@ void bnxt_del_one_usr_fltr(struct bnxt *bp, struct bnxt_filter_base *fltr) list_del_init(&fltr->list); } -void bnxt_clear_usr_fltrs(struct bnxt *bp, bool all) +static void bnxt_clear_usr_fltrs(struct bnxt *bp, bool all) { struct bnxt_filter_base *usr_fltr, *tmp; @@ -10248,7 +10248,7 @@ static void bnxt_hwrm_realloc_rss_ctx_vnic(struct bnxt *bp) } } -void bnxt_clear_rss_ctxs(struct bnxt *bp) +static void bnxt_clear_rss_ctxs(struct bnxt *bp) { struct ethtool_rxfh_context *ctx; unsigned long context; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 6bbdc718c3a701..059a6f81c1a871 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2790,7 +2790,6 @@ void bnxt_set_ring_params(struct bnxt *); int bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode); void bnxt_insert_usr_fltr(struct bnxt *bp, struct bnxt_filter_base *fltr); void bnxt_del_one_usr_fltr(struct bnxt *bp, struct bnxt_filter_base *fltr); -void bnxt_clear_usr_fltrs(struct bnxt *bp, bool all); int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp, unsigned long *bmap, int bmap_size, bool async_only); int bnxt_hwrm_func_drv_unrgtr(struct bnxt *bp); @@ -2842,7 +2841,6 @@ int bnxt_hwrm_vnic_rss_cfg_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic); int __bnxt_setup_vnic_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic); void bnxt_del_one_rss_ctx(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx, bool all); -void bnxt_clear_rss_ctxs(struct bnxt *bp); int bnxt_open_nic(struct bnxt *, bool, bool); int bnxt_half_open_nic(struct bnxt *bp); void bnxt_half_close_nic(struct bnxt *bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 9dadc89378f02e..4cf9bf8b01b09d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -968,9 +968,6 @@ static int bnxt_set_channels(struct net_device *dev, return -EINVAL; } - bnxt_clear_usr_fltrs(bp, true); - if (BNXT_SUPPORTS_MULTI_RSS_CTX(bp)) - bnxt_clear_rss_ctxs(bp); if (netif_running(dev)) { if (BNXT_PF(bp)) { /* TODO CHIMP_FW: Send message to all VF's @@ -2000,7 +1997,6 @@ static int bnxt_set_rxfh(struct net_device *dev, bnxt_modify_rss(bp, NULL, NULL, rxfh); - bnxt_clear_usr_fltrs(bp, false); if (netif_running(bp->dev)) { bnxt_close_nic(bp, false, false); rc = bnxt_open_nic(bp, false, false); From b153b3c747003e1ce312ba205e552db4bd9e8df7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 14 Aug 2024 07:28:32 -0700 Subject: [PATCH 394/991] MAINTAINERS: add selftests to network drivers tools/testing/selftests/drivers/net/ is not listed under networking entries. Add it to NETWORKING DRIVERS. Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240814142832.3473685-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f328373463b0d2..a964a34651f563 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15886,6 +15886,7 @@ F: include/linux/netdevice.h F: include/uapi/linux/cn_proc.h F: include/uapi/linux/if_* F: include/uapi/linux/netdevice.h +F: tools/testing/selftests/drivers/net/ X: drivers/net/wireless/ NETWORKING DRIVERS (WIRELESS) From e46bc2e7eb90a370bc27fa2fd98cb8251e7da1ec Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 7 Aug 2024 18:33:35 +0100 Subject: [PATCH 395/991] mseal: fix is_madv_discard() is_madv_discard did its check wrong. MADV_ flags are not bitwise, they're normal sequential numbers. So, for instance: behavior & (/* ... */ | MADV_REMOVE) tagged both MADV_REMOVE and MADV_RANDOM (bit 0 set) as discard operations. As a result the kernel could erroneously block certain madvises (e.g MADV_RANDOM or MADV_HUGEPAGE) on sealed VMAs due to them sharing bits with blocked MADV operations (e.g REMOVE or WIPEONFORK). This is obviously incorrect, so use a switch statement instead. Link: https://lkml.kernel.org/r/20240807173336.2523757-1-pedro.falcato@gmail.com Link: https://lkml.kernel.org/r/20240807173336.2523757-2-pedro.falcato@gmail.com Fixes: 8be7258aad44 ("mseal: add mseal syscall") Signed-off-by: Pedro Falcato Tested-by: Jeff Xu Reviewed-by: Jeff Xu Cc: Kees Cook Cc: Liam R. Howlett Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- mm/mseal.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/mm/mseal.c b/mm/mseal.c index bf783bba8ed0b9..15bba28acc005f 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -40,9 +40,17 @@ static bool can_modify_vma(struct vm_area_struct *vma) static bool is_madv_discard(int behavior) { - return behavior & - (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED | - MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK); + switch (behavior) { + case MADV_FREE: + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_REMOVE: + case MADV_DONTFORK: + case MADV_WIPEONFORK: + return true; + } + + return false; } static bool is_ro_anon(struct vm_area_struct *vma) From 5f75cfbd6bb02295ddaed48adf667b6c828ce07b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 1 Aug 2024 22:47:48 +0200 Subject: [PATCH 396/991] mm/hugetlb: fix hugetlb vs. core-mm PT locking We recently made GUP's common page table walking code to also walk hugetlb VMAs without most hugetlb special-casing, preparing for the future of having less hugetlb-specific page table walking code in the codebase. Turns out that we missed one page table locking detail: page table locking for hugetlb folios that are not mapped using a single PMD/PUD. Assume we have hugetlb folio that spans multiple PTEs (e.g., 64 KiB hugetlb folios on arm64 with 4 KiB base page size). GUP, as it walks the page tables, will perform a pte_offset_map_lock() to grab the PTE table lock. However, hugetlb that concurrently modifies these page tables would actually grab the mm->page_table_lock: with USE_SPLIT_PTE_PTLOCKS, the locks would differ. Something similar can happen right now with hugetlb folios that span multiple PMDs when USE_SPLIT_PMD_PTLOCKS. This issue can be reproduced [1], for example triggering: [ 3105.936100] ------------[ cut here ]------------ [ 3105.939323] WARNING: CPU: 31 PID: 2732 at mm/gup.c:142 try_grab_folio+0x11c/0x188 [ 3105.944634] Modules linked in: [...] [ 3105.974841] CPU: 31 PID: 2732 Comm: reproducer Not tainted 6.10.0-64.eln141.aarch64 #1 [ 3105.980406] Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-4.fc40 05/24/2024 [ 3105.986185] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 3105.991108] pc : try_grab_folio+0x11c/0x188 [ 3105.994013] lr : follow_page_pte+0xd8/0x430 [ 3105.996986] sp : ffff80008eafb8f0 [ 3105.999346] x29: ffff80008eafb900 x28: ffffffe8d481f380 x27: 00f80001207cff43 [ 3106.004414] x26: 0000000000000001 x25: 0000000000000000 x24: ffff80008eafba48 [ 3106.009520] x23: 0000ffff9372f000 x22: ffff7a54459e2000 x21: ffff7a546c1aa978 [ 3106.014529] x20: ffffffe8d481f3c0 x19: 0000000000610041 x18: 0000000000000001 [ 3106.019506] x17: 0000000000000001 x16: ffffffffffffffff x15: 0000000000000000 [ 3106.024494] x14: ffffb85477fdfe08 x13: 0000ffff9372ffff x12: 0000000000000000 [ 3106.029469] x11: 1fffef4a88a96be1 x10: ffff7a54454b5f0c x9 : ffffb854771b12f0 [ 3106.034324] x8 : 0008000000000000 x7 : ffff7a546c1aa980 x6 : 0008000000000080 [ 3106.038902] x5 : 00000000001207cf x4 : 0000ffff9372f000 x3 : ffffffe8d481f000 [ 3106.043420] x2 : 0000000000610041 x1 : 0000000000000001 x0 : 0000000000000000 [ 3106.047957] Call trace: [ 3106.049522] try_grab_folio+0x11c/0x188 [ 3106.051996] follow_pmd_mask.constprop.0.isra.0+0x150/0x2e0 [ 3106.055527] follow_page_mask+0x1a0/0x2b8 [ 3106.058118] __get_user_pages+0xf0/0x348 [ 3106.060647] faultin_page_range+0xb0/0x360 [ 3106.063651] do_madvise+0x340/0x598 Let's make huge_pte_lockptr() effectively use the same PT locks as any core-mm page table walker would. Add ptep_lockptr() to obtain the PTE page table lock using a pte pointer -- unfortunately we cannot convert pte_lockptr() because virt_to_page() doesn't work with kmap'ed page tables we can have with CONFIG_HIGHPTE. Handle CONFIG_PGTABLE_LEVELS correctly by checking in reverse order, such that when e.g., CONFIG_PGTABLE_LEVELS==2 with PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE will work as expected. Document why that works. There is one ugly case: powerpc 8xx, whereby we have an 8 MiB hugetlb folio being mapped using two PTE page tables. While hugetlb wants to take the PMD table lock, core-mm would grab the PTE table lock of one of both PTE page tables. In such corner cases, we have to make sure that both locks match, which is (fortunately!) currently guaranteed for 8xx as it does not support SMP and consequently doesn't use split PT locks. [1] https://lore.kernel.org/all/1bbfcc7f-f222-45a5-ac44-c5a1381c596d@redhat.com/ Link: https://lkml.kernel.org/r/20240801204748.99107-1-david@redhat.com Fixes: 9cb28da54643 ("mm/gup: handle hugetlb in the generic follow_page_mask code") Signed-off-by: David Hildenbrand Acked-by: Peter Xu Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Peter Xu Cc: Oscar Salvador Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 33 ++++++++++++++++++++++++++++++--- include/linux/mm.h | 11 +++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c9bf68c239a011..45bf05ad5c53a1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -944,10 +944,37 @@ static inline bool htlb_allow_alloc_fallback(int reason) static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { - if (huge_page_size(h) == PMD_SIZE) + const unsigned long size = huge_page_size(h); + + VM_WARN_ON(size == PAGE_SIZE); + + /* + * hugetlb must use the exact same PT locks as core-mm page table + * walkers would. When modifying a PTE table, hugetlb must take the + * PTE PT lock, when modifying a PMD table, hugetlb must take the PMD + * PT lock etc. + * + * The expectation is that any hugetlb folio smaller than a PMD is + * always mapped into a single PTE table and that any hugetlb folio + * smaller than a PUD (but at least as big as a PMD) is always mapped + * into a single PMD table. + * + * If that does not hold for an architecture, then that architecture + * must disable split PT locks such that all *_lockptr() functions + * will give us the same result: the per-MM PT lock. + * + * Note that with e.g., CONFIG_PGTABLE_LEVELS=2 where + * PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE, we'd use pud_lockptr() + * and core-mm would use pmd_lockptr(). However, in such configurations + * split PMD locks are disabled -- they don't make sense on a single + * PGDIR page table -- and the end result is the same. + */ + if (size >= PUD_SIZE) + return pud_lockptr(mm, (pud_t *) pte); + else if (size >= PMD_SIZE || IS_ENABLED(CONFIG_HIGHPTE)) return pmd_lockptr(mm, (pmd_t *) pte); - VM_BUG_ON(huge_page_size(h) == PAGE_SIZE); - return &mm->page_table_lock; + /* pte_alloc_huge() only applies with !CONFIG_HIGHPTE */ + return ptep_lockptr(mm, pte); } #ifndef hugepages_supported diff --git a/include/linux/mm.h b/include/linux/mm.h index c4b238a20b76e9..6549d0979b28f9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2920,6 +2920,13 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } +static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) +{ + BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); + BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); + return ptlock_ptr(virt_to_ptdesc(pte)); +} + static inline bool ptlock_init(struct ptdesc *ptdesc) { /* @@ -2944,6 +2951,10 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } +static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) +{ + return &mm->page_table_lock; +} static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} From ace0741a55e453c265cbf3d965eea7f687cd6d45 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 8 Aug 2024 21:34:34 +0000 Subject: [PATCH 397/991] mm: don't account memmap on failure Patch series "Fixes for memmap accounting", v4. Memmap accounting provides us with observability of how much memory is used for per-page metadata: i.e. "struct page"'s and "struct page_ext". It also provides with information of how much was allocated using boot allocator (i.e. not part of MemTotal), and how much was allocated using buddy allocated (i.e. part of MemTotal). This small series fixes a few problems that were discovered with the original patch. This patch (of 3): When we fail to allocate the mmemmap in alloc_vmemmap_page_list(), do not account any already-allocated pages: we're going to free all them before we return from the function. Link: https://lkml.kernel.org/r/20240809191020.1142142-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240808213437.682006-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240808213437.682006-2-pasha.tatashin@soleen.com Fixes: 15995a352474 ("mm: report per-page metadata information") Signed-off-by: Pasha Tatashin Reviewed-by: Fan Ni Reviewed-by: Yosry Ahmed Acked-by: David Hildenbrand Tested-by: Alison Schofield Reviewed-by: Muchun Song Acked-by: David Rientjes Cc: Dan Williams Cc: Domenico Cerasuolo Cc: Joel Granados Cc: Johannes Weiner Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport Cc: Nhat Pham Cc: Sourav Panda Cc: Vlastimil Babka Cc: Yi Zhang Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 829112b0a914c9..4f51e0596197e0 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -392,13 +392,10 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, for (i = 0; i < nr_pages; i++) { page = alloc_pages_node(nid, gfp_mask, 0); - if (!page) { - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, i); + if (!page) goto out; - } list_add(&page->lru, list); } - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, nr_pages); return 0; From f4cb78af91e3b2b7aa76dbf8213b898fa8811b12 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 8 Aug 2024 21:34:35 +0000 Subject: [PATCH 398/991] mm: add system wide stats items category /proc/vmstat contains events and stats, events can only grow, but stats can grow and shrink. vmstat has the following: ------------------------- NR_VM_ZONE_STAT_ITEMS: per-zone stats NR_VM_NUMA_EVENT_ITEMS: per-numa events NR_VM_NODE_STAT_ITEMS: per-numa stats NR_VM_WRITEBACK_STAT_ITEMS: system-wide background-writeback and dirty-throttling tresholds. NR_VM_EVENT_ITEMS: system-wide events ------------------------- Rename NR_VM_WRITEBACK_STAT_ITEMS to NR_VM_STAT_ITEMS, to track the system-wide stats, we are going to add per-page metadata stats to this category in the next patch. Also delete unused writeback_stat_name(). Link: https://lkml.kernel.org/r/20240809191020.1142142-2-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240808213437.682006-3-pasha.tatashin@soleen.com Fixes: 15995a352474 ("mm: report per-page metadata information") Signed-off-by: Pasha Tatashin Suggested-by: Yosry Ahmed Tested-by: Alison Schofield Acked-by: David Hildenbrand Acked-by: David Rientjes Cc: Dan Williams Cc: Domenico Cerasuolo Cc: Joel Granados Cc: Johannes Weiner Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Sourav Panda Cc: Vlastimil Babka Cc: Yi Zhang Cc: Fan Ni Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 15 ++++----------- mm/vmstat.c | 6 +++--- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 23cd179420363c..9ab4fa5e09b5ac 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -34,10 +34,11 @@ struct reclaim_stat { unsigned nr_lazyfree_fail; }; -enum writeback_stat_item { +/* Stat data for system wide items */ +enum vm_stat_item { NR_DIRTY_THRESHOLD, NR_DIRTY_BG_THRESHOLD, - NR_VM_WRITEBACK_STAT_ITEMS, + NR_VM_STAT_ITEMS, }; #ifdef CONFIG_VM_EVENT_COUNTERS @@ -514,21 +515,13 @@ static inline const char *lru_list_name(enum lru_list lru) return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" } -static inline const char *writeback_stat_name(enum writeback_stat_item item) -{ - return vmstat_text[NR_VM_ZONE_STAT_ITEMS + - NR_VM_NUMA_EVENT_ITEMS + - NR_VM_NODE_STAT_ITEMS + - item]; -} - #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) static inline const char *vm_event_name(enum vm_event_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + NR_VM_NODE_STAT_ITEMS + - NR_VM_WRITEBACK_STAT_ITEMS + + NR_VM_STAT_ITEMS + item]; } #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 04a1cb6cc6365a..6f8aa4766f16d8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1257,7 +1257,7 @@ const char * const vmstat_text[] = { "pgdemote_khugepaged", "nr_memmap", "nr_memmap_boot", - /* enum writeback_stat_item counters */ + /* system-wide enum vm_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", @@ -1790,7 +1790,7 @@ static const struct seq_operations zoneinfo_op = { #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \ NR_VM_NUMA_EVENT_ITEMS + \ NR_VM_NODE_STAT_ITEMS + \ - NR_VM_WRITEBACK_STAT_ITEMS + \ + NR_VM_STAT_ITEMS + \ (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \ NR_VM_EVENT_ITEMS : 0)) @@ -1827,7 +1827,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, v + NR_DIRTY_THRESHOLD); - v += NR_VM_WRITEBACK_STAT_ITEMS; + v += NR_VM_STAT_ITEMS; #ifdef CONFIG_VM_EVENT_COUNTERS all_vm_events(v); From 9d85731110241fb8ca9445ea4177d816041a8825 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 8 Aug 2024 21:34:36 +0000 Subject: [PATCH 399/991] mm: don't account memmap per-node Fix invalid access to pgdat during hot-remove operation: ndctl users reported a GPF when trying to destroy a namespace: $ ndctl destroy-namespace all -r all -f Segmentation fault dmesg: Oops: general protection fault, probably for non-canonical address 0xdffffc0000005650: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: probably user-memory-access in range [0x000000000002b280-0x000000000002b287] CPU: 26 UID: 0 PID: 1868 Comm: ndctl Not tainted 6.11.0-rc1 #1 Hardware name: Dell Inc. PowerEdge R640/08HT8T, BIOS 2.20.1 09/13/2023 RIP: 0010:mod_node_page_state+0x2a/0x110 cxl-test users report a GPF when trying to unload the test module: $ modrpobe -r cxl-test dmesg BUG: unable to handle page fault for address: 0000000000004200 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 UID: 0 PID: 1076 Comm: modprobe Tainted: G O N 6.11.0-rc1 #197 Tainted: [O]=OOT_MODULE, [N]=TEST Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/15 RIP: 0010:mod_node_page_state+0x6/0x90 Currently, when memory is hot-plugged or hot-removed the accounting is done based on the assumption that memmap is allocated from the same node as the hot-plugged/hot-removed memory, which is not always the case. In addition, there are challenges with keeping the node id of the memory that is being remove to the time when memmap accounting is actually performed: since this is done after remove_pfn_range_from_zone(), and also after remove_memory_block_devices(). Meaning that we cannot use pgdat nor walking though memblocks to get the nid. Given all of that, account the memmap overhead system wide instead. For this we are going to be using global atomic counters, but given that memmap size is rarely modified, and normally is only modified either during early boot when there is only one CPU, or under a hotplug global mutex lock, therefore there is no need for per-cpu optimizations. Also, while we are here rename nr_memmap to nr_memmap_pages, and nr_memmap_boot to nr_memmap_boot_pages to be self explanatory that the units are in page count. [pasha.tatashin@soleen.com: address a few nits from David Hildenbrand] Link: https://lkml.kernel.org/r/20240809191020.1142142-4-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240809191020.1142142-4-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240808213437.682006-4-pasha.tatashin@soleen.com Fixes: 15995a352474 ("mm: report per-page metadata information") Signed-off-by: Pasha Tatashin Reported-by: Yi Zhang Closes: https://lore.kernel.org/linux-cxl/CAHj4cs9Ax1=CoJkgBGP_+sNu6-6=6v=_L-ZBZY0bVLD3wUWZQg@mail.gmail.com Reported-by: Alison Schofield Closes: https://lore.kernel.org/linux-mm/Zq0tPd2h6alFz8XF@aschofie-mobl2/#t Tested-by: Dan Williams Tested-by: Alison Schofield Acked-by: David Hildenbrand Acked-by: David Rientjes Tested-by: Yi Zhang Cc: Domenico Cerasuolo Cc: Fan Ni Cc: Joel Granados Cc: Johannes Weiner Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Sourav Panda Cc: Vlastimil Babka Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 -- include/linux/vmstat.h | 7 ++++--- mm/hugetlb_vmemmap.c | 8 ++++---- mm/mm_init.c | 3 +-- mm/page_alloc.c | 1 - mm/page_ext.c | 18 ++++------------- mm/sparse-vmemmap.c | 11 ++++------ mm/sparse.c | 5 ++--- mm/vmstat.c | 46 ++++++++++++++++++++---------------------- 9 files changed, 41 insertions(+), 60 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 41458892bc8a34..1dc6248feb8324 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,8 +220,6 @@ enum node_stat_item { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, - NR_MEMMAP, /* page metadata allocated through buddy allocator */ - NR_MEMMAP_BOOT, /* page metadata allocated through boot allocator */ NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 9ab4fa5e09b5ac..9eb77c9007e623 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -38,6 +38,8 @@ struct reclaim_stat { enum vm_stat_item { NR_DIRTY_THRESHOLD, NR_DIRTY_BG_THRESHOLD, + NR_MEMMAP_PAGES, /* page metadata allocated through buddy allocator */ + NR_MEMMAP_BOOT_PAGES, /* page metadata allocated through boot allocator */ NR_VM_STAT_ITEMS, }; @@ -618,7 +620,6 @@ static inline void lruvec_stat_sub_folio(struct folio *folio, lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } -void __meminit mod_node_early_perpage_metadata(int nid, long delta); -void __meminit store_early_perpage_metadata(void); - +void memmap_boot_pages_add(long delta); +void memmap_pages_add(long delta); #endif /* _LINUX_VMSTAT_H */ diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4f51e0596197e0..0c3f56b3578eba 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -185,11 +185,11 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, static inline void free_vmemmap_page(struct page *page) { if (PageReserved(page)) { + memmap_boot_pages_add(-1); free_bootmem_page(page); - mod_node_page_state(page_pgdat(page), NR_MEMMAP_BOOT, -1); } else { + memmap_pages_add(-1); __free_page(page); - mod_node_page_state(page_pgdat(page), NR_MEMMAP, -1); } } @@ -341,7 +341,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, copy_page(page_to_virt(walk.reuse_page), (void *)walk.reuse_addr); list_add(&walk.reuse_page->lru, vmemmap_pages); - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, 1); + memmap_pages_add(1); } /* @@ -396,7 +396,7 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, goto out; list_add(&page->lru, list); } - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, nr_pages); + memmap_pages_add(nr_pages); return 0; out: diff --git a/mm/mm_init.c b/mm/mm_init.c index 75c3bd42799b93..f9a60ffc553206 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1623,8 +1623,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) panic("Failed to allocate %ld bytes for node %d memory map\n", size, pgdat->node_id); pgdat->node_mem_map = map + offset; - mod_node_early_perpage_metadata(pgdat->node_id, - DIV_ROUND_UP(size, PAGE_SIZE)); + memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", __func__, pgdat->node_id, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 28f80daf5c0418..875d76e8684ac4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5755,7 +5755,6 @@ void __init setup_per_cpu_pageset(void) for_each_online_pgdat(pgdat) pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); - store_early_perpage_metadata(); } __meminit void zone_pcp_init(struct zone *zone) diff --git a/mm/page_ext.c b/mm/page_ext.c index c191e490c401ba..641d93f6af4c1e 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -214,8 +214,7 @@ static int __init alloc_node_page_ext(int nid) return -ENOMEM; NODE_DATA(nid)->node_page_ext = base; total_usage += table_size; - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP_BOOT, - DIV_ROUND_UP(table_size, PAGE_SIZE)); + memmap_boot_pages_add(DIV_ROUND_UP(table_size, PAGE_SIZE)); return 0; } @@ -275,10 +274,8 @@ static void *__meminit alloc_page_ext(size_t size, int nid) else addr = vzalloc_node(size, nid); - if (addr) { - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, - DIV_ROUND_UP(size, PAGE_SIZE)); - } + if (addr) + memmap_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); return addr; } @@ -323,25 +320,18 @@ static void free_page_ext(void *addr) { size_t table_size; struct page *page; - struct pglist_data *pgdat; table_size = page_ext_size * PAGES_PER_SECTION; + memmap_pages_add(-1L * (DIV_ROUND_UP(table_size, PAGE_SIZE))); if (is_vmalloc_addr(addr)) { - page = vmalloc_to_page(addr); - pgdat = page_pgdat(page); vfree(addr); } else { page = virt_to_page(addr); - pgdat = page_pgdat(page); BUG_ON(PageReserved(page)); kmemleak_free(addr); free_pages_exact(addr, table_size); } - - mod_node_page_state(pgdat, NR_MEMMAP, - -1L * (DIV_ROUND_UP(table_size, PAGE_SIZE))); - } static void __free_page_ext(unsigned long pfn) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 1dda6c53370b0a..edcc7a6b0f6f20 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -469,13 +469,10 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn, if (r < 0) return NULL; - if (system_state == SYSTEM_BOOTING) { - mod_node_early_perpage_metadata(nid, DIV_ROUND_UP(end - start, - PAGE_SIZE)); - } else { - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, - DIV_ROUND_UP(end - start, PAGE_SIZE)); - } + if (system_state == SYSTEM_BOOTING) + memmap_boot_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE)); + else + memmap_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE)); return pfn_to_page(pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index e4b830091d1373..0f018c6f9ec52f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -463,7 +463,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid) sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); sparsemap_buf_end = sparsemap_buf + size; #ifndef CONFIG_SPARSEMEM_VMEMMAP - mod_node_early_perpage_metadata(nid, DIV_ROUND_UP(size, PAGE_SIZE)); + memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); #endif } @@ -643,8 +643,7 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); - mod_node_page_state(page_pgdat(pfn_to_page(pfn)), NR_MEMMAP, - -1L * (DIV_ROUND_UP(end - start, PAGE_SIZE))); + memmap_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE))); vmemmap_free(start, end, altmap); } static void free_map_bootmem(struct page *memmap) diff --git a/mm/vmstat.c b/mm/vmstat.c index 6f8aa4766f16d8..e875f2a4915f5f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1033,6 +1033,24 @@ unsigned long node_page_state(struct pglist_data *pgdat, } #endif +/* + * Count number of pages "struct page" and "struct page_ext" consume. + * nr_memmap_boot_pages: # of pages allocated by boot allocator + * nr_memmap_pages: # of pages that were allocated by buddy allocator + */ +static atomic_long_t nr_memmap_boot_pages = ATOMIC_LONG_INIT(0); +static atomic_long_t nr_memmap_pages = ATOMIC_LONG_INIT(0); + +void memmap_boot_pages_add(long delta) +{ + atomic_long_add(delta, &nr_memmap_boot_pages); +} + +void memmap_pages_add(long delta) +{ + atomic_long_add(delta, &nr_memmap_pages); +} + #ifdef CONFIG_COMPACTION struct contig_page_info { @@ -1255,11 +1273,11 @@ const char * const vmstat_text[] = { "pgdemote_kswapd", "pgdemote_direct", "pgdemote_khugepaged", - "nr_memmap", - "nr_memmap_boot", /* system-wide enum vm_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", + "nr_memmap_pages", + "nr_memmap_boot_pages", #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) /* enum vm_event_item counters */ @@ -1827,6 +1845,8 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, v + NR_DIRTY_THRESHOLD); + v[NR_MEMMAP_PAGES] = atomic_long_read(&nr_memmap_pages); + v[NR_MEMMAP_BOOT_PAGES] = atomic_long_read(&nr_memmap_boot_pages); v += NR_VM_STAT_ITEMS; #ifdef CONFIG_VM_EVENT_COUNTERS @@ -2285,25 +2305,3 @@ static int __init extfrag_debug_init(void) module_init(extfrag_debug_init); #endif - -/* - * Page metadata size (struct page and page_ext) in pages - */ -static unsigned long early_perpage_metadata[MAX_NUMNODES] __meminitdata; - -void __meminit mod_node_early_perpage_metadata(int nid, long delta) -{ - early_perpage_metadata[nid] += delta; -} - -void __meminit store_early_perpage_metadata(void) -{ - int nid; - struct pglist_data *pgdat; - - for_each_online_pgdat(pgdat) { - nid = pgdat->node_id; - mod_node_page_state(NODE_DATA(nid), NR_MEMMAP_BOOT, - early_perpage_metadata[nid]); - } -} From d75abd0d0bc29e6ebfebbf76d11b4067b35844af Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 6 Aug 2024 12:41:07 -0400 Subject: [PATCH 400/991] mm/memory-failure: use raw_spinlock_t in struct memory_failure_cpu The memory_failure_cpu structure is a per-cpu structure. Access to its content requires the use of get_cpu_var() to lock in the current CPU and disable preemption. The use of a regular spinlock_t for locking purpose is fine for a non-RT kernel. Since the integration of RT spinlock support into the v5.15 kernel, a spinlock_t in a RT kernel becomes a sleeping lock and taking a sleeping lock in a preemption disabled context is illegal resulting in the following kind of warning. [12135.732244] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 [12135.732248] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 270076, name: kworker/0:0 [12135.732252] preempt_count: 1, expected: 0 [12135.732255] RCU nest depth: 2, expected: 2 : [12135.732420] Hardware name: Dell Inc. PowerEdge R640/0HG0J8, BIOS 2.10.2 02/24/2021 [12135.732423] Workqueue: kacpi_notify acpi_os_execute_deferred [12135.732433] Call Trace: [12135.732436] [12135.732450] dump_stack_lvl+0x57/0x81 [12135.732461] __might_resched.cold+0xf4/0x12f [12135.732479] rt_spin_lock+0x4c/0x100 [12135.732491] memory_failure_queue+0x40/0xe0 [12135.732503] ghes_do_memory_failure+0x53/0x390 [12135.732516] ghes_do_proc.constprop.0+0x229/0x3e0 [12135.732575] ghes_proc+0xf9/0x1a0 [12135.732591] ghes_notify_hed+0x6a/0x150 [12135.732602] notifier_call_chain+0x43/0xb0 [12135.732626] blocking_notifier_call_chain+0x43/0x60 [12135.732637] acpi_ev_notify_dispatch+0x47/0x70 [12135.732648] acpi_os_execute_deferred+0x13/0x20 [12135.732654] process_one_work+0x41f/0x500 [12135.732695] worker_thread+0x192/0x360 [12135.732715] kthread+0x111/0x140 [12135.732733] ret_from_fork+0x29/0x50 [12135.732779] Fix it by using a raw_spinlock_t for locking instead. Also move the pr_err() out of the lock critical section and after put_cpu_ptr() to avoid indeterminate latency and the possibility of sleep with this call. [longman@redhat.com: don't hold percpu ref across pr_err(), per Miaohe] Link: https://lkml.kernel.org/r/20240807181130.1122660-1-longman@redhat.com Link: https://lkml.kernel.org/r/20240806164107.1044956-1-longman@redhat.com Fixes: 0f383b6dc96e ("locking/spinlock: Provide RT variant") Signed-off-by: Waiman Long Acked-by: Miaohe Lin Cc: "Huang, Ying" Cc: Juri Lelli Cc: Len Brown Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 581d3e5c911758..7066fc84f35172 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2417,7 +2417,7 @@ struct memory_failure_entry { struct memory_failure_cpu { DECLARE_KFIFO(fifo, struct memory_failure_entry, MEMORY_FAILURE_FIFO_SIZE); - spinlock_t lock; + raw_spinlock_t lock; struct work_struct work; }; @@ -2443,20 +2443,22 @@ void memory_failure_queue(unsigned long pfn, int flags) { struct memory_failure_cpu *mf_cpu; unsigned long proc_flags; + bool buffer_overflow; struct memory_failure_entry entry = { .pfn = pfn, .flags = flags, }; mf_cpu = &get_cpu_var(memory_failure_cpu); - spin_lock_irqsave(&mf_cpu->lock, proc_flags); - if (kfifo_put(&mf_cpu->fifo, entry)) + raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags); + buffer_overflow = !kfifo_put(&mf_cpu->fifo, entry); + if (!buffer_overflow) schedule_work_on(smp_processor_id(), &mf_cpu->work); - else + raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + put_cpu_var(memory_failure_cpu); + if (buffer_overflow) pr_err("buffer overflow when queuing memory failure at %#lx\n", pfn); - spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); - put_cpu_var(memory_failure_cpu); } EXPORT_SYMBOL_GPL(memory_failure_queue); @@ -2469,9 +2471,9 @@ static void memory_failure_work_func(struct work_struct *work) mf_cpu = container_of(work, struct memory_failure_cpu, work); for (;;) { - spin_lock_irqsave(&mf_cpu->lock, proc_flags); + raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); - spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); if (!gotten) break; if (entry.flags & MF_SOFT_OFFLINE) @@ -2501,7 +2503,7 @@ static int __init memory_failure_init(void) for_each_possible_cpu(cpu) { mf_cpu = &per_cpu(memory_failure_cpu, cpu); - spin_lock_init(&mf_cpu->lock); + raw_spin_lock_init(&mf_cpu->lock); INIT_KFIFO(mf_cpu->fifo); INIT_WORK(&mf_cpu->work, memory_failure_work_func); } From 61ebe5a747da649057c37be1c37eb934b4af79ca Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Thu, 8 Aug 2024 20:19:56 +0800 Subject: [PATCH 401/991] mm/vmalloc: fix page mapping if vm_area_alloc_pages() with high order fallback to order 0 The __vmap_pages_range_noflush() assumes its argument pages** contains pages with the same page shift. However, since commit e9c3cda4d86e ("mm, vmalloc: fix high order __GFP_NOFAIL allocations"), if gfp_flags includes __GFP_NOFAIL with high order in vm_area_alloc_pages() and page allocation failed for high order, the pages** may contain two different page shifts (high order and order-0). This could lead __vmap_pages_range_noflush() to perform incorrect mappings, potentially resulting in memory corruption. Users might encounter this as follows (vmap_allow_huge = true, 2M is for PMD_SIZE): kvmalloc(2M, __GFP_NOFAIL|GFP_X) __vmalloc_node_range_noprof(vm_flags=VM_ALLOW_HUGE_VMAP) vm_area_alloc_pages(order=9) ---> order-9 allocation failed and fallback to order-0 vmap_pages_range() vmap_pages_range_noflush() __vmap_pages_range_noflush(page_shift = 21) ----> wrong mapping happens We can remove the fallback code because if a high-order allocation fails, __vmalloc_node_range_noprof() will retry with order-0. Therefore, it is unnecessary to fallback to order-0 here. Therefore, fix this by removing the fallback code. Link: https://lkml.kernel.org/r/20240808122019.3361-1-hailong.liu@oppo.com Fixes: e9c3cda4d86e ("mm, vmalloc: fix high order __GFP_NOFAIL allocations") Signed-off-by: Hailong Liu Reported-by: Tangquan Zheng Reviewed-by: Baoquan He Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Barry Song Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6b783baf12a143..af2de36549d608 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3584,15 +3584,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, page = alloc_pages_noprof(alloc_gfp, order); else page = alloc_pages_node_noprof(nid, alloc_gfp, order); - if (unlikely(!page)) { - if (!nofail) - break; - - /* fall back to the zero order allocations */ - alloc_gfp |= __GFP_NOFAIL; - order = 0; - continue; - } + if (unlikely(!page)) + break; /* * Higher order allocations must be able to be treated as From 40b760cfd44566bca791c80e0720d70d75382b84 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 9 Aug 2024 10:59:04 -0400 Subject: [PATCH 402/991] mm/numa: no task_numa_fault() call if PTE is changed When handling a numa page fault, task_numa_fault() should be called by a process that restores the page table of the faulted folio to avoid duplicated stats counting. Commit b99a342d4f11 ("NUMA balancing: reduce TLB flush via delaying mapping on hint page fault") restructured do_numa_page() and did not avoid task_numa_fault() call in the second page table check after a numa migration failure. Fix it by making all !pte_same() return immediately. This issue can cause task_numa_fault() being called more than necessary and lead to unexpected numa balancing results (It is hard to tell whether the issue will cause positive or negative performance impact due to duplicated numa fault counting). Link: https://lkml.kernel.org/r/20240809145906.1513458-2-ziy@nvidia.com Fixes: b99a342d4f11 ("NUMA balancing: reduce TLB flush via delaying mapping on hint page fault") Signed-off-by: Zi Yan Reported-by: "Huang, Ying" Closes: https://lore.kernel.org/linux-mm/87zfqfw0yw.fsf@yhuang6-desk2.ccr.corp.intel.com/ Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Kefeng Wang Cc: Mel Gorman Cc: Yang Shi Cc: Signed-off-by: Andrew Morton --- mm/memory.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 34f8402d2046f3..3c01d68065be21 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5295,7 +5295,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); - goto out; + return 0; } pte = pte_modify(old_pte, vma->vm_page_prot); @@ -5358,23 +5358,19 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (!migrate_misplaced_folio(folio, vma, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; - } else { - flags |= TNF_MIGRATE_FAIL; - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); - if (unlikely(!vmf->pte)) - goto out; - if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - goto out; - } - goto out_map; + task_numa_fault(last_cpupid, nid, nr_pages, flags); + return 0; } -out: - if (nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, nid, nr_pages, flags); - return 0; + flags |= TNF_MIGRATE_FAIL; + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!vmf->pte)) + return 0; + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } out_map: /* * Make it present again, depending on how arch implements @@ -5387,7 +5383,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, writable); pte_unmap_unlock(vmf->pte, vmf->ptl); - goto out; + + if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, nr_pages, flags); + return 0; } static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) From fd8c35a92910f4829b7c99841f39b1b952c259d5 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 9 Aug 2024 10:59:05 -0400 Subject: [PATCH 403/991] mm/numa: no task_numa_fault() call if PMD is changed When handling a numa page fault, task_numa_fault() should be called by a process that restores the page table of the faulted folio to avoid duplicated stats counting. Commit c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling") restructured do_huge_pmd_numa_page() and did not avoid task_numa_fault() call in the second page table check after a numa migration failure. Fix it by making all !pmd_same() return immediately. This issue can cause task_numa_fault() being called more than necessary and lead to unexpected numa balancing results (It is hard to tell whether the issue will cause positive or negative performance impact due to duplicated numa fault counting). Link: https://lkml.kernel.org/r/20240809145906.1513458-3-ziy@nvidia.com Fixes: c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling") Reported-by: "Huang, Ying" Closes: https://lore.kernel.org/linux-mm/87zfqfw0yw.fsf@yhuang6-desk2.ccr.corp.intel.com/ Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Yang Shi Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f4be468e06a49a..67c86a5d64a6a9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1685,7 +1685,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { spin_unlock(vmf->ptl); - goto out; + return 0; } pmd = pmd_modify(oldpmd, vma->vm_page_prot); @@ -1728,22 +1728,16 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) if (!migrate_misplaced_folio(folio, vma, target_nid)) { flags |= TNF_MIGRATED; nid = target_nid; - } else { - flags |= TNF_MIGRATE_FAIL; - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { - spin_unlock(vmf->ptl); - goto out; - } - goto out_map; - } - -out: - if (nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); + return 0; + } - return 0; - + flags |= TNF_MIGRATE_FAIL; + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { + spin_unlock(vmf->ptl); + return 0; + } out_map: /* Restore the PMD */ pmd = pmd_modify(oldpmd, vma->vm_page_prot); @@ -1753,7 +1747,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); - goto out; + + if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); + return 0; } /* From af3b7d09a9934220a8136065a0e6985fe0b67a1b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 9 Aug 2024 15:32:30 +0300 Subject: [PATCH 404/991] selftests/mm: compaction_test: fix off by one in check_compaction() The "initial_nr_hugepages" variable is unsigned long so it takes up to 20 characters to print, plus 1 more character for the NUL terminator. Unfortunately, this buffer is not quite large enough for the terminator to fit. Also use snprintf() for a belt and suspenders approach. Link: https://lkml.kernel.org/r/87470c06-b45a-4e83-92ff-aac2e7b9c6ba@stanley.mountain Fixes: fb9293b6b015 ("selftests/mm: compaction_test: fix bogus test success and reduce probability of OOM-killer invocation") Signed-off-by: Dan Carpenter Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/compaction_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index e140558e6f53f8..2c3a0eb6b22d31 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -89,9 +89,10 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size, int fd, ret = -1; int compaction_index = 0; char nr_hugepages[20] = {0}; - char init_nr_hugepages[20] = {0}; + char init_nr_hugepages[24] = {0}; - sprintf(init_nr_hugepages, "%lu", initial_nr_hugepages); + snprintf(init_nr_hugepages, sizeof(init_nr_hugepages), + "%lu", initial_nr_hugepages); /* We want to test with 80% of available memory. Else, OOM killer comes in to play */ From 807174a93d24c456503692dc3f5af322ee0b640a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 9 Aug 2024 14:48:47 +0300 Subject: [PATCH 405/991] mm: fix endless reclaim on machines with unaccepted memory Unaccepted memory is considered unusable free memory, which is not counted as free on the zone watermark check. This causes get_page_from_freelist() to accept more memory to hit the high watermark, but it creates problems in the reclaim path. The reclaim path encounters a failed zone watermark check and attempts to reclaim memory. This is usually successful, but if there is little or no reclaimable memory, it can result in endless reclaim with little to no progress. This can occur early in the boot process, just after start of the init process when the only reclaimable memory is the page cache of the init executable and its libraries. Make unaccepted memory free from watermark check point of view. This way unaccepted memory will never be the trigger of memory reclaim. Accept more memory in the get_page_from_freelist() if needed. Link: https://lkml.kernel.org/r/20240809114854.3745464-2-kirill.shutemov@linux.intel.com Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") Signed-off-by: Kirill A. Shutemov Reported-by: Jianxiong Gao Acked-by: David Hildenbrand Tested-by: Jianxiong Gao Cc: Borislav Petkov Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Mel Gorman Cc: Mike Rapoport (Microsoft) Cc: Tom Lendacky Cc: Vlastimil Babka Cc: [6.5+] Signed-off-by: Andrew Morton --- mm/page_alloc.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 875d76e8684ac4..8747087acee3b3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -287,7 +287,7 @@ EXPORT_SYMBOL(nr_online_nodes); static bool page_contains_unaccepted(struct page *page, unsigned int order); static void accept_page(struct page *page, unsigned int order); -static bool try_to_accept_memory(struct zone *zone, unsigned int order); +static bool cond_accept_memory(struct zone *zone, unsigned int order); static inline bool has_unaccepted_memory(void); static bool __free_unaccepted(struct page *page); @@ -3072,9 +3072,6 @@ static inline long __zone_watermark_unusable_free(struct zone *z, if (!(alloc_flags & ALLOC_CMA)) unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); #endif -#ifdef CONFIG_UNACCEPTED_MEMORY - unusable_free += zone_page_state(z, NR_UNACCEPTED); -#endif return unusable_free; } @@ -3368,6 +3365,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } } + cond_accept_memory(zone, order); + /* * Detect whether the number of free pages is below high * watermark. If so, we will decrease pcp->high and free @@ -3393,10 +3392,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, gfp_mask)) { int ret; - if (has_unaccepted_memory()) { - if (try_to_accept_memory(zone, order)) - goto try_this_zone; - } + if (cond_accept_memory(zone, order)) + goto try_this_zone; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* @@ -3450,10 +3447,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, return page; } else { - if (has_unaccepted_memory()) { - if (try_to_accept_memory(zone, order)) - goto try_this_zone; - } + if (cond_accept_memory(zone, order)) + goto try_this_zone; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ @@ -6950,9 +6945,6 @@ static bool try_to_accept_memory_one(struct zone *zone) struct page *page; bool last; - if (list_empty(&zone->unaccepted_pages)) - return false; - spin_lock_irqsave(&zone->lock, flags); page = list_first_entry_or_null(&zone->unaccepted_pages, struct page, lru); @@ -6978,23 +6970,29 @@ static bool try_to_accept_memory_one(struct zone *zone) return true; } -static bool try_to_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order) { long to_accept; - int ret = false; + bool ret = false; + + if (!has_unaccepted_memory()) + return false; + + if (list_empty(&zone->unaccepted_pages)) + return false; /* How much to accept to get to high watermark? */ to_accept = high_wmark_pages(zone) - (zone_page_state(zone, NR_FREE_PAGES) - - __zone_watermark_unusable_free(zone, order, 0)); + __zone_watermark_unusable_free(zone, order, 0) - + zone_page_state(zone, NR_UNACCEPTED)); - /* Accept at least one page */ - do { + while (to_accept > 0) { if (!try_to_accept_memory_one(zone)) break; ret = true; to_accept -= MAX_ORDER_NR_PAGES; - } while (to_accept > 0); + } return ret; } @@ -7037,7 +7035,7 @@ static void accept_page(struct page *page, unsigned int order) { } -static bool try_to_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order) { return false; } From 7c5e8d212d7d81991a580e7de3904ea213d9a852 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 9 Aug 2024 12:56:42 +0500 Subject: [PATCH 406/991] selftests: memfd_secret: don't build memfd_secret test on unsupported arches [1] mentions that memfd_secret is only supported on arm64, riscv, x86 and x86_64 for now. It doesn't support other architectures. I found the build error on arm and decided to send the fix as it was creating noise on KernelCI: memfd_secret.c: In function 'memfd_secret': memfd_secret.c:42:24: error: '__NR_memfd_secret' undeclared (first use in this function); did you mean 'memfd_secret'? 42 | return syscall(__NR_memfd_secret, flags); | ^~~~~~~~~~~~~~~~~ | memfd_secret Hence I'm adding condition that memfd_secret should only be compiled on supported architectures. Also check in run_vmtests script if memfd_secret binary is present before executing it. Link: https://lkml.kernel.org/r/20240812061522.1933054-1-usama.anjum@collabora.com Link: https://lore.kernel.org/all/20210518072034.31572-7-rppt@kernel.org/ [1] Link: https://lkml.kernel.org/r/20240809075642.403247-1-usama.anjum@collabora.com Fixes: 76fe17ef588a ("secretmem: test: add basic selftest for memfd_secret(2)") Signed-off-by: Muhammad Usama Anjum Reviewed-by: Shuah Khan Acked-by: Mike Rapoport (Microsoft) Cc: Albert Ou Cc: James Bottomley Cc: Mike Rapoport (Microsoft) Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 ++ tools/testing/selftests/mm/run_vmtests.sh | 3 +++ 2 files changed, 5 insertions(+) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 7b8a5def54a1ab..cfad627e8d94de 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -53,7 +53,9 @@ TEST_GEN_FILES += madv_populate TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_populate +ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64)) TEST_GEN_FILES += memfd_secret +endif TEST_GEN_FILES += migration TEST_GEN_FILES += mkdirty TEST_GEN_FILES += mlock-random-test diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 03ac4f2e1cce65..36045edb10dea0 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -374,8 +374,11 @@ CATEGORY="hmm" run_test bash ./test_hmm.sh smoke # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate +if [ -x ./memfd_secret ] +then (echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope 2>&1) | tap_prefix CATEGORY="memfd_secret" run_test ./memfd_secret +fi # KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100 CATEGORY="ksm" run_test ./ksm_tests -H -s 100 From edb907a6133323e19311901a39dee68b1c6a2ef8 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 12 Aug 2024 14:20:17 +0800 Subject: [PATCH 407/991] crash: fix riscv64 crash memory reserve dead loop On RISCV64 Qemu machine with 512MB memory, cmdline "crashkernel=500M,high" will cause system stall as below: Zone ranges: DMA32 [mem 0x0000000080000000-0x000000009fffffff] Normal empty Movable zone start for each node Early memory node ranges node 0: [mem 0x0000000080000000-0x000000008005ffff] node 0: [mem 0x0000000080060000-0x000000009fffffff] Initmem setup node 0 [mem 0x0000000080000000-0x000000009fffffff] (stall here) commit 5d99cadf1568 ("crash: fix x86_32 crash memory reserve dead loop bug") fix this on 32-bit architecture. However, the problem is not completely solved. If `CRASH_ADDR_LOW_MAX = CRASH_ADDR_HIGH_MAX` on 64-bit architecture, for example, when system memory is equal to CRASH_ADDR_LOW_MAX on RISCV64, the following infinite loop will also occur: -> reserve_crashkernel_generic() and high is true -> alloc at [CRASH_ADDR_LOW_MAX, CRASH_ADDR_HIGH_MAX] fail -> alloc at [0, CRASH_ADDR_LOW_MAX] fail and repeatedly (because CRASH_ADDR_LOW_MAX = CRASH_ADDR_HIGH_MAX). As Catalin suggested, do not remove the ",high" reservation fallback to ",low" logic which will change arm64's kdump behavior, but fix it by skipping the above situation similar to commit d2f32f23190b ("crash: fix x86_32 crash memory reserve dead loop"). After this patch, it print: cannot allocate crashkernel (size:0x1f400000) Link: https://lkml.kernel.org/r/20240812062017.2674441-1-ruanjinjie@huawei.com Signed-off-by: Jinjie Ruan Suggested-by: Catalin Marinas Reviewed-by: Catalin Marinas Acked-by: Baoquan He Cc: Albert Ou Cc: Dave Young Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_reserve.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index d3b4cd12bdd134..64d44a52c01147 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -423,7 +423,8 @@ void __init reserve_crashkernel_generic(char *cmdline, if (high && search_end == CRASH_ADDR_HIGH_MAX) { search_end = CRASH_ADDR_LOW_MAX; search_base = 0; - goto retry; + if (search_end != CRASH_ADDR_HIGH_MAX) + goto retry; } pr_warn("cannot allocate crashkernel (size:0x%llx)\n", crash_size); From a8fc28dad6d574582cdf2f7e78c73c59c623df30 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 13 Aug 2024 08:07:56 -0700 Subject: [PATCH 408/991] alloc_tag: introduce clear_page_tag_ref() helper function In several cases we are freeing pages which were not allocated using common page allocators. For such cases, in order to keep allocation accounting correct, we should clear the page tag to indicate that the page being freed is expected to not have a valid allocation tag. Introduce clear_page_tag_ref() helper function to be used for this. Link: https://lkml.kernel.org/r/20240813150758.855881-1-surenb@google.com Fixes: d224eb0287fb ("codetag: debug: mark codetags for reserved pages as empty") Signed-off-by: Suren Baghdasaryan Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Pasha Tatashin Cc: Kees Cook Cc: Kent Overstreet Cc: Sourav Panda Cc: Vlastimil Babka Cc: [6.10] Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 13 +++++++++++++ mm/mm_init.c | 10 +--------- mm/page_alloc.c | 9 +-------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 18cd0c0c73d93d..207f0c83c8e975 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -43,6 +43,18 @@ static inline void put_page_tag_ref(union codetag_ref *ref) page_ext_put(page_ext_from_codetag_ref(ref)); } +static inline void clear_page_tag_ref(struct page *page) +{ + if (mem_alloc_profiling_enabled()) { + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } + } +} + static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) { @@ -126,6 +138,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; } static inline void put_page_tag_ref(union codetag_ref *ref) {} +static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} diff --git a/mm/mm_init.c b/mm/mm_init.c index f9a60ffc553206..adc3127573cd45 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2459,15 +2459,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, } /* pages were reserved and not allocated */ - if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); - - if (ref) { - set_codetag_empty(ref); - put_page_tag_ref(ref); - } - } - + clear_page_tag_ref(page); __free_pages_core(page, order, MEMINIT_EARLY); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8747087acee3b3..c565de8f48e9d8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5815,14 +5815,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char void free_reserved_page(struct page *page) { - if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); - - if (ref) { - set_codetag_empty(ref); - put_page_tag_ref(ref); - } - } + clear_page_tag_ref(page); ClearPageReserved(page); init_page_count(page); __free_page(page); From 766c163c2068b45330664fb67df67268e588a22d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 13 Aug 2024 08:07:57 -0700 Subject: [PATCH 409/991] alloc_tag: mark pages reserved during CMA activation as not tagged During CMA activation, pages in CMA area are prepared and then freed without being allocated. This triggers warnings when memory allocation debug config (CONFIG_MEM_ALLOC_PROFILING_DEBUG) is enabled. Fix this by marking these pages not tagged before freeing them. Link: https://lkml.kernel.org/r/20240813150758.855881-2-surenb@google.com Fixes: d224eb0287fb ("codetag: debug: mark codetags for reserved pages as empty") Signed-off-by: Suren Baghdasaryan Acked-by: David Hildenbrand Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Cc: [6.10] Signed-off-by: Andrew Morton --- mm/mm_init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/mm_init.c b/mm/mm_init.c index adc3127573cd45..51960079875baa 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2244,6 +2244,8 @@ void __init init_cma_reserved_pageblock(struct page *page) set_pageblock_migratetype(page, MIGRATE_CMA); set_page_refcounted(page); + /* pages were reserved and not allocated */ + clear_page_tag_ref(page); __free_pages(page, pageblock_order); adjust_managed_page_count(page, pageblock_nr_pages); From 2e6506e1c4eed2676a8412231046f31e10e240da Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 29 Jul 2024 10:13:06 +0800 Subject: [PATCH 410/991] mm/migrate: fix deadlock in migrate_pages_batch() on large folios Currently, migrate_pages_batch() can lock multiple locked folios with an arbitrary order. Although folio_trylock() is used to avoid deadlock as commit 2ef7dbb26990 ("migrate_pages: try migrate in batch asynchronously firstly") mentioned, it seems try_split_folio() is still missing. It was found by compaction stress test when I explicitly enable EROFS compressed files to use large folios, which case I cannot reproduce with the same workload if large folio support is off (current mainline). Typically, filesystem reads (with locked file-backed folios) could use another bdev/meta inode to load some other I/Os (e.g. inode extent metadata or caching compressed data), so the locking order will be: file-backed folios (A) bdev/meta folios (B) The following calltrace shows the deadlock: Thread 1 takes (B) lock and tries to take folio (A) lock Thread 2 takes (A) lock and tries to take folio (B) lock [Thread 1] INFO: task stress:1824 blocked for more than 30 seconds. Tainted: G OE 6.10.0-rc7+ #6 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:stress state:D stack:0 pid:1824 tgid:1824 ppid:1822 flags:0x0000000c Call trace: __switch_to+0xec/0x138 __schedule+0x43c/0xcb0 schedule+0x54/0x198 io_schedule+0x44/0x70 folio_wait_bit_common+0x184/0x3f8 <-- folio mapping ffff00036d69cb18 index 996 (**) __folio_lock+0x24/0x38 migrate_pages_batch+0x77c/0xea0 // try_split_folio (mm/migrate.c:1486:2) // migrate_pages_batch (mm/migrate.c:1734:16) <--- LIST_HEAD(unmap_folios) has .. folio mapping 0xffff0000d184f1d8 index 1711; (*) folio mapping 0xffff0000d184f1d8 index 1712; .. migrate_pages+0xb28/0xe90 compact_zone+0xa08/0x10f0 compact_node+0x9c/0x180 sysctl_compaction_handler+0x8c/0x118 proc_sys_call_handler+0x1a8/0x280 proc_sys_write+0x1c/0x30 vfs_write+0x240/0x380 ksys_write+0x78/0x118 __arm64_sys_write+0x24/0x38 invoke_syscall+0x78/0x108 el0_svc_common.constprop.0+0x48/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x3c/0x148 el0t_64_sync_handler+0x100/0x130 el0t_64_sync+0x190/0x198 [Thread 2] INFO: task stress:1825 blocked for more than 30 seconds. Tainted: G OE 6.10.0-rc7+ #6 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:stress state:D stack:0 pid:1825 tgid:1825 ppid:1822 flags:0x0000000c Call trace: __switch_to+0xec/0x138 __schedule+0x43c/0xcb0 schedule+0x54/0x198 io_schedule+0x44/0x70 folio_wait_bit_common+0x184/0x3f8 <-- folio = 0xfffffdffc6b503c0 (mapping == 0xffff0000d184f1d8 index == 1711) (*) __folio_lock+0x24/0x38 z_erofs_runqueue+0x384/0x9c0 [erofs] z_erofs_readahead+0x21c/0x350 [erofs] <-- folio mapping 0xffff00036d69cb18 range from [992, 1024] (**) read_pages+0x74/0x328 page_cache_ra_order+0x26c/0x348 ondemand_readahead+0x1c0/0x3a0 page_cache_sync_ra+0x9c/0xc0 filemap_get_pages+0xc4/0x708 filemap_read+0x104/0x3a8 generic_file_read_iter+0x4c/0x150 vfs_read+0x27c/0x330 ksys_pread64+0x84/0xd0 __arm64_sys_pread64+0x28/0x40 invoke_syscall+0x78/0x108 el0_svc_common.constprop.0+0x48/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x3c/0x148 el0t_64_sync_handler+0x100/0x130 el0t_64_sync+0x190/0x198 Link: https://lkml.kernel.org/r/20240729021306.398286-1-hsiangkao@linux.alibaba.com Fixes: 5dfab109d519 ("migrate_pages: batch _unmap and _move") Signed-off-by: Gao Xiang Reviewed-by: "Huang, Ying" Acked-by: David Hildenbrand Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/migrate.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index e7296c0fb5d57a..923ea80ba7442b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1479,11 +1479,17 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, return rc; } -static inline int try_split_folio(struct folio *folio, struct list_head *split_folios) +static inline int try_split_folio(struct folio *folio, struct list_head *split_folios, + enum migrate_mode mode) { int rc; - folio_lock(folio); + if (mode == MIGRATE_ASYNC) { + if (!folio_trylock(folio)) + return -EAGAIN; + } else { + folio_lock(folio); + } rc = split_folio_to_list(folio, split_folios); folio_unlock(folio); if (!rc) @@ -1677,7 +1683,7 @@ static int migrate_pages_batch(struct list_head *from, */ if (nr_pages > 2 && !list_empty(&folio->_deferred_list)) { - if (try_split_folio(folio, split_folios) == 0) { + if (!try_split_folio(folio, split_folios, mode)) { nr_failed++; stats->nr_thp_failed += is_thp; stats->nr_thp_split += is_thp; @@ -1699,7 +1705,7 @@ static int migrate_pages_batch(struct list_head *from, if (!thp_migration_supported() && is_thp) { nr_failed++; stats->nr_thp_failed++; - if (!try_split_folio(folio, split_folios)) { + if (!try_split_folio(folio, split_folios, mode)) { stats->nr_thp_split++; stats->nr_split++; continue; @@ -1731,7 +1737,7 @@ static int migrate_pages_batch(struct list_head *from, stats->nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ if (is_large && !nosplit) { - int ret = try_split_folio(folio, split_folios); + int ret = try_split_folio(folio, split_folios, mode); if (!ret) { stats->nr_thp_split += is_thp; From ad899c301c880766cc709aad277991b3ab671b66 Mon Sep 17 00:00:00 2001 From: Eli Billauer Date: Fri, 16 Aug 2024 10:01:59 +0300 Subject: [PATCH 411/991] char: xillybus: Refine workqueue handling As the wakeup work item now runs on a separate workqueue, it needs to be flushed separately along with flushing the device's workqueue. Also, move the destroy_workqueue() call to the end of the exit method, so that deinitialization is done in the opposite order of initialization. Fixes: ccbde4b128ef ("char: xillybus: Don't destroy workqueue from work item running on it") Cc: stable Signed-off-by: Eli Billauer Link: https://lore.kernel.org/r/20240816070200.50695-1-eli.billauer@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/xillybus/xillyusb.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/char/xillybus/xillyusb.c b/drivers/char/xillybus/xillyusb.c index 33ca0f4af3901e..e12d359194f89d 100644 --- a/drivers/char/xillybus/xillyusb.c +++ b/drivers/char/xillybus/xillyusb.c @@ -2093,9 +2093,11 @@ static int xillyusb_discovery(struct usb_interface *interface) * just after responding with the IDT, there is no reason for any * work item to be running now. To be sure that xdev->channels * is updated on anything that might run in parallel, flush the - * workqueue, which rarely does anything. + * device's workqueue and the wakeup work item. This rarely + * does anything. */ flush_workqueue(xdev->workq); + flush_work(&xdev->wakeup_workitem); xdev->num_channels = num_channels; @@ -2274,9 +2276,9 @@ static int __init xillyusb_init(void) static void __exit xillyusb_exit(void) { - destroy_workqueue(wakeup_wq); - usb_deregister(&xillyusb_driver); + + destroy_workqueue(wakeup_wq); } module_init(xillyusb_init); From 2374bf7558de915edc6ec8cb10ec3291dfab9594 Mon Sep 17 00:00:00 2001 From: Eli Billauer Date: Fri, 16 Aug 2024 10:02:00 +0300 Subject: [PATCH 412/991] char: xillybus: Check USB endpoints when probing device Ensure, as the driver probes the device, that all endpoints that the driver may attempt to access exist and are of the correct type. All XillyUSB devices must have a Bulk IN and Bulk OUT endpoint at address 1. This is verified in xillyusb_setup_base_eps(). On top of that, a XillyUSB device may have additional Bulk OUT endpoints. The information about these endpoints' addresses is deduced from a data structure (the IDT) that the driver fetches from the device while probing it. These endpoints are checked in setup_channels(). A XillyUSB device never has more than one IN endpoint, as all data towards the host is multiplexed in this single Bulk IN endpoint. This is why setup_channels() only checks OUT endpoints. Reported-by: syzbot+eac39cba052f2e750dbe@syzkaller.appspotmail.com Cc: stable Closes: https://lore.kernel.org/all/0000000000001d44a6061f7a54ee@google.com/T/ Fixes: a53d1202aef1 ("char: xillybus: Add driver for XillyUSB (Xillybus variant for USB)"). Signed-off-by: Eli Billauer Link: https://lore.kernel.org/r/20240816070200.50695-2-eli.billauer@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/xillybus/xillyusb.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/char/xillybus/xillyusb.c b/drivers/char/xillybus/xillyusb.c index e12d359194f89d..45771b1a3716a2 100644 --- a/drivers/char/xillybus/xillyusb.c +++ b/drivers/char/xillybus/xillyusb.c @@ -1903,6 +1903,13 @@ static const struct file_operations xillyusb_fops = { static int xillyusb_setup_base_eps(struct xillyusb_dev *xdev) { + struct usb_device *udev = xdev->udev; + + /* Verify that device has the two fundamental bulk in/out endpoints */ + if (usb_pipe_type_check(udev, usb_sndbulkpipe(udev, MSG_EP_NUM)) || + usb_pipe_type_check(udev, usb_rcvbulkpipe(udev, IN_EP_NUM))) + return -ENODEV; + xdev->msg_ep = endpoint_alloc(xdev, MSG_EP_NUM | USB_DIR_OUT, bulk_out_work, 1, 2); if (!xdev->msg_ep) @@ -1932,14 +1939,15 @@ static int setup_channels(struct xillyusb_dev *xdev, __le16 *chandesc, int num_channels) { - struct xillyusb_channel *chan; + struct usb_device *udev = xdev->udev; + struct xillyusb_channel *chan, *new_channels; int i; chan = kcalloc(num_channels, sizeof(*chan), GFP_KERNEL); if (!chan) return -ENOMEM; - xdev->channels = chan; + new_channels = chan; for (i = 0; i < num_channels; i++, chan++) { unsigned int in_desc = le16_to_cpu(*chandesc++); @@ -1968,6 +1976,15 @@ static int setup_channels(struct xillyusb_dev *xdev, */ if ((out_desc & 0x80) && i < 14) { /* Entry is valid */ + if (usb_pipe_type_check(udev, + usb_sndbulkpipe(udev, i + 2))) { + dev_err(xdev->dev, + "Missing BULK OUT endpoint %d\n", + i + 2); + kfree(new_channels); + return -ENODEV; + } + chan->writable = 1; chan->out_synchronous = !!(out_desc & 0x40); chan->out_seekable = !!(out_desc & 0x20); @@ -1977,6 +1994,7 @@ static int setup_channels(struct xillyusb_dev *xdev, } } + xdev->channels = new_channels; return 0; } From 8d019b15ddd55d6dc5685b1f51902c4aa8e01939 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:06:54 +0300 Subject: [PATCH 413/991] selftests: net: local_termination: refactor macvlan creation/deletion This will be used in other subtests as well; make new macvlan_create() and macvlan_destroy() functions. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../net/forwarding/local_termination.sh | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 4b364cdf3ef0cf..36f3d577d0be89 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -247,19 +247,29 @@ bridge_destroy() ip link del br0 } -standalone() +macvlan_create() { - h1_create - h2_create + local lower=$1 - ip link add link $h2 name macvlan0 type macvlan mode private + ip link add link $lower name macvlan0 type macvlan mode private ip link set macvlan0 address $MACVLAN_ADDR ip link set macvlan0 up +} - run_test $h2 - +macvlan_destroy() +{ ip link del macvlan0 +} + +standalone() +{ + h1_create + h2_create + macvlan_create $h2 + + run_test $h2 + macvlan_destroy h2_destroy h1_destroy } @@ -268,15 +278,11 @@ bridge() { h1_create bridge_create - - ip link add link br0 name macvlan0 type macvlan mode private - ip link set macvlan0 address $MACVLAN_ADDR - ip link set macvlan0 up + macvlan_create br0 run_test br0 - ip link del macvlan0 - + macvlan_destroy bridge_destroy h1_destroy } From 4261fa35185c0112acca0496d3732c8fcfe1dcf2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:06:55 +0300 Subject: [PATCH 414/991] selftests: net: local_termination: parameterize sending interface In future changes we will want to subject the DUT, $h2, to additional VLAN-tagged traffic. For that, we need to run the tests using $h1.100 as a sending interface, rather than the currently hardcoded $h1. Add a parameter to run_test() and modify its 2 callers to explicitly pass $h1, as was implicit before. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../net/forwarding/local_termination.sh | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 36f3d577d0be89..92f0e242d119da 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -104,44 +104,45 @@ mc_route_destroy() run_test() { - local rcv_if_name=$1 - local smac=$(mac_get $h1) + local send_if_name=$1; shift + local rcv_if_name=$1; shift + local smac=$(mac_get $send_if_name) local rcv_dmac=$(mac_get $rcv_if_name) tcpdump_start $rcv_if_name - mc_route_prepare $h1 + mc_route_prepare $send_if_name mc_route_prepare $rcv_if_name - send_uc_ipv4 $h1 $rcv_dmac - send_uc_ipv4 $h1 $MACVLAN_ADDR - send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR1 + send_uc_ipv4 $send_if_name $rcv_dmac + send_uc_ipv4 $send_if_name $MACVLAN_ADDR + send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR1 ip link set dev $rcv_if_name promisc on - send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR2 - mc_send $h1 $UNKNOWN_IPV4_MC_ADDR2 - mc_send $h1 $UNKNOWN_IPV6_MC_ADDR2 + send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR2 + mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR2 + mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR2 ip link set dev $rcv_if_name promisc off mc_join $rcv_if_name $JOINED_IPV4_MC_ADDR - mc_send $h1 $JOINED_IPV4_MC_ADDR + mc_send $send_if_name $JOINED_IPV4_MC_ADDR mc_leave mc_join $rcv_if_name $JOINED_IPV6_MC_ADDR - mc_send $h1 $JOINED_IPV6_MC_ADDR + mc_send $send_if_name $JOINED_IPV6_MC_ADDR mc_leave - mc_send $h1 $UNKNOWN_IPV4_MC_ADDR1 - mc_send $h1 $UNKNOWN_IPV6_MC_ADDR1 + mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR1 + mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR1 ip link set dev $rcv_if_name allmulticast on - send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR3 - mc_send $h1 $UNKNOWN_IPV4_MC_ADDR3 - mc_send $h1 $UNKNOWN_IPV6_MC_ADDR3 + send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR3 + mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR3 + mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR3 ip link set dev $rcv_if_name allmulticast off mc_route_destroy $rcv_if_name - mc_route_destroy $h1 + mc_route_destroy $send_if_name sleep 1 @@ -267,7 +268,7 @@ standalone() h2_create macvlan_create $h2 - run_test $h2 + run_test $h1 $h2 macvlan_destroy h2_destroy @@ -280,7 +281,7 @@ bridge() bridge_create macvlan_create br0 - run_test br0 + run_test $h1 br0 macvlan_destroy bridge_destroy From df7cf5cc551c7c0a92520e91e1184993784c6386 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:06:56 +0300 Subject: [PATCH 415/991] selftests: net: local_termination: parameterize test name There are upcoming tests which verify the RX filtering of a bridge (or bridge port), but under differing vlan_filtering conditions. Since we currently print $h2 (the DUT) in the log_test() output, it becomes necessary to make a further distinction between tests, to not give the user the impression that the exact same thing is run twice. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../net/forwarding/local_termination.sh | 38 ++++++++++--------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 92f0e242d119da..af284edaf4014f 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -68,10 +68,11 @@ send_uc_ipv4() check_rcv() { - local if_name=$1 - local type=$2 - local pattern=$3 - local should_receive=$4 + local if_name=$1; shift + local type=$1; shift + local pattern=$1; shift + local should_receive=$1; shift + local test_name="$1"; shift local should_fail= [ $should_receive = true ] && should_fail=0 || should_fail=1 @@ -81,7 +82,7 @@ check_rcv() check_err_fail "$should_fail" "$?" "reception" - log_test "$if_name: $type" + log_test "$test_name: $type" } mc_route_prepare() @@ -106,6 +107,7 @@ run_test() { local send_if_name=$1; shift local rcv_if_name=$1; shift + local test_name="$1"; shift local smac=$(mac_get $send_if_name) local rcv_dmac=$(mac_get $rcv_if_name) @@ -150,61 +152,61 @@ run_test() check_rcv $rcv_if_name "Unicast IPv4 to primary MAC address" \ "$smac > $rcv_dmac, ethertype IPv4 (0x0800)" \ - true + true "$test_name" check_rcv $rcv_if_name "Unicast IPv4 to macvlan MAC address" \ "$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \ - true + true "$test_name" xfail_on_veth $h1 \ check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ - false + false "$test_name" check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \ "$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \ - true + true "$test_name" xfail_on_veth $h1 \ check_rcv $rcv_if_name \ "Unicast IPv4 to unknown MAC address, allmulti" \ "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ - false + false "$test_name" check_rcv $rcv_if_name "Multicast IPv4 to joined group" \ "$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \ - true + true "$test_name" xfail_on_veth $h1 \ check_rcv $rcv_if_name \ "Multicast IPv4 to unknown group" \ "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ - false + false "$test_name" check_rcv $rcv_if_name "Multicast IPv4 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV4_MC_ADDR2, ethertype IPv4 (0x0800)" \ - true + true "$test_name" check_rcv $rcv_if_name "Multicast IPv4 to unknown group, allmulti" \ "$smac > $UNKNOWN_MACV4_MC_ADDR3, ethertype IPv4 (0x0800)" \ - true + true "$test_name" check_rcv $rcv_if_name "Multicast IPv6 to joined group" \ "$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \ - true + true "$test_name" xfail_on_veth $h1 \ check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ - false + false "$test_name" check_rcv $rcv_if_name "Multicast IPv6 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV6_MC_ADDR2, ethertype IPv6 (0x86dd)" \ - true + true "$test_name" check_rcv $rcv_if_name "Multicast IPv6 to unknown group, allmulti" \ "$smac > $UNKNOWN_MACV6_MC_ADDR3, ethertype IPv6 (0x86dd)" \ - true + true "$test_name" tcpdump_cleanup $rcv_if_name } From 5b8e74182ed3d4f1c38c626e6120275ca9d92bee Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:06:57 +0300 Subject: [PATCH 416/991] selftests: net: local_termination: add one more test for VLAN-aware bridges The current bridge() test is for packet reception on a VLAN-unaware bridge. Some things are different enough with VLAN-aware bridges that it's worth renaming this test into vlan_unaware_bridge(), and add a new vlan_aware_bridge() test. The two will share the same implementation: bridge() becomes a common function, which receives $vlan_filtering as an argument. Rename it to test_bridge() at the same time, because just bridge() pollutes the global namespace and we cannot invoke the binary with the same name from the iproute2 package currently. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../net/forwarding/local_termination.sh | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index af284edaf4014f..5aa364b40e335f 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="standalone bridge" +ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge" NUM_NETIFS=2 PING_COUNT=1 REQUIRE_MTOOLS=yes @@ -233,7 +233,9 @@ h2_destroy() bridge_create() { - ip link add br0 type bridge + local vlan_filtering=$1 + + ip link add br0 type bridge vlan_filtering $vlan_filtering ip link set br0 address $BRIDGE_ADDR ip link set br0 up @@ -277,10 +279,12 @@ standalone() h1_destroy } -bridge() +test_bridge() { + local vlan_filtering=$1 + h1_create - bridge_create + bridge_create $vlan_filtering macvlan_create br0 run_test $h1 br0 @@ -290,6 +294,16 @@ bridge() h1_destroy } +vlan_unaware_bridge() +{ + test_bridge 0 +} + +vlan_aware_bridge() +{ + test_bridge 1 +} + cleanup() { pre_cleanup From 5fea8bb009744bbb90b3f6ca41c558429ee4c849 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:06:58 +0300 Subject: [PATCH 417/991] selftests: net: local_termination: introduce new tests which capture VLAN behavior Add more coverage to the local termination selftest as follows: - 8021q upper of $h2 - 8021q upper of $h2, where $h2 is a port of a VLAN-unaware bridge - 8021q upper of $h2, where $h2 is a port of a VLAN-aware bridge - 8021q upper of VLAN-unaware br0, which is the upper of $h2 - 8021q upper of VLAN-aware br0, which is the upper of $h2 Especially the cases with traffic sent through the VLAN upper of a VLAN-aware bridge port will be immediately relevant when we will start transmitting PTP packets as an additional kind of traffic. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../net/forwarding/local_termination.sh | 117 ++++++++++++++++-- 1 file changed, 110 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 5aa364b40e335f..e22c6a693bef2d 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -1,7 +1,9 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge" +ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge test_vlan \ + vlan_over_vlan_unaware_bridged_port vlan_over_vlan_aware_bridged_port \ + vlan_over_vlan_unaware_bridge vlan_over_vlan_aware_bridge" NUM_NETIFS=2 PING_COUNT=1 REQUIRE_MTOOLS=yes @@ -231,6 +233,30 @@ h2_destroy() simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64 } +h1_vlan_create() +{ + simple_if_init $h1 + vlan_create $h1 100 v$h1 $H1_IPV4/24 $H1_IPV6/64 +} + +h1_vlan_destroy() +{ + vlan_destroy $h1 100 + simple_if_fini $h1 +} + +h2_vlan_create() +{ + simple_if_init $h2 + vlan_create $h2 100 v$h2 $H2_IPV4/24 $H2_IPV6/64 +} + +h2_vlan_destroy() +{ + vlan_destroy $h2 100 + simple_if_fini $h2 +} + bridge_create() { local vlan_filtering=$1 @@ -241,14 +267,10 @@ bridge_create() ip link set $h2 master br0 ip link set $h2 up - - simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64 } bridge_destroy() { - simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64 - ip link del br0 } @@ -272,7 +294,7 @@ standalone() h2_create macvlan_create $h2 - run_test $h1 $h2 + run_test $h1 $h2 "$h2" macvlan_destroy h2_destroy @@ -285,11 +307,13 @@ test_bridge() h1_create bridge_create $vlan_filtering + simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64 macvlan_create br0 - run_test $h1 br0 + run_test $h1 br0 "vlan_filtering=$vlan_filtering bridge" macvlan_destroy + simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64 bridge_destroy h1_destroy } @@ -304,6 +328,85 @@ vlan_aware_bridge() test_bridge 1 } +test_vlan() +{ + h1_vlan_create + h2_vlan_create + macvlan_create $h2.100 + + run_test $h1.100 $h2.100 "VLAN upper" + + macvlan_destroy + h2_vlan_destroy + h1_vlan_destroy +} + +vlan_over_bridged_port() +{ + local vlan_filtering=$1 + + h1_vlan_create + h2_vlan_create + bridge_create $vlan_filtering + macvlan_create $h2.100 + + run_test $h1.100 $h2.100 "VLAN over vlan_filtering=$vlan_filtering bridged port" + + macvlan_destroy + bridge_destroy + h2_vlan_destroy + h1_vlan_destroy +} + +vlan_over_vlan_unaware_bridged_port() +{ + vlan_over_bridged_port 0 +} + +vlan_over_vlan_aware_bridged_port() +{ + vlan_over_bridged_port 1 +} + +vlan_over_bridge() +{ + local vlan_filtering=$1 + + h1_vlan_create + bridge_create $vlan_filtering + simple_if_init br0 + vlan_create br0 100 vbr0 $H2_IPV4/24 $H2_IPV6/64 + macvlan_create br0.100 + + if [ $vlan_filtering = 1 ]; then + bridge vlan add dev $h2 vid 100 master + bridge vlan add dev br0 vid 100 self + fi + + run_test $h1.100 br0.100 "VLAN over vlan_filtering=$vlan_filtering bridge" + + if [ $vlan_filtering = 1 ]; then + bridge vlan del dev br0 vid 100 self + bridge vlan del dev $h2 vid 100 master + fi + + macvlan_destroy + vlan_destroy br0 100 + simple_if_fini br0 + bridge_destroy + h1_vlan_destroy +} + +vlan_over_vlan_unaware_bridge() +{ + vlan_over_bridge 0 +} + +vlan_over_vlan_aware_bridge() +{ + vlan_over_bridge 1 +} + cleanup() { pre_cleanup From 9aa3749ca4a880c1a59720aab3eacf344ed8d68d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:06:59 +0300 Subject: [PATCH 418/991] selftests: net: local_termination: don't use xfail_on_veth() xfail_on_veth() for this test is an incorrect approximation which gives false positives and false negatives. When local_termination fails with "reception succeeded, but should have failed", it is because the DUT ($h2) accepts packets even when not configured as promiscuous. This is not something specific to veth; even the bridge behaves that way, but this is not captured by the xfail_on_veth test. The IFF_UNICAST_FLT flag is not explicitly exported to user space, but it can somewhat be determined from the interface's behavior. We have to create a macvlan upper with a different MAC address. This forces a dev_uc_add() call in the kernel. When the unicast filtering list is not empty, but the device doesn't support IFF_UNICAST_FLT, __dev_set_rx_mode() force-enables promiscuity on the interface, to ensure correct behavior (that the requested address is received). We can monitor the change in the promiscuity flag and infer from it whether the device supports unicast filtering. There is no equivalent thing for allmulti, unfortunately. We never know what's hiding behind a device which has allmulti=off. Whether it will actually perform RX multicast filtering of unknown traffic is a strong "maybe". The bridge driver, for example, completely ignores the flag. We'll have to keep the xfail behavior, but instead of XFAIL on just veth, always XFAIL. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/lib.sh | 57 ++++++++++++++++++ .../net/forwarding/local_termination.sh | 58 ++++++++++++++----- 2 files changed, 99 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index ff96bb7535ff00..718d04a4f72d02 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -500,6 +500,11 @@ check_err_fail() fi } +xfail() +{ + FAIL_TO_XFAIL=yes "$@" +} + xfail_on_slow() { if [[ $KSFT_MACHINE_SLOW = yes ]]; then @@ -1113,6 +1118,39 @@ mac_get() ip -j link show dev $if_name | jq -r '.[]["address"]' } +ether_addr_to_u64() +{ + local addr="$1" + local order="$((1 << 40))" + local val=0 + local byte + + addr="${addr//:/ }" + + for byte in $addr; do + byte="0x$byte" + val=$((val + order * byte)) + order=$((order >> 8)) + done + + printf "0x%x" $val +} + +u64_to_ether_addr() +{ + local val=$1 + local byte + local i + + for ((i = 40; i >= 0; i -= 8)); do + byte=$(((val & (0xff << i)) >> i)) + printf "%02x" $byte + if [ $i -ne 0 ]; then + printf ":" + fi + done +} + ipv6_lladdr_get() { local if_name=$1 @@ -2229,3 +2267,22 @@ absval() echo $((v > 0 ? v : -v)) } + +has_unicast_flt() +{ + local dev=$1; shift + local mac_addr=$(mac_get $dev) + local tmp=$(ether_addr_to_u64 $mac_addr) + local promisc + + ip link set $dev up + ip link add link $dev name macvlan-tmp type macvlan mode private + ip link set macvlan-tmp address $(u64_to_ether_addr $((tmp + 1))) + ip link set macvlan-tmp up + + promisc=$(ip -j -d link show dev $dev | jq -r '.[].promiscuity') + + ip link del macvlan-tmp + + [[ $promisc == 1 ]] && echo "no" || echo "yes" +} diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index e22c6a693bef2d..80ea4c10d76498 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -109,9 +109,11 @@ run_test() { local send_if_name=$1; shift local rcv_if_name=$1; shift + local no_unicast_flt=$1; shift local test_name="$1"; shift local smac=$(mac_get $send_if_name) local rcv_dmac=$(mac_get $rcv_if_name) + local should_receive tcpdump_start $rcv_if_name @@ -160,26 +162,26 @@ run_test() "$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \ true "$test_name" - xfail_on_veth $h1 \ - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ - "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ - false "$test_name" + [ $no_unicast_flt = true ] && should_receive=true || should_receive=false + check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ + "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ + $should_receive "$test_name" check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \ "$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \ true "$test_name" - xfail_on_veth $h1 \ - check_rcv $rcv_if_name \ - "Unicast IPv4 to unknown MAC address, allmulti" \ - "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ - false "$test_name" + [ $no_unicast_flt = true ] && should_receive=true || should_receive=false + check_rcv $rcv_if_name \ + "Unicast IPv4 to unknown MAC address, allmulti" \ + "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ + $should_receive "$test_name" check_rcv $rcv_if_name "Multicast IPv4 to joined group" \ "$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \ true "$test_name" - xfail_on_veth $h1 \ + xfail \ check_rcv $rcv_if_name \ "Multicast IPv4 to unknown group" \ "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ @@ -197,7 +199,7 @@ run_test() "$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \ true "$test_name" - xfail_on_veth $h1 \ + xfail \ check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ false "$test_name" @@ -290,11 +292,17 @@ macvlan_destroy() standalone() { + local no_unicast_flt=true + + if [ $(has_unicast_flt $h2) = yes ]; then + no_unicast_flt=false + fi + h1_create h2_create macvlan_create $h2 - run_test $h1 $h2 "$h2" + run_test $h1 $h2 $no_unicast_flt "$h2" macvlan_destroy h2_destroy @@ -303,6 +311,7 @@ standalone() test_bridge() { + local no_unicast_flt=true local vlan_filtering=$1 h1_create @@ -310,7 +319,7 @@ test_bridge() simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64 macvlan_create br0 - run_test $h1 br0 "vlan_filtering=$vlan_filtering bridge" + run_test $h1 br0 $no_unicast_flt "vlan_filtering=$vlan_filtering bridge" macvlan_destroy simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64 @@ -330,11 +339,17 @@ vlan_aware_bridge() test_vlan() { + local no_unicast_flt=true + + if [ $(has_unicast_flt $h2) = yes ]; then + no_unicast_flt=false + fi + h1_vlan_create h2_vlan_create macvlan_create $h2.100 - run_test $h1.100 $h2.100 "VLAN upper" + run_test $h1.100 $h2.100 $no_unicast_flt "VLAN upper" macvlan_destroy h2_vlan_destroy @@ -343,14 +358,23 @@ test_vlan() vlan_over_bridged_port() { + local no_unicast_flt=true local vlan_filtering=$1 + # br_manage_promisc() will not force a single vlan_filtering port to + # promiscuous mode, so we should still expect unicast filtering to take + # place if the device can do it. + if [ $(has_unicast_flt $h2) = yes ] && [ $vlan_filtering = 1 ]; then + no_unicast_flt=false + fi + h1_vlan_create h2_vlan_create bridge_create $vlan_filtering macvlan_create $h2.100 - run_test $h1.100 $h2.100 "VLAN over vlan_filtering=$vlan_filtering bridged port" + run_test $h1.100 $h2.100 $no_unicast_flt \ + "VLAN over vlan_filtering=$vlan_filtering bridged port" macvlan_destroy bridge_destroy @@ -370,6 +394,7 @@ vlan_over_vlan_aware_bridged_port() vlan_over_bridge() { + local no_unicast_flt=true local vlan_filtering=$1 h1_vlan_create @@ -383,7 +408,8 @@ vlan_over_bridge() bridge vlan add dev br0 vid 100 self fi - run_test $h1.100 br0.100 "VLAN over vlan_filtering=$vlan_filtering bridge" + run_test $h1.100 br0.100 $no_unicast_flt \ + "VLAN over vlan_filtering=$vlan_filtering bridge" if [ $vlan_filtering = 1 ]; then bridge vlan del dev br0 vid 100 self From 237979504264912a9797dabc0db35126e705fe0d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:07:00 +0300 Subject: [PATCH 419/991] selftests: net: local_termination: add PTP frames to the mix A breakage in the felix DSA driver shows we do not have enough test coverage. More generally, it is sufficiently special that it is likely drivers will treat it differently. This is not meant to be a full PTP test, it just makes sure that PTP packets sent to the different addresses corresponding to their profiles are received correctly. The local_termination selftest seemed like the most appropriate place for this addition. PTP RX/TX in some cases makes no sense (over a bridge) and this is why $skip_ptp exists. And in others - PTP over a bridge port - the IP stack needs convincing through the available bridge netfilter hooks to leave the PTP packets alone and not stolen by the bridge rx_handler. It is safe to assume that users have that figured out already. This is a driver level test, and by using tcpdump, all that extra setup is out of scope here. send_non_ip() was an unfinished idea; written but never used. Replace it with a more generic send_raw(), and send 3 PTP packet types times 3 transports. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../net/forwarding/local_termination.sh | 161 ++++++++++++++++-- 1 file changed, 148 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 80ea4c10d76498..648868f7460442 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -39,9 +39,68 @@ UNKNOWN_MACV6_MC_ADDR1="33:33:01:02:03:05" UNKNOWN_MACV6_MC_ADDR2="33:33:01:02:03:06" UNKNOWN_MACV6_MC_ADDR3="33:33:01:02:03:07" -NON_IP_MC="01:02:03:04:05:06" -NON_IP_PKT="00:04 48:45:4c:4f" -BC="ff:ff:ff:ff:ff:ff" +PTP_1588_L2_SYNC=" \ +01:1b:19:00:00:00 00:00:de:ad:be:ef 88:f7 00 02 \ +00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 00 00 \ +00 00 00 00 00 00 00 00 00 00" +PTP_1588_L2_FOLLOW_UP=" \ +01:1b:19:00:00:00 00:00:de:ad:be:ef 88:f7 08 02 \ +00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 02 00 \ +00 00 66 83 c5 f1 17 97 ed f0" +PTP_1588_L2_PDELAY_REQ=" \ +01:80:c2:00:00:0e 00:00:de:ad:be:ef 88:f7 02 02 \ +00 36 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 06 05 7f \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 00 00" +PTP_1588_IPV4_SYNC=" \ +01:00:5e:00:01:81 00:00:de:ad:be:ef 08:00 45 00 \ +00 48 0a 9a 40 00 01 11 cb 88 c0 00 02 01 e0 00 \ +01 81 01 3f 01 3f 00 34 a3 c8 00 02 00 2c 00 00 \ +02 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \ +63 ff fe cf 17 0e 00 01 00 00 00 00 00 00 00 00 \ +00 00 00 00 00 00" +PTP_1588_IPV4_FOLLOW_UP=" +01:00:5e:00:01:81 00:00:de:ad:be:ef 08:00 45 00 \ +00 48 0a 9b 40 00 01 11 cb 87 c0 00 02 01 e0 00 \ +01 81 01 40 01 40 00 34 a3 c8 08 02 00 2c 00 00 \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \ +63 ff fe cf 17 0e 00 01 00 00 02 00 00 00 66 83 \ +c6 0f 1d 9a 61 87" +PTP_1588_IPV4_PDELAY_REQ=" \ +01:00:5e:00:00:6b 00:00:de:ad:be:ef 08:00 45 00 \ +00 52 35 a9 40 00 01 11 a1 85 c0 00 02 01 e0 00 \ +00 6b 01 3f 01 3f 00 3e a2 bc 02 02 00 36 00 00 \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \ +63 ff fe cf 17 0e 00 01 00 01 05 7f 00 00 00 00 \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00" +PTP_1588_IPV6_SYNC=" \ +33:33:00:00:01:81 00:00:de:ad:be:ef 86:dd 60 06 \ +7c 2f 00 36 11 01 20 01 0d b8 00 01 00 00 00 00 \ +00 00 00 00 00 01 ff 0e 00 00 00 00 00 00 00 00 \ +00 00 00 00 01 81 01 3f 01 3f 00 36 2e 92 00 02 \ +00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 00 00 \ +00 00 00 00 00 00 00 00 00 00 00 00" +PTP_1588_IPV6_FOLLOW_UP=" \ +33:33:00:00:01:81 00:00:de:ad:be:ef 86:dd 60 0a \ +00 bc 00 36 11 01 20 01 0d b8 00 01 00 00 00 00 \ +00 00 00 00 00 01 ff 0e 00 00 00 00 00 00 00 00 \ +00 00 00 00 01 81 01 40 01 40 00 36 2e 92 08 02 \ +00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 02 00 \ +00 00 66 83 c6 2a 32 09 bd 74 00 00" +PTP_1588_IPV6_PDELAY_REQ=" \ +33:33:00:00:00:6b 00:00:de:ad:be:ef 86:dd 60 0c \ +5c fd 00 40 11 01 fe 80 00 00 00 00 00 00 3c 37 \ +63 ff fe cf 17 0e ff 02 00 00 00 00 00 00 00 00 \ +00 00 00 00 00 6b 01 3f 01 3f 00 40 b4 54 02 02 \ +00 36 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 01 05 7f \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 00 00 00 00" # Disable promisc to ensure we don't receive unknown MAC DA packets export TCPDUMP_EXTRA_FLAGS="-pl" @@ -49,13 +108,15 @@ export TCPDUMP_EXTRA_FLAGS="-pl" h1=${NETIFS[p1]} h2=${NETIFS[p2]} -send_non_ip() +send_raw() { - local if_name=$1 - local smac=$2 - local dmac=$3 + local if_name=$1; shift + local pkt="$1"; shift + local smac=$(mac_get $if_name) - $MZ -q $if_name "$dmac $smac $NON_IP_PKT" + pkt="${pkt/00:00:de:ad:be:ef/$smac}" + + $MZ -q $if_name "$pkt" } send_uc_ipv4() @@ -109,6 +170,7 @@ run_test() { local send_if_name=$1; shift local rcv_if_name=$1; shift + local skip_ptp=$1; shift local no_unicast_flt=$1; shift local test_name="$1"; shift local smac=$(mac_get $send_if_name) @@ -150,6 +212,35 @@ run_test() mc_route_destroy $rcv_if_name mc_route_destroy $send_if_name + if [ $skip_ptp = false ]; then + ip maddress add 01:1b:19:00:00:00 dev $rcv_if_name + send_raw $send_if_name "$PTP_1588_L2_SYNC" + send_raw $send_if_name "$PTP_1588_L2_FOLLOW_UP" + ip maddress del 01:1b:19:00:00:00 dev $rcv_if_name + + ip maddress add 01:80:c2:00:00:0e dev $rcv_if_name + send_raw $send_if_name "$PTP_1588_L2_PDELAY_REQ" + ip maddress del 01:80:c2:00:00:0e dev $rcv_if_name + + mc_join $rcv_if_name 224.0.1.129 + send_raw $send_if_name "$PTP_1588_IPV4_SYNC" + send_raw $send_if_name "$PTP_1588_IPV4_FOLLOW_UP" + mc_leave + + mc_join $rcv_if_name 224.0.0.107 + send_raw $send_if_name "$PTP_1588_IPV4_PDELAY_REQ" + mc_leave + + mc_join $rcv_if_name ff0e::181 + send_raw $send_if_name "$PTP_1588_IPV6_SYNC" + send_raw $send_if_name "$PTP_1588_IPV6_FOLLOW_UP" + mc_leave + + mc_join $rcv_if_name ff02::6b + send_raw $send_if_name "$PTP_1588_IPV6_PDELAY_REQ" + mc_leave + fi + sleep 1 tcpdump_stop $rcv_if_name @@ -212,6 +303,44 @@ run_test() "$smac > $UNKNOWN_MACV6_MC_ADDR3, ethertype IPv6 (0x86dd)" \ true "$test_name" + if [ $skip_ptp = false ]; then + check_rcv $rcv_if_name "1588v2 over L2 transport, Sync" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type : sync msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over L2 transport, Follow-Up" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type : follow up msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over L2 transport, Peer Delay Request" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type : peer delay req msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv4, Sync" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type : sync msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv4, Follow-Up" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type : follow up msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv4, Peer Delay Request" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type : peer delay req msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv6, Sync" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type : sync msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv6, Follow-Up" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type : follow up msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv6, Peer Delay Request" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type : peer delay req msg" \ + true "$test_name" + fi + tcpdump_cleanup $rcv_if_name } @@ -293,6 +422,7 @@ macvlan_destroy() standalone() { local no_unicast_flt=true + local skip_ptp=false if [ $(has_unicast_flt $h2) = yes ]; then no_unicast_flt=false @@ -302,7 +432,7 @@ standalone() h2_create macvlan_create $h2 - run_test $h1 $h2 $no_unicast_flt "$h2" + run_test $h1 $h2 $skip_ptp $no_unicast_flt "$h2" macvlan_destroy h2_destroy @@ -313,13 +443,15 @@ test_bridge() { local no_unicast_flt=true local vlan_filtering=$1 + local skip_ptp=true h1_create bridge_create $vlan_filtering simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64 macvlan_create br0 - run_test $h1 br0 $no_unicast_flt "vlan_filtering=$vlan_filtering bridge" + run_test $h1 br0 $skip_ptp $no_unicast_flt \ + "vlan_filtering=$vlan_filtering bridge" macvlan_destroy simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64 @@ -340,6 +472,7 @@ vlan_aware_bridge() test_vlan() { local no_unicast_flt=true + local skip_ptp=false if [ $(has_unicast_flt $h2) = yes ]; then no_unicast_flt=false @@ -349,7 +482,7 @@ test_vlan() h2_vlan_create macvlan_create $h2.100 - run_test $h1.100 $h2.100 $no_unicast_flt "VLAN upper" + run_test $h1.100 $h2.100 $skip_ptp $no_unicast_flt "VLAN upper" macvlan_destroy h2_vlan_destroy @@ -360,6 +493,7 @@ vlan_over_bridged_port() { local no_unicast_flt=true local vlan_filtering=$1 + local skip_ptp=false # br_manage_promisc() will not force a single vlan_filtering port to # promiscuous mode, so we should still expect unicast filtering to take @@ -373,7 +507,7 @@ vlan_over_bridged_port() bridge_create $vlan_filtering macvlan_create $h2.100 - run_test $h1.100 $h2.100 $no_unicast_flt \ + run_test $h1.100 $h2.100 $skip_ptp $no_unicast_flt \ "VLAN over vlan_filtering=$vlan_filtering bridged port" macvlan_destroy @@ -396,6 +530,7 @@ vlan_over_bridge() { local no_unicast_flt=true local vlan_filtering=$1 + local skip_ptp=true h1_vlan_create bridge_create $vlan_filtering @@ -408,7 +543,7 @@ vlan_over_bridge() bridge vlan add dev br0 vid 100 self fi - run_test $h1.100 br0.100 $no_unicast_flt \ + run_test $h1.100 br0.100 $skip_ptp $no_unicast_flt \ "VLAN over vlan_filtering=$vlan_filtering bridge" if [ $vlan_filtering = 1 ]; then From e29b82ef27616777e21c07dc263a8769cbdaa358 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:07:01 +0300 Subject: [PATCH 420/991] selftests: net: bridge_vlan_aware: test that other TPIDs are seen as untagged The bridge VLAN implementation w.r.t. VLAN protocol is described in merge commit 1a0b20b25732 ("Merge branch 'bridge-next'"). We are only sensitive to those VLAN tags whose TPID is equal to the bridge's vlan_protocol. Thus, an 802.1ad VLAN should be treated as 802.1Q-untagged. Add 3 tests which validate that: - 802.1ad-tagged traffic is learned into the PVID of an 802.1Q-aware bridge - Double-tagged traffic is forwarded when just the PVID of the port is present in the VLAN group of the ports - Double-tagged traffic is not forwarded when the PVID of the port is absent from the VLAN group of the ports The test passes with both veth and ocelot. Signed-off-by: Vladimir Oltean Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Signed-off-by: David S. Miller --- .../net/forwarding/bridge_vlan_aware.sh | 54 ++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh index 64bd00fe9a4f87..90f8a244ea9015 100755 --- a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh +++ b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn" +ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn other_tpid" NUM_NETIFS=4 CHECK_TC="yes" source lib.sh @@ -142,6 +142,58 @@ extern_learn() bridge fdb del de:ad:be:ef:13:37 dev $swp1 master vlan 1 &> /dev/null } +other_tpid() +{ + local mac=de:ad:be:ef:13:37 + + # Test that packets with TPID 802.1ad VID 3 + TPID 802.1Q VID 5 are + # classified as untagged by a bridge with vlan_protocol 802.1Q, and + # are processed in the PVID of the ingress port (here 1). Not VID 3, + # and not VID 5. + RET=0 + + tc qdisc add dev $h2 clsact + tc filter add dev $h2 ingress protocol all pref 1 handle 101 \ + flower dst_mac $mac action drop + ip link set $h2 promisc on + ethtool -K $h2 rx-vlan-filter off rx-vlan-stag-filter off + + $MZ -q $h1 -c 1 -b $mac -a own "88:a8 00:03 81:00 00:05 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa" + sleep 1 + + # Match on 'self' addresses as well, for those drivers which + # do not push their learned addresses to the bridge software + # database + bridge -j fdb show $swp1 | \ + jq -e ".[] | select(.mac == \"$(mac_get $h1)\") | select(.vlan == 1)" &> /dev/null + check_err $? "FDB entry was not learned when it should" + + log_test "FDB entry in PVID for VLAN-tagged with other TPID" + + RET=0 + tc -j -s filter show dev $h2 ingress \ + | jq -e ".[] | select(.options.handle == 101) \ + | select(.options.actions[0].stats.packets == 1)" &> /dev/null + check_err $? "Packet was not forwarded when it should" + log_test "Reception of VLAN with other TPID as untagged" + + bridge vlan del dev $swp1 vid 1 + + $MZ -q $h1 -c 1 -b $mac -a own "88:a8 00:03 81:00 00:05 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa" + sleep 1 + + RET=0 + tc -j -s filter show dev $h2 ingress \ + | jq -e ".[] | select(.options.handle == 101) \ + | select(.options.actions[0].stats.packets == 1)" &> /dev/null + check_err $? "Packet was forwarded when should not" + log_test "Reception of VLAN with other TPID as untagged (no PVID)" + + bridge vlan add dev $swp1 vid 1 pvid untagged + ip link set $h2 promisc off + tc qdisc del dev $h2 clsact +} + trap cleanup EXIT setup_prepare From 67c3ca2c5cfe6a50772514e3349b5e7b3b0fac03 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:07:02 +0300 Subject: [PATCH 421/991] net: mscc: ocelot: use ocelot_xmit_get_vlan_info() also for FDMA and register injection Problem description ------------------- On an NXP LS1028A (felix DSA driver) with the following configuration: - ocelot-8021q tagging protocol - VLAN-aware bridge (with STP) spanning at least swp0 and swp1 - 8021q VLAN upper interfaces on swp0 and swp1: swp0.700, swp1.700 - ptp4l on swp0.700 and swp1.700 we see that the ptp4l instances do not see each other's traffic, and they all go to the grand master state due to the ANNOUNCE_RECEIPT_TIMEOUT_EXPIRES condition. Jumping to the conclusion for the impatient ------------------------------------------- There is a zero-day bug in the ocelot switchdev driver in the way it handles VLAN-tagged packet injection. The correct logic already exists in the source code, in function ocelot_xmit_get_vlan_info() added by commit 5ca721c54d86 ("net: dsa: tag_ocelot: set the classified VLAN during xmit"). But it is used only for normal NPI-based injection with the DSA "ocelot" tagging protocol. The other injection code paths (register-based and FDMA-based) roll their own wrong logic. This affects and was noticed on the DSA "ocelot-8021q" protocol because it uses register-based injection. By moving ocelot_xmit_get_vlan_info() to a place that's common for both the DSA tagger and the ocelot switch library, it can also be called from ocelot_port_inject_frame() in ocelot.c. We need to touch the lines with ocelot_ifh_port_set()'s prototype anyway, so let's rename it to something clearer regarding what it does, and add a kernel-doc. ocelot_ifh_set_basic() should do. Investigation notes ------------------- Debugging reveals that PTP event (aka those carrying timestamps, like Sync) frames injected into swp0.700 (but also swp1.700) hit the wire with two VLAN tags: 00000000: 01 1b 19 00 00 00 00 01 02 03 04 05 81 00 02 bc ~~~~~~~~~~~ 00000010: 81 00 02 bc 88 f7 00 12 00 2c 00 00 02 00 00 00 ~~~~~~~~~~~ 00000020: 00 00 00 00 00 00 00 00 00 00 00 01 02 ff fe 03 00000030: 04 05 00 01 00 04 00 00 00 00 00 00 00 00 00 00 00000040: 00 00 The second (unexpected) VLAN tag makes felix_check_xtr_pkt() -> ptp_classify_raw() fail to see these as PTP packets at the link partner's receiving end, and return PTP_CLASS_NONE (because the BPF classifier is not written to expect 2 VLAN tags). The reason why packets have 2 VLAN tags is because the transmission code treats VLAN incorrectly. Neither ocelot switchdev, nor felix DSA, declare the NETIF_F_HW_VLAN_CTAG_TX feature. Therefore, at xmit time, all VLANs should be in the skb head, and none should be in the hwaccel area. This is done by: static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) skb = __vlan_hwaccel_push_inside(skb); return skb; } But ocelot_port_inject_frame() handles things incorrectly: ocelot_ifh_port_set(ifh, port, rew_op, skb_vlan_tag_get(skb)); void ocelot_ifh_port_set(struct sk_buff *skb, void *ifh, int port, u32 rew_op) { (...) if (vlan_tag) ocelot_ifh_set_vlan_tci(ifh, vlan_tag); (...) } The way __vlan_hwaccel_push_inside() pushes the tag inside the skb head is by calling: static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) { skb->vlan_present = 0; } which does _not_ zero out skb->vlan_tci as seen by skb_vlan_tag_get(). This means that ocelot, when it calls skb_vlan_tag_get(), sees (and uses) a residual skb->vlan_tci, while the same VLAN tag is _already_ in the skb head. The trivial fix for double VLAN headers is to replace the content of ocelot_ifh_port_set() with: if (skb_vlan_tag_present(skb)) ocelot_ifh_set_vlan_tci(ifh, skb_vlan_tag_get(skb)); but this would not be correct either, because, as mentioned, vlan_hw_offload_capable() is false for us, so we'd be inserting dead code and we'd always transmit packets with VID=0 in the injection frame header. I can't actually test the ocelot switchdev driver and rely exclusively on code inspection, but I don't think traffic from 8021q uppers has ever been injected properly, and not double-tagged. Thus I'm blaming the introduction of VLAN fields in the injection header - early driver code. As hinted at in the early conclusion, what we _want_ to happen for VLAN transmission was already described once in commit 5ca721c54d86 ("net: dsa: tag_ocelot: set the classified VLAN during xmit"). ocelot_xmit_get_vlan_info() intends to ensure that if the port through which we're transmitting is under a VLAN-aware bridge, the outer VLAN tag from the skb head is stripped from there and inserted into the injection frame header (so that the packet is processed in hardware through that actual VLAN). And in all other cases, the packet is sent with VID=0 in the injection frame header, since the port is VLAN-unaware and has logic to strip this VID on egress (making it invisible to the wire). Fixes: 08d02364b12f ("net: mscc: fix the injection header") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 29 +++++++++++---- drivers/net/ethernet/mscc/ocelot_fdma.c | 2 +- include/linux/dsa/ocelot.h | 47 +++++++++++++++++++++++++ include/soc/mscc/ocelot.h | 3 +- net/dsa/tag_ocelot.c | 37 ++----------------- 5 files changed, 75 insertions(+), 43 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index ed2fb44500b0cb..69a4e5a90475bf 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1193,17 +1193,34 @@ bool ocelot_can_inject(struct ocelot *ocelot, int grp) } EXPORT_SYMBOL(ocelot_can_inject); -void ocelot_ifh_port_set(void *ifh, int port, u32 rew_op, u32 vlan_tag) +/** + * ocelot_ifh_set_basic - Set basic information in Injection Frame Header + * @ifh: Pointer to Injection Frame Header memory + * @ocelot: Switch private data structure + * @port: Egress port number + * @rew_op: Egress rewriter operation for PTP + * @skb: Pointer to socket buffer (packet) + * + * Populate the Injection Frame Header with basic information for this skb: the + * analyzer bypass bit, destination port, VLAN info, egress rewriter info. + */ +void ocelot_ifh_set_basic(void *ifh, struct ocelot *ocelot, int port, + u32 rew_op, struct sk_buff *skb) { + struct ocelot_port *ocelot_port = ocelot->ports[port]; + u64 vlan_tci, tag_type; + + ocelot_xmit_get_vlan_info(skb, ocelot_port->bridge, &vlan_tci, + &tag_type); + ocelot_ifh_set_bypass(ifh, 1); ocelot_ifh_set_dest(ifh, BIT_ULL(port)); - ocelot_ifh_set_tag_type(ifh, IFH_TAG_TYPE_C); - if (vlan_tag) - ocelot_ifh_set_vlan_tci(ifh, vlan_tag); + ocelot_ifh_set_tag_type(ifh, tag_type); + ocelot_ifh_set_vlan_tci(ifh, vlan_tci); if (rew_op) ocelot_ifh_set_rew_op(ifh, rew_op); } -EXPORT_SYMBOL(ocelot_ifh_port_set); +EXPORT_SYMBOL(ocelot_ifh_set_basic); void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, u32 rew_op, struct sk_buff *skb) @@ -1214,7 +1231,7 @@ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) | QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp); - ocelot_ifh_port_set(ifh, port, rew_op, skb_vlan_tag_get(skb)); + ocelot_ifh_set_basic(ifh, ocelot, port, rew_op, skb); for (i = 0; i < OCELOT_TAG_LEN / 4; i++) ocelot_write_rix(ocelot, ifh[i], QS_INJ_WR, grp); diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.c b/drivers/net/ethernet/mscc/ocelot_fdma.c index 312a468321544d..87b59cc5e44162 100644 --- a/drivers/net/ethernet/mscc/ocelot_fdma.c +++ b/drivers/net/ethernet/mscc/ocelot_fdma.c @@ -666,7 +666,7 @@ static int ocelot_fdma_prepare_skb(struct ocelot *ocelot, int port, u32 rew_op, ifh = skb_push(skb, OCELOT_TAG_LEN); skb_put(skb, ETH_FCS_LEN); memset(ifh, 0, OCELOT_TAG_LEN); - ocelot_ifh_port_set(ifh, port, rew_op, skb_vlan_tag_get(skb)); + ocelot_ifh_set_basic(ifh, ocelot, port, rew_op, skb); return 0; } diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index dca2969015d802..6fbfbde68a37c3 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -5,6 +5,8 @@ #ifndef _NET_DSA_TAG_OCELOT_H #define _NET_DSA_TAG_OCELOT_H +#include +#include #include #include #include @@ -273,4 +275,49 @@ static inline u32 ocelot_ptp_rew_op(struct sk_buff *skb) return rew_op; } +/** + * ocelot_xmit_get_vlan_info: Determine VLAN_TCI and TAG_TYPE for injected frame + * @skb: Pointer to socket buffer + * @br: Pointer to bridge device that the port is under, if any + * @vlan_tci: + * @tag_type: + * + * If the port is under a VLAN-aware bridge, remove the VLAN header from the + * payload and move it into the DSA tag, which will make the switch classify + * the packet to the bridge VLAN. Otherwise, leave the classified VLAN at zero, + * which is the pvid of standalone ports (OCELOT_STANDALONE_PVID), although not + * of VLAN-unaware bridge ports (that would be ocelot_vlan_unaware_pvid()). + * Anyway, VID 0 is fine because it is stripped on egress for these port modes, + * and source address learning is not performed for packets injected from the + * CPU anyway, so it doesn't matter that the VID is "wrong". + */ +static inline void ocelot_xmit_get_vlan_info(struct sk_buff *skb, + struct net_device *br, + u64 *vlan_tci, u64 *tag_type) +{ + struct vlan_ethhdr *hdr; + u16 proto, tci; + + if (!br || !br_vlan_enabled(br)) { + *vlan_tci = 0; + *tag_type = IFH_TAG_TYPE_C; + return; + } + + hdr = (struct vlan_ethhdr *)skb_mac_header(skb); + br_vlan_get_proto(br, &proto); + + if (ntohs(hdr->h_vlan_proto) == proto) { + vlan_remove_tag(skb, &tci); + *vlan_tci = tci; + } else { + rcu_read_lock(); + br_vlan_get_pvid_rcu(br, &tci); + rcu_read_unlock(); + *vlan_tci = tci; + } + + *tag_type = (proto != ETH_P_8021Q) ? IFH_TAG_TYPE_S : IFH_TAG_TYPE_C; +} + #endif diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 6a37b29f4b4c7a..ed18e6bafc8d0e 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -969,7 +969,8 @@ void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target, bool ocelot_can_inject(struct ocelot *ocelot, int grp); void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, u32 rew_op, struct sk_buff *skb); -void ocelot_ifh_port_set(void *ifh, int port, u32 rew_op, u32 vlan_tag); +void ocelot_ifh_set_basic(void *ifh, struct ocelot *ocelot, int port, + u32 rew_op, struct sk_buff *skb); int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb); void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp); void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb, diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index e0e4300bfbd3fc..bf6608fc6be70e 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -8,40 +8,6 @@ #define OCELOT_NAME "ocelot" #define SEVILLE_NAME "seville" -/* If the port is under a VLAN-aware bridge, remove the VLAN header from the - * payload and move it into the DSA tag, which will make the switch classify - * the packet to the bridge VLAN. Otherwise, leave the classified VLAN at zero, - * which is the pvid of standalone and VLAN-unaware bridge ports. - */ -static void ocelot_xmit_get_vlan_info(struct sk_buff *skb, struct dsa_port *dp, - u64 *vlan_tci, u64 *tag_type) -{ - struct net_device *br = dsa_port_bridge_dev_get(dp); - struct vlan_ethhdr *hdr; - u16 proto, tci; - - if (!br || !br_vlan_enabled(br)) { - *vlan_tci = 0; - *tag_type = IFH_TAG_TYPE_C; - return; - } - - hdr = skb_vlan_eth_hdr(skb); - br_vlan_get_proto(br, &proto); - - if (ntohs(hdr->h_vlan_proto) == proto) { - vlan_remove_tag(skb, &tci); - *vlan_tci = tci; - } else { - rcu_read_lock(); - br_vlan_get_pvid_rcu(br, &tci); - rcu_read_unlock(); - *vlan_tci = tci; - } - - *tag_type = (proto != ETH_P_8021Q) ? IFH_TAG_TYPE_S : IFH_TAG_TYPE_C; -} - static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, __be32 ifh_prefix, void **ifh) { @@ -53,7 +19,8 @@ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, u32 rew_op = 0; u64 qos_class; - ocelot_xmit_get_vlan_info(skb, dp, &vlan_tci, &tag_type); + ocelot_xmit_get_vlan_info(skb, dsa_port_bridge_dev_get(dp), &vlan_tci, + &tag_type); qos_class = netdev_get_num_tc(netdev) ? netdev_get_prio_tc_map(netdev, skb->priority) : skb->priority; From e1b9e80236c540fa85d76e2d510d1b38e1968c5d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:07:03 +0300 Subject: [PATCH 422/991] net: mscc: ocelot: fix QoS class for injected packets with "ocelot-8021q" There are 2 distinct code paths (listed below) in the source code which set up an injection header for Ocelot(-like) switches. Code path (2) lacks the QoS class and source port being set correctly. Especially the improper QoS classification is a problem for the "ocelot-8021q" alternative DSA tagging protocol, because we support tc-taprio and each packet needs to be scheduled precisely through its time slot. This includes PTP, which is normally assigned to a traffic class other than 0, but would be sent through TC 0 nonetheless. The code paths are: (1) ocelot_xmit_common() from net/dsa/tag_ocelot.c - called only by the standard "ocelot" DSA tagging protocol which uses NPI-based injection - sets up bit fields in the tag manually to account for a small difference (destination port offset) between Ocelot and Seville. Namely, ocelot_ifh_set_dest() is omitted out of ocelot_xmit_common(), because there's also seville_ifh_set_dest(). (2) ocelot_ifh_set_basic(), called by: - ocelot_fdma_prepare_skb() for FDMA transmission of the ocelot switchdev driver - ocelot_port_xmit() -> ocelot_port_inject_frame() for register-based transmission of the ocelot switchdev driver - felix_port_deferred_xmit() -> ocelot_port_inject_frame() for the DSA tagger ocelot-8021q when it must transmit PTP frames (also through register-based injection). sets the bit fields according to its own logic. The problem is that (2) doesn't call ocelot_ifh_set_qos_class(). Copying that logic from ocelot_xmit_common() fixes that. Unfortunately, although desirable, it is not easily possible to de-duplicate code paths (1) and (2), and make net/dsa/tag_ocelot.c directly call ocelot_ifh_set_basic()), because of the ocelot/seville difference. This is the "minimal" fix with some logic duplicated (but at least more consolidated). Fixes: 0a6f17c6ae21 ("net: dsa: tag_ocelot_8021q: add support for PTP timestamping") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 10 +++++++++- drivers/net/ethernet/mscc/ocelot_fdma.c | 1 - 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 69a4e5a90475bf..9301716e21d58e 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1208,13 +1208,21 @@ void ocelot_ifh_set_basic(void *ifh, struct ocelot *ocelot, int port, u32 rew_op, struct sk_buff *skb) { struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct net_device *dev = skb->dev; u64 vlan_tci, tag_type; + int qos_class; ocelot_xmit_get_vlan_info(skb, ocelot_port->bridge, &vlan_tci, &tag_type); + qos_class = netdev_get_num_tc(dev) ? + netdev_get_prio_tc_map(dev, skb->priority) : skb->priority; + + memset(ifh, 0, OCELOT_TAG_LEN); ocelot_ifh_set_bypass(ifh, 1); + ocelot_ifh_set_src(ifh, BIT_ULL(ocelot->num_phys_ports)); ocelot_ifh_set_dest(ifh, BIT_ULL(port)); + ocelot_ifh_set_qos_class(ifh, qos_class); ocelot_ifh_set_tag_type(ifh, tag_type); ocelot_ifh_set_vlan_tci(ifh, vlan_tci); if (rew_op) @@ -1225,7 +1233,7 @@ EXPORT_SYMBOL(ocelot_ifh_set_basic); void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, u32 rew_op, struct sk_buff *skb) { - u32 ifh[OCELOT_TAG_LEN / 4] = {0}; + u32 ifh[OCELOT_TAG_LEN / 4]; unsigned int i, count, last; ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) | diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.c b/drivers/net/ethernet/mscc/ocelot_fdma.c index 87b59cc5e44162..00326ae8c708b0 100644 --- a/drivers/net/ethernet/mscc/ocelot_fdma.c +++ b/drivers/net/ethernet/mscc/ocelot_fdma.c @@ -665,7 +665,6 @@ static int ocelot_fdma_prepare_skb(struct ocelot *ocelot, int port, u32 rew_op, ifh = skb_push(skb, OCELOT_TAG_LEN); skb_put(skb, ETH_FCS_LEN); - memset(ifh, 0, OCELOT_TAG_LEN); ocelot_ifh_set_basic(ifh, ocelot, port, rew_op, skb); return 0; From c5e12ac3beb0dd3a718296b2d8af5528e9ab728e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:07:04 +0300 Subject: [PATCH 423/991] net: mscc: ocelot: serialize access to the injection/extraction groups As explained by Horatiu Vultur in commit 603ead96582d ("net: sparx5: Add spinlock for frame transmission from CPU") which is for a similar hardware design, multiple CPUs can simultaneously perform injection or extraction. There are only 2 register groups for injection and 2 for extraction, and the driver only uses one of each. So we'd better serialize access using spin locks, otherwise frame corruption is possible. Note that unlike in sparx5, FDMA in ocelot does not have this issue because struct ocelot_fdma_tx_ring already contains an xmit_lock. I guess this is mostly a problem for NXP LS1028A, as that is dual core. I don't think VSC7514 is. So I'm blaming the commit where LS1028A (aka the felix DSA driver) started using register-based packet injection and extraction. Fixes: 0a6f17c6ae21 ("net: dsa: tag_ocelot_8021q: add support for PTP timestamping") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 11 +++++ drivers/net/ethernet/mscc/ocelot.c | 52 ++++++++++++++++++++++ drivers/net/ethernet/mscc/ocelot_vsc7514.c | 4 ++ include/soc/mscc/ocelot.h | 9 ++++ 4 files changed, 76 insertions(+) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index e554699f06d419..8d31ff18c5c7e0 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -528,7 +528,9 @@ static int felix_tag_8021q_setup(struct dsa_switch *ds) * so we need to be careful that there are no extra frames to be * dequeued over MMIO, since we would never know to discard them. */ + ocelot_lock_xtr_grp_bh(ocelot, 0); ocelot_drain_cpu_queue(ocelot, 0); + ocelot_unlock_xtr_grp_bh(ocelot, 0); return 0; } @@ -1518,6 +1520,8 @@ static void felix_port_deferred_xmit(struct kthread_work *work) int port = xmit_work->dp->index; int retries = 10; + ocelot_lock_inj_grp(ocelot, 0); + do { if (ocelot_can_inject(ocelot, 0)) break; @@ -1526,6 +1530,7 @@ static void felix_port_deferred_xmit(struct kthread_work *work) } while (--retries); if (!retries) { + ocelot_unlock_inj_grp(ocelot, 0); dev_err(ocelot->dev, "port %d failed to inject skb\n", port); ocelot_port_purge_txtstamp_skb(ocelot, port, skb); @@ -1535,6 +1540,8 @@ static void felix_port_deferred_xmit(struct kthread_work *work) ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); + ocelot_unlock_inj_grp(ocelot, 0); + consume_skb(skb); kfree(xmit_work); } @@ -1694,6 +1701,8 @@ static bool felix_check_xtr_pkt(struct ocelot *ocelot) if (!felix->info->quirk_no_xtr_irq) return false; + ocelot_lock_xtr_grp(ocelot, grp); + while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) { struct sk_buff *skb; unsigned int type; @@ -1730,6 +1739,8 @@ static bool felix_check_xtr_pkt(struct ocelot *ocelot) ocelot_drain_cpu_queue(ocelot, 0); } + ocelot_unlock_xtr_grp(ocelot, grp); + return true; } diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 9301716e21d58e..f4e027a6fe955b 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1099,6 +1099,48 @@ void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb, } EXPORT_SYMBOL(ocelot_ptp_rx_timestamp); +void ocelot_lock_inj_grp(struct ocelot *ocelot, int grp) + __acquires(&ocelot->inj_lock) +{ + spin_lock(&ocelot->inj_lock); +} +EXPORT_SYMBOL_GPL(ocelot_lock_inj_grp); + +void ocelot_unlock_inj_grp(struct ocelot *ocelot, int grp) + __releases(&ocelot->inj_lock) +{ + spin_unlock(&ocelot->inj_lock); +} +EXPORT_SYMBOL_GPL(ocelot_unlock_inj_grp); + +void ocelot_lock_xtr_grp(struct ocelot *ocelot, int grp) + __acquires(&ocelot->inj_lock) +{ + spin_lock(&ocelot->inj_lock); +} +EXPORT_SYMBOL_GPL(ocelot_lock_xtr_grp); + +void ocelot_unlock_xtr_grp(struct ocelot *ocelot, int grp) + __releases(&ocelot->inj_lock) +{ + spin_unlock(&ocelot->inj_lock); +} +EXPORT_SYMBOL_GPL(ocelot_unlock_xtr_grp); + +void ocelot_lock_xtr_grp_bh(struct ocelot *ocelot, int grp) + __acquires(&ocelot->xtr_lock) +{ + spin_lock_bh(&ocelot->xtr_lock); +} +EXPORT_SYMBOL_GPL(ocelot_lock_xtr_grp_bh); + +void ocelot_unlock_xtr_grp_bh(struct ocelot *ocelot, int grp) + __releases(&ocelot->xtr_lock) +{ + spin_unlock_bh(&ocelot->xtr_lock); +} +EXPORT_SYMBOL_GPL(ocelot_unlock_xtr_grp_bh); + int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb) { u64 timestamp, src_port, len; @@ -1109,6 +1151,8 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb) u32 val, *buf; int err; + lockdep_assert_held(&ocelot->xtr_lock); + err = ocelot_xtr_poll_xfh(ocelot, grp, xfh); if (err) return err; @@ -1184,6 +1228,8 @@ bool ocelot_can_inject(struct ocelot *ocelot, int grp) { u32 val = ocelot_read(ocelot, QS_INJ_STATUS); + lockdep_assert_held(&ocelot->inj_lock); + if (!(val & QS_INJ_STATUS_FIFO_RDY(BIT(grp)))) return false; if (val & QS_INJ_STATUS_WMARK_REACHED(BIT(grp))) @@ -1236,6 +1282,8 @@ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, u32 ifh[OCELOT_TAG_LEN / 4]; unsigned int i, count, last; + lockdep_assert_held(&ocelot->inj_lock); + ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) | QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp); @@ -1272,6 +1320,8 @@ EXPORT_SYMBOL(ocelot_port_inject_frame); void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp) { + lockdep_assert_held(&ocelot->xtr_lock); + while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) ocelot_read_rix(ocelot, QS_XTR_RD, grp); } @@ -2954,6 +3004,8 @@ int ocelot_init(struct ocelot *ocelot) mutex_init(&ocelot->fwd_domain_lock); spin_lock_init(&ocelot->ptp_clock_lock); spin_lock_init(&ocelot->ts_id_lock); + spin_lock_init(&ocelot->inj_lock); + spin_lock_init(&ocelot->xtr_lock); ocelot->owq = alloc_ordered_workqueue("ocelot-owq", 0); if (!ocelot->owq) diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 993212c3a7da6f..c09dd2e3343cba 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -51,6 +51,8 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) struct ocelot *ocelot = arg; int grp = 0, err; + ocelot_lock_xtr_grp(ocelot, grp); + while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) { struct sk_buff *skb; @@ -69,6 +71,8 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) if (err < 0) ocelot_drain_cpu_queue(ocelot, 0); + ocelot_unlock_xtr_grp(ocelot, grp); + return IRQ_HANDLED; } diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index ed18e6bafc8d0e..462c653e101746 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -813,6 +813,9 @@ struct ocelot { const u32 *const *map; struct list_head stats_regions; + spinlock_t inj_lock; + spinlock_t xtr_lock; + u32 pool_size[OCELOT_SB_NUM][OCELOT_SB_POOL_NUM]; int packet_buffer_size; int num_frame_refs; @@ -966,6 +969,12 @@ void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target, u32 val, u32 reg, u32 offset); /* Packet I/O */ +void ocelot_lock_inj_grp(struct ocelot *ocelot, int grp); +void ocelot_unlock_inj_grp(struct ocelot *ocelot, int grp); +void ocelot_lock_xtr_grp(struct ocelot *ocelot, int grp); +void ocelot_unlock_xtr_grp(struct ocelot *ocelot, int grp); +void ocelot_lock_xtr_grp_bh(struct ocelot *ocelot, int grp); +void ocelot_unlock_xtr_grp_bh(struct ocelot *ocelot, int grp); bool ocelot_can_inject(struct ocelot *ocelot, int grp); void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, u32 rew_op, struct sk_buff *skb); From 93e4649efa964201c73b0a03c35c04a0d6fc809f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:07:05 +0300 Subject: [PATCH 424/991] net: dsa: provide a software untagging function on RX for VLAN-aware bridges Through code analysis, I realized that the ds->untag_bridge_pvid logic is contradictory - see the newly added FIXME above the kernel-doc for dsa_software_untag_vlan_unaware_bridge(). Moreover, for the Felix driver, I need something very similar, but which is actually _not_ contradictory: untag the bridge PVID on RX, but for VLAN-aware bridges. The existing logic does it for VLAN-unaware bridges. Since I don't want to change the functionality of drivers which were supposedly properly tested with the ds->untag_bridge_pvid flag, I have introduced a new one: ds->untag_vlan_aware_bridge_pvid, and I have refactored the DSA reception code into a common path for both flags. TODO: both flags should be unified under a single ds->software_vlan_untag, which users of both current flags should set. This is not something that can be carried out right away. It needs very careful examination of all drivers which make use of this functionality, since some of them actually get this wrong in the first place. For example, commit 9130c2d30c17 ("net: dsa: microchip: ksz8795: Use software untagging on CPU port") uses this in a driver which has ds->configure_vlan_while_not_filtering = true. The latter mechanism has been known for many years to be broken by design: https://lore.kernel.org/netdev/CABumfLzJmXDN_W-8Z=p9KyKUVi_HhS7o_poBkeKHS2BkAiyYpw@mail.gmail.com/ and we have the situation of 2 bugs canceling each other. There is no private VLAN, and the port follows the PVID of the VLAN-unaware bridge. So, it's kinda ok for that driver to use the ds->untag_bridge_pvid mechanism, in a broken way. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 16 +++--- net/dsa/tag.c | 5 +- net/dsa/tag.h | 135 +++++++++++++++++++++++++++++++++++----------- 3 files changed, 118 insertions(+), 38 deletions(-) diff --git a/include/net/dsa.h b/include/net/dsa.h index b06f97ae3da1b5..d7a6c2930277ea 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -403,14 +403,18 @@ struct dsa_switch { */ u32 configure_vlan_while_not_filtering:1; - /* If the switch driver always programs the CPU port as egress tagged - * despite the VLAN configuration indicating otherwise, then setting - * @untag_bridge_pvid will force the DSA receive path to pop the - * bridge's default_pvid VLAN tagged frames to offer a consistent - * behavior between a vlan_filtering=0 and vlan_filtering=1 bridge - * device. + /* Pop the default_pvid of VLAN-unaware bridge ports from tagged frames. + * DEPRECATED: Do NOT set this field in new drivers. Instead look at + * the dsa_software_vlan_untag() comments. */ u32 untag_bridge_pvid:1; + /* Pop the default_pvid of VLAN-aware bridge ports from tagged frames. + * Useful if the switch cannot preserve the VLAN tag as seen on the + * wire for user port ingress, and chooses to send all frames as + * VLAN-tagged to the CPU, including those which were originally + * untagged. + */ + u32 untag_vlan_aware_bridge_pvid:1; /* Let DSA manage the FDB entries towards the * CPU, based on the software bridge database. diff --git a/net/dsa/tag.c b/net/dsa/tag.c index 6e402d49afd3e8..79ad105902d978 100644 --- a/net/dsa/tag.c +++ b/net/dsa/tag.c @@ -105,8 +105,9 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, p = netdev_priv(skb->dev); - if (unlikely(cpu_dp->ds->untag_bridge_pvid)) { - nskb = dsa_untag_bridge_pvid(skb); + if (unlikely(cpu_dp->ds->untag_bridge_pvid || + cpu_dp->ds->untag_vlan_aware_bridge_pvid)) { + nskb = dsa_software_vlan_untag(skb); if (!nskb) { kfree_skb(skb); return 0; diff --git a/net/dsa/tag.h b/net/dsa/tag.h index f6b9c73718dfa2..d5707870906bc9 100644 --- a/net/dsa/tag.h +++ b/net/dsa/tag.h @@ -44,46 +44,81 @@ static inline struct net_device *dsa_conduit_find_user(struct net_device *dev, return NULL; } -/* If under a bridge with vlan_filtering=0, make sure to send pvid-tagged - * frames as untagged, since the bridge will not untag them. +/** + * dsa_software_untag_vlan_aware_bridge: Software untagging for VLAN-aware bridge + * @skb: Pointer to received socket buffer (packet) + * @br: Pointer to bridge upper interface of ingress port + * @vid: Parsed VID from packet + * + * The bridge can process tagged packets. Software like STP/PTP may not. The + * bridge can also process untagged packets, to the same effect as if they were + * tagged with the PVID of the ingress port. So packets tagged with the PVID of + * the bridge port must be software-untagged, to support both use cases. */ -static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb) +static inline void dsa_software_untag_vlan_aware_bridge(struct sk_buff *skb, + struct net_device *br, + u16 vid) { - struct dsa_port *dp = dsa_user_to_port(skb->dev); - struct net_device *br = dsa_port_bridge_dev_get(dp); - struct net_device *dev = skb->dev; - struct net_device *upper_dev; - u16 vid, pvid, proto; + u16 pvid, proto; int err; - if (!br || br_vlan_enabled(br)) - return skb; - err = br_vlan_get_proto(br, &proto); if (err) - return skb; + return; - /* Move VLAN tag from data to hwaccel */ - if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) { - skb = skb_vlan_untag(skb); - if (!skb) - return NULL; - } + err = br_vlan_get_pvid_rcu(skb->dev, &pvid); + if (err) + return; - if (!skb_vlan_tag_present(skb)) - return skb; + if (vid == pvid && skb->vlan_proto == htons(proto)) + __vlan_hwaccel_clear_tag(skb); +} - vid = skb_vlan_tag_get_id(skb); +/** + * dsa_software_untag_vlan_unaware_bridge: Software untagging for VLAN-unaware bridge + * @skb: Pointer to received socket buffer (packet) + * @br: Pointer to bridge upper interface of ingress port + * @vid: Parsed VID from packet + * + * The bridge ignores all VLAN tags. Software like STP/PTP may not (it may run + * on the plain port, or on a VLAN upper interface). Maybe packets are coming + * to software as tagged with a driver-defined VID which is NOT equal to the + * PVID of the bridge port (since the bridge is VLAN-unaware, its configuration + * should NOT be committed to hardware). DSA needs a method for this private + * VID to be communicated by software to it, and if packets are tagged with it, + * software-untag them. Note: the private VID may be different per bridge, to + * support the FDB isolation use case. + * + * FIXME: this is currently implemented based on the broken assumption that + * the "private VID" used by the driver in VLAN-unaware mode is equal to the + * bridge PVID. It should not be, except for a coincidence; the bridge PVID is + * irrelevant to the data path in the VLAN-unaware mode. Thus, the VID that + * this function removes is wrong. + * + * All users of ds->untag_bridge_pvid should fix their drivers, if necessary, + * to make the two independent. Only then, if there still remains a need to + * strip the private VID from packets, then a new ds->ops->get_private_vid() + * API shall be introduced to communicate to DSA what this VID is, which needs + * to be stripped here. + */ +static inline void dsa_software_untag_vlan_unaware_bridge(struct sk_buff *skb, + struct net_device *br, + u16 vid) +{ + struct net_device *upper_dev; + u16 pvid, proto; + int err; - /* We already run under an RCU read-side critical section since - * we are called from netif_receive_skb_list_internal(). - */ - err = br_vlan_get_pvid_rcu(dev, &pvid); + err = br_vlan_get_proto(br, &proto); if (err) - return skb; + return; - if (vid != pvid) - return skb; + err = br_vlan_get_pvid_rcu(skb->dev, &pvid); + if (err) + return; + + if (vid != pvid || skb->vlan_proto != htons(proto)) + return; /* The sad part about attempting to untag from DSA is that we * don't know, unless we check, if the skb will end up in @@ -95,10 +130,50 @@ static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb) * definitely keep the tag, to make sure it keeps working. */ upper_dev = __vlan_find_dev_deep_rcu(br, htons(proto), vid); - if (upper_dev) + if (!upper_dev) + __vlan_hwaccel_clear_tag(skb); +} + +/** + * dsa_software_vlan_untag: Software VLAN untagging in DSA receive path + * @skb: Pointer to socket buffer (packet) + * + * Receive path method for switches which cannot avoid tagging all packets + * towards the CPU port. Called when ds->untag_bridge_pvid (legacy) or + * ds->untag_vlan_aware_bridge_pvid is set to true. + * + * As a side effect of this method, any VLAN tag from the skb head is moved + * to hwaccel. + */ +static inline struct sk_buff *dsa_software_vlan_untag(struct sk_buff *skb) +{ + struct dsa_port *dp = dsa_user_to_port(skb->dev); + struct net_device *br = dsa_port_bridge_dev_get(dp); + u16 vid; + + /* software untagging for standalone ports not yet necessary */ + if (!br) return skb; - __vlan_hwaccel_clear_tag(skb); + /* Move VLAN tag from data to hwaccel */ + if (!skb_vlan_tag_present(skb)) { + skb = skb_vlan_untag(skb); + if (!skb) + return NULL; + } + + if (!skb_vlan_tag_present(skb)) + return skb; + + vid = skb_vlan_tag_get_id(skb); + + if (br_vlan_enabled(br)) { + if (dp->ds->untag_vlan_aware_bridge_pvid) + dsa_software_untag_vlan_aware_bridge(skb, br, vid); + } else { + if (dp->ds->untag_bridge_pvid) + dsa_software_untag_vlan_unaware_bridge(skb, br, vid); + } return skb; } From f1288fd7293b91442ad7420394c252a252ecaa30 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:07:06 +0300 Subject: [PATCH 425/991] net: dsa: felix: fix VLAN tag loss on CPU reception with ocelot-8021q There is a major design bug with ocelot-8021q, which is that it expects more of the hardware than the hardware can actually do. The short summary of the issue is that when a port is under a VLAN-aware bridge and we use this tagging protocol, VLAN upper interfaces of this port do not see RX traffic. We use VCAP ES0 (egress rewriter) rules towards the tag_8021q CPU port to encapsulate packets with an outer tag, later stripped by software, that depends on the source user port. We do this so that packets can be identified in ocelot_rcv(). To be precise, we create rules with push_outer_tag = OCELOT_ES0_TAG and push_inner_tag = 0. With this configuration, we expect the switch to keep the inner tag configuration as found in the packet (if it was untagged on user port ingress, keep it untagged, otherwise preserve the VLAN tag unmodified as the inner tag towards the tag_8021q CPU port). But this is not what happens. Instead, table "Tagging Combinations" from the user manual suggests that when the ES0 action is "PUSH_OUTER_TAG=1 and PUSH_INNER_TAG=0", there will be "no inner tag". Experimentation further clarifies what this means. It appears that this "inner tag" which is not pushed into the packet on its egress towards the CPU is none other than the classified VLAN. When the ingress user port is standalone or under a VLAN-unaware bridge, the classified VLAN is a discardable quantity: it is a fixed value - the result of ocelot_vlan_unaware_pvid()'s configuration, and actually independent of the VID from any 802.1Q header that may be in the frame. It is actually preferable to discard the "inner tag" in this case. The problem is when the ingress port is under a VLAN-aware bridge. Then, the classified VLAN is taken from the frame's 802.1Q header, with a fallback on the bridge port's PVID. It would be very good to not discard the "inner tag" here, because if we do, we break communication with any 8021q VLAN uppers that the port might have. These have a processing path outside the bridge. There seems to be nothing else we can do except to change the configuration for VCAP ES0 rules, to actually push the inner VLAN into the frame. There are 2 options for that, first is to push a fixed value specified in the rule, and second is to push a fixed value, plus (aka arithmetic +) the classified VLAN. We choose the second option, and we select that fixed value as 0. Thus, what is pushed in the inner tag is just the classified VLAN. From there, we need to perform software untagging, in the receive path, of stuff that was untagged on the wire. Fixes: 7c83a7c539ab ("net: dsa: add a second tagger for Ocelot switches based on tag_8021q") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 115 +++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 8d31ff18c5c7e0..4a705f7333f43b 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -61,11 +61,46 @@ static int felix_cpu_port_for_conduit(struct dsa_switch *ds, return cpu_dp->index; } +/** + * felix_update_tag_8021q_rx_rule - Update VCAP ES0 tag_8021q rule after + * vlan_filtering change + * @outer_tagging_rule: Pointer to VCAP filter on which the update is performed + * @vlan_filtering: Current bridge VLAN filtering setting + * + * Source port identification for tag_8021q is done using VCAP ES0 rules on the + * CPU port(s). The ES0 tag B (inner tag from the packet) can be configured as + * either: + * - push_inner_tag=0: the inner tag is never pushed into the frame + * (and we lose info about the classified VLAN). This is + * good when the classified VLAN is a discardable quantity + * for the software RX path: it is either set to + * OCELOT_STANDALONE_PVID, or to + * ocelot_vlan_unaware_pvid(bridge). + * - push_inner_tag=1: the inner tag is always pushed. This is good when the + * classified VLAN is not a discardable quantity (the port + * is under a VLAN-aware bridge, and software needs to + * continue processing the packet in the same VLAN as the + * hardware). + * The point is that what is good for a VLAN-unaware port is not good for a + * VLAN-aware port, and vice versa. Thus, the RX tagging rules must be kept in + * sync with the VLAN filtering state of the port. + */ +static void +felix_update_tag_8021q_rx_rule(struct ocelot_vcap_filter *outer_tagging_rule, + bool vlan_filtering) +{ + if (vlan_filtering) + outer_tagging_rule->action.push_inner_tag = OCELOT_ES0_TAG; + else + outer_tagging_rule->action.push_inner_tag = OCELOT_NO_ES0_TAG; +} + /* Set up VCAP ES0 rules for pushing a tag_8021q VLAN towards the CPU such that * the tagger can perform RX source port identification. */ static int felix_tag_8021q_vlan_add_rx(struct dsa_switch *ds, int port, - int upstream, u16 vid) + int upstream, u16 vid, + bool vlan_filtering) { struct ocelot_vcap_filter *outer_tagging_rule; struct ocelot *ocelot = ds->priv; @@ -96,6 +131,14 @@ static int felix_tag_8021q_vlan_add_rx(struct dsa_switch *ds, int port, outer_tagging_rule->action.tag_a_tpid_sel = OCELOT_TAG_TPID_SEL_8021AD; outer_tagging_rule->action.tag_a_vid_sel = 1; outer_tagging_rule->action.vid_a_val = vid; + felix_update_tag_8021q_rx_rule(outer_tagging_rule, vlan_filtering); + outer_tagging_rule->action.tag_b_tpid_sel = OCELOT_TAG_TPID_SEL_8021Q; + /* Leave TAG_B_VID_SEL at 0 (Classified VID + VID_B_VAL). Since we also + * leave VID_B_VAL at 0, this makes ES0 tag B (the inner tag) equal to + * the classified VID, which we need to see in the DSA tagger's receive + * path. Note: the inner tag is only visible in the packet when pushed + * (push_inner_tag == OCELOT_ES0_TAG). + */ err = ocelot_vcap_filter_add(ocelot, outer_tagging_rule, NULL); if (err) @@ -227,6 +270,7 @@ static int felix_tag_8021q_vlan_del_tx(struct dsa_switch *ds, int port, u16 vid) static int felix_tag_8021q_vlan_add(struct dsa_switch *ds, int port, u16 vid, u16 flags) { + struct dsa_port *dp = dsa_to_port(ds, port); struct dsa_port *cpu_dp; int err; @@ -234,11 +278,12 @@ static int felix_tag_8021q_vlan_add(struct dsa_switch *ds, int port, u16 vid, * membership, which we aren't. So we don't need to add any VCAP filter * for the CPU port. */ - if (!dsa_is_user_port(ds, port)) + if (!dsa_port_is_user(dp)) return 0; dsa_switch_for_each_cpu_port(cpu_dp, ds) { - err = felix_tag_8021q_vlan_add_rx(ds, port, cpu_dp->index, vid); + err = felix_tag_8021q_vlan_add_rx(ds, port, cpu_dp->index, vid, + dsa_port_is_vlan_filtering(dp)); if (err) return err; } @@ -258,10 +303,11 @@ static int felix_tag_8021q_vlan_add(struct dsa_switch *ds, int port, u16 vid, static int felix_tag_8021q_vlan_del(struct dsa_switch *ds, int port, u16 vid) { + struct dsa_port *dp = dsa_to_port(ds, port); struct dsa_port *cpu_dp; int err; - if (!dsa_is_user_port(ds, port)) + if (!dsa_port_is_user(dp)) return 0; dsa_switch_for_each_cpu_port(cpu_dp, ds) { @@ -278,11 +324,41 @@ static int felix_tag_8021q_vlan_del(struct dsa_switch *ds, int port, u16 vid) del_tx_failed: dsa_switch_for_each_cpu_port(cpu_dp, ds) - felix_tag_8021q_vlan_add_rx(ds, port, cpu_dp->index, vid); + felix_tag_8021q_vlan_add_rx(ds, port, cpu_dp->index, vid, + dsa_port_is_vlan_filtering(dp)); return err; } +static int felix_update_tag_8021q_rx_rules(struct dsa_switch *ds, int port, + bool vlan_filtering) +{ + struct ocelot_vcap_filter *outer_tagging_rule; + struct ocelot_vcap_block *block_vcap_es0; + struct ocelot *ocelot = ds->priv; + struct dsa_port *cpu_dp; + unsigned long cookie; + int err; + + block_vcap_es0 = &ocelot->block[VCAP_ES0]; + + dsa_switch_for_each_cpu_port(cpu_dp, ds) { + cookie = OCELOT_VCAP_ES0_TAG_8021Q_RXVLAN(ocelot, port, + cpu_dp->index); + + outer_tagging_rule = ocelot_vcap_block_find_filter_by_id(block_vcap_es0, + cookie, false); + + felix_update_tag_8021q_rx_rule(outer_tagging_rule, vlan_filtering); + + err = ocelot_vcap_filter_replace(ocelot, outer_tagging_rule); + if (err) + return err; + } + + return 0; +} + static int felix_trap_get_cpu_port(struct dsa_switch *ds, const struct ocelot_vcap_filter *trap) { @@ -532,6 +608,16 @@ static int felix_tag_8021q_setup(struct dsa_switch *ds) ocelot_drain_cpu_queue(ocelot, 0); ocelot_unlock_xtr_grp_bh(ocelot, 0); + /* Problem: when using push_inner_tag=1 for ES0 tag B, we lose info + * about whether the received packets were VLAN-tagged on the wire, + * since they are always tagged on egress towards the CPU port. + * + * Since using push_inner_tag=1 is unavoidable for VLAN-aware bridges, + * we must work around the fallout by untagging in software to make + * untagged reception work more or less as expected. + */ + ds->untag_vlan_aware_bridge_pvid = true; + return 0; } @@ -556,6 +642,8 @@ static void felix_tag_8021q_teardown(struct dsa_switch *ds) ocelot_port_teardown_dsa_8021q_cpu(ocelot, dp->index); dsa_tag_8021q_unregister(ds); + + ds->untag_vlan_aware_bridge_pvid = false; } static unsigned long felix_tag_8021q_get_host_fwd_mask(struct dsa_switch *ds) @@ -1010,8 +1098,23 @@ static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled, struct netlink_ext_ack *extack) { struct ocelot *ocelot = ds->priv; + bool using_tag_8021q; + struct felix *felix; + int err; - return ocelot_port_vlan_filtering(ocelot, port, enabled, extack); + err = ocelot_port_vlan_filtering(ocelot, port, enabled, extack); + if (err) + return err; + + felix = ocelot_to_felix(ocelot); + using_tag_8021q = felix->tag_proto == DSA_TAG_PROTO_OCELOT_8021Q; + if (using_tag_8021q) { + err = felix_update_tag_8021q_rx_rules(ds, port, enabled); + if (err) + return err; + } + + return 0; } static int felix_vlan_add(struct dsa_switch *ds, int port, From 36dd1141be70b5966906919714dc504a24c65ddf Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:07:07 +0300 Subject: [PATCH 426/991] net: mscc: ocelot: treat 802.1ad tagged traffic as 802.1Q-untagged I was revisiting the topic of 802.1ad treatment in the Ocelot switch [0] and realized that not only is its basic VLAN classification pipeline improper for offloading vlan_protocol 802.1ad bridges, but also improper for offloading regular 802.1Q bridges already. Namely, 802.1ad-tagged traffic should be treated as VLAN-untagged by bridged ports, but this switch treats it as if it was 802.1Q-tagged with the same VID as in the 802.1ad header. This is markedly different to what the Linux bridge expects; see the "other_tpid()" function in tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh. An idea came to me that the VCAP IS1 TCAM is more powerful than I'm giving it credit for, and that it actually overwrites the classified VID before the VLAN Table lookup takes place. In other words, it can be used even to save a packet from being dropped on ingress due to VLAN membership. Add a sophisticated TCAM rule hardcoded into the driver to force the switch to behave like a Linux bridge with vlan_filtering 1 vlan_protocol 802.1Q. Regarding the lifetime of the filter: eventually the bridge will disappear, and vlan_filtering on the port will be restored to 0 for standalone mode. Then the filter will be deleted. [0]: https://lore.kernel.org/netdev/20201009122947.nvhye4hvcha3tljh@skbuf/ Fixes: 7142529f1688 ("net: mscc: ocelot: add VLAN filtering") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 188 ++++++++++++++++++++++-- drivers/net/ethernet/mscc/ocelot_vcap.c | 1 + include/soc/mscc/ocelot_vcap.h | 2 + 3 files changed, 180 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index f4e027a6fe955b..3d72aa7b130503 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -453,9 +453,158 @@ static u16 ocelot_vlan_unaware_pvid(struct ocelot *ocelot, return VLAN_N_VID - bridge_num - 1; } +/** + * ocelot_update_vlan_reclassify_rule() - Make switch aware only to bridge VLAN TPID + * + * @ocelot: Switch private data structure + * @port: Index of ingress port + * + * IEEE 802.1Q-2018 clauses "5.5 C-VLAN component conformance" and "5.6 S-VLAN + * component conformance" suggest that a C-VLAN component should only recognize + * and filter on C-Tags, and an S-VLAN component should only recognize and + * process based on C-Tags. + * + * In Linux, as per commit 1a0b20b25732 ("Merge branch 'bridge-next'"), C-VLAN + * components are largely represented by a bridge with vlan_protocol 802.1Q, + * and S-VLAN components by a bridge with vlan_protocol 802.1ad. + * + * Currently the driver only offloads vlan_protocol 802.1Q, but the hardware + * design is non-conformant, because the switch assigns each frame to a VLAN + * based on an entirely different question, as detailed in figure "Basic VLAN + * Classification Flow" from its manual and reproduced below. + * + * Set TAG_TYPE, PCP, DEI, VID to port-default values in VLAN_CFG register + * if VLAN_AWARE_ENA[port] and frame has outer tag then: + * if VLAN_INNER_TAG_ENA[port] and frame has inner tag then: + * TAG_TYPE = (Frame.InnerTPID <> 0x8100) + * Set PCP, DEI, VID to values from inner VLAN header + * else: + * TAG_TYPE = (Frame.OuterTPID <> 0x8100) + * Set PCP, DEI, VID to values from outer VLAN header + * if VID == 0 then: + * VID = VLAN_CFG.VLAN_VID + * + * Summarized, the switch will recognize both 802.1Q and 802.1ad TPIDs as VLAN + * "with equal rights", and just set the TAG_TYPE bit to 0 (if 802.1Q) or to 1 + * (if 802.1ad). It will classify based on whichever of the tags is "outer", no + * matter what TPID that may have (or "inner", if VLAN_INNER_TAG_ENA[port]). + * + * In the VLAN Table, the TAG_TYPE information is not accessible - just the + * classified VID is - so it is as if each VLAN Table entry is for 2 VLANs: + * C-VLAN X, and S-VLAN X. + * + * Whereas the Linux bridge behavior is to only filter on frames with a TPID + * equal to the vlan_protocol, and treat everything else as VLAN-untagged. + * + * Consider an ingress packet tagged with 802.1ad VID=3 and 802.1Q VID=5, + * received on a bridge vlan_filtering=1 vlan_protocol=802.1Q port. This frame + * should be treated as 802.1Q-untagged, and classified to the PVID of that + * bridge port. Not to VID=3, and not to VID=5. + * + * The VCAP IS1 TCAM has everything we need to overwrite the choices made in + * the basic VLAN classification pipeline: it can match on TAG_TYPE in the key, + * and it can modify the classified VID in the action. Thus, for each port + * under a vlan_filtering bridge, we can insert a rule in VCAP IS1 lookup 0 to + * match on 802.1ad tagged frames and modify their classified VID to the 802.1Q + * PVID of the port. This effectively makes it appear to the outside world as + * if those packets were processed as VLAN-untagged. + * + * The rule needs to be updated each time the bridge PVID changes, and needs + * to be deleted if the bridge PVID is deleted, or if the port becomes + * VLAN-unaware. + */ +static int ocelot_update_vlan_reclassify_rule(struct ocelot *ocelot, int port) +{ + unsigned long cookie = OCELOT_VCAP_IS1_VLAN_RECLASSIFY(ocelot, port); + struct ocelot_vcap_block *block_vcap_is1 = &ocelot->block[VCAP_IS1]; + struct ocelot_port *ocelot_port = ocelot->ports[port]; + const struct ocelot_bridge_vlan *pvid_vlan; + struct ocelot_vcap_filter *filter; + int err, val, pcp, dei; + bool vid_replace_ena; + u16 vid; + + pvid_vlan = ocelot_port->pvid_vlan; + vid_replace_ena = ocelot_port->vlan_aware && pvid_vlan; + + filter = ocelot_vcap_block_find_filter_by_id(block_vcap_is1, cookie, + false); + if (!vid_replace_ena) { + /* If the reclassification filter doesn't need to exist, delete + * it if it was previously installed, and exit doing nothing + * otherwise. + */ + if (filter) + return ocelot_vcap_filter_del(ocelot, filter); + + return 0; + } + + /* The reclassification rule must apply. See if it already exists + * or if it must be created. + */ + + /* Treating as VLAN-untagged means using as classified VID equal to + * the bridge PVID, and PCP/DEI set to the port default QoS values. + */ + vid = pvid_vlan->vid; + val = ocelot_read_gix(ocelot, ANA_PORT_QOS_CFG, port); + pcp = ANA_PORT_QOS_CFG_QOS_DEFAULT_VAL_X(val); + dei = !!(val & ANA_PORT_QOS_CFG_DP_DEFAULT_VAL); + + if (filter) { + bool changed = false; + + /* Filter exists, just update it */ + if (filter->action.vid != vid) { + filter->action.vid = vid; + changed = true; + } + if (filter->action.pcp != pcp) { + filter->action.pcp = pcp; + changed = true; + } + if (filter->action.dei != dei) { + filter->action.dei = dei; + changed = true; + } + + if (!changed) + return 0; + + return ocelot_vcap_filter_replace(ocelot, filter); + } + + /* Filter doesn't exist, create it */ + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return -ENOMEM; + + filter->key_type = OCELOT_VCAP_KEY_ANY; + filter->ingress_port_mask = BIT(port); + filter->vlan.tpid = OCELOT_VCAP_BIT_1; + filter->prio = 1; + filter->id.cookie = cookie; + filter->id.tc_offload = false; + filter->block_id = VCAP_IS1; + filter->type = OCELOT_VCAP_FILTER_OFFLOAD; + filter->lookup = 0; + filter->action.vid_replace_ena = true; + filter->action.pcp_dei_ena = true; + filter->action.vid = vid; + filter->action.pcp = pcp; + filter->action.dei = dei; + + err = ocelot_vcap_filter_add(ocelot, filter, NULL); + if (err) + kfree(filter); + + return err; +} + /* Default vlan to clasify for untagged frames (may be zero) */ -static void ocelot_port_set_pvid(struct ocelot *ocelot, int port, - const struct ocelot_bridge_vlan *pvid_vlan) +static int ocelot_port_set_pvid(struct ocelot *ocelot, int port, + const struct ocelot_bridge_vlan *pvid_vlan) { struct ocelot_port *ocelot_port = ocelot->ports[port]; u16 pvid = ocelot_vlan_unaware_pvid(ocelot, ocelot_port->bridge); @@ -475,15 +624,23 @@ static void ocelot_port_set_pvid(struct ocelot *ocelot, int port, * happens automatically), but also 802.1p traffic which gets * classified to VLAN 0, but that is always in our RX filter, so it * would get accepted were it not for this setting. + * + * Also, we only support the bridge 802.1Q VLAN protocol, so + * 802.1ad-tagged frames (carrying S-Tags) should be considered + * 802.1Q-untagged, and also dropped. */ if (!pvid_vlan && ocelot_port->vlan_aware) val = ANA_PORT_DROP_CFG_DROP_PRIO_S_TAGGED_ENA | - ANA_PORT_DROP_CFG_DROP_PRIO_C_TAGGED_ENA; + ANA_PORT_DROP_CFG_DROP_PRIO_C_TAGGED_ENA | + ANA_PORT_DROP_CFG_DROP_S_TAGGED_ENA; ocelot_rmw_gix(ocelot, val, ANA_PORT_DROP_CFG_DROP_PRIO_S_TAGGED_ENA | - ANA_PORT_DROP_CFG_DROP_PRIO_C_TAGGED_ENA, + ANA_PORT_DROP_CFG_DROP_PRIO_C_TAGGED_ENA | + ANA_PORT_DROP_CFG_DROP_S_TAGGED_ENA, ANA_PORT_DROP_CFG, port); + + return ocelot_update_vlan_reclassify_rule(ocelot, port); } static struct ocelot_bridge_vlan *ocelot_bridge_vlan_find(struct ocelot *ocelot, @@ -631,7 +788,10 @@ int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, ANA_PORT_VLAN_CFG_VLAN_POP_CNT_M, ANA_PORT_VLAN_CFG, port); - ocelot_port_set_pvid(ocelot, port, ocelot_port->pvid_vlan); + err = ocelot_port_set_pvid(ocelot, port, ocelot_port->pvid_vlan); + if (err) + return err; + ocelot_port_manage_port_tag(ocelot, port); return 0; @@ -684,9 +844,12 @@ int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, return err; /* Default ingress vlan classification */ - if (pvid) - ocelot_port_set_pvid(ocelot, port, - ocelot_bridge_vlan_find(ocelot, vid)); + if (pvid) { + err = ocelot_port_set_pvid(ocelot, port, + ocelot_bridge_vlan_find(ocelot, vid)); + if (err) + return err; + } /* Untagged egress vlan clasification */ ocelot_port_manage_port_tag(ocelot, port); @@ -712,8 +875,11 @@ int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid) return err; /* Ingress */ - if (del_pvid) - ocelot_port_set_pvid(ocelot, port, NULL); + if (del_pvid) { + err = ocelot_port_set_pvid(ocelot, port, NULL); + if (err) + return err; + } /* Egress */ ocelot_port_manage_port_tag(ocelot, port); @@ -2607,7 +2773,7 @@ int ocelot_port_set_default_prio(struct ocelot *ocelot, int port, u8 prio) ANA_PORT_QOS_CFG, port); - return 0; + return ocelot_update_vlan_reclassify_rule(ocelot, port); } EXPORT_SYMBOL_GPL(ocelot_port_set_default_prio); diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.c b/drivers/net/ethernet/mscc/ocelot_vcap.c index 73cdec5ca6a34d..5734b86aed5b53 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.c +++ b/drivers/net/ethernet/mscc/ocelot_vcap.c @@ -695,6 +695,7 @@ static void is1_entry_set(struct ocelot *ocelot, int ix, vcap_key_bit_set(vcap, &data, VCAP_IS1_HK_L2_MC, filter->dmac_mc); vcap_key_bit_set(vcap, &data, VCAP_IS1_HK_L2_BC, filter->dmac_bc); vcap_key_bit_set(vcap, &data, VCAP_IS1_HK_VLAN_TAGGED, tag->tagged); + vcap_key_bit_set(vcap, &data, VCAP_IS1_HK_TPID, tag->tpid); vcap_key_set(vcap, &data, VCAP_IS1_HK_VID, tag->vid.value, tag->vid.mask); vcap_key_set(vcap, &data, VCAP_IS1_HK_PCP, diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index c601a4598b0da8..eb19668a06db17 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -13,6 +13,7 @@ */ #define OCELOT_VCAP_ES0_TAG_8021Q_RXVLAN(ocelot, port, upstream) ((upstream) << 16 | (port)) #define OCELOT_VCAP_IS1_TAG_8021Q_TXVLAN(ocelot, port) (port) +#define OCELOT_VCAP_IS1_VLAN_RECLASSIFY(ocelot, port) ((ocelot)->num_phys_ports + (port)) #define OCELOT_VCAP_IS2_TAG_8021Q_TXVLAN(ocelot, port) (port) #define OCELOT_VCAP_IS2_MRP_REDIRECT(ocelot, port) ((ocelot)->num_phys_ports + (port)) #define OCELOT_VCAP_IS2_MRP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2) @@ -499,6 +500,7 @@ struct ocelot_vcap_key_vlan { struct ocelot_vcap_u8 pcp; /* PCP (3 bit) */ enum ocelot_vcap_bit dei; /* DEI */ enum ocelot_vcap_bit tagged; /* Tagged/untagged frame */ + enum ocelot_vcap_bit tpid; }; struct ocelot_vcap_key_etype { From 27ec3c57fcadb43c79ed05b2ea31bc18c72d798a Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Fri, 9 Aug 2024 10:11:33 +0200 Subject: [PATCH 427/991] wifi: mwifiex: duplicate static structs used in driver instances mwifiex_band_2ghz and mwifiex_band_5ghz are statically allocated, but used and modified in driver instances. Duplicate them before using them in driver instances so that different driver instances do not influence each other. This was observed on a board which has one PCIe and one SDIO mwifiex adapter. It blew up in mwifiex_setup_ht_caps(). This was called with the statically allocated struct which is modified in this function. Cc: stable@vger.kernel.org Fixes: d6bffe8bb520 ("mwifiex: support for creation of AP interface") Signed-off-by: Sascha Hauer Reviewed-by: Francesco Dolcini Acked-by: Brian Norris Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240809-mwifiex-duplicate-static-structs-v1-1-6837b903b1a4@pengutronix.de --- .../net/wireless/marvell/mwifiex/cfg80211.c | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index 155eb0fab12a48..bf35c92f91d7e9 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -4363,11 +4363,27 @@ int mwifiex_register_cfg80211(struct mwifiex_adapter *adapter) if (ISSUPP_ADHOC_ENABLED(adapter->fw_cap_info)) wiphy->interface_modes |= BIT(NL80211_IFTYPE_ADHOC); - wiphy->bands[NL80211_BAND_2GHZ] = &mwifiex_band_2ghz; - if (adapter->config_bands & BAND_A) - wiphy->bands[NL80211_BAND_5GHZ] = &mwifiex_band_5ghz; - else + wiphy->bands[NL80211_BAND_2GHZ] = devm_kmemdup(adapter->dev, + &mwifiex_band_2ghz, + sizeof(mwifiex_band_2ghz), + GFP_KERNEL); + if (!wiphy->bands[NL80211_BAND_2GHZ]) { + ret = -ENOMEM; + goto err; + } + + if (adapter->config_bands & BAND_A) { + wiphy->bands[NL80211_BAND_5GHZ] = devm_kmemdup(adapter->dev, + &mwifiex_band_5ghz, + sizeof(mwifiex_band_5ghz), + GFP_KERNEL); + if (!wiphy->bands[NL80211_BAND_5GHZ]) { + ret = -ENOMEM; + goto err; + } + } else { wiphy->bands[NL80211_BAND_5GHZ] = NULL; + } if (adapter->drcs_enabled && ISSUPP_DRCS_ENABLED(adapter->fw_cap_info)) wiphy->iface_combinations = &mwifiex_iface_comb_ap_sta_drcs; @@ -4461,8 +4477,7 @@ int mwifiex_register_cfg80211(struct mwifiex_adapter *adapter) if (ret < 0) { mwifiex_dbg(adapter, ERROR, "%s: wiphy_register failed: %d\n", __func__, ret); - wiphy_free(wiphy); - return ret; + goto err; } if (!adapter->regd) { @@ -4504,4 +4519,9 @@ int mwifiex_register_cfg80211(struct mwifiex_adapter *adapter) adapter->wiphy = wiphy; return ret; + +err: + wiphy_free(wiphy); + + return ret; } From b9b6ee6fe258ce4d89592593efcd3d798c418859 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Aug 2024 16:25:19 +0200 Subject: [PATCH 428/991] thermal: gov_bang_bang: Call __thermal_cdev_update() directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of clearing the "updated" flag for each cooling device affected by the trip point crossing in bang_bang_control() and walking all thermal instances to run thermal_cdev_update() for all of the affected cooling devices, call __thermal_cdev_update() directly for each of them. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Kästle Reviewed-by: Zhang Rui Cc: 6.10+ # 6.10+ Link: https://patch.msgid.link/13583081.uLZWGnKmhe@rjwysocki.net --- drivers/thermal/gov_bang_bang.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c index 4a2e869b9538c1..b9474c6af72b51 100644 --- a/drivers/thermal/gov_bang_bang.c +++ b/drivers/thermal/gov_bang_bang.c @@ -71,12 +71,9 @@ static void bang_bang_control(struct thermal_zone_device *tz, dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target); mutex_lock(&instance->cdev->lock); - instance->cdev->updated = false; /* cdev needs update */ + __thermal_cdev_update(instance->cdev); mutex_unlock(&instance->cdev->lock); } - - list_for_each_entry(instance, &tz->thermal_instances, tz_node) - thermal_cdev_update(instance->cdev); } static struct thermal_governor thermal_gov_bang_bang = { From 84248e35d9b60e03df7276627e4e91fbaf80f73d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Aug 2024 16:26:42 +0200 Subject: [PATCH 429/991] thermal: gov_bang_bang: Split bang_bang_control() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the setting of the thermal instance target state from bang_bang_control() into a separate function that will be also called in a different place going forward. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Kästle Reviewed-by: Zhang Rui Cc: 6.10+ # 6.10+ Link: https://patch.msgid.link/3313587.aeNJFYEL58@rjwysocki.net --- drivers/thermal/gov_bang_bang.c | 42 ++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c index b9474c6af72b51..87cff3ea77a9d9 100644 --- a/drivers/thermal/gov_bang_bang.c +++ b/drivers/thermal/gov_bang_bang.c @@ -13,6 +13,27 @@ #include "thermal_core.h" +static void bang_bang_set_instance_target(struct thermal_instance *instance, + unsigned int target) +{ + if (instance->target != 0 && instance->target != 1 && + instance->target != THERMAL_NO_TARGET) + pr_debug("Unexpected state %ld of thermal instance %s in bang-bang\n", + instance->target, instance->name); + + /* + * Enable the fan when the trip is crossed on the way up and disable it + * when the trip is crossed on the way down. + */ + instance->target = target; + + dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target); + + mutex_lock(&instance->cdev->lock); + __thermal_cdev_update(instance->cdev); + mutex_unlock(&instance->cdev->lock); +} + /** * bang_bang_control - controls devices associated with the given zone * @tz: thermal_zone_device @@ -54,25 +75,8 @@ static void bang_bang_control(struct thermal_zone_device *tz, tz->temperature, trip->hysteresis); list_for_each_entry(instance, &tz->thermal_instances, tz_node) { - if (instance->trip != trip) - continue; - - if (instance->target != 0 && instance->target != 1 && - instance->target != THERMAL_NO_TARGET) - pr_debug("Unexpected state %ld of thermal instance %s in bang-bang\n", - instance->target, instance->name); - - /* - * Enable the fan when the trip is crossed on the way up and - * disable it when the trip is crossed on the way down. - */ - instance->target = crossed_up; - - dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target); - - mutex_lock(&instance->cdev->lock); - __thermal_cdev_update(instance->cdev); - mutex_unlock(&instance->cdev->lock); + if (instance->trip == trip) + bang_bang_set_instance_target(instance, crossed_up); } } From 5f64b4a1ab1b0412446d42e1fc2964c2cdb60b27 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Aug 2024 16:27:33 +0200 Subject: [PATCH 430/991] thermal: gov_bang_bang: Add .manage() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After recent changes, the Bang-bang governor may not adjust the initial configuration of cooling devices to the actual situation. Namely, if a cooling device bound to a certain trip point starts in the "on" state and the thermal zone temperature is below the threshold of that trip point, the trip point may never be crossed on the way up in which case the state of the cooling device will never be adjusted because the thermal core will never invoke the governor's .trip_crossed() callback. [Note that there is no issue if the zone temperature is at the trip threshold or above it to start with because .trip_crossed() will be invoked then to indicate the start of thermal mitigation for the given trip.] To address this, add a .manage() callback to the Bang-bang governor and use it to ensure that all of the thermal instances managed by the governor have been initialized properly and the states of all of the cooling devices involved have been adjusted to the current zone temperature as appropriate. Fixes: 530c932bdf75 ("thermal: gov_bang_bang: Use .trip_crossed() instead of .throttle()") Link: https://lore.kernel.org/linux-pm/1bfbbae5-42b0-4c7d-9544-e98855715294@piie.net/ Cc: 6.10+ # 6.10+ Signed-off-by: Rafael J. Wysocki Acked-by: Peter Kästle Reviewed-by: Zhang Rui Link: https://patch.msgid.link/8419356.T7Z3S40VBb@rjwysocki.net --- drivers/thermal/gov_bang_bang.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c index 87cff3ea77a9d9..bc55e0698bfa8c 100644 --- a/drivers/thermal/gov_bang_bang.c +++ b/drivers/thermal/gov_bang_bang.c @@ -26,6 +26,7 @@ static void bang_bang_set_instance_target(struct thermal_instance *instance, * when the trip is crossed on the way down. */ instance->target = target; + instance->initialized = true; dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target); @@ -80,8 +81,37 @@ static void bang_bang_control(struct thermal_zone_device *tz, } } +static void bang_bang_manage(struct thermal_zone_device *tz) +{ + const struct thermal_trip_desc *td; + struct thermal_instance *instance; + + for_each_trip_desc(tz, td) { + const struct thermal_trip *trip = &td->trip; + + if (tz->temperature >= td->threshold || + trip->temperature == THERMAL_TEMP_INVALID || + trip->type == THERMAL_TRIP_CRITICAL || + trip->type == THERMAL_TRIP_HOT) + continue; + + /* + * If the initial cooling device state is "on", but the zone + * temperature is not above the trip point, the core will not + * call bang_bang_control() until the zone temperature reaches + * the trip point temperature which may be never. In those + * cases, set the initial state of the cooling device to 0. + */ + list_for_each_entry(instance, &tz->thermal_instances, tz_node) { + if (!instance->initialized && instance->trip == trip) + bang_bang_set_instance_target(instance, 0); + } + } +} + static struct thermal_governor thermal_gov_bang_bang = { .name = "bang_bang", .trip_crossed = bang_bang_control, + .manage = bang_bang_manage, }; THERMAL_GOVERNOR_DECLARE(thermal_gov_bang_bang); From 6e6f58a170ea98e44075b761f2da42a5aec47dfb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Aug 2024 16:29:11 +0200 Subject: [PATCH 431/991] thermal: gov_bang_bang: Use governor_data to reduce overhead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After running once, the for_each_trip_desc() loop in bang_bang_manage() is pure needless overhead because it is not going to make any changes unless a new cooling device has been bound to one of the trips in the thermal zone or the system is resuming from sleep. For this reason, make bang_bang_manage() set governor_data for the thermal zone and check it upfront to decide whether or not it needs to do anything. However, governor_data needs to be reset in some cases to let bang_bang_manage() know that it should walk the trips again, so add an .update_tz() callback to the governor and make the core additionally invoke it during system resume. To avoid affecting the other users of that callback unnecessarily, add a special notification reason for system resume, THERMAL_TZ_RESUME, and also pass it to __thermal_zone_device_update() called during system resume for consistency. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Kästle Reviewed-by: Zhang Rui Cc: 6.10+ # 6.10+ Link: https://patch.msgid.link/2285575.iZASKD2KPV@rjwysocki.net --- drivers/thermal/gov_bang_bang.c | 18 ++++++++++++++++++ drivers/thermal/thermal_core.c | 3 ++- include/linux/thermal.h | 1 + 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c index bc55e0698bfa8c..daed67d19efb81 100644 --- a/drivers/thermal/gov_bang_bang.c +++ b/drivers/thermal/gov_bang_bang.c @@ -86,6 +86,10 @@ static void bang_bang_manage(struct thermal_zone_device *tz) const struct thermal_trip_desc *td; struct thermal_instance *instance; + /* If the code below has run already, nothing needs to be done. */ + if (tz->governor_data) + return; + for_each_trip_desc(tz, td) { const struct thermal_trip *trip = &td->trip; @@ -107,11 +111,25 @@ static void bang_bang_manage(struct thermal_zone_device *tz) bang_bang_set_instance_target(instance, 0); } } + + tz->governor_data = (void *)true; +} + +static void bang_bang_update_tz(struct thermal_zone_device *tz, + enum thermal_notify_event reason) +{ + /* + * Let bang_bang_manage() know that it needs to walk trips after binding + * a new cdev and after system resume. + */ + if (reason == THERMAL_TZ_BIND_CDEV || reason == THERMAL_TZ_RESUME) + tz->governor_data = NULL; } static struct thermal_governor thermal_gov_bang_bang = { .name = "bang_bang", .trip_crossed = bang_bang_control, .manage = bang_bang_manage, + .update_tz = bang_bang_update_tz, }; THERMAL_GOVERNOR_DECLARE(thermal_gov_bang_bang); diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 95c399f9474418..e6669aeda1fff0 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1728,7 +1728,8 @@ static void thermal_zone_device_resume(struct work_struct *work) thermal_debug_tz_resume(tz); thermal_zone_device_init(tz); - __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); + thermal_governor_update_tz(tz, THERMAL_TZ_RESUME); + __thermal_zone_device_update(tz, THERMAL_TZ_RESUME); complete(&tz->resume); tz->resuming = false; diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 25fbf960b474bd..b86ddca46b9e81 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -55,6 +55,7 @@ enum thermal_notify_event { THERMAL_TZ_BIND_CDEV, /* Cooling dev is bind to the thermal zone */ THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */ THERMAL_INSTANCE_WEIGHT_CHANGED, /* Thermal instance weight changed */ + THERMAL_TZ_RESUME, /* Thermal zone is resuming after system sleep */ }; /** From a42db293e5983aa1508d12644f23d73f0553b32c Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Fri, 16 Aug 2024 12:33:28 +0530 Subject: [PATCH 432/991] ASoC: SOF: amd: Fix for acp init sequence When ACP is not powered on by default, acp power on sequence explicitly invoked by programming pgfsm control mask. The existing implementation checks the same PGFSM status mask and programs the same PGFSM control mask in all ACP variants which breaks acp power on sequence for ACP6.0 and ACP6.3 variants. So to fix this issue, update ACP pgfsm control mask and status mask based on acp descriptor rev field, which will vary based on acp variant. Fixes: 846aef1d7cc0 ("ASoC: SOF: amd: Add Renoir ACP HW support") Signed-off-by: Vijendar Mukunda Link: https://patch.msgid.link/20240816070328.610360-1-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp.c | 19 +++++++++++++++++-- sound/soc/sof/amd/acp.h | 7 +++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/sound/soc/sof/amd/acp.c b/sound/soc/sof/amd/acp.c index d95f865669a699..85b58c8ccd0da2 100644 --- a/sound/soc/sof/amd/acp.c +++ b/sound/soc/sof/amd/acp.c @@ -433,6 +433,7 @@ static int acp_power_on(struct snd_sof_dev *sdev) const struct sof_amd_acp_desc *desc = get_chip_info(sdev->pdata); unsigned int base = desc->pgfsm_base; unsigned int val; + unsigned int acp_pgfsm_status_mask, acp_pgfsm_cntl_mask; int ret; val = snd_sof_dsp_read(sdev, ACP_DSP_BAR, base + PGFSM_STATUS_OFFSET); @@ -440,9 +441,23 @@ static int acp_power_on(struct snd_sof_dev *sdev) if (val == ACP_POWERED_ON) return 0; - if (val & ACP_PGFSM_STATUS_MASK) + switch (desc->rev) { + case 3: + case 5: + acp_pgfsm_status_mask = ACP3X_PGFSM_STATUS_MASK; + acp_pgfsm_cntl_mask = ACP3X_PGFSM_CNTL_POWER_ON_MASK; + break; + case 6: + acp_pgfsm_status_mask = ACP6X_PGFSM_STATUS_MASK; + acp_pgfsm_cntl_mask = ACP6X_PGFSM_CNTL_POWER_ON_MASK; + break; + default: + return -EINVAL; + } + + if (val & acp_pgfsm_status_mask) snd_sof_dsp_write(sdev, ACP_DSP_BAR, base + PGFSM_CONTROL_OFFSET, - ACP_PGFSM_CNTL_POWER_ON_MASK); + acp_pgfsm_cntl_mask); ret = snd_sof_dsp_read_poll_timeout(sdev, ACP_DSP_BAR, base + PGFSM_STATUS_OFFSET, val, !val, ACP_REG_POLL_INTERVAL, ACP_REG_POLL_TIMEOUT_US); diff --git a/sound/soc/sof/amd/acp.h b/sound/soc/sof/amd/acp.h index 1af86b5b28db80..61b28df8c90814 100644 --- a/sound/soc/sof/amd/acp.h +++ b/sound/soc/sof/amd/acp.h @@ -25,8 +25,11 @@ #define ACP_REG_POLL_TIMEOUT_US 2000 #define ACP_DMA_COMPLETE_TIMEOUT_US 5000 -#define ACP_PGFSM_CNTL_POWER_ON_MASK 0x01 -#define ACP_PGFSM_STATUS_MASK 0x03 +#define ACP3X_PGFSM_CNTL_POWER_ON_MASK 0x01 +#define ACP3X_PGFSM_STATUS_MASK 0x03 +#define ACP6X_PGFSM_CNTL_POWER_ON_MASK 0x07 +#define ACP6X_PGFSM_STATUS_MASK 0x0F + #define ACP_POWERED_ON 0x00 #define ACP_ASSERT_RESET 0x01 #define ACP_RELEASE_RESET 0x00 From 145082ebfcf08f4fd254c467abf4aa58b4d38505 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 12 Aug 2024 14:17:21 +0200 Subject: [PATCH 433/991] Documentation/llvm: turn make command for ccache into code block The command provided to use ccache with clang is not a literal code block. Once built, the documentation displays the '' symbols as a " character, which is wrong, and the command can not be applied as provided. Turn the command into a literal code block. Signed-off-by: Javier Carrasco Reviewed-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- Documentation/kbuild/llvm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/kbuild/llvm.rst b/Documentation/kbuild/llvm.rst index bb5c44f8bd1c49..6dc66b4f31a7bb 100644 --- a/Documentation/kbuild/llvm.rst +++ b/Documentation/kbuild/llvm.rst @@ -126,7 +126,7 @@ Ccache ``ccache`` can be used with ``clang`` to improve subsequent builds, (though KBUILD_BUILD_TIMESTAMP_ should be set to a deterministic value between builds -in order to avoid 100% cache misses, see Reproducible_builds_ for more info): +in order to avoid 100% cache misses, see Reproducible_builds_ for more info):: KBUILD_BUILD_TIMESTAMP='' make LLVM=1 CC="ccache clang" From c2f6e16a6771eaefba6bb35f6803fe7217822d41 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 Aug 2024 13:02:55 -0400 Subject: [PATCH 434/991] bcachefs: Increase size of cuckoo hash table on too many rehashes Also, improve the calculation of the new table size, so that it can shrink when needed. Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets_waiting_for_journal.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index ec1b636ef78d07..f70eb2127d322c 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -93,7 +93,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, .dev_bucket = (u64) dev << 56 | bucket, .journal_seq = journal_seq, }; - size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0; + size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0; int ret = 0; mutex_lock(&b->lock); @@ -106,7 +106,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, for (i = 0; i < size; i++) nr_elements += t->d[i].journal_seq > flushed_seq; - new_bits = t->bits + (nr_elements * 3 > size); + new_bits = ilog2(roundup_pow_of_two(nr_elements * 3)); n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); if (!n) { @@ -115,7 +115,14 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, } retry_rehash: + if (nr_rehashes_this_size == 3) { + new_bits++; + nr_rehashes_this_size = 0; + } + nr_rehashes++; + nr_rehashes_this_size++; + bucket_table_init(n, new_bits); tmp = new; From 075cabf324c3fd790d6ba39ff9db33a30b954fe2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 Aug 2024 12:31:29 -0400 Subject: [PATCH 435/991] bcachefs: Fix forgetting to pass trans to fsck_err() Reported-by: syzbot+e3938cd6d761b78750e6@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index b69ef4b3de6e2a..ad517ef744e577 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -925,7 +925,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, return PTR_ERR(a); if (a->v.data_type && type && a->v.data_type != type) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, bucket_metadata_type_mismatch, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", From 9482f3b05332a624508a91c2ab2cf3527328a6a4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 Aug 2024 12:41:46 -0400 Subject: [PATCH 436/991] bcachefs: avoid overflowing LRU_TIME_BITS for cached data lru Reported-by: syzbot+510b0b28f8e6de64d307@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 260e7fa83d0518..fd790b03fbe198 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -150,7 +150,9 @@ static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_typ static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) { - return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; + return a.data_type == BCH_DATA_cached + ? a.io_time[READ] & LRU_TIME_MAX + : 0; } #define DATA_TYPES_MOVABLE \ From 99c87fe0f584f8d778a323141504d1ba5c89a4a5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 Aug 2024 12:44:49 -0400 Subject: [PATCH 437/991] bcachefs: fix incorrect i_state usage Reported-by: syzbot+95e40eae71609e40d851@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 15fc41e63b6c63..94c392abef65cd 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -193,7 +193,7 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino * only insert fully created inodes in the inode hash table. But * discard_new_inode() expects it to be set... */ - inode->v.i_flags |= I_NEW; + inode->v.i_state |= I_NEW; /* * We don't want bch2_evict_inode() to delete the inode on disk, * we just raced and had another inode in cache. Normally new From 1fc2ac428ef7d2ab9e8e19efe7ec3e58aea51bf3 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 16 Aug 2024 12:15:23 -0600 Subject: [PATCH 438/991] io_uring: fix user_data field name in comment io_uring_cqe's user_data field refers to `sqe->data`, but io_uring_sqe does not have a data field. Fix the comment to say `sqe->user_data`. Signed-off-by: Caleb Sander Mateos Link: https://github.com/axboe/liburing/pull/1206 Link: https://lore.kernel.org/r/20240816181526.3642732-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2aaf7ee256ac42..adc2524fd8e3dc 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -421,7 +421,7 @@ enum io_uring_msg_ring_flags { * IO completion data structure (Completion Queue Entry) */ struct io_uring_cqe { - __u64 user_data; /* sqe->data submission passed back */ + __u64 user_data; /* sqe->user_data value passed back */ __s32 res; /* result code for this event */ __u32 flags; From f232de7cdb4b99adb2c7f2bc5e0b7e4e1292873b Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 15 Aug 2024 10:16:08 +0300 Subject: [PATCH 439/991] net/mlx5e: SHAMPO, Fix page leak When SHAMPO is used, a receive queue currently almost always leaks one page on shutdown. A page has MLX5E_SHAMPO_WQ_HEADER_PER_PAGE (8) headers. These headers are tracked in the SHAMPO bitmap. Each page is released when the last header index in the group is processed. During header allocation, there can be leftovers from a page that will be used in a subsequent allocation. This is normally fine, except for the following scenario (simplified a bit): 1) Allocate N new page fragments, showing only the relevant last 4 fragments: 0: new page 1: new page 2: new page 3: new page 4: page from previous allocation 5: page from previous allocation 6: page from previous allocation 7: page from previous allocation 2) NAPI processes header indices 4-7 because they are the oldest allocated. Bit 7 will be set to 0. 3) Receive queue shutdown occurs. All the remaining bits are being iterated on to release the pages. But the page assigned to header indices 0-3 will not be freed due to what happened in step 2. This patch fixes the issue by making sure that on allocation, header fragments are always allocated in groups of MLX5E_SHAMPO_WQ_HEADER_PER_PAGE so that there is never a partial page left over between allocations. A more appropriate fix would be a refactoring of mlx5e_alloc_rx_hd_mpwqe() and mlx5e_build_shampo_hd_umr(). But this refactoring is too big for net. It will be targeted for net-next. Fixes: e839ac9a89cb ("net/mlx5e: SHAMPO, Simplify header page release in teardown") Signed-off-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20240815071611.2211873-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 225da8d691fcf2..23aa555ca0ae8d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -735,6 +735,7 @@ static int mlx5e_alloc_rx_hd_mpwqe(struct mlx5e_rq *rq) ksm_entries = bitmap_find_window(shampo->bitmap, shampo->hd_per_wqe, shampo->hd_per_wq, shampo->pi); + ksm_entries = ALIGN_DOWN(ksm_entries, MLX5E_SHAMPO_WQ_HEADER_PER_PAGE); if (!ksm_entries) return 0; From 94e521937839475b83bac46e4d3ccba332e12064 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 15 Aug 2024 10:16:09 +0300 Subject: [PATCH 440/991] net/mlx5e: SHAMPO, Release in progress headers The change in the fixes tag cleaned up too much: it removed the part that was releasing header pages that were posted via UMR but haven't been acknowledged yet on the ICOSQ. This patch corrects this omission by setting the bits between pi and ci to on when shutting down a queue with SHAMPO. To be consistent with the Striding RQ code, this action is done in mlx5e_free_rx_missing_descs(). Fixes: e839ac9a89cb ("net/mlx5e: SHAMPO, Simplify header page release in teardown") Signed-off-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20240815071611.2211873-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 8 ++++++ .../net/ethernet/mellanox/mlx5/core/en_rx.c | 25 +++++++++++-------- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index bb5da42edc23a0..d9e241423bc567 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -998,6 +998,7 @@ void mlx5e_build_ptys2ethtool_map(void); bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev, u8 page_shift, enum mlx5e_mpwrq_umr_mode umr_mode); +void mlx5e_shampo_fill_umr(struct mlx5e_rq *rq, int len); void mlx5e_shampo_dealloc_hd(struct mlx5e_rq *rq); void mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats); void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 5df904639b0ce6..583fa24a7ae980 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1236,6 +1236,14 @@ void mlx5e_free_rx_missing_descs(struct mlx5e_rq *rq) rq->mpwqe.actual_wq_head = wq->head; rq->mpwqe.umr_in_progress = 0; rq->mpwqe.umr_completed = 0; + + if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) { + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; + u16 len; + + len = (shampo->pi - shampo->ci) & shampo->hd_per_wq; + mlx5e_shampo_fill_umr(rq, len); + } } void mlx5e_free_rx_descs(struct mlx5e_rq *rq) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 23aa555ca0ae8d..de9d01036c2807 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -963,26 +963,31 @@ void mlx5e_free_icosq_descs(struct mlx5e_icosq *sq) sq->cc = sqcc; } -static void mlx5e_handle_shampo_hd_umr(struct mlx5e_shampo_umr umr, - struct mlx5e_icosq *sq) +void mlx5e_shampo_fill_umr(struct mlx5e_rq *rq, int len) { - struct mlx5e_channel *c = container_of(sq, struct mlx5e_channel, icosq); - struct mlx5e_shampo_hd *shampo; - /* assume 1:1 relationship between RQ and icosq */ - struct mlx5e_rq *rq = &c->rq; - int end, from, len = umr.len; + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; + int end, from, full_len = len; - shampo = rq->mpwqe.shampo; end = shampo->hd_per_wq; from = shampo->ci; - if (from + len > shampo->hd_per_wq) { + if (from + len > end) { len -= end - from; bitmap_set(shampo->bitmap, from, end - from); from = 0; } bitmap_set(shampo->bitmap, from, len); - shampo->ci = (shampo->ci + umr.len) & (shampo->hd_per_wq - 1); + shampo->ci = (shampo->ci + full_len) & (shampo->hd_per_wq - 1); +} + +static void mlx5e_handle_shampo_hd_umr(struct mlx5e_shampo_umr umr, + struct mlx5e_icosq *sq) +{ + struct mlx5e_channel *c = container_of(sq, struct mlx5e_channel, icosq); + /* assume 1:1 relationship between RQ and icosq */ + struct mlx5e_rq *rq = &c->rq; + + mlx5e_shampo_fill_umr(rq, umr.len); } int mlx5e_poll_ico_cq(struct mlx5e_cq *cq) From a07e953dafe5ebd88942dc861dfb06eaf055fb07 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 15 Aug 2024 10:16:10 +0300 Subject: [PATCH 441/991] net/mlx5e: XPS, Fix oversight of Multi-PF Netdev changes The offending commit overlooked the Multi-PF Netdev changes. Revert mlx5e_set_default_xps_cpumasks to incorporate Multi-PF Netdev changes. Fixes: bcee093751f8 ("net/mlx5e: Modifying channels number and updating TX queues") Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20240815071611.2211873-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 583fa24a7ae980..16b67c457b6057 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3028,15 +3028,18 @@ int mlx5e_update_tx_netdev_queues(struct mlx5e_priv *priv) static void mlx5e_set_default_xps_cpumasks(struct mlx5e_priv *priv, struct mlx5e_params *params) { - struct mlx5_core_dev *mdev = priv->mdev; - int num_comp_vectors, ix, irq; - - num_comp_vectors = mlx5_comp_vectors_max(mdev); + int ix; for (ix = 0; ix < params->num_channels; ix++) { + int num_comp_vectors, irq, vec_ix; + struct mlx5_core_dev *mdev; + + mdev = mlx5_sd_ch_ix_get_dev(priv->mdev, ix); + num_comp_vectors = mlx5_comp_vectors_max(mdev); cpumask_clear(priv->scratchpad.cpumask); + vec_ix = mlx5_sd_ch_ix_get_vec_ix(mdev, ix); - for (irq = ix; irq < num_comp_vectors; irq += params->num_channels) { + for (irq = vec_ix; irq < num_comp_vectors; irq += params->num_channels) { int cpu = mlx5_comp_vector_get_cpu(mdev, irq); cpumask_set_cpu(cpu, priv->scratchpad.cpumask); From 607e1df7bd47fe91cab85a97f57870a26d066137 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 15 Aug 2024 10:16:11 +0300 Subject: [PATCH 442/991] net/mlx5: Fix IPsec RoCE MPV trace call Prevent the call trace below from happening, by not allowing IPsec creation over a slave, if master device doesn't support IPsec. WARNING: CPU: 44 PID: 16136 at kernel/locking/rwsem.c:240 down_read+0x75/0x94 Modules linked in: esp4_offload esp4 act_mirred act_vlan cls_flower sch_ingress mlx5_vdpa vringh vhost_iotlb vdpa mst_pciconf(OE) nfsv3 nfs_acl nfs lockd grace fscache netfs xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT nf_reject_ipv4 nft_compat nft_counter nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 rfkill cuse fuse rpcrdma sunrpc rdma_ucm ib_srpt ib_isert iscsi_target_mod target_core_mod ib_umad ib_iser libiscsi scsi_transport_iscsi rdma_cm ib_ipoib iw_cm ib_cm ipmi_ssif intel_rapl_msr intel_rapl_common amd64_edac edac_mce_amd kvm_amd kvm irqbypass crct10dif_pclmul crc32_pclmul mlx5_ib ghash_clmulni_intel sha1_ssse3 dell_smbios ib_uverbs aesni_intel crypto_simd dcdbas wmi_bmof dell_wmi_descriptor cryptd pcspkr ib_core acpi_ipmi sp5100_tco ccp i2c_piix4 ipmi_si ptdma k10temp ipmi_devintf ipmi_msghandler acpi_power_meter acpi_cpufreq ext4 mbcache jbd2 sd_mod t10_pi sg mgag200 drm_kms_helper syscopyarea sysfillrect mlx5_core sysimgblt fb_sys_fops cec ahci libahci mlxfw drm pci_hyperv_intf libata tg3 sha256_ssse3 tls megaraid_sas i2c_algo_bit psample wmi dm_mirror dm_region_hash dm_log dm_mod [last unloaded: mst_pci] CPU: 44 PID: 16136 Comm: kworker/44:3 Kdump: loaded Tainted: GOE 5.15.0-20240509.el8uek.uek7_u3_update_v6.6_ipsec_bf.x86_64 #2 Hardware name: Dell Inc. PowerEdge R7525/074H08, BIOS 2.0.3 01/15/2021 Workqueue: events xfrm_state_gc_task RIP: 0010:down_read+0x75/0x94 Code: 00 48 8b 45 08 65 48 8b 14 25 80 fc 01 00 83 e0 02 48 09 d0 48 83 c8 01 48 89 45 08 5d 31 c0 89 c2 89 c6 89 c7 e9 cb 88 3b 00 <0f> 0b 48 8b 45 08 a8 01 74 b2 a8 02 75 ae 48 89 c2 48 83 ca 02 f0 RSP: 0018:ffffb26387773da8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffa08b658af900 RCX: 0000000000000001 RDX: 0000000000000000 RSI: ff886bc5e1366f2f RDI: 0000000000000000 RBP: ffffa08b658af940 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffa0a9bfb31540 R13: ffffa0a9bfb37900 R14: 0000000000000000 R15: ffffa0a9bfb37905 FS: 0000000000000000(0000) GS:ffffa0a9bfb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055a45ed814e8 CR3: 000000109038a000 CR4: 0000000000350ee0 Call Trace: ? show_trace_log_lvl+0x1d6/0x2f9 ? show_trace_log_lvl+0x1d6/0x2f9 ? mlx5_devcom_for_each_peer_begin+0x29/0x60 [mlx5_core] ? down_read+0x75/0x94 ? __warn+0x80/0x113 ? down_read+0x75/0x94 ? report_bug+0xa4/0x11d ? handle_bug+0x35/0x8b ? exc_invalid_op+0x14/0x75 ? asm_exc_invalid_op+0x16/0x1b ? down_read+0x75/0x94 ? down_read+0xe/0x94 mlx5_devcom_for_each_peer_begin+0x29/0x60 [mlx5_core] mlx5_ipsec_fs_roce_tx_destroy+0xb1/0x130 [mlx5_core] tx_destroy+0x1b/0xc0 [mlx5_core] tx_ft_put+0x53/0xc0 [mlx5_core] mlx5e_xfrm_free_state+0x45/0x90 [mlx5_core] ___xfrm_state_destroy+0x10f/0x1a2 xfrm_state_gc_task+0x81/0xa9 process_one_work+0x1f1/0x3c6 worker_thread+0x53/0x3e4 ? process_one_work.cold+0x46/0x3c kthread+0x127/0x144 ? set_kthread_struct+0x60/0x52 ret_from_fork+0x22/0x2d ---[ end trace 5ef7896144d398e1 ]--- Fixes: dfbd229abeee ("net/mlx5: Configure IPsec steering for egress RoCEv2 MPV traffic") Reviewed-by: Leon Romanovsky Signed-off-by: Patrisious Haddad Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20240815071611.2211873-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c index 234cd00f71a1cb..b7d4b1a2baf2ec 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c @@ -386,7 +386,8 @@ static int ipsec_fs_roce_tx_mpv_create(struct mlx5_core_dev *mdev, return -EOPNOTSUPP; peer_priv = mlx5_devcom_get_next_peer_data(*ipsec_roce->devcom, &tmp); - if (!peer_priv) { + if (!peer_priv || !peer_priv->ipsec) { + mlx5_core_err(mdev, "IPsec not supported on master device\n"); err = -EOPNOTSUPP; goto release_peer; } @@ -455,7 +456,8 @@ static int ipsec_fs_roce_rx_mpv_create(struct mlx5_core_dev *mdev, return -EOPNOTSUPP; peer_priv = mlx5_devcom_get_next_peer_data(*ipsec_roce->devcom, &tmp); - if (!peer_priv) { + if (!peer_priv || !peer_priv->ipsec) { + mlx5_core_err(mdev, "IPsec not supported on master device\n"); err = -EOPNOTSUPP; goto release_peer; } From 0e49d3ff12501adaafaf6fdb19699f021d1eda1c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 4 May 2024 23:48:58 -0400 Subject: [PATCH 443/991] bcachefs: Fix locking in __bch2_trans_mark_dev_sb() We run this in full RW mode now, so we have to guard against the superblock buffer being reallocated. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 5 +---- fs/bcachefs/buckets.c | 14 +++++++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 6cbf2aa6a9479b..eb3002c4eae7b1 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -741,12 +741,9 @@ static int bch2_gc_btrees(struct bch_fs *c) static int bch2_mark_superblocks(struct bch_fs *c) { - mutex_lock(&c->sb_lock); gc_pos_set(c, gc_phase(GC_PHASE_sb)); - int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); - mutex_unlock(&c->sb_lock); - return ret; + return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); } static void bch2_gc_free(struct bch_fs *c) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index ad517ef744e577..be2bbd2486314f 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -915,7 +915,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, enum bch_data_type type, unsigned sectors) { - struct bch_fs *c = trans->c; struct btree_iter iter; int ret = 0; @@ -1046,13 +1045,18 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca, enum btree_iter_update_trigger_flags flags) { - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + struct bch_fs *c = trans->c; + + mutex_lock(&c->sb_lock); + struct bch_sb_layout layout = ca->disk_sb.sb->layout; + mutex_unlock(&c->sb_lock); + u64 bucket = 0; unsigned i, bucket_sectors = 0; int ret; - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); + for (i = 0; i < layout.nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout.sb_offset[i]); if (offset == BCH_SB_SECTOR) { ret = bch2_trans_mark_metadata_sectors(trans, ca, @@ -1063,7 +1067,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *c } ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, - offset + (1 << layout->sb_max_size_bits), + offset + (1 << layout.sb_max_size_bits), BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; From 2fa62ce91a52e704716d08f9a8eb3f9e7e04710d Mon Sep 17 00:00:00 2001 From: Yihang Li Date: Wed, 14 Aug 2024 12:01:24 +0800 Subject: [PATCH 444/991] scsi: MAINTAINERS: Update HiSilicon SAS controller driver maintainer Add Yihang Li as the maintainer of the HiSilicon SAS controller driver, replacing Xiang Chen. Signed-off-by: Yihang Li Link: https://lore.kernel.org/r/20240814040124.1376195-1-liyihang9@huawei.com Signed-off-by: Martin K. Petersen --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 42decde3832066..f96bc870a6640b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10173,7 +10173,7 @@ F: Documentation/devicetree/bindings/infiniband/hisilicon-hns-roce.txt F: drivers/infiniband/hw/hns/ HISILICON SAS Controller -M: Xiang Chen +M: Yihang Li S: Supported W: http://www.hisilicon.com F: Documentation/devicetree/bindings/scsi/hisilicon-sas.txt From f03e94f23b04c2b71c0044c1534921b3975ef10c Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Tue, 13 Aug 2024 13:34:10 +0800 Subject: [PATCH 445/991] scsi: core: Fix the return value of scsi_logical_block_count() scsi_logical_block_count() should return the block count of a given SCSI command. The original implementation ended up shifting twice, leading to an incorrect count being returned. Fix the conversion between bytes and logical blocks. Cc: stable@vger.kernel.org Fixes: 6a20e21ae1e2 ("scsi: core: Add helper to return number of logical blocks in a request") Signed-off-by: Chaotian Jing Link: https://lore.kernel.org/r/20240813053534.7720-1-chaotian.jing@mediatek.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- include/scsi/scsi_cmnd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 45c40d200154dd..8ecfb94049db51 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -234,7 +234,7 @@ static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd) static inline unsigned int scsi_logical_block_count(struct scsi_cmnd *scmd) { - unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT; + unsigned int shift = ilog2(scmd->device->sector_size); return blk_rq_bytes(scsi_cmd_to_rq(scmd)) >> shift; } From a0c9fe5eecc97680323ee83780ea3eaf440ba1b7 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 15 Aug 2024 16:37:13 +0100 Subject: [PATCH 446/991] tc-testing: don't access non-existent variable on exception Since commit 255c1c7279ab ("tc-testing: Allow test cases to be skipped") the variable test_ordinal doesn't exist in call_pre_case(). So it should not be accessed when an exception occurs. This resolves the following splat: ... During handling of the above exception, another exception occurred: Traceback (most recent call last): File ".../tdc.py", line 1028, in main() File ".../tdc.py", line 1022, in main set_operation_mode(pm, parser, args, remaining) File ".../tdc.py", line 966, in set_operation_mode catresults = test_runner_serial(pm, args, alltests) File ".../tdc.py", line 642, in test_runner_serial (index, tsr) = test_runner(pm, args, alltests) File ".../tdc.py", line 536, in test_runner res = run_one_test(pm, args, index, tidx) File ".../tdc.py", line 419, in run_one_test pm.call_pre_case(tidx) File ".../tdc.py", line 146, in call_pre_case print('test_ordinal is {}'.format(test_ordinal)) NameError: name 'test_ordinal' is not defined Fixes: 255c1c7279ab ("tc-testing: Allow test cases to be skipped") Signed-off-by: Simon Horman Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20240815-tdc-test-ordinal-v1-1-0255c122a427@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/tc-testing/tdc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/tc-testing/tdc.py b/tools/testing/selftests/tc-testing/tdc.py index ee349187636fc1..4f255cec0c22e7 100755 --- a/tools/testing/selftests/tc-testing/tdc.py +++ b/tools/testing/selftests/tc-testing/tdc.py @@ -143,7 +143,6 @@ def call_pre_case(self, caseinfo, *, test_skip=False): except Exception as ee: print('exception {} in call to pre_case for {} plugin'. format(ee, pgn_inst.__class__)) - print('test_ordinal is {}'.format(test_ordinal)) print('testid is {}'.format(caseinfo['id'])) raise From cd06b713a6880997ca5aecac8e33d5f9c541749e Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Fri, 16 Aug 2024 11:55:10 +0530 Subject: [PATCH 447/991] scsi: ufs: core: Add a quirk for handling broken LSDBS field in controller capabilities register 'Legacy Queue & Single Doorbell Support (LSDBS)' field in the controller capabilities register is supposed to report whether the legacy single doorbell mode is supported in the controller or not. But some controllers report '1' in this field which corresponds to 'LSDB not supported', but they indeed support LSDB. So let's add a quirk to handle those controllers. If the quirk is enabled by the controller driver, then LSDBS register field will be ignored and legacy single doorbell mode is assumed to be enabled always. Tested-by: Amit Pundir Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20240816-ufs-bug-fix-v3-1-e6fe0e18e2a3@linaro.org Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 6 +++++- include/ufs/ufshcd.h | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 0b3d0c8e0ddaed..a6f818cdef0e77 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -2426,7 +2426,11 @@ static inline int ufshcd_hba_capabilities(struct ufs_hba *hba) * 0h: legacy single doorbell support is available * 1h: indicate that legacy single doorbell support has been removed */ - hba->lsdb_sup = !FIELD_GET(MASK_LSDB_SUPPORT, hba->capabilities); + if (!(hba->quirks & UFSHCD_QUIRK_BROKEN_LSDBS_CAP)) + hba->lsdb_sup = !FIELD_GET(MASK_LSDB_SUPPORT, hba->capabilities); + else + hba->lsdb_sup = true; + if (!hba->mcq_sup) return 0; diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index cac0cdb9a916c7..0fd2aebac72862 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -676,6 +676,14 @@ enum ufshcd_quirks { * the standard best practice for managing keys). */ UFSHCD_QUIRK_KEYS_IN_PRDT = 1 << 24, + + /* + * This quirk indicates that the controller reports the value 1 (not + * supported) in the Legacy Single DoorBell Support (LSDBS) bit of the + * Controller Capabilities register although it supports the legacy + * single doorbell mode. + */ + UFSHCD_QUIRK_BROKEN_LSDBS_CAP = 1 << 25, }; enum ufshcd_caps { From ea593e028a9cc523557b4084a61d87ae69e2f270 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Fri, 16 Aug 2024 11:55:11 +0530 Subject: [PATCH 448/991] scsi: ufs: qcom: Add UFSHCD_QUIRK_BROKEN_LSDBS_CAP for SM8550 SoC SM8550 SoC has the UFSHCI 4.0 compliant UFS controller and only supports legacy single doorbell mode without MCQ. But due to a hardware bug, it reports 1 in the 'Legacy Queue & Single Doorbell Support (LSDBS)' field of the Controller Capabilities register. This field is supposed to read as 0 if legacy single doorbell mode is supported and 1 otherwise. Starting with commit 0c60eb0cc320 ("scsi: ufs: core: Check LSDBS cap when !mcq"), ufshcd driver is now relying on the LSDBS field to decide when to use the legacy doorbell mode if MCQ is not supported. And this ends up breaking UFS on SM8550: ufshcd-qcom 1d84000.ufs: ufshcd_init: failed to initialize (legacy doorbell mode not supported) ufshcd-qcom 1d84000.ufs: error -EINVAL: Initialization failed with error -22 So use the UFSHCD_QUIRK_BROKEN_LSDBS_CAP quirk for SM8550 SoC so that the ufshcd driver could use legacy doorbell mode correctly. Fixes: 0c60eb0cc320 ("scsi: ufs: core: Check LSDBS cap when !mcq") Tested-by: Amit Pundir Reviewed-by: Bart Van Assche Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20240816-ufs-bug-fix-v3-2-e6fe0e18e2a3@linaro.org Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-qcom.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 810e637047d04b..c87fdc849c627e 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -857,6 +857,9 @@ static void ufs_qcom_advertise_quirks(struct ufs_hba *hba) if (host->hw_ver.major > 0x3) hba->quirks |= UFSHCD_QUIRK_REINIT_AFTER_MAX_GEAR_SWITCH; + + if (of_device_is_compatible(hba->dev->of_node, "qcom,sm8550-ufshc")) + hba->quirks |= UFSHCD_QUIRK_BROKEN_LSDBS_CAP; } static void ufs_qcom_set_phy_gear(struct ufs_qcom_host *host) @@ -1847,7 +1850,8 @@ static void ufs_qcom_remove(struct platform_device *pdev) } static const struct of_device_id ufs_qcom_of_match[] __maybe_unused = { - { .compatible = "qcom,ufshc"}, + { .compatible = "qcom,ufshc" }, + { .compatible = "qcom,sm8550-ufshc" }, {}, }; MODULE_DEVICE_TABLE(of, ufs_qcom_of_match); From cd612b57c3672487ae8565855eaf9e83862eccc5 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 16 Aug 2024 13:59:56 +0100 Subject: [PATCH 449/991] scsi: MAINTAINERS: Add header files to SCSI SUBSYSTEM This is part of an effort to assign a section in MAINTAINERS to header files that relate to Networking [1]. In this case the files with "net" in their name. [1] https://lore.kernel.org/netdev/20240816-net-mnt-v1-0-ef946b47ced4@kernel.org/ As part of that effort these files came up: * include/uapi/scsi/scsi_netlink_fc.h * include/uapi/scsi/scsi_netlink.h Unlike all the other matching files, these one seem to relate more closely to SCSI than Networking, so I have added them to the SCSI SUBSYSTEM section. In order to simplify things, and for consistency, I have added the entire include/uapi/scsi rather than the individual files. Signed-off-by: Simon Horman Link: https://lore.kernel.org/r/20240816-scsi-mnt-v1-1-439af8b1c28b@kernel.org Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f96bc870a6640b..9a33ab69abab2b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20351,6 +20351,7 @@ F: Documentation/devicetree/bindings/scsi/ F: drivers/scsi/ F: drivers/ufs/ F: include/scsi/ +F: include/uapi/scsi/ SCSI TAPE DRIVER M: Kai Mäkisara From cbaac68987b8699397df29413b33bd51f5255255 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 16 Aug 2024 20:53:10 -0400 Subject: [PATCH 450/991] scsi: sd: Do not attempt to configure discard unless LBPME is set Commit f874d7210d88 ("scsi: sd: Keep the discard mode stable") attempted to address an issue where one mode of discard operation got configured prior to the device completing full discovery. Unfortunately this change assumed discard was always enabled on the device. Do not attempt to configure discard unless LBPME is enabled. Link: https://lore.kernel.org/r/20240817005325.3319384-1-martin.petersen@oracle.com Fixes: f874d7210d88 ("scsi: sd: Keep the discard mode stable") Reported-by: Chris Bainbridge Tested-by: Chris Bainbridge Tested-by: Shin'ichiro Kawasaki Tested-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 699f4f9674d98f..dad3991397cf9a 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3308,6 +3308,9 @@ static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer) static unsigned int sd_discard_mode(struct scsi_disk *sdkp) { + if (!sdkp->lbpme) + return SD_LBP_FULL; + if (!sdkp->lbpvpd) { /* LBP VPD page not provided */ if (sdkp->max_unmap_blocks) From ab8d66d132bc8f1992d3eb6cab8d32dda6733c84 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 29 Jul 2024 16:01:57 +0200 Subject: [PATCH 451/991] soundwire: stream: fix programming slave ports for non-continous port maps Two bitmasks in 'struct sdw_slave_prop' - 'source_ports' and 'sink_ports' - define which ports to program in sdw_program_slave_port_params(). The masks are used to get the appropriate data port properties ('struct sdw_get_slave_dpn_prop') from an array. Bitmasks can be non-continuous or can start from index different than 0, thus when looking for matching port property for given port, we must iterate over mask bits, not from 0 up to number of ports. This fixes allocation and programming slave ports, when a source or sink masks start from further index. Fixes: f8101c74aa54 ("soundwire: Add Master and Slave port programming") Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Kozlowski Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20240729140157.326450-1-krzysztof.kozlowski@linaro.org Signed-off-by: Vinod Koul --- drivers/soundwire/stream.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c index 7aa4900dcf3172..f275143d7b18e4 100644 --- a/drivers/soundwire/stream.c +++ b/drivers/soundwire/stream.c @@ -1291,18 +1291,18 @@ struct sdw_dpn_prop *sdw_get_slave_dpn_prop(struct sdw_slave *slave, unsigned int port_num) { struct sdw_dpn_prop *dpn_prop; - u8 num_ports; + unsigned long mask; int i; if (direction == SDW_DATA_DIR_TX) { - num_ports = hweight32(slave->prop.source_ports); + mask = slave->prop.source_ports; dpn_prop = slave->prop.src_dpn_prop; } else { - num_ports = hweight32(slave->prop.sink_ports); + mask = slave->prop.sink_ports; dpn_prop = slave->prop.sink_dpn_prop; } - for (i = 0; i < num_ports; i++) { + for_each_set_bit(i, &mask, 32) { if (dpn_prop[i].num == port_num) return &dpn_prop[i]; } From 3c0da3d163eb32f1f91891efaade027fa9b245b9 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 6 Aug 2024 21:51:42 +0200 Subject: [PATCH 452/991] fuse: Initialize beyond-EOF page contents before setting uptodate fuse_notify_store(), unlike fuse_do_readpage(), does not enable page zeroing (because it can be used to change partial page contents). So fuse_notify_store() must be more careful to fully initialize page contents (including parts of the page that are beyond end-of-file) before marking the page uptodate. The current code can leave beyond-EOF page contents uninitialized, which makes these uninitialized page contents visible to userspace via mmap(). This is an information leak, but only affects systems which do not enable init-on-alloc (via CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y or the corresponding kernel command line parameter). Link: https://bugs.chromium.org/p/project-zero/issues/detail?id=2574 Cc: stable@kernel.org Fixes: a1d75f258230 ("fuse: add store request") Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9eb191b5c4de12..7146038b2fe7d6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1618,9 +1618,11 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, this_num = min_t(unsigned, num, PAGE_SIZE - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); - if (!err && offset == 0 && - (this_num == PAGE_SIZE || file_size == end)) + if (!PageUptodate(page) && !err && offset == 0 && + (this_num == PAGE_SIZE || file_size == end)) { + zero_user_segment(page, this_num, PAGE_SIZE); SetPageUptodate(page); + } unlock_page(page); put_page(page); From 47ac09b91befbb6a235ab620c32af719f8208399 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 18 Aug 2024 13:17:27 -0700 Subject: [PATCH 453/991] Linux 6.11-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5f417808213fa1..68ebd6d6b444df 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* From e4be320eeca842a3d7648258ee3673f1755a5a59 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 15 Aug 2024 18:31:36 -0500 Subject: [PATCH 454/991] smb3: fix broken cached reads when posix locks Mandatory locking is enforced for cached reads, which violates default posix semantics, and also it is enforced inconsistently. This affected recent versions of libreoffice, and can be demonstrated by opening a file twice from the same client, locking it from handle one and trying to read from it from handle two (which fails, returning EACCES). There is already a mount option "forcemandatorylock" (which defaults to off), so with this change only when the user intentionally specifies "forcemandatorylock" on mount will we break posix semantics on read to a locked range (ie we will only fail in this case, if the user mounts with "forcemandatorylock"). An earlier patch fixed the write path. Fixes: 85160e03a79e ("CIFS: Implement caching mechanism for mandatory brlocks") Cc: stable@vger.kernel.org Cc: Pavel Shilovsky Reviewed-by: David Howells Reported-by: abartlet@samba.org Reported-by: Kevin Ottens Signed-off-by: Steve French --- fs/smb/client/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 1fc66bcf49eb45..f9b302cb8233cf 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -2912,9 +2912,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to) if (!CIFS_CACHE_READ(cinode)) return netfs_unbuffered_read_iter(iocb, to); - if (cap_unix(tcon->ses) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) { if (iocb->ki_flags & IOCB_DIRECT) return netfs_unbuffered_read_iter(iocb, to); return netfs_buffered_read_iter(iocb, to); From dfd046d0ced19b6ff5f11ec4ceab0a83de924771 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 15 Aug 2024 08:56:35 +0900 Subject: [PATCH 455/991] ksmbd: Use unsafe_memcpy() for ntlm_negotiate rsp buffer is allocated larger than spnego_blob from smb2_allocate_rsp_buf(). Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 2df1354288e68a..3f4c56a10a86f8 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1370,7 +1370,8 @@ static int ntlm_negotiate(struct ksmbd_work *work, } sz = le16_to_cpu(rsp->SecurityBufferOffset); - memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); + unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len, + /* alloc is larger than blob, see smb2_allocate_rsp_buf() */); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); out: @@ -1453,7 +1454,9 @@ static int ntlm_authenticate(struct ksmbd_work *work, return -ENOMEM; sz = le16_to_cpu(rsp->SecurityBufferOffset); - memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); + unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, + spnego_blob_len, + /* alloc is larger than blob, see smb2_allocate_rsp_buf() */); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); kfree(spnego_blob); } From 76e98a158b207771a6c9a0de0a60522a446a3447 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 17 Aug 2024 14:03:49 +0900 Subject: [PATCH 456/991] ksmbd: fix race condition between destroy_previous_session() and smb2 operations() If there is ->PreviousSessionId field in the session setup request, The session of the previous connection should be destroyed. During this, if the smb2 operation requests in the previous session are being processed, a racy issue could happen with ksmbd_destroy_file_table(). This patch sets conn->status to KSMBD_SESS_NEED_RECONNECT to block incoming operations and waits until on-going operations are complete (i.e. idle) before desctorying the previous session. Fixes: c8efcc786146 ("ksmbd: add support for durable handles v1/v2") Cc: stable@vger.kernel.org # v6.6+ Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-25040 Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 34 ++++++++++++++++++++++++++++++- fs/smb/server/connection.h | 3 ++- fs/smb/server/mgmt/user_session.c | 9 ++++++++ fs/smb/server/smb2pdu.c | 2 +- 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 09e1e7771592f5..7889df8112b4ee 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -165,11 +165,43 @@ void ksmbd_all_conn_set_status(u64 sess_id, u32 status) up_read(&conn_list_lock); } -void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id) +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn) { wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2); } +int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id) +{ + struct ksmbd_conn *conn; + int rc, retry_count = 0, max_timeout = 120; + int rcount = 1; + +retry_idle: + if (retry_count >= max_timeout) + return -EIO; + + down_read(&conn_list_lock); + list_for_each_entry(conn, &conn_list, conns_list) { + if (conn->binding || xa_load(&conn->sessions, sess_id)) { + if (conn == curr_conn) + rcount = 2; + if (atomic_read(&conn->req_running) >= rcount) { + rc = wait_event_timeout(conn->req_running_q, + atomic_read(&conn->req_running) < rcount, + HZ); + if (!rc) { + up_read(&conn_list_lock); + retry_count++; + goto retry_idle; + } + } + } + } + up_read(&conn_list_lock); + + return 0; +} + int ksmbd_conn_write(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 5c2845e47cf2df..5b947175c048eb 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -145,7 +145,8 @@ extern struct list_head conn_list; extern struct rw_semaphore conn_list_lock; bool ksmbd_conn_alive(struct ksmbd_conn *conn); -void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id); +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn); +int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id); struct ksmbd_conn *ksmbd_conn_alloc(void); void ksmbd_conn_free(struct ksmbd_conn *conn); bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 162a12685d2c95..99416ce9f50183 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -311,6 +311,7 @@ void destroy_previous_session(struct ksmbd_conn *conn, { struct ksmbd_session *prev_sess; struct ksmbd_user *prev_user; + int err; down_write(&sessions_table_lock); down_write(&conn->session_lock); @@ -325,8 +326,16 @@ void destroy_previous_session(struct ksmbd_conn *conn, memcmp(user->passkey, prev_user->passkey, user->passkey_sz)) goto out; + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_RECONNECT); + err = ksmbd_conn_wait_idle_sess_id(conn, id); + if (err) { + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE); + goto out; + } + ksmbd_destroy_file_table(&prev_sess->file_table); prev_sess->state = SMB2_SESSION_EXPIRED; + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE); ksmbd_launch_ksmbd_durable_scavenger(); out: up_write(&conn->session_lock); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 3f4c56a10a86f8..cb7f487c96af80 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2213,7 +2213,7 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_conn_unlock(conn); ksmbd_close_session_fds(work); - ksmbd_conn_wait_idle(conn, sess_id); + ksmbd_conn_wait_idle(conn); /* * Re-lookup session to validate if session is deleted From 4fdd8664c8a94411a01d11d5ed2f083f105f570a Mon Sep 17 00:00:00 2001 From: Victor Timofei Date: Fri, 16 Aug 2024 22:24:52 +0300 Subject: [PATCH 457/991] ksmbd: fix spelling mistakes in documentation There are a couple of spelling mistakes in the documentation. This patch fixes them. Signed-off-by: Victor Timofei Acked-by: Namjae Jeon Signed-off-by: Steve French --- Documentation/filesystems/smb/ksmbd.rst | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Documentation/filesystems/smb/ksmbd.rst b/Documentation/filesystems/smb/ksmbd.rst index 6b30e43a0d11f4..67cb68ea6e6880 100644 --- a/Documentation/filesystems/smb/ksmbd.rst +++ b/Documentation/filesystems/smb/ksmbd.rst @@ -13,7 +13,7 @@ KSMBD architecture The subset of performance related operations belong in kernelspace and the other subset which belong to operations which are not really related with performance in userspace. So, DCE/RPC management that has historically resulted -into number of buffer overflow issues and dangerous security bugs and user +into a number of buffer overflow issues and dangerous security bugs and user account management are implemented in user space as ksmbd.mountd. File operations that are related with performance (open/read/write/close etc.) in kernel space (ksmbd). This also allows for easier integration with VFS @@ -24,8 +24,8 @@ ksmbd (kernel daemon) When the server daemon is started, It starts up a forker thread (ksmbd/interface name) at initialization time and open a dedicated port 445 -for listening to SMB requests. Whenever new clients make request, Forker -thread will accept the client connection and fork a new thread for dedicated +for listening to SMB requests. Whenever new clients make a request, the Forker +thread will accept the client connection and fork a new thread for a dedicated communication channel between the client and the server. It allows for parallel processing of SMB requests(commands) from clients as well as allowing for new clients to make new connections. Each instance is named ksmbd/1~n(port number) @@ -34,12 +34,12 @@ thread can decide to pass through the commands to the user space (ksmbd.mountd), currently DCE/RPC commands are identified to be handled through the user space. To further utilize the linux kernel, it has been chosen to process the commands as workitems and to be executed in the handlers of the ksmbd-io kworker threads. -It allows for multiplexing of the handlers as the kernel take care of initiating +It allows for multiplexing of the handlers as the kernel takes care of initiating extra worker threads if the load is increased and vice versa, if the load is -decreased it destroys the extra worker threads. So, after connection is -established with client. Dedicated ksmbd/1..n(port number) takes complete +decreased it destroys the extra worker threads. So, after the connection is +established with the client. Dedicated ksmbd/1..n(port number) takes complete ownership of receiving/parsing of SMB commands. Each received command is worked -in parallel i.e., There can be multiple clients commands which are worked in +in parallel i.e., there can be multiple client commands which are worked in parallel. After receiving each command a separated kernel workitem is prepared for each command which is further queued to be handled by ksmbd-io kworkers. So, each SMB workitem is queued to the kworkers. This allows the benefit of load @@ -49,9 +49,9 @@ performance by handling client commands in parallel. ksmbd.mountd (user space daemon) -------------------------------- -ksmbd.mountd is userspace process to, transfer user account and password that +ksmbd.mountd is a userspace process to, transfer the user account and password that are registered using ksmbd.adduser (part of utils for user space). Further it -allows sharing information parameters that parsed from smb.conf to ksmbd in +allows sharing information parameters that are parsed from smb.conf to ksmbd in kernel. For the execution part it has a daemon which is continuously running and connected to the kernel interface using netlink socket, it waits for the requests (dcerpc and share/user info). It handles RPC calls (at a minimum few @@ -124,7 +124,7 @@ How to run 1. Download ksmbd-tools(https://github.com/cifsd-team/ksmbd-tools/releases) and compile them. - - Refer README(https://github.com/cifsd-team/ksmbd-tools/blob/master/README.md) + - Refer to README(https://github.com/cifsd-team/ksmbd-tools/blob/master/README.md) to know how to use ksmbd.mountd/adduser/addshare/control utils $ ./autogen.sh @@ -133,7 +133,7 @@ How to run 2. Create /usr/local/etc/ksmbd/ksmbd.conf file, add SMB share in ksmbd.conf file. - - Refer ksmbd.conf.example in ksmbd-utils, See ksmbd.conf manpage + - Refer to ksmbd.conf.example in ksmbd-utils, See ksmbd.conf manpage for details to configure shares. $ man ksmbd.conf @@ -145,7 +145,7 @@ How to run $ man ksmbd.adduser $ sudo ksmbd.adduser -a -4. Insert ksmbd.ko module after build your kernel. No need to load module +4. Insert the ksmbd.ko module after you build your kernel. No need to load the module if ksmbd is built into the kernel. - Set ksmbd in menuconfig(e.g. $ make menuconfig) @@ -175,7 +175,7 @@ Each layer 1. Enable all component prints # sudo ksmbd.control -d "all" -2. Enable one of components (smb, auth, vfs, oplock, ipc, conn, rdma) +2. Enable one of the components (smb, auth, vfs, oplock, ipc, conn, rdma) # sudo ksmbd.control -d "smb" 3. Show what prints are enabled. From 7c525dddbee71880e654ad44f3917787a4f6042c Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 16 Aug 2024 19:33:39 +0200 Subject: [PATCH 458/991] ksmbd: Replace one-element arrays with flexible-array members Replace the deprecated one-element arrays with flexible-array members in the structs filesystem_attribute_info and filesystem_device_info. There are no binary differences after this conversion. Link: https://github.com/KSPP/linux/issues/79 Signed-off-by: Thorsten Blum Reviewed-by: Gustavo A. R. Silva Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 4 ++-- fs/smb/server/smb_common.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index cb7f487c96af80..0bc9edf22ba407 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -5360,7 +5360,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, "NTFS", PATH_MAX, conn->local_nls, 0); len = len * 2; info->FileSystemNameLen = cpu_to_le32(len); - sz = sizeof(struct filesystem_attribute_info) - 2 + len; + sz = sizeof(struct filesystem_attribute_info) + len; rsp->OutputBufferLength = cpu_to_le32(sz); break; } @@ -5386,7 +5386,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, len = len * 2; info->VolumeLabelSize = cpu_to_le32(len); info->Reserved = 0; - sz = sizeof(struct filesystem_vol_info) - 2 + len; + sz = sizeof(struct filesystem_vol_info) + len; rsp->OutputBufferLength = cpu_to_le32(sz); break; } diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index 4a3148b0167f54..cc1d6dfe29d565 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -213,7 +213,7 @@ struct filesystem_attribute_info { __le32 Attributes; __le32 MaxPathNameComponentLength; __le32 FileSystemNameLen; - __le16 FileSystemName[1]; /* do not have to save this - get subset? */ + __le16 FileSystemName[]; /* do not have to save this - get subset? */ } __packed; struct filesystem_device_info { @@ -226,7 +226,7 @@ struct filesystem_vol_info { __le32 SerialNumber; __le32 VolumeLabelSize; __le16 Reserved; - __le16 VolumeLabel[1]; + __le16 VolumeLabel[]; } __packed; struct filesystem_info { From d6d539c9a7ad0655e5ad46b5e869f1b20bce8953 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 Aug 2024 19:51:13 -0400 Subject: [PATCH 459/991] bcachefs: Reallocate table when we're increasing size Fixes: c2f6e16a6771 ("bcachefs: Increase size of cuckoo hash table on too many rehashes") Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets_waiting_for_journal.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index f70eb2127d322c..f9fb150eda706c 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -107,7 +107,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, nr_elements += t->d[i].journal_seq > flushed_seq; new_bits = ilog2(roundup_pow_of_two(nr_elements * 3)); - +realloc: n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); if (!n) { ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; @@ -118,6 +118,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, if (nr_rehashes_this_size == 3) { new_bits++; nr_rehashes_this_size = 0; + kvfree(n); + goto realloc; } nr_rehashes++; From d9f49c3106e404776afcf6c5682357f4fe088beb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 Aug 2024 16:41:39 -0400 Subject: [PATCH 460/991] bcachefs: fix field-spanning write warning attempts to retrofit memory safety onto C are increasingly annoying ------------[ cut here ]------------ memcpy: detected field-spanning write (size 4) of single field "&k.replicas" at fs/bcachefs/replicas.c:454 (size 3) WARNING: CPU: 5 PID: 6525 at fs/bcachefs/replicas.c:454 bch2_replicas_gc2+0x2cb/0x400 [bcachefs] bch2_replicas_gc2+0x2cb/0x400: bch2_replicas_gc2 at /home/ojab/src/bcachefs/fs/bcachefs/replicas.c:454 (discriminator 3) Modules linked in: dm_mod tun nf_conntrack_netlink nfnetlink xt_addrtype br_netfilter overlay msr sctp bcachefs lz4hc_compress lz4_compress libcrc32c xor raid6_pq lz4_decompress pps_ldisc pps_core wireguard libchacha20poly1305 chacha_x86_64 poly1305_x86_64 ip6_udp_tunnel udp_tunnel curve25519_x86_64 libcurve25519_generic libchacha sit tunnel4 ip_tunnel af_packet bridge stp llc ip6table_nat ip6table_filter ip6_tables xt_MASQUERADE xt_conntrack iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter ip_tables x_tables tcp_bbr sch_fq_codel efivarfs nls_iso8859_1 nls_cp437 vfat fat cdc_mbim cdc_wdm cdc_ncm cdc_ether usbnet r8152 input_leds joydev mii amdgpu mousedev hid_generic usbhid hid ath10k_pci amd_atl edac_mce_amd ath10k_core kvm_amd ath kvm mac80211 bfq crc32_pclmul crc32c_intel polyval_clmulni polyval_generic sha512_ssse3 sha256_ssse3 sha1_ssse3 snd_hda_codec_generic snd_hda_codec_hdmi snd_hda_intel snd_intel_dspcfg i2c_algo_bit drm_exec snd_hda_codec r8169 drm_suballoc_helper aesni_intel gf128mul crypto_simd amdxcp realtek mfd_core tpm_crb drm_buddy snd_hwdep mdio_devres libarc4 cryptd tpm_tis wmi_bmof cfg80211 evdev libphy snd_hda_core tpm_tis_core gpu_sched rapl xhci_pci xhci_hcd snd_pcm drm_display_helper snd_timer tpm sp5100_tco rfkill efi_pstore mpt3sas drm_ttm_helper ahci usbcore libaescfb ccp snd ttm 8250 libahci watchdog soundcore raid_class sha1_generic acpi_cpufreq k10temp 8250_base usb_common scsi_transport_sas i2c_piix4 hwmon video serial_mctrl_gpio serial_base ecdh_generic wmi rtc_cmos backlight ecc gpio_amdpt rng_core gpio_generic button CPU: 5 UID: 0 PID: 6525 Comm: bcachefs Tainted: G W 6.11.0-rc1-ojab-00058-g224bc118aec9 #6 6d5debde398d2a84851f42ab300dae32c2992027 Tainted: [W]=WARN RIP: 0010:bch2_replicas_gc2+0x2cb/0x400 [bcachefs] Code: c7 c2 60 91 d1 c1 48 89 c6 48 c7 c7 98 91 d1 c1 4c 89 14 24 44 89 5c 24 08 48 89 44 24 20 c6 05 fa 68 04 00 01 e8 05 a3 40 e4 <0f> 0b 4c 8b 14 24 44 8b 5c 24 08 48 8b 44 24 20 e9 55 fe ff ff 8b RSP: 0018:ffffb434c9263d60 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff9a8efa79cc00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffb434c9263de0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000005 R13: ffff9a8efa73c300 R14: ffff9a8d9e880000 R15: ffff9a8d9e8806f8 FS: 0000000000000000(0000) GS:ffff9a9410c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000565423373090 CR3: 0000000164e30000 CR4: 00000000003506f0 Call Trace: ? __warn+0x97/0x150 ? bch2_replicas_gc2+0x2cb/0x400 [bcachefs 9803eca5e131ef28f26250ede34072d5b50d98b3] bch2_replicas_gc2+0x2cb/0x400: bch2_replicas_gc2 at /home/ojab/src/bcachefs/fs/bcachefs/replicas.c:454 (discriminator 3) ? report_bug+0x196/0x1c0 ? handle_bug+0x3c/0x70 ? exc_invalid_op+0x17/0x80 ? __wake_up_klogd.part.0+0x4c/0x80 ? asm_exc_invalid_op+0x16/0x20 ? bch2_replicas_gc2+0x2cb/0x400 [bcachefs 9803eca5e131ef28f26250ede34072d5b50d98b3] bch2_replicas_gc2+0x2cb/0x400: bch2_replicas_gc2 at /home/ojab/src/bcachefs/fs/bcachefs/replicas.c:454 (discriminator 3) ? bch2_dev_usage_read+0xa0/0xa0 [bcachefs 9803eca5e131ef28f26250ede34072d5b50d98b3] bch2_dev_usage_read+0xa0/0xa0: discard_in_flight_remove at /home/ojab/src/bcachefs/fs/bcachefs/alloc_background.c:1712 Signed-off-by: Kent Overstreet --- fs/bcachefs/replicas.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 1223b710755dac..12b1d28b7eb499 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -451,7 +451,8 @@ int bch2_replicas_gc2(struct bch_fs *c) .type = BCH_DISK_ACCOUNTING_replicas, }; - memcpy(&k.replicas, e, replicas_entry_bytes(e)); + unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), + "embedded variable length struct"); struct bpos p = disk_accounting_pos_to_bpos(&k); From 47cdc7b14417a40af6a5d5909f1d28a5a23fc11d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 Aug 2024 17:38:43 -0400 Subject: [PATCH 461/991] bcachefs: Fix incorrect gfp flags fixes: 00488 WARNING: CPU: 9 PID: 194 at mm/page_alloc.c:4410 __alloc_pages_noprof+0x1818/0x1888 00488 Modules linked in: 00488 CPU: 9 UID: 0 PID: 194 Comm: kworker/u66:1 Not tainted 6.11.0-rc1-ktest-g18fa10d6495f #2931 00488 Hardware name: linux,dummy-virt (DT) 00488 Workqueue: writeback wb_workfn (flush-bcachefs-2) 00488 pstate: 20001005 (nzCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) 00488 pc : __alloc_pages_noprof+0x1818/0x1888 00488 lr : __alloc_pages_noprof+0x5f4/0x1888 00488 sp : ffffff80ccd8ed00 00488 x29: ffffff80ccd8ed00 x28: 0000000000000000 x27: dfffffc000000000 00488 x26: 0000000000000010 x25: 0000000000000002 x24: 0000000000000000 00488 x23: 0000000000000000 x22: 1ffffff0199b1dbe x21: ffffff80cc680900 00488 x20: 0000000000000000 x19: ffffff80ccd8eed0 x18: 0000000000000000 00488 x17: ffffff80cc58a010 x16: dfffffc000000000 x15: 1ffffff00474e518 00488 x14: 1ffffff00474e518 x13: 1ffffff00474e518 x12: ffffffb8104701b9 00488 x11: 1ffffff8104701b8 x10: ffffffb8104701b8 x9 : ffffffc08043cde8 00488 x8 : 00000047efb8fe48 x7 : ffffff80ccd8ee20 x6 : 0000000000048000 00488 x5 : 1ffffff810470138 x4 : 0000000000000050 x3 : 1ffffff0199b1d94 00488 x2 : ffffffb0199b1d94 x1 : 0000000000000001 x0 : ffffffc082387448 00488 Call trace: 00488 __alloc_pages_noprof+0x1818/0x1888 00488 new_slab+0x284/0x2f0 00488 ___slab_alloc+0x208/0x8e0 00488 __kmalloc_noprof+0x328/0x340 00488 __bch2_writepage+0x106c/0x1830 00488 write_cache_pages+0xa0/0xe8 due to __GFP_NOFAIL without allowing reclaim Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-buffered.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index cc33d763f7221d..184d0385167689 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -534,7 +534,7 @@ static int __bch2_writepage(struct folio *folio, if (f_sectors > w->tmp_sectors) { kfree(w->tmp); - w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL); + w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL); w->tmp_sectors = f_sectors; } From 5b5c96c63d5b6e91c622611e04b2b156bbae53f5 Mon Sep 17 00:00:00 2001 From: Hongzhen Luo Date: Thu, 1 Aug 2024 19:26:22 +0800 Subject: [PATCH 462/991] erofs: simplify readdir operation - Use i_size instead of i_size_read() due to immutable fses; - Get rid of an unneeded goto since erofs_fill_dentries() also works; - Remove unnecessary lines. Signed-off-by: Hongzhen Luo Link: https://lore.kernel.org/r/20240801112622.2164029-1-hongzhen@linux.alibaba.com Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/dir.c | 35 ++++++++++++----------------------- fs/erofs/internal.h | 2 +- 2 files changed, 13 insertions(+), 24 deletions(-) diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 2193a6710c8f66..c3b90abdee37af 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -8,19 +8,15 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, void *dentry_blk, struct erofs_dirent *de, - unsigned int nameoff, unsigned int maxsize) + unsigned int nameoff0, unsigned int maxsize) { - const struct erofs_dirent *end = dentry_blk + nameoff; + const struct erofs_dirent *end = dentry_blk + nameoff0; while (de < end) { - const char *de_name; + unsigned char d_type = fs_ftype_to_dtype(de->file_type); + unsigned int nameoff = le16_to_cpu(de->nameoff); + const char *de_name = (char *)dentry_blk + nameoff; unsigned int de_namelen; - unsigned char d_type; - - d_type = fs_ftype_to_dtype(de->file_type); - - nameoff = le16_to_cpu(de->nameoff); - de_name = (char *)dentry_blk + nameoff; /* the last dirent in the block? */ if (de + 1 >= end) @@ -52,21 +48,20 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct super_block *sb = dir->i_sb; unsigned long bsz = sb->s_blocksize; - const size_t dirsize = i_size_read(dir); - unsigned int i = erofs_blknr(sb, ctx->pos); unsigned int ofs = erofs_blkoff(sb, ctx->pos); int err = 0; bool initial = true; buf.mapping = dir->i_mapping; - while (ctx->pos < dirsize) { + while (ctx->pos < dir->i_size) { + erofs_off_t dbstart = ctx->pos - ofs; struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, erofs_pos(sb, i), EROFS_KMAP); + de = erofs_bread(&buf, dbstart, EROFS_KMAP); if (IS_ERR(de)) { erofs_err(sb, "fail to readdir of logical block %u of nid %llu", - i, EROFS_I(dir)->nid); + erofs_blknr(sb, dbstart), EROFS_I(dir)->nid); err = PTR_ERR(de); break; } @@ -79,25 +74,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) break; } - maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz); - + maxsize = min_t(unsigned int, dir->i_size - dbstart, bsz); /* search dirents at the arbitrary position */ if (initial) { initial = false; - ofs = roundup(ofs, sizeof(struct erofs_dirent)); - ctx->pos = erofs_pos(sb, i) + ofs; - if (ofs >= nameoff) - goto skip_this; + ctx->pos = dbstart + ofs; } err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, nameoff, maxsize); if (err) break; -skip_this: - ctx->pos = erofs_pos(sb, i) + maxsize; - ++i; + ctx->pos = dbstart + maxsize; ofs = 0; } erofs_put_metabuf(&buf); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 736607675396e8..45dc15ebd870dd 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -220,7 +220,7 @@ struct erofs_buf { }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) -#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits) +#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits)) #define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) #define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) #define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits) From 2c534624ae70100aeea0b5800b0f3768b2fd3cf0 Mon Sep 17 00:00:00 2001 From: Hongzhen Luo Date: Tue, 6 Aug 2024 19:22:08 +0800 Subject: [PATCH 463/991] erofs: get rid of check_layout_compatibility() Simple enough to just open-code it. Signed-off-by: Hongzhen Luo Reviewed-by: Sandeep Dhavale Reviewed-by: Gao Xiang Link: https://lore.kernel.org/r/20240806112208.150323-1-hongzhen@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/super.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 32ce5b35e1dff8..6cb5c8916174be 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -108,22 +108,6 @@ static void erofs_free_inode(struct inode *inode) kmem_cache_free(erofs_inode_cachep, vi); } -static bool check_layout_compatibility(struct super_block *sb, - struct erofs_super_block *dsb) -{ - const unsigned int feature = le32_to_cpu(dsb->feature_incompat); - - EROFS_SB(sb)->feature_incompat = feature; - - /* check if current kernel meets all mandatory requirements */ - if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { - erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", - feature & ~EROFS_ALL_FEATURE_INCOMPAT); - return false; - } - return true; -} - /* read variable-sized metadata, offset will be aligned by 4-byte */ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp) @@ -279,7 +263,7 @@ static int erofs_scan_devices(struct super_block *sb, static int erofs_read_superblock(struct super_block *sb) { - struct erofs_sb_info *sbi; + struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_super_block *dsb; void *data; @@ -291,9 +275,7 @@ static int erofs_read_superblock(struct super_block *sb) return PTR_ERR(data); } - sbi = EROFS_SB(sb); dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); - ret = -EINVAL; if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { erofs_err(sb, "cannot find valid erofs superblock"); @@ -318,8 +300,12 @@ static int erofs_read_superblock(struct super_block *sb) } ret = -EINVAL; - if (!check_layout_compatibility(sb, dsb)) + sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat); + if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) { + erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", + sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT); goto out; + } sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) { From 7ce7c2283fa6843ab3c2adfeb83dcc504a107858 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 14 Aug 2024 12:06:19 +0200 Subject: [PATCH 464/991] Input: i8042 - add Fujitsu Lifebook E756 to i8042 quirk table Yet another quirk entry for Fujitsu laptop. Lifebook E756 requires i8041.nomux for keeping the touchpad working after suspend/resume. Link: https://bugzilla.suse.com/show_bug.cgi?id=1229056 Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240814100630.2048-1-tiwai@suse.de Signed-off-by: Dmitry Torokhov --- drivers/input/serio/i8042-acpipnpio.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h index 5b50475ec41402..78e5c9c60b8b3a 100644 --- a/drivers/input/serio/i8042-acpipnpio.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -626,6 +626,15 @@ static const struct dmi_system_id i8042_dmi_quirk_table[] __initconst = { }, .driver_data = (void *)(SERIO_QUIRK_NOMUX) }, + { + /* Fujitsu Lifebook E756 */ + /* https://bugzilla.suse.com/show_bug.cgi?id=1229056 */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU"), + DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK E756"), + }, + .driver_data = (void *)(SERIO_QUIRK_NOMUX) + }, { /* Fujitsu Lifebook E5411 */ .matches = { From 5d41eeb6725e3e24853629e5d7635e4bc45d736e Mon Sep 17 00:00:00 2001 From: Suraj Kandpal Date: Fri, 9 Aug 2024 17:11:28 +0530 Subject: [PATCH 465/991] drm/i915/hdcp: Use correct cp_irq_count We are checking cp_irq_count from the wrong hdcp structure which ends up giving timed out errors. We only increment the cp_irq_count of the primary connector's hdcp structure but here in case of multidisplay setup we end up checking the secondary connector's hdcp structure, which will not have its cp_irq_count incremented. This leads to a timed out at CP_IRQ error even though a CP_IRQ was raised. Extract it from the correct intel_hdcp structure. --v2 -Explain why it was the wrong hdcp structure [Jani] Fixes: 8c9e4f68b861 ("drm/i915/hdcp: Use per-device debugs") Signed-off-by: Suraj Kandpal Reviewed-by: Ankit Nautiyal Link: https://patchwork.freedesktop.org/patch/msgid/20240809114127.3940699-2-suraj.kandpal@intel.com (cherry picked from commit dd925902634def895690426bf10e0a8b3e56f56d) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_dp_hdcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp_hdcp.c b/drivers/gpu/drm/i915/display/intel_dp_hdcp.c index 2edffe62f360c3..b0101d72b9c1ae 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_hdcp.c +++ b/drivers/gpu/drm/i915/display/intel_dp_hdcp.c @@ -39,7 +39,9 @@ static u32 transcoder_to_stream_enc_status(enum transcoder cpu_transcoder) static void intel_dp_hdcp_wait_for_cp_irq(struct intel_connector *connector, int timeout) { - struct intel_hdcp *hdcp = &connector->hdcp; + struct intel_digital_port *dig_port = intel_attached_dig_port(connector); + struct intel_dp *dp = &dig_port->dp; + struct intel_hdcp *hdcp = &dp->attached_connector->hdcp; long ret; #define C (hdcp->cp_irq_count_cached != atomic_read(&hdcp->cp_irq_count)) From 3d765ae2daccc570b3f4fbcb57eb321b12cdded2 Mon Sep 17 00:00:00 2001 From: Werner Sembach Date: Thu, 4 Jan 2024 19:31:17 +0100 Subject: [PATCH 466/991] Input: i8042 - add forcenorestore quirk to leave controller untouched even on s3 On s3 resume the i8042 driver tries to restore the controller to a known state by reinitializing things, however this can confuse the controller with different effects. Mostly occasionally unresponsive keyboards after resume. These issues do not rise on s0ix resume as here the controller is assumed to preserved its state from before suspend. This patch adds a quirk for devices where the reinitialization on s3 resume is not needed and might be harmful as described above. It does this by using the s0ix resume code path at selected locations. This new quirk goes beyond what the preexisting reset=never quirk does, which only skips some reinitialization steps. Signed-off-by: Werner Sembach Cc: stable@vger.kernel.org Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240104183118.779778-2-wse@tuxedocomputers.com Signed-off-by: Dmitry Torokhov --- drivers/input/serio/i8042-acpipnpio.h | 10 +++++++--- drivers/input/serio/i8042.c | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h index 78e5c9c60b8b3a..00e80430108872 100644 --- a/drivers/input/serio/i8042-acpipnpio.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -83,6 +83,7 @@ static inline void i8042_write_command(int val) #define SERIO_QUIRK_KBDRESET BIT(12) #define SERIO_QUIRK_DRITEK BIT(13) #define SERIO_QUIRK_NOPNP BIT(14) +#define SERIO_QUIRK_FORCENORESTORE BIT(15) /* Quirk table for different mainboards. Options similar or identical to i8042 * module parameters. @@ -1694,6 +1695,8 @@ static void __init i8042_check_quirks(void) if (quirks & SERIO_QUIRK_NOPNP) i8042_nopnp = true; #endif + if (quirks & SERIO_QUIRK_FORCENORESTORE) + i8042_forcenorestore = true; } #else static inline void i8042_check_quirks(void) {} @@ -1727,7 +1730,7 @@ static int __init i8042_platform_init(void) i8042_check_quirks(); - pr_debug("Active quirks (empty means none):%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + pr_debug("Active quirks (empty means none):%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", i8042_nokbd ? " nokbd" : "", i8042_noaux ? " noaux" : "", i8042_nomux ? " nomux" : "", @@ -1747,10 +1750,11 @@ static int __init i8042_platform_init(void) "", #endif #ifdef CONFIG_PNP - i8042_nopnp ? " nopnp" : ""); + i8042_nopnp ? " nopnp" : "", #else - ""); + "", #endif + i8042_forcenorestore ? " forcenorestore" : ""); retval = i8042_pnp_init(); if (retval) diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index e0fb1db653b735..8ec4872b447145 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -115,6 +115,10 @@ module_param_named(nopnp, i8042_nopnp, bool, 0); MODULE_PARM_DESC(nopnp, "Do not use PNP to detect controller settings"); #endif +static bool i8042_forcenorestore; +module_param_named(forcenorestore, i8042_forcenorestore, bool, 0); +MODULE_PARM_DESC(forcenorestore, "Force no restore on s3 resume, copying s2idle behaviour"); + #define DEBUG #ifdef DEBUG static bool i8042_debug; @@ -1232,7 +1236,7 @@ static int i8042_pm_suspend(struct device *dev) { int i; - if (pm_suspend_via_firmware()) + if (!i8042_forcenorestore && pm_suspend_via_firmware()) i8042_controller_reset(true); /* Set up serio interrupts for system wakeup. */ @@ -1248,7 +1252,7 @@ static int i8042_pm_suspend(struct device *dev) static int i8042_pm_resume_noirq(struct device *dev) { - if (!pm_resume_via_firmware()) + if (i8042_forcenorestore || !pm_resume_via_firmware()) i8042_interrupt(0, NULL); return 0; @@ -1271,7 +1275,7 @@ static int i8042_pm_resume(struct device *dev) * not restore the controller state to whatever it had been at boot * time, so we do not need to do anything. */ - if (!pm_suspend_via_firmware()) + if (i8042_forcenorestore || !pm_suspend_via_firmware()) return 0; /* From aaa4ca873d3da768896ffc909795359a01e853ef Mon Sep 17 00:00:00 2001 From: Werner Sembach Date: Thu, 4 Jan 2024 19:31:18 +0100 Subject: [PATCH 467/991] Input: i8042 - use new forcenorestore quirk to replace old buggy quirk combination The old quirk combination sometimes cause a laggy keyboard after boot. With the new quirk the initial issue of an unresponsive keyboard after s3 resume is also fixed, but it doesn't have the negative side effect of the sometimes laggy keyboard. Signed-off-by: Werner Sembach Cc: stable@vger.kernel.org Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240104183118.779778-3-wse@tuxedocomputers.com Signed-off-by: Dmitry Torokhov --- drivers/input/serio/i8042-acpipnpio.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h index 00e80430108872..bad238f69a7afd 100644 --- a/drivers/input/serio/i8042-acpipnpio.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -1159,18 +1159,10 @@ static const struct dmi_system_id i8042_dmi_quirk_table[] __initconst = { SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) }, { - /* - * Setting SERIO_QUIRK_NOMUX or SERIO_QUIRK_RESET_ALWAYS makes - * the keyboard very laggy for ~5 seconds after boot and - * sometimes also after resume. - * However both are required for the keyboard to not fail - * completely sometimes after boot or resume. - */ .matches = { DMI_MATCH(DMI_BOARD_NAME, "N150CU"), }, - .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | - SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) + .driver_data = (void *)(SERIO_QUIRK_FORCENORESTORE) }, { .matches = { From a9aaf1ff88a8cb99a1335c9eb76de637f0cf8c10 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 13 Aug 2024 21:07:50 +0200 Subject: [PATCH 468/991] power: sequencing: request the WLAN enable GPIO as-is If the WCN module is powered up before linux boots and the ath11k driver probes at the same time as the power sequencing driver, we may end up driving the wlan-enable GPIO low in the latter, breaking the start-up of the WLAN module. Request the wlan-enable GPIO as-is so that if the WLAN module is already starting/started, we leave it alone. Fixes: 2f1630f437df ("power: pwrseq: add a driver for the PMU module on the QCom WCN chipsets") Reported-by: Stephan Gerhold Link: https://lore.kernel.org/r/20240813190751.155035-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- drivers/power/sequencing/pwrseq-qcom-wcn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/power/sequencing/pwrseq-qcom-wcn.c b/drivers/power/sequencing/pwrseq-qcom-wcn.c index 42dacfda745e4a..d786cbf1b2cd64 100644 --- a/drivers/power/sequencing/pwrseq-qcom-wcn.c +++ b/drivers/power/sequencing/pwrseq-qcom-wcn.c @@ -283,7 +283,7 @@ static int pwrseq_qcom_wcn_probe(struct platform_device *pdev) "Failed to get the Bluetooth enable GPIO\n"); ctx->wlan_gpio = devm_gpiod_get_optional(dev, "wlan-enable", - GPIOD_OUT_LOW); + GPIOD_ASIS); if (IS_ERR(ctx->wlan_gpio)) return dev_err_probe(dev, PTR_ERR(ctx->wlan_gpio), "Failed to get the WLAN enable GPIO\n"); From e080a26725fb36f535f22ea42694c60ab005fb2e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 19 Aug 2024 10:52:07 +0800 Subject: [PATCH 469/991] erofs: allow large folios for compressed files As commit 2e6506e1c4ee ("mm/migrate: fix deadlock in migrate_pages_batch() on large folios") has landed upstream, large folios can be safely enabled for compressed inodes since all prerequisites have already landed in 6.11-rc1. Stress tests has been running on my fleet for over 20 days without any regression. Additionally, users [1] have requested it for months. Let's allow large folios for EROFS full cases upstream now for wider testing. [1] https://lore.kernel.org/r/CAGsJ_4wtE8OcpinuqVwG4jtdx6Qh5f+TON6wz+4HMCq=A2qFcA@mail.gmail.com Cc: Barry Song <21cnbao@gmail.com> Cc: Matthew Wilcox (Oracle) [ Gao Xiang: minor commit typo fixes. ] Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240819025207.3808649-1-hsiangkao@linux.alibaba.com --- Documentation/filesystems/erofs.rst | 2 +- fs/erofs/inode.c | 18 ++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index cc4626d6ee4f83..c293f8e37468cd 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -75,7 +75,7 @@ Here are the main features of EROFS: - Support merging tail-end data into a special inode as fragments. - - Support large folios for uncompressed files. + - Support large folios to make use of THPs (Transparent Hugepages); - Support direct I/O on uncompressed files to avoid double caching for loop devices; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 43c09aae2afcd1..419432be3223b7 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -257,25 +257,23 @@ static int erofs_fill_inode(struct inode *inode) goto out_unlock; } + mapping_set_large_folios(inode->i_mapping); if (erofs_inode_is_data_compressed(vi->datalayout)) { #ifdef CONFIG_EROFS_FS_ZIP DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT, erofs_info, inode->i_sb, "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); inode->i_mapping->a_ops = &z_erofs_aops; - err = 0; - goto out_unlock; -#endif +#else err = -EOPNOTSUPP; - goto out_unlock; - } - inode->i_mapping->a_ops = &erofs_raw_access_aops; - mapping_set_large_folios(inode->i_mapping); +#endif + } else { + inode->i_mapping->a_ops = &erofs_raw_access_aops; #ifdef CONFIG_EROFS_FS_ONDEMAND - if (erofs_is_fscache_mode(inode->i_sb)) - inode->i_mapping->a_ops = &erofs_fscache_access_aops; + if (erofs_is_fscache_mode(inode->i_sb)) + inode->i_mapping->a_ops = &erofs_fscache_access_aops; #endif - + } out_unlock: erofs_put_metabuf(&buf); return err; From 56314c0d78d6f5a60c8804c517167991a879e14a Mon Sep 17 00:00:00 2001 From: John Sweeney Date: Sun, 18 Aug 2024 11:30:15 -0400 Subject: [PATCH 470/991] ALSA: hda/realtek: Enable mute/micmute LEDs on HP Laptop 14-ey0xxx HP Pavilion Plus 14-ey0xxx needs existing quirk ALC245_FIXUP_HP_X360_MUTE_LEDS to enable its mute/micmute LEDs. Signed-off-by: John Sweeney Cc: Link: https://patch.msgid.link/E1sfhrD-0007TA-HC@rmmprod05.runbox Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d022a25635f9bf..4eafbcb40120c2 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10315,6 +10315,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8c15, "HP Spectre x360 2-in-1 Laptop 14-eu0xxx", ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX), SND_PCI_QUIRK(0x103c, 0x8c16, "HP Spectre 16", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8c17, "HP Spectre 16", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x8c21, "HP Pavilion Plus Laptop 14-ey0XXX", ALC245_FIXUP_HP_X360_MUTE_LEDS), SND_PCI_QUIRK(0x103c, 0x8c46, "HP EliteBook 830 G11", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c47, "HP EliteBook 840 G11", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c48, "HP EliteBook 860 G11", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), From 32108c22ac619c32dd6db594319e259b63bfb387 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 19 Aug 2024 10:41:53 +0200 Subject: [PATCH 471/991] ALSA: seq: Skip event type filtering for UMP events UMP events don't use the event type field, hence it's invalid to apply the filter, which may drop the events unexpectedly. Skip the event filtering for UMP events, instead. Fixes: 46397622a3fa ("ALSA: seq: Add UMP support") Cc: Link: https://patch.msgid.link/20240819084156.10286-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 8c4ee5066afe5b..6be548baa6df28 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -537,6 +537,9 @@ static struct snd_seq_client *get_event_dest_client(struct snd_seq_event *event, return NULL; if (! dest->accept_input) goto __not_avail; + if (snd_seq_ev_is_ump(event)) + return dest; /* ok - no filter checks */ + if ((dest->filter & SNDRV_SEQ_FILTER_USE_EVENT) && ! test_bit(event->type, dest->event_filter)) goto __not_avail; From 7167395a4be7930ecac6a33b4e54d7e3dd9ee209 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 15 Aug 2024 15:59:50 +0800 Subject: [PATCH 472/991] selftests: udpgro: report error when receive failed Currently, we only check the latest senders's exit code. If the receiver report failed, it is not recoreded. Fix it by checking the exit code of all the involved processes. Before: bad GRO lookup ok multiple GRO socks ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520 ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520 failed $ echo $? 0 After: bad GRO lookup ok multiple GRO socks ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520 ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520 failed $ echo $? 1 Fixes: 3327a9c46352 ("selftests: add functionals test for UDP GRO") Suggested-by: Paolo Abeni Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgro.sh | 44 ++++++++++++++++----------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index 11a1ebda564fd5..4659cf01e4384c 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -46,17 +46,19 @@ run_one() { local -r all="$@" local -r tx_args=${all%rx*} local -r rx_args=${all#*rx} + local ret=0 cfg_veth - ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} && \ - echo "ok" || \ - echo "failed" & + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} & + local PID1=$! wait_local_port_listen ${PEER_NS} 8000 udp ./udpgso_bench_tx ${tx_args} - ret=$? - wait $(jobs -p) + check_err $? + wait ${PID1} + check_err $? + [ "$ret" -eq 0 ] && echo "ok" || echo "failed" return $ret } @@ -73,6 +75,7 @@ run_one_nat() { local -r all="$@" local -r tx_args=${all%rx*} local -r rx_args=${all#*rx} + local ret=0 if [[ ${tx_args} = *-4* ]]; then ipt_cmd=iptables @@ -93,16 +96,17 @@ run_one_nat() { # ... so that GRO will match the UDP_GRO enabled socket, but packets # will land on the 'plain' one ip netns exec "${PEER_NS}" ./udpgso_bench_rx -G ${family} -b ${addr1} -n 0 & - pid=$! - ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} && \ - echo "ok" || \ - echo "failed"& + local PID1=$! + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} & + local PID2=$! wait_local_port_listen "${PEER_NS}" 8000 udp ./udpgso_bench_tx ${tx_args} - ret=$? - kill -INT $pid - wait $(jobs -p) + check_err $? + kill -INT ${PID1} + wait ${PID2} + check_err $? + [ "$ret" -eq 0 ] && echo "ok" || echo "failed" return $ret } @@ -111,20 +115,26 @@ run_one_2sock() { local -r all="$@" local -r tx_args=${all%rx*} local -r rx_args=${all#*rx} + local ret=0 cfg_veth ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} -p 12345 & - ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} && \ - echo "ok" || \ - echo "failed" & + local PID1=$! + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} & + local PID2=$! wait_local_port_listen "${PEER_NS}" 12345 udp ./udpgso_bench_tx ${tx_args} -p 12345 + check_err $? wait_local_port_listen "${PEER_NS}" 8000 udp ./udpgso_bench_tx ${tx_args} - ret=$? - wait $(jobs -p) + check_err $? + wait ${PID1} + check_err $? + wait ${PID2} + check_err $? + [ "$ret" -eq 0 ] && echo "ok" || echo "failed" return $ret } From d7818402b1d80347c764001583f6d63fa68c2e1a Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 15 Aug 2024 15:59:51 +0800 Subject: [PATCH 473/991] selftests: udpgro: no need to load xdp for gro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit d7db7775ea2e ("net: veth: do not manipulate GRO when using XDP"), there is no need to load XDP program to enable GRO. On the other hand, the current test is failed due to loading the XDP program. e.g. # selftests: net: udpgro.sh # ipv4 # no GRO ok # no GRO chk cmsg ok # GRO ./udpgso_bench_rx: recv: bad packet len, got 1472, expected 14720 # # failed [...] # bad GRO lookup ok # multiple GRO socks ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520 # # ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520 # # failed ok 1 selftests: net: udpgro.sh After fix, all the test passed. # ./udpgro.sh ipv4 no GRO ok [...] multiple GRO socks ok Fixes: d7db7775ea2e ("net: veth: do not manipulate GRO when using XDP") Reported-by: Yi Chen Closes: https://issues.redhat.com/browse/RHEL-53858 Reviewed-by: Toke Høiland-Jørgensen Acked-by: Paolo Abeni Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgro.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index 4659cf01e4384c..d5ffd8c9172e1d 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -7,8 +7,6 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.bpf.o" - # set global exit status, but never reset nonzero one. check_err() { @@ -38,7 +36,7 @@ cfg_veth() { ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24 ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad ip -netns "${PEER_NS}" link set dev veth1 up - ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp + ip netns exec "${PEER_NS}" ethtool -K veth1 gro on } run_one() { @@ -206,11 +204,6 @@ run_all() { return $ret } -if [ ! -f ${BPF_FILE} ]; then - echo "Missing ${BPF_FILE}. Run 'make' first" - exit -1 -fi - if [[ $# -eq 0 ]]; then run_all elif [[ $1 == "__subprocess" ]]; then From cf1e515c9a40caa8bddb920970d3257bb01c1421 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 19 Aug 2024 20:00:07 +0800 Subject: [PATCH 474/991] iommufd/selftest: Make dirty_ops static The sparse tool complains as follows: drivers/iommu/iommufd/selftest.c:277:30: warning: symbol 'dirty_ops' was not declared. Should it be static? This symbol is not used outside of selftest.c, so marks it static. Fixes: 266ce58989ba ("iommufd/selftest: Test IOMMU_HWPT_ALLOC_DIRTY_TRACKING") Link: https://patch.msgid.link/r/20240819120007.3884868-1-ruanjinjie@huawei.com Signed-off-by: Jinjie Ruan Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index f95e32e2913330..222cfc11ebfd00 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -273,7 +273,7 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, return 0; } -const struct iommu_dirty_ops dirty_ops = { +static const struct iommu_dirty_ops dirty_ops = { .set_dirty_tracking = mock_domain_set_dirty_tracking, .read_and_clear_dirty = mock_domain_read_and_clear_dirty, }; From cd8e468efb4fb2742e06328a75b282c35c1abf8d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 14 Aug 2024 21:01:57 +0200 Subject: [PATCH 475/991] ACPI: video: Add Dell UART backlight controller detection Dell All In One (AIO) models released after 2017 use a backlight controller board connected to an UART. In DSDT this uart port will be defined as: Name (_HID, "DELL0501") Name (_CID, EisaId ("PNP0501") Commit 484bae9e4d6a ("platform/x86: Add new Dell UART backlight driver") has added support for this, but I neglected to tie this into acpi_video_get_backlight_type(). Now the first AIO has turned up which has not only the DSDT bits for this, but also an actual controller attached to the UART, yet it is not using this controller for backlight control. Add support to acpi_video_get_backlight_type() for a new dell_uart backlight type. So that the existing infra to override the backlight control method on the commandline or with DMI quirks can be used. Fixes: 484bae9e4d6a ("platform/x86: Add new Dell UART backlight driver") Cc: All applicable Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20240814190159.15650-2-hdegoede@redhat.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/video_detect.c | 7 +++++++ include/acpi/video.h | 1 + 2 files changed, 8 insertions(+) diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index c11cbe5b6eaa6c..e509dcbf309064 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -54,6 +54,8 @@ static void acpi_video_parse_cmdline(void) acpi_backlight_cmdline = acpi_backlight_nvidia_wmi_ec; if (!strcmp("apple_gmux", acpi_video_backlight_string)) acpi_backlight_cmdline = acpi_backlight_apple_gmux; + if (!strcmp("dell_uart", acpi_video_backlight_string)) + acpi_backlight_cmdline = acpi_backlight_dell_uart; if (!strcmp("none", acpi_video_backlight_string)) acpi_backlight_cmdline = acpi_backlight_none; } @@ -918,6 +920,7 @@ enum acpi_backlight_type __acpi_video_get_backlight_type(bool native, bool *auto static DEFINE_MUTEX(init_mutex); static bool nvidia_wmi_ec_present; static bool apple_gmux_present; + static bool dell_uart_present; static bool native_available; static bool init_done; static long video_caps; @@ -932,6 +935,7 @@ enum acpi_backlight_type __acpi_video_get_backlight_type(bool native, bool *auto &video_caps, NULL); nvidia_wmi_ec_present = nvidia_wmi_ec_supported(); apple_gmux_present = apple_gmux_detect(NULL, NULL); + dell_uart_present = acpi_dev_present("DELL0501", NULL, -1); init_done = true; } if (native) @@ -962,6 +966,9 @@ enum acpi_backlight_type __acpi_video_get_backlight_type(bool native, bool *auto if (apple_gmux_present) return acpi_backlight_apple_gmux; + if (dell_uart_present) + return acpi_backlight_dell_uart; + /* Use ACPI video if available, except when native should be preferred. */ if ((video_caps & ACPI_VIDEO_BACKLIGHT) && !(native_available && prefer_native_over_acpi_video())) diff --git a/include/acpi/video.h b/include/acpi/video.h index 3d538d4178abb1..044c463138df81 100644 --- a/include/acpi/video.h +++ b/include/acpi/video.h @@ -50,6 +50,7 @@ enum acpi_backlight_type { acpi_backlight_native, acpi_backlight_nvidia_wmi_ec, acpi_backlight_apple_gmux, + acpi_backlight_dell_uart, }; #if IS_ENABLED(CONFIG_ACPI_VIDEO) From b5f0943001339c4d324a1af10470ce0bdd79f966 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 14 Aug 2024 21:01:58 +0200 Subject: [PATCH 476/991] platform/x86: dell-uart-backlight: Use acpi_video_get_backlight_type() The dell-uart-backlight driver supports backlight control on Dell All In One (AIO) models using a backlight controller board connected to an UART. In DSDT this uart port will be defined as: Name (_HID, "DELL0501") Name (_CID, EisaId ("PNP0501") Now the first AIO has turned up which has not only the DSDT bits for this, but also an actual controller attached to the UART, yet it is not using this controller for backlight control. Use the acpi_video_get_backlight_type() function from the ACPI video-detect code to check if the dell-uart-backlight driver should actually be used. This allows reusing the existing ACPI video-detect infra to override the backlight control method on the commandline or with DMI quirks. Fixes: 484bae9e4d6a ("platform/x86: Add new Dell UART backlight driver") Cc: All applicable Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20240814190159.15650-3-hdegoede@redhat.com Signed-off-by: Rafael J. Wysocki --- drivers/platform/x86/dell/Kconfig | 1 + drivers/platform/x86/dell/dell-uart-backlight.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/platform/x86/dell/Kconfig b/drivers/platform/x86/dell/Kconfig index 85a78ef91182ea..309236cecd5a43 100644 --- a/drivers/platform/x86/dell/Kconfig +++ b/drivers/platform/x86/dell/Kconfig @@ -161,6 +161,7 @@ config DELL_SMO8800 config DELL_UART_BACKLIGHT tristate "Dell AIO UART Backlight driver" depends on ACPI + depends on ACPI_VIDEO depends on BACKLIGHT_CLASS_DEVICE depends on SERIAL_DEV_BUS help diff --git a/drivers/platform/x86/dell/dell-uart-backlight.c b/drivers/platform/x86/dell/dell-uart-backlight.c index 87d2a20b4cb3d8..3995f90add4568 100644 --- a/drivers/platform/x86/dell/dell-uart-backlight.c +++ b/drivers/platform/x86/dell/dell-uart-backlight.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "../serdev_helpers.h" /* The backlight controller must respond within 1 second */ @@ -332,10 +333,17 @@ struct serdev_device_driver dell_uart_bl_serdev_driver = { static int dell_uart_bl_pdev_probe(struct platform_device *pdev) { + enum acpi_backlight_type bl_type; struct serdev_device *serdev; struct device *ctrl_dev; int ret; + bl_type = acpi_video_get_backlight_type(); + if (bl_type != acpi_backlight_dell_uart) { + dev_dbg(&pdev->dev, "Not loading (ACPI backlight type = %d)\n", bl_type); + return -ENODEV; + } + ctrl_dev = get_serdev_controller("DELL0501", NULL, 0, "serial0"); if (IS_ERR(ctrl_dev)) return PTR_ERR(ctrl_dev); From 5c7bb62cb8f53de71d8ab3d619be22740da0b837 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 14 Aug 2024 21:01:59 +0200 Subject: [PATCH 477/991] ACPI: video: Add backlight=native quirk for Dell OptiPlex 7760 AIO Dell All In One (AIO) models released after 2017 may use a backlight controller board connected to an UART. In DSDT this uart port will be defined as: Name (_HID, "DELL0501") Name (_CID, EisaId ("PNP0501") The Dell OptiPlex 7760 AIO has an ACPI device for one if its UARTs with the above _HID + _CID. Loading the dell-uart-backlight driver shows that there actually is a backlight controller board attached to the UART, which reports a firmware version of "G&MX01-V15". But the backlight controller board does not actually control the backlight brightness and the GPU's native backlight control method does work. Add a quirk to use the GPU's native backlight control method on this model. Fixes: 484bae9e4d6a ("platform/x86: Add new Dell UART backlight driver") Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2303936 Cc: All applicable Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20240814190159.15650-4-hdegoede@redhat.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/video_detect.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index e509dcbf309064..674b9db7a1ef8b 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -823,6 +823,21 @@ static const struct dmi_system_id video_detect_dmi_table[] = { }, }, + /* + * Dell AIO (All in Ones) which advertise an UART attached backlight + * controller board in their ACPI tables (and may even have one), but + * which need native backlight control nevertheless. + */ + { + /* https://bugzilla.redhat.com/show_bug.cgi?id=2303936 */ + .callback = video_detect_force_native, + /* Dell OptiPlex 7760 AIO */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7760 AIO"), + }, + }, + /* * Models which have nvidia-ec-wmi support, but should not use it. * Note this indicates a likely firmware bug on these models and should From f4b2a0ae1a31fd3d1b5ca18ee08319b479cf9b5f Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 24 Jul 2024 14:53:09 -0700 Subject: [PATCH 478/991] drm/xe: Fix opregion leak Being part o the display, ideally the setup and cleanup would be done by display itself. However this is a bigger refactor that needs to be done on both i915 and xe. For now, just fix the leak: unreferenced object 0xffff8881a0300008 (size 192): comm "modprobe", pid 4354, jiffies 4295647021 hex dump (first 32 bytes): 00 00 87 27 81 88 ff ff 18 80 9b 00 00 c9 ff ff ...'............ 18 81 9b 00 00 c9 ff ff 00 00 00 00 00 00 00 00 ................ backtrace (crc 99260e31): [] kmemleak_alloc+0x4b/0x80 [] kmalloc_trace_noprof+0x312/0x3d0 [] intel_opregion_setup+0x89/0x700 [xe] [] xe_display_init_noirq+0x2f/0x90 [xe] [] xe_device_probe+0x7a3/0xbf0 [xe] [] xe_pci_probe+0x333/0x5b0 [xe] [] local_pci_probe+0x48/0xb0 [] pci_device_probe+0xc8/0x280 [] really_probe+0xf8/0x390 [] __driver_probe_device+0x8a/0x170 [] driver_probe_device+0x23/0xb0 [] __driver_attach+0xc7/0x190 [] bus_for_each_dev+0x7d/0xd0 [] driver_attach+0x1e/0x30 [] bus_add_driver+0x117/0x250 Fixes: 44e694958b95 ("drm/xe/display: Implement display support") Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240724215309.644423-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 6f4e43a2f771b737d991142ec4f6d4b7ff31fbb4) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/display/xe_display.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index 8b83dcff72e17a..ca4468c820788c 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -132,6 +132,7 @@ static void xe_display_fini_noirq(void *arg) return; intel_display_driver_remove_noirq(xe); + intel_opregion_cleanup(xe); } int xe_display_init_noirq(struct xe_device *xe) @@ -157,8 +158,10 @@ int xe_display_init_noirq(struct xe_device *xe) intel_display_device_info_runtime_init(xe); err = intel_display_driver_probe_noirq(xe); - if (err) + if (err) { + intel_opregion_cleanup(xe); return err; + } return devm_add_action_or_reset(xe->drm.dev, xe_display_fini_noirq, xe); } From c621f70539cae731d9749c1900cd00bb70ea5c72 Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Sun, 4 Aug 2024 23:20:57 -0700 Subject: [PATCH 479/991] drm/xe/observation: Drop empty sysctl table entry An empty sysctl table entry was inadvertently left behind for observation sysctl. The breaks on 6.11 with the following errors: [ 219.654850] sysctl table check failed: dev/xe/(null) procname is null [ 219.654862] sysctl table check failed: dev/xe/(null) No proc_handler Drop the empty entry. Fixes: 63347fe031e3 ("drm/xe/uapi: Rename xe perf layer as xe observation layer") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2419 Signed-off-by: Ashutosh Dixit Reviewed-by: Umesh Nerlige Ramappa Link: https://patchwork.freedesktop.org/patch/msgid/20240805062057.3547560-1-ashutosh.dixit@intel.com (cherry picked from commit be1dec570b6f5a29ce9c99334c52bea94c28914b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_observation.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_observation.c b/drivers/gpu/drm/xe/xe_observation.c index fcb584b42a7d51..a78c92a44ec2d2 100644 --- a/drivers/gpu/drm/xe/xe_observation.c +++ b/drivers/gpu/drm/xe/xe_observation.c @@ -66,7 +66,6 @@ static struct ctl_table observation_ctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; /** From 7090d7fc969fcc9985d7e538cfcd8a69a5f9c616 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Wed, 24 Jul 2024 08:28:31 -0700 Subject: [PATCH 480/991] drm/xe: Move VM dma-resv lock from xe_exec_queue_create to __xe_exec_queue_init The critical section which requires the VM dma-resv is the call xe_lrc_create in __xe_exec_queue_init. Move this lock to __xe_exec_queue_init holding it just around xe_lrc_create. Not only is good practice, this also fixes a locking double of the VM dma-resv in the error paths of __xe_exec_queue_init as xe_lrc_put tries to acquire this too resulting in a deadlock. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Maarten Lankhorst Signed-off-by: Matthew Brost Reviewed-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20240724152831.1848325-1-matthew.brost@intel.com (cherry picked from commit 549dd786b61cd3db903f5d94d07fc5a89ccdbeb9) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_exec_queue.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index a39384bb9553ff..16f24f4a7062d8 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -105,22 +105,35 @@ static struct xe_exec_queue *__xe_exec_queue_alloc(struct xe_device *xe, static int __xe_exec_queue_init(struct xe_exec_queue *q) { + struct xe_vm *vm = q->vm; int i, err; + if (vm) { + err = xe_vm_lock(vm, true); + if (err) + return err; + } + for (i = 0; i < q->width; ++i) { q->lrc[i] = xe_lrc_create(q->hwe, q->vm, SZ_16K); if (IS_ERR(q->lrc[i])) { err = PTR_ERR(q->lrc[i]); - goto err_lrc; + goto err_unlock; } } + if (vm) + xe_vm_unlock(vm); + err = q->ops->init(q); if (err) goto err_lrc; return 0; +err_unlock: + if (vm) + xe_vm_unlock(vm); err_lrc: for (i = i - 1; i >= 0; --i) xe_lrc_put(q->lrc[i]); @@ -140,15 +153,7 @@ struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *v if (IS_ERR(q)) return q; - if (vm) { - err = xe_vm_lock(vm, true); - if (err) - goto err_post_alloc; - } - err = __xe_exec_queue_init(q); - if (vm) - xe_vm_unlock(vm); if (err) goto err_post_alloc; From 15939ca77d4424f736e1e4953b4da2351cc9689d Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 9 Aug 2024 16:28:30 -0700 Subject: [PATCH 481/991] drm/xe: Fix tile fini sequence Only set tile->mmio.regs to NULL if not the root tile in tile_fini. The root tile mmio regs is setup ealier in MMIO init thus it should be set to NULL in mmio_fini. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Brost Reviewed-by: Lucas De Marchi Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20240809232830.3302251-1-matthew.brost@intel.com (cherry picked from commit 3396900aa273903639a1792afa4d23dc09bec291) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_mmio.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_mmio.c b/drivers/gpu/drm/xe/xe_mmio.c index f92faad4b96d3c..83122c77edd9c6 100644 --- a/drivers/gpu/drm/xe/xe_mmio.c +++ b/drivers/gpu/drm/xe/xe_mmio.c @@ -30,7 +30,8 @@ static void tiles_fini(void *arg) int id; for_each_tile(tile, xe, id) - tile->mmio.regs = NULL; + if (tile != xe_device_get_root_tile(xe)) + tile->mmio.regs = NULL; } int xe_mmio_probe_tiles(struct xe_device *xe) @@ -91,9 +92,11 @@ int xe_mmio_probe_tiles(struct xe_device *xe) static void mmio_fini(void *arg) { struct xe_device *xe = arg; + struct xe_tile *root_tile = xe_device_get_root_tile(xe); pci_iounmap(to_pci_dev(xe->drm.dev), xe->mmio.regs); xe->mmio.regs = NULL; + root_tile->mmio.regs = NULL; } int xe_mmio_init(struct xe_device *xe) From 730b72480e29f63fd644f5fa57c9d46109428953 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Wed, 14 Aug 2024 12:01:30 +0100 Subject: [PATCH 482/991] drm/xe: prevent UAF around preempt fence The fence lock is part of the queue, therefore in the current design anything locking the fence should then also hold a ref to the queue to prevent the queue from being freed. However, currently it looks like we signal the fence and then drop the queue ref, but if something is waiting on the fence, the waiter is kicked to wake up at some later point, where upon waking up it first grabs the lock before checking the fence state. But if we have already dropped the queue ref, then the lock might already be freed as part of the queue, leading to uaf. To prevent this, move the fence lock into the fence itself so we don't run into lifetime issues. Alternative might be to have device level lock, or only release the queue in the fence release callback, however that might require pushing to another worker to avoid locking issues. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") References: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2454 References: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2342 References: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2020 Signed-off-by: Matthew Auld Cc: Matthew Brost Cc: # v6.8+ Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240814110129.825847-2-matthew.auld@intel.com (cherry picked from commit 7116c35aacedc38be6d15bd21b2fc936eed0008b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_exec_queue.c | 1 - drivers/gpu/drm/xe/xe_exec_queue_types.h | 2 -- drivers/gpu/drm/xe/xe_preempt_fence.c | 3 ++- drivers/gpu/drm/xe/xe_preempt_fence_types.h | 2 ++ 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 16f24f4a7062d8..9731dcd0b1bde9 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -643,7 +643,6 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data, if (xe_vm_in_preempt_fence_mode(vm)) { q->lr.context = dma_fence_context_alloc(1); - spin_lock_init(&q->lr.lock); err = xe_vm_add_compute_exec_queue(vm, q); if (XE_IOCTL_DBG(xe, err)) diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index a35ce24c97982c..f6ee0ae80fd63d 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -126,8 +126,6 @@ struct xe_exec_queue { u32 seqno; /** @lr.link: link into VM's list of exec queues */ struct list_head link; - /** @lr.lock: preemption fences lock */ - spinlock_t lock; } lr; /** @ops: submission backend exec queue operations */ diff --git a/drivers/gpu/drm/xe/xe_preempt_fence.c b/drivers/gpu/drm/xe/xe_preempt_fence.c index e8b8ae5c6485e3..c453f45328b1c5 100644 --- a/drivers/gpu/drm/xe/xe_preempt_fence.c +++ b/drivers/gpu/drm/xe/xe_preempt_fence.c @@ -128,8 +128,9 @@ xe_preempt_fence_arm(struct xe_preempt_fence *pfence, struct xe_exec_queue *q, { list_del_init(&pfence->link); pfence->q = xe_exec_queue_get(q); + spin_lock_init(&pfence->lock); dma_fence_init(&pfence->base, &preempt_fence_ops, - &q->lr.lock, context, seqno); + &pfence->lock, context, seqno); return &pfence->base; } diff --git a/drivers/gpu/drm/xe/xe_preempt_fence_types.h b/drivers/gpu/drm/xe/xe_preempt_fence_types.h index b54b5c29b5331e..312c3372a49f90 100644 --- a/drivers/gpu/drm/xe/xe_preempt_fence_types.h +++ b/drivers/gpu/drm/xe/xe_preempt_fence_types.h @@ -25,6 +25,8 @@ struct xe_preempt_fence { struct xe_exec_queue *q; /** @preempt_work: work struct which issues preemption */ struct work_struct preempt_work; + /** @lock: dma-fence fence lock */ + spinlock_t lock; /** @error: preempt fence is in error state */ int error; }; From ddf6492e0e508b7c2b42c8d5a4ac82bd38ef0dd5 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Tue, 6 Aug 2024 12:50:44 +0200 Subject: [PATCH 483/991] drm/xe/display: Make display suspend/resume work on discrete We should unpin before evicting all memory, and repin after GT resume. This way, we preserve the contents of the framebuffers, and won't hang on resume due to migration engine not being restored yet. Signed-off-by: Maarten Lankhorst Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: stable@vger.kernel.org # v6.8+ Reviewed-by: Uma Shankar Link: https://patchwork.freedesktop.org/patch/msgid/20240806105044.596842-3-maarten.lankhorst@linux.intel.com Signed-off-by: Maarten Lankhorst,,, (cherry picked from commit cb8f81c1753187995b7a43e79c12959f14eb32d3) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/display/xe_display.c | 23 +++++++++++++++++++++++ drivers/gpu/drm/xe/xe_pm.c | 11 ++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index ca4468c820788c..49de4e4f8a75b0 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -283,6 +283,27 @@ static bool suspend_to_idle(void) return false; } +static void xe_display_flush_cleanup_work(struct xe_device *xe) +{ + struct intel_crtc *crtc; + + for_each_intel_crtc(&xe->drm, crtc) { + struct drm_crtc_commit *commit; + + spin_lock(&crtc->base.commit_lock); + commit = list_first_entry_or_null(&crtc->base.commit_list, + struct drm_crtc_commit, commit_entry); + if (commit) + drm_crtc_commit_get(commit); + spin_unlock(&crtc->base.commit_lock); + + if (commit) { + wait_for_completion(&commit->cleanup_done); + drm_crtc_commit_put(commit); + } + } +} + void xe_display_pm_suspend(struct xe_device *xe, bool runtime) { bool s2idle = suspend_to_idle(); @@ -300,6 +321,8 @@ void xe_display_pm_suspend(struct xe_device *xe, bool runtime) if (!runtime) intel_display_driver_suspend(xe); + xe_display_flush_cleanup_work(xe); + intel_dp_mst_suspend(xe); intel_hpd_cancel_work(xe); diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index de3b5df65e4816..9a3f618d22dcbf 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -91,13 +91,13 @@ int xe_pm_suspend(struct xe_device *xe) for_each_gt(gt, xe, id) xe_gt_suspend_prepare(gt); + xe_display_pm_suspend(xe, false); + /* FIXME: Super racey... */ err = xe_bo_evict_all(xe); if (err) goto err; - xe_display_pm_suspend(xe, false); - for_each_gt(gt, xe, id) { err = xe_gt_suspend(gt); if (err) { @@ -151,11 +151,11 @@ int xe_pm_resume(struct xe_device *xe) xe_irq_resume(xe); - xe_display_pm_resume(xe, false); - for_each_gt(gt, xe, id) xe_gt_resume(gt); + xe_display_pm_resume(xe, false); + err = xe_bo_restore_user(xe); if (err) goto err; @@ -363,10 +363,11 @@ int xe_pm_runtime_suspend(struct xe_device *xe) mutex_unlock(&xe->mem_access.vram_userfault.lock); if (xe->d3cold.allowed) { + xe_display_pm_suspend(xe, true); + err = xe_bo_evict_all(xe); if (err) goto out; - xe_display_pm_suspend(xe, true); } for_each_gt(gt, xe, id) { From ad614a706b1ac83b95b333f44b8f5e70bcb37dc5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 29 Jul 2024 11:26:34 +0200 Subject: [PATCH 484/991] drm/xe/oa/uapi: Make bit masks unsigned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building with gcc-5: In function ‘decode_oa_format.isra.26’, inlined from ‘xe_oa_set_prop_oa_format’ at drivers/gpu/drm/xe/xe_oa.c:1664:6: ././include/linux/compiler_types.h:510:38: error: call to ‘__compiletime_assert_1336’ declared with attribute error: FIELD_GET: mask is not constant [...] ./include/linux/bitfield.h:155:3: note: in expansion of macro ‘__BF_FIELD_CHECK’ __BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: "); \ ^ drivers/gpu/drm/xe/xe_oa.c:1573:18: note: in expansion of macro ‘FIELD_GET’ u32 bc_report = FIELD_GET(DRM_XE_OA_FORMAT_MASK_BC_REPORT, fmt); ^ Fixes: b6fd51c62119 ("drm/xe/oa/uapi: Define and parse OA stream properties") Signed-off-by: Geert Uytterhoeven Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240729092634.2227611-1-geert+renesas@glider.be Signed-off-by: Lucas De Marchi (cherry picked from commit f2881dfdaaa9ec873dbd383ef5512fc31e576cbb) Signed-off-by: Rodrigo Vivi --- include/uapi/drm/xe_drm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 19619d4952a863..db232a25189eba 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -1590,10 +1590,10 @@ enum drm_xe_oa_property_id { * b. Counter select c. Counter size and d. BC report. Also refer to the * oa_formats array in drivers/gpu/drm/xe/xe_oa.c. */ -#define DRM_XE_OA_FORMAT_MASK_FMT_TYPE (0xff << 0) -#define DRM_XE_OA_FORMAT_MASK_COUNTER_SEL (0xff << 8) -#define DRM_XE_OA_FORMAT_MASK_COUNTER_SIZE (0xff << 16) -#define DRM_XE_OA_FORMAT_MASK_BC_REPORT (0xff << 24) +#define DRM_XE_OA_FORMAT_MASK_FMT_TYPE (0xffu << 0) +#define DRM_XE_OA_FORMAT_MASK_COUNTER_SEL (0xffu << 8) +#define DRM_XE_OA_FORMAT_MASK_COUNTER_SIZE (0xffu << 16) +#define DRM_XE_OA_FORMAT_MASK_BC_REPORT (0xffu << 24) /** * @DRM_XE_OA_PROPERTY_OA_PERIOD_EXPONENT: Requests periodic OA unit From 565d121b69980637f040eb4d84289869cdaabedf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 13 Aug 2024 00:28:25 +0200 Subject: [PATCH 485/991] tcp: prevent concurrent execution of tcp_sk_exit_batch Its possible that two threads call tcp_sk_exit_batch() concurrently, once from the cleanup_net workqueue, once from a task that failed to clone a new netns. In the latter case, error unwinding calls the exit handlers in reverse order for the 'failed' netns. tcp_sk_exit_batch() calls tcp_twsk_purge(). Problem is that since commit b099ce2602d8 ("net: Batch inet_twsk_purge"), this function picks up twsk in any dying netns, not just the one passed in via exit_batch list. This means that the error unwind of setup_net() can "steal" and destroy timewait sockets belonging to the exiting netns. This allows the netns exit worker to proceed to call WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); without the expected 1 -> 0 transition, which then splats. At same time, error unwind path that is also running inet_twsk_purge() will splat as well: WARNING: .. at lib/refcount.c:31 refcount_warn_saturate+0x1ed/0x210 ... refcount_dec include/linux/refcount.h:351 [inline] inet_twsk_kill+0x758/0x9c0 net/ipv4/inet_timewait_sock.c:70 inet_twsk_deschedule_put net/ipv4/inet_timewait_sock.c:221 inet_twsk_purge+0x725/0x890 net/ipv4/inet_timewait_sock.c:304 tcp_sk_exit_batch+0x1c/0x170 net/ipv4/tcp_ipv4.c:3522 ops_exit_list+0x128/0x180 net/core/net_namespace.c:178 setup_net+0x714/0xb40 net/core/net_namespace.c:375 copy_net_ns+0x2f0/0x670 net/core/net_namespace.c:508 create_new_namespaces+0x3ea/0xb10 kernel/nsproxy.c:110 ... because refcount_dec() of tw_refcount unexpectedly dropped to 0. This doesn't seem like an actual bug (no tw sockets got lost and I don't see a use-after-free) but as erroneous trigger of debug check. Add a mutex to force strict ordering: the task that calls tcp_twsk_purge() blocks other task from doing final _dec_and_test before mutex-owner has removed all tw sockets of dying netns. Fixes: e9bd0cca09d1 ("tcp: Don't allocate tcp_death_row outside of struct netns_ipv4.") Reported-by: syzbot+8ea26396ff85d23a8929@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/0000000000003a5292061f5e4e19@google.com/ Link: https://lore.kernel.org/netdev/20240812140104.GA21559@breakpoint.cc/ Signed-off-by: Florian Westphal Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240812222857.29837-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fd17f25ff288a4..a4e510846905e6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -97,6 +97,8 @@ static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; +static DEFINE_MUTEX(tcp_exit_batch_mutex); + static u32 tcp_v4_init_seq(const struct sk_buff *skb) { return secure_tcp_seq(ip_hdr(skb)->daddr, @@ -3514,6 +3516,16 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; + /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work + * and failed setup_net error unwinding path are serialized. + * + * tcp_twsk_purge() handles twsk in any dead netns, not just those in + * net_exit_list, the thread that dismantles a particular twsk must + * do so without other thread progressing to refcount_dec_and_test() of + * tcp_death_row.tw_refcount. + */ + mutex_lock(&tcp_exit_batch_mutex); + tcp_twsk_purge(net_exit_list); list_for_each_entry(net, net_exit_list, exit_list) { @@ -3521,6 +3533,8 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); tcp_fastopen_ctx_destroy(net); } + + mutex_unlock(&tcp_exit_batch_mutex); } static struct pernet_operations __net_initdata tcp_sk_ops = { From 64b582ca88ca11400467b282d5fa3b870ded1c11 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 15 Aug 2024 16:32:27 +0000 Subject: [PATCH 486/991] block: Read max write zeroes once for __blkdev_issue_write_zeroes() As reported in [0], we may get a hang when formatting a XFS FS on a RAID0 drive. Commit 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper") changed __blkdev_issue_write_zeroes() to read the max write zeroes value in the loop. This is not safe as max write zeroes may change in value. Specifically for the case of [0], the value goes to 0, and we get an infinite loop. Lift the limit reading out of the loop. [0] https://lore.kernel.org/linux-xfs/4d31268f-310b-4220-88a2-e191c3932a82@oracle.com/T/#t Fixes: 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper") Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240815163228.216051-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-lib.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 9f735efa6c9459..83eb7761c2bfbd 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -111,13 +111,20 @@ static sector_t bio_write_zeroes_limit(struct block_device *bdev) (UINT_MAX >> SECTOR_SHIFT) & ~bs_mask); } +/* + * There is no reliable way for the SCSI subsystem to determine whether a + * device supports a WRITE SAME operation without actually performing a write + * to media. As a result, write_zeroes is enabled by default and will be + * disabled if a zeroing operation subsequently fails. This means that this + * queue limit is likely to change at runtime. + */ static void __blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, - struct bio **biop, unsigned flags) + struct bio **biop, unsigned flags, sector_t limit) { + while (nr_sects) { - unsigned int len = min_t(sector_t, nr_sects, - bio_write_zeroes_limit(bdev)); + unsigned int len = min(nr_sects, limit); struct bio *bio; if ((flags & BLKDEV_ZERO_KILLABLE) && @@ -141,12 +148,14 @@ static void __blkdev_issue_write_zeroes(struct block_device *bdev, static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp, unsigned flags) { + sector_t limit = bio_write_zeroes_limit(bdev); struct bio *bio = NULL; struct blk_plug plug; int ret = 0; blk_start_plug(&plug); - __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio, flags); + __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio, + flags, limit); if (bio) { if ((flags & BLKDEV_ZERO_KILLABLE) && fatal_signal_pending(current)) { @@ -165,7 +174,7 @@ static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, * on an I/O error, in which case we'll turn any error into * "not supported" here. */ - if (ret && !bdev_write_zeroes_sectors(bdev)) + if (ret && !limit) return -EOPNOTSUPP; return ret; } @@ -265,12 +274,14 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) { + sector_t limit = bio_write_zeroes_limit(bdev); + if (bdev_read_only(bdev)) return -EPERM; - if (bdev_write_zeroes_sectors(bdev)) { + if (limit) { __blkdev_issue_write_zeroes(bdev, sector, nr_sects, - gfp_mask, biop, flags); + gfp_mask, biop, flags, limit); } else { if (flags & BLKDEV_ZERO_NOFALLBACK) return -EOPNOTSUPP; From 81475beb1b5996505a39cd1d9316ce1e668932a2 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 15 Aug 2024 16:32:28 +0000 Subject: [PATCH 487/991] block: Drop NULL check in bdev_write_zeroes_sectors() Function bdev_get_queue() must not return NULL, so drop the check in bdev_write_zeroes_sectors(). Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Nitesh Shetty Link: https://lore.kernel.org/r/20240815163228.216051-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e85ec73a07d575..b7664d593486a8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1296,12 +1296,7 @@ bdev_max_secure_erase_sectors(struct block_device *bdev) static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { - struct request_queue *q = bdev_get_queue(bdev); - - if (q) - return q->limits.max_write_zeroes_sectors; - - return 0; + return bdev_get_queue(bdev)->limits.max_write_zeroes_sectors; } static inline bool bdev_nonrot(struct block_device *bdev) From de48aad2a8e80ba026ca91c383f590f0bf97b3c0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Aug 2024 11:47:57 -0400 Subject: [PATCH 488/991] rpcrdma: Device kref is over-incremented on error from xa_alloc If the device's reference count is too high, the device completion callback never fires. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/ib_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c index a938c19c3490d8..4d1e9fa89573e4 100644 --- a/net/sunrpc/xprtrdma/ib_client.c +++ b/net/sunrpc/xprtrdma/ib_client.c @@ -62,9 +62,9 @@ int rpcrdma_rn_register(struct ib_device *device, if (!rd || test_bit(RPCRDMA_RD_F_REMOVING, &rd->rd_flags)) return -ENETUNREACH; - kref_get(&rd->rd_kref); if (xa_alloc(&rd->rd_xa, &rn->rn_index, rn, xa_limit_32b, GFP_KERNEL) < 0) return -ENOMEM; + kref_get(&rd->rd_kref); rn->rn_done = done; return 0; } From 6b3b023e2d0c130235c0e494f77df2a9a64ab6a2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Aug 2024 11:47:58 -0400 Subject: [PATCH 489/991] rpcrdma: Use XA_FLAGS_ALLOC instead of XA_FLAGS_ALLOC1 Nit: The built-in xa_limit_32b range starts at 0, but XA_FLAGS_ALLOC1 configures the xarray's allocator to start at 1. Adopt the more conventional XA_FLAGS_ALLOC because there's no mechanical reason to skip 0. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/ib_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c index 4d1e9fa89573e4..7913d7bad23dec 100644 --- a/net/sunrpc/xprtrdma/ib_client.c +++ b/net/sunrpc/xprtrdma/ib_client.c @@ -111,7 +111,7 @@ static int rpcrdma_add_one(struct ib_device *device) return -ENOMEM; kref_init(&rd->rd_kref); - xa_init_flags(&rd->rd_xa, XA_FLAGS_ALLOC1); + xa_init_flags(&rd->rd_xa, XA_FLAGS_ALLOC); rd->rd_device = device; init_completion(&rd->rd_done); ib_set_client_data(device, &rpcrdma_ib_client, rd); From dc0112e6d8b42b39f9d283bab489a757e9d284f0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Aug 2024 11:47:59 -0400 Subject: [PATCH 490/991] rpcrdma: Trace connection registration and unregistration These new trace points record xarray indices and the time of endpoint registration and unregistration, to co-ordinate with device removal events. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 36 +++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/ib_client.c | 2 ++ 2 files changed, 38 insertions(+) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index ba2d6a0e41ccf7..a96a985c49b3d7 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -2277,6 +2277,42 @@ DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_remove_one); DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_wait_on); DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_remove_one_done); +DECLARE_EVENT_CLASS(rpcrdma_client_register_class, + TP_PROTO( + const struct ib_device *device, + const struct rpcrdma_notification *rn + ), + + TP_ARGS(device, rn), + + TP_STRUCT__entry( + __string(name, device->name) + __field(void *, callback) + __field(u32, index) + ), + + TP_fast_assign( + __assign_str(name); + __entry->callback = rn->rn_done; + __entry->index = rn->rn_index; + ), + + TP_printk("device=%s index=%u done callback=%pS\n", + __get_str(name), __entry->index, __entry->callback + ) +); + +#define DEFINE_CLIENT_REGISTER_EVENT(name) \ + DEFINE_EVENT(rpcrdma_client_register_class, name, \ + TP_PROTO( \ + const struct ib_device *device, \ + const struct rpcrdma_notification *rn \ + ), \ + TP_ARGS(device, rn)) + +DEFINE_CLIENT_REGISTER_EVENT(rpcrdma_client_register); +DEFINE_CLIENT_REGISTER_EVENT(rpcrdma_client_unregister); + #endif /* _TRACE_RPCRDMA_H */ #include diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c index 7913d7bad23dec..8507cd4d892170 100644 --- a/net/sunrpc/xprtrdma/ib_client.c +++ b/net/sunrpc/xprtrdma/ib_client.c @@ -66,6 +66,7 @@ int rpcrdma_rn_register(struct ib_device *device, return -ENOMEM; kref_get(&rd->rd_kref); rn->rn_done = done; + trace_rpcrdma_client_register(device, rn); return 0; } @@ -91,6 +92,7 @@ void rpcrdma_rn_unregister(struct ib_device *device, if (!rd) return; + trace_rpcrdma_client_unregister(device, rn); xa_erase(&rd->rd_xa, rn->rn_index); kref_put(&rd->rd_kref, rpcrdma_rn_release); } From 2240a50e6294214de791729e9dcba6880fa7e44e Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sat, 17 Aug 2024 18:15:41 +0800 Subject: [PATCH 491/991] KVM: arm64: vgic-debug: Don't put unmarked LPIs If there were LPIs being mapped behind our back (i.e., between .start() and .stop()), we would put them at iter_unmark_lpis() without checking if they were actually *marked*, which is obviously not good. Switch to use the xa_for_each_marked() iterator to fix it. Cc: stable@vger.kernel.org Fixes: 85d3ccc8b75b ("KVM: arm64: vgic-debug: Use an xarray mark for debug iterator") Signed-off-by: Zenghui Yu Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240817101541.1664-1-yuzenghui@huawei.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index bc74d06398ef16..e1397ab2072a57 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -85,7 +85,7 @@ static void iter_unmark_lpis(struct kvm *kvm) struct vgic_irq *irq; unsigned long intid; - xa_for_each(&dist->lpi_xa, intid, irq) { + xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) { xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); vgic_put_irq(kvm, irq); } From f616506754d34bcfdbfbc7508b562e5c98461e9a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 19 Aug 2024 13:50:45 +0100 Subject: [PATCH 492/991] KVM: arm64: vgic: Don't hold config_lock while unregistering redistributors We recently moved the teardown of the vgic part of a vcpu inside a critical section guarded by the config_lock. This teardown phase involves calling into kvm_io_bus_unregister_dev(), which takes the kvm->srcu lock. However, this violates the established order where kvm->srcu is taken on a memory fault (such as an MMIO access), possibly followed by taking the config_lock if the GIC emulation requires mutual exclusion from the other vcpus. It therefore results in a bad lockdep splat, as reported by Zenghui. Fix this by moving the call to kvm_io_bus_unregister_dev() outside of the config_lock critical section. At this stage, there shouln't be any need to hold the config_lock. As an additional bonus, document the ordering between kvm->slots_lock, kvm->srcu and kvm->arch.config_lock so that I cannot pretend I didn't know about those anymore. Fixes: 9eb18136af9f ("KVM: arm64: vgic: Hold config_lock while tearing down a CPU interface") Reported-by: Zenghui Yu Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Tested-by: Zenghui Yu Link: https://lore.kernel.org/r/20240819125045.3474845-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-init.c | 9 ++++++--- arch/arm64/kvm/vgic/vgic.c | 5 +++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 41feb858ff9a5e..e7c53e8af3d165 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -417,10 +417,8 @@ static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) kfree(vgic_cpu->private_irqs); vgic_cpu->private_irqs = NULL; - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - vgic_unregister_redist_iodev(vcpu); + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; - } } void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -448,6 +446,11 @@ void kvm_vgic_destroy(struct kvm *kvm) kvm_vgic_dist_destroy(kvm); mutex_unlock(&kvm->arch.config_lock); + + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + kvm_for_each_vcpu(i, vcpu, kvm) + vgic_unregister_redist_iodev(vcpu); + mutex_unlock(&kvm->slots_lock); } diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 974849ea7101c6..abe29c7d85d05a 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -36,6 +36,11 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * we have to disable IRQs before taking this lock and everything lower * than it. * + * The config_lock has additional ordering requirements: + * kvm->slots_lock + * kvm->srcu + * kvm->arch.config_lock + * * If you need to take multiple locks, always take the upper lock first, * then the lower ones, e.g. first take the its_lock, then the irq_lock. * If you are already holding a lock and need to take a higher one, you From 27cb2b7fec2abf310e4128137979124ead920ccb Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Wed, 3 Jul 2024 13:43:38 +0100 Subject: [PATCH 493/991] drm/xe/bmg: implement Wa_16023588340 This involves enabling l2 caching of host side memory access to VRAM through the CPU BAR. The main fallout here is with display since VRAM writes from CPU can now be cached in GPU l2, and display is never coherent with caches, so needs various manual flushing. In the case of fbc we disable it due to complications in getting this to work correctly (in a later patch). Signed-off-by: Matthew Auld Cc: Jonathan Cavitt Cc: Matt Roper Cc: Lucas De Marchi Cc: Vinod Govindapillai Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20240703124338.208220-3-matthew.auld@intel.com (cherry picked from commit 01570b446939c3538b1aa3d059837f49fa14a3ae) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/Makefile | 2 + drivers/gpu/drm/xe/display/xe_dsb_buffer.c | 8 ++++ drivers/gpu/drm/xe/display/xe_fb_pin.c | 3 ++ drivers/gpu/drm/xe/regs/xe_gt_regs.h | 8 ++++ drivers/gpu/drm/xe/xe_device.c | 30 ++++++++++++ drivers/gpu/drm/xe/xe_device.h | 1 + drivers/gpu/drm/xe/xe_gt.c | 54 ++++++++++++++++++++++ drivers/gpu/drm/xe/xe_pat.c | 11 ++++- drivers/gpu/drm/xe/xe_wa_oob.rules | 1 + 9 files changed, 117 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 628c245c4822eb..e97c9da451b36c 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -25,12 +25,14 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \ uses_generated_oob := \ $(obj)/xe_ggtt.o \ + $(obj)/xe_device.o \ $(obj)/xe_gsc.o \ $(obj)/xe_gt.o \ $(obj)/xe_guc.o \ $(obj)/xe_guc_ads.o \ $(obj)/xe_guc_pc.o \ $(obj)/xe_migrate.o \ + $(obj)/xe_pat.o \ $(obj)/xe_ring_ops.o \ $(obj)/xe_vm.o \ $(obj)/xe_wa.o \ diff --git a/drivers/gpu/drm/xe/display/xe_dsb_buffer.c b/drivers/gpu/drm/xe/display/xe_dsb_buffer.c index 9e860c61f4b33f..ccd0d87d438a3a 100644 --- a/drivers/gpu/drm/xe/display/xe_dsb_buffer.c +++ b/drivers/gpu/drm/xe/display/xe_dsb_buffer.c @@ -7,6 +7,8 @@ #include "intel_display_types.h" #include "intel_dsb_buffer.h" #include "xe_bo.h" +#include "xe_device.h" +#include "xe_device_types.h" #include "xe_gt.h" u32 intel_dsb_buffer_ggtt_offset(struct intel_dsb_buffer *dsb_buf) @@ -16,7 +18,10 @@ u32 intel_dsb_buffer_ggtt_offset(struct intel_dsb_buffer *dsb_buf) void intel_dsb_buffer_write(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val) { + struct xe_device *xe = dsb_buf->vma->bo->tile->xe; + iosys_map_wr(&dsb_buf->vma->bo->vmap, idx * 4, u32, val); + xe_device_l2_flush(xe); } u32 intel_dsb_buffer_read(struct intel_dsb_buffer *dsb_buf, u32 idx) @@ -26,9 +31,12 @@ u32 intel_dsb_buffer_read(struct intel_dsb_buffer *dsb_buf, u32 idx) void intel_dsb_buffer_memset(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val, size_t size) { + struct xe_device *xe = dsb_buf->vma->bo->tile->xe; + WARN_ON(idx > (dsb_buf->buf_size - size) / sizeof(*dsb_buf->cmd_buf)); iosys_map_memset(&dsb_buf->vma->bo->vmap, idx * 4, val, size); + xe_device_l2_flush(xe); } bool intel_dsb_buffer_create(struct intel_crtc *crtc, struct intel_dsb_buffer *dsb_buf, size_t size) diff --git a/drivers/gpu/drm/xe/display/xe_fb_pin.c b/drivers/gpu/drm/xe/display/xe_fb_pin.c index 423f367c7065ed..d7db44e79eaf55 100644 --- a/drivers/gpu/drm/xe/display/xe_fb_pin.c +++ b/drivers/gpu/drm/xe/display/xe_fb_pin.c @@ -10,6 +10,7 @@ #include "intel_fb.h" #include "intel_fb_pin.h" #include "xe_bo.h" +#include "xe_device.h" #include "xe_ggtt.h" #include "xe_gt.h" #include "xe_pm.h" @@ -304,6 +305,8 @@ static struct i915_vma *__xe_pin_fb_vma(const struct intel_framebuffer *fb, if (ret) goto err_unpin; + /* Ensure DPT writes are flushed */ + xe_device_l2_flush(xe); return vma; err_unpin: diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h index d44564bad00949..fd9d94174efb12 100644 --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h @@ -80,6 +80,9 @@ #define LE_CACHEABILITY_MASK REG_GENMASK(1, 0) #define LE_CACHEABILITY(value) REG_FIELD_PREP(LE_CACHEABILITY_MASK, value) +#define XE2_GAMREQSTRM_CTRL XE_REG(0x4194) +#define CG_DIS_CNTLBUS REG_BIT(6) + #define CCS_AUX_INV XE_REG(0x4208) #define VD0_AUX_INV XE_REG(0x4218) @@ -372,6 +375,11 @@ #define XEHPC_L3CLOS_MASK(i) XE_REG_MCR(0xb194 + (i) * 8) +#define XE2_GLOBAL_INVAL XE_REG(0xb404) + +#define SCRATCH1LPFC XE_REG(0xb474) +#define EN_L3_RW_CCS_CACHE_FLUSH REG_BIT(0) + #define XE2LPM_L3SQCREG5 XE_REG_MCR(0xb658) #define XE2_TDF_CTRL XE_REG(0xb418) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index f2f1d8ddb22138..6ce44ca2524de0 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -54,6 +54,9 @@ #include "xe_vm.h" #include "xe_vram.h" #include "xe_wait_user_fence.h" +#include "xe_wa.h" + +#include static int xe_file_open(struct drm_device *dev, struct drm_file *file) { @@ -820,6 +823,11 @@ void xe_device_td_flush(struct xe_device *xe) if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20) return; + if (XE_WA(xe_root_mmio_gt(xe), 16023588340)) { + xe_device_l2_flush(xe); + return; + } + for_each_gt(gt, xe, id) { if (xe_gt_is_media_type(gt)) continue; @@ -843,6 +851,28 @@ void xe_device_td_flush(struct xe_device *xe) } } +void xe_device_l2_flush(struct xe_device *xe) +{ + struct xe_gt *gt; + int err; + + gt = xe_root_mmio_gt(xe); + + if (!XE_WA(gt, 16023588340)) + return; + + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + return; + + xe_mmio_write32(gt, XE2_GLOBAL_INVAL, 0x1); + + if (xe_mmio_wait32(gt, XE2_GLOBAL_INVAL, 0x1, 0x0, 150, NULL, true)) + xe_gt_err_once(gt, "Global invalidation timeout\n"); + + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +} + u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size) { return xe_device_has_flat_ccs(xe) ? diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index b3952718b3c1cb..533ccfb2567a2c 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -162,6 +162,7 @@ u64 xe_device_canonicalize_addr(struct xe_device *xe, u64 address); u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address); void xe_device_td_flush(struct xe_device *xe); +void xe_device_l2_flush(struct xe_device *xe); static inline bool xe_device_wedged(struct xe_device *xe) { diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 31b2e64c70c6ab..816ecc9e294ce9 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -11,6 +11,8 @@ #include #include +#include + #include "instructions/xe_gfxpipe_commands.h" #include "instructions/xe_mi_commands.h" #include "regs/xe_gt_regs.h" @@ -95,6 +97,51 @@ void xe_gt_sanitize(struct xe_gt *gt) gt->uc.guc.submission_state.enabled = false; } +static void xe_gt_enable_host_l2_vram(struct xe_gt *gt) +{ + u32 reg; + int err; + + if (!XE_WA(gt, 16023588340)) + return; + + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (WARN_ON(err)) + return; + + if (!xe_gt_is_media_type(gt)) { + xe_mmio_write32(gt, SCRATCH1LPFC, EN_L3_RW_CCS_CACHE_FLUSH); + reg = xe_mmio_read32(gt, XE2_GAMREQSTRM_CTRL); + reg |= CG_DIS_CNTLBUS; + xe_mmio_write32(gt, XE2_GAMREQSTRM_CTRL, reg); + } + + xe_gt_mcr_multicast_write(gt, XEHPC_L3CLOS_MASK(3), 0x3); + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +} + +static void xe_gt_disable_host_l2_vram(struct xe_gt *gt) +{ + u32 reg; + int err; + + if (!XE_WA(gt, 16023588340)) + return; + + if (xe_gt_is_media_type(gt)) + return; + + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (WARN_ON(err)) + return; + + reg = xe_mmio_read32(gt, XE2_GAMREQSTRM_CTRL); + reg &= ~CG_DIS_CNTLBUS; + xe_mmio_write32(gt, XE2_GAMREQSTRM_CTRL, reg); + + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +} + /** * xe_gt_remove() - Clean up the GT structures before driver removal * @gt: the GT object @@ -111,6 +158,8 @@ void xe_gt_remove(struct xe_gt *gt) for (i = 0; i < XE_ENGINE_CLASS_MAX; ++i) xe_hw_fence_irq_finish(>->fence_irq[i]); + + xe_gt_disable_host_l2_vram(gt); } static void gt_reset_worker(struct work_struct *w); @@ -508,6 +557,7 @@ int xe_gt_init_hwconfig(struct xe_gt *gt) xe_gt_mcr_init_early(gt); xe_pat_init(gt); + xe_gt_enable_host_l2_vram(gt); err = xe_uc_init(>->uc); if (err) @@ -643,6 +693,8 @@ static int do_gt_restart(struct xe_gt *gt) xe_pat_init(gt); + xe_gt_enable_host_l2_vram(gt); + xe_gt_mcr_set_implicit_defaults(gt); xe_reg_sr_apply_mmio(>->reg_sr, gt); @@ -796,6 +848,8 @@ int xe_gt_suspend(struct xe_gt *gt) xe_gt_idle_disable_pg(gt); + xe_gt_disable_host_l2_vram(gt); + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); xe_gt_dbg(gt, "suspended\n"); diff --git a/drivers/gpu/drm/xe/xe_pat.c b/drivers/gpu/drm/xe/xe_pat.c index 4ee32ee1cc885f..722278cc23fc5d 100644 --- a/drivers/gpu/drm/xe/xe_pat.c +++ b/drivers/gpu/drm/xe/xe_pat.c @@ -7,6 +7,8 @@ #include +#include + #include "regs/xe_reg_defs.h" #include "xe_assert.h" #include "xe_device.h" @@ -15,6 +17,7 @@ #include "xe_gt_mcr.h" #include "xe_mmio.h" #include "xe_sriov.h" +#include "xe_wa.h" #define _PAT_ATS 0x47fc #define _PAT_INDEX(index) _PICK_EVEN_2RANGES(index, 8, \ @@ -382,7 +385,13 @@ void xe_pat_init_early(struct xe_device *xe) if (GRAPHICS_VER(xe) == 20) { xe->pat.ops = &xe2_pat_ops; xe->pat.table = xe2_pat_table; - xe->pat.n_entries = ARRAY_SIZE(xe2_pat_table); + + /* Wa_16023588340. XXX: Should use XE_WA */ + if (GRAPHICS_VERx100(xe) == 2001) + xe->pat.n_entries = 28; /* Disable CLOS3 */ + else + xe->pat.n_entries = ARRAY_SIZE(xe2_pat_table); + xe->pat.idx[XE_CACHE_NONE] = 3; xe->pat.idx[XE_CACHE_WT] = 15; xe->pat.idx[XE_CACHE_WB] = 2; diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules index 26066beb4f6f5d..08f7336881e32d 100644 --- a/drivers/gpu/drm/xe/xe_wa_oob.rules +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules @@ -29,3 +29,4 @@ 13011645652 GRAPHICS_VERSION(2004) 22019338487 MEDIA_VERSION(2000) GRAPHICS_VERSION(2001) +16023588340 GRAPHICS_VERSION(2001) From 03a2dc84f5c4ef31ac0112b29d51ff103f7c8dd4 Mon Sep 17 00:00:00 2001 From: Ngai-Mint Kwan Date: Mon, 1 Jul 2024 11:46:37 -0700 Subject: [PATCH 494/991] drm/xe/xe2lpm: Extend Wa_16021639441 Wa_16021639441 applies to Xe2_LPM. Signed-off-by: Ngai-Mint Kwan Reviewed-by: Matt Roper Signed-off-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240701184637.531794-1-ngai-mint.kwan@linux.intel.com (cherry picked from commit 74e3076800067c6dc0dcff5b75344cec064c20eb) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_wa.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c index c7bf0862b23188..6c52d9d02b5fd9 100644 --- a/drivers/gpu/drm/xe/xe_wa.c +++ b/drivers/gpu/drm/xe/xe_wa.c @@ -539,6 +539,16 @@ static const struct xe_rtp_entry_sr engine_was[] = { XE_RTP_ACTIONS(SET(HALF_SLICE_CHICKEN7, CLEAR_OPTIMIZATION_DISABLE)) }, + /* Xe2_LPM */ + + { XE_RTP_NAME("16021639441"), + XE_RTP_RULES(MEDIA_VERSION(2000)), + XE_RTP_ACTIONS(SET(CSFE_CHICKEN1(0), + GHWSP_CSB_REPORT_DIS | + PPHWSP_CSB_AND_TIMESTAMP_REPORT_DIS, + XE_RTP_ACTION_FLAG(ENGINE_BASE))) + }, + /* Xe2_HPM */ { XE_RTP_NAME("16021639441"), From b196e6fcc71186134b4cfe756067d87ae41b1ed9 Mon Sep 17 00:00:00 2001 From: Bommu Krishnaiah Date: Wed, 3 Jul 2024 14:37:54 +0530 Subject: [PATCH 495/991] drm/xe/xe2lpg: Extend workaround 14021402888 workaround 14021402888 also applies to Xe2_LPG. Replicate the existing entry to one specific for Xe2_LPG. Signed-off-by: Bommu Krishnaiah Cc: Tejas Upadhyay Cc: Matt Roper Cc: Himal Prasad Ghimiray Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240703090754.1323647-1-krishnaiah.bommu@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 56ab6986992ba143aee0bda33e15a764343e271d) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_wa.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c index 6c52d9d02b5fd9..fd009b2c68fa60 100644 --- a/drivers/gpu/drm/xe/xe_wa.c +++ b/drivers/gpu/drm/xe/xe_wa.c @@ -486,6 +486,10 @@ static const struct xe_rtp_entry_sr engine_was[] = { XE_RTP_RULES(GRAPHICS_VERSION(2004), FUNC(xe_rtp_match_first_render_or_compute)), XE_RTP_ACTIONS(SET(TDL_TSL_CHICKEN, SLM_WMTP_RESTORE)) }, + { XE_RTP_NAME("14021402888"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(HALF_SLICE_CHICKEN7, CLEAR_OPTIMIZATION_DISABLE)) + }, /* Xe2_HPG */ From 7e81285380743aa5759bb29a388f056c3d326a2c Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Wed, 10 Jul 2024 10:57:50 +0530 Subject: [PATCH 496/991] drm/xe/xe2: Make subsequent L2 flush sequential Issuing the flush on top of an ongoing flush is not desirable. Lets use lock to make it sequential. Reviewed-by: Nirmoy Das Signed-off-by: Tejas Upadhyay Link: https://patchwork.freedesktop.org/patch/msgid/20240710052750.3031586-1-tejas.upadhyay@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit 71733b8d7f50b61403f940c6c9745fb3a9b98dcb) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.c | 2 ++ drivers/gpu/drm/xe/xe_gt.c | 1 + drivers/gpu/drm/xe/xe_gt_types.h | 6 ++++++ 3 files changed, 9 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 6ce44ca2524de0..c89deffffb6d06 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -865,10 +865,12 @@ void xe_device_l2_flush(struct xe_device *xe) if (err) return; + spin_lock(>->global_invl_lock); xe_mmio_write32(gt, XE2_GLOBAL_INVAL, 0x1); if (xe_mmio_wait32(gt, XE2_GLOBAL_INVAL, 0x1, 0x0, 150, NULL, true)) xe_gt_err_once(gt, "Global invalidation timeout\n"); + spin_unlock(>->global_invl_lock); xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); } diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 816ecc9e294ce9..b9bcbbe27705ff 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -388,6 +388,7 @@ int xe_gt_init_early(struct xe_gt *gt) xe_force_wake_init_gt(gt, gt_to_fw(gt)); xe_pcode_init(gt); + spin_lock_init(>->global_invl_lock); return 0; } diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h index 6b5e0b45efb0c9..38a0d0e178c8f7 100644 --- a/drivers/gpu/drm/xe/xe_gt_types.h +++ b/drivers/gpu/drm/xe/xe_gt_types.h @@ -362,6 +362,12 @@ struct xe_gt { */ spinlock_t mcr_lock; + /** + * @global_invl_lock: protects the register for the duration + * of a global invalidation of l2 cache + */ + spinlock_t global_invl_lock; + /** @wa_active: keep track of active workarounds */ struct { /** @wa_active.gt: bitmap with active GT workarounds */ From cbc6e98ab11bea52789d2835e45e8816c39407e1 Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Tue, 9 Jul 2024 21:26:06 +0530 Subject: [PATCH 497/991] drm/xe/xe2: Add Wa_15015404425 Wa_15015404425 asks us to perform four "dummy" writes to a non-existent register offset before every real register read. Although the specific offset of the writes doesn't directly matter, the workaround suggests offset 0x130030 as a good target so that these writes will be easy to recognize and filter out in debugging traces. V5(MattR): - Avoid negating an equality comparison V4(MattR): - Use writel and remove xe_reg usage V3(MattR): - Define dummy reg local to function - Avoid tracing dummy writes - Update commit message V2: - Add WA to 8/16/32bit reads also - MattR - Corrected dummy reg address - MattR - Use for loop to avoid mental pause - JaniN Reviewed-by: Matt Roper Signed-off-by: Tejas Upadhyay Signed-off-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240709155606.2998941-1-tejas.upadhyay@intel.com (cherry picked from commit 86c5b70a9c0c3f05f7002ef8b789460c96b54e27) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_mmio.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_mmio.c b/drivers/gpu/drm/xe/xe_mmio.c index 83122c77edd9c6..aa68cac9fdf808 100644 --- a/drivers/gpu/drm/xe/xe_mmio.c +++ b/drivers/gpu/drm/xe/xe_mmio.c @@ -124,12 +124,29 @@ int xe_mmio_init(struct xe_device *xe) return devm_add_action_or_reset(xe->drm.dev, mmio_fini, xe); } +static void mmio_flush_pending_writes(struct xe_gt *gt) +{ +#define DUMMY_REG_OFFSET 0x130030 + struct xe_tile *tile = gt_to_tile(gt); + int i; + + if (tile->xe->info.platform != XE_LUNARLAKE) + return; + + /* 4 dummy writes */ + for (i = 0; i < 4; i++) + writel(0, tile->mmio.regs + DUMMY_REG_OFFSET); +} + u8 xe_mmio_read8(struct xe_gt *gt, struct xe_reg reg) { struct xe_tile *tile = gt_to_tile(gt); u32 addr = xe_mmio_adjusted_addr(gt, reg.addr); u8 val; + /* Wa_15015404425 */ + mmio_flush_pending_writes(gt); + val = readb((reg.ext ? tile->mmio_ext.regs : tile->mmio.regs) + addr); trace_xe_reg_rw(gt, false, addr, val, sizeof(val)); @@ -142,6 +159,9 @@ u16 xe_mmio_read16(struct xe_gt *gt, struct xe_reg reg) u32 addr = xe_mmio_adjusted_addr(gt, reg.addr); u16 val; + /* Wa_15015404425 */ + mmio_flush_pending_writes(gt); + val = readw((reg.ext ? tile->mmio_ext.regs : tile->mmio.regs) + addr); trace_xe_reg_rw(gt, false, addr, val, sizeof(val)); @@ -163,6 +183,9 @@ u32 xe_mmio_read32(struct xe_gt *gt, struct xe_reg reg) u32 addr = xe_mmio_adjusted_addr(gt, reg.addr); u32 val; + /* Wa_15015404425 */ + mmio_flush_pending_writes(gt); + if (!reg.vf && IS_SRIOV_VF(gt_to_xe(gt))) val = xe_gt_sriov_vf_read32(gt, reg); else From f5cb1275c8ce56c7583cb323cfa08a820a7ef6b4 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Wed, 7 Aug 2024 16:53:32 -0700 Subject: [PATCH 498/991] drm/xe: fix WA 14018094691 This WA is applied while initializing the media GT, but it a primary GT WA (because it modifies a register on the primary GT), so the XE_WA macro is returning false even when the WA should be applied. Fix this by using the primary GT in the macro. Note that this WA only applies to PXP and we don't yet support that in Xe, so there are no negative effects to this bug, which is why we didn't see any errors in testing. v2: use the primary GT in the macro instead of marking the WA as platform-wide (Lucas, Matt). Signed-off-by: Daniele Ceraolo Spurio Cc: Matt Roper Cc: Lucas De Marchi Reviewed-by: Matt Roper Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240807235333.1370915-1-daniele.ceraolospurio@intel.com (cherry picked from commit e422c0bfd9e47e399e86bcc483f49d8b54064fc2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gsc.c b/drivers/gpu/drm/xe/xe_gsc.c index f8239a13fa2b88..77ce44e845c5f0 100644 --- a/drivers/gpu/drm/xe/xe_gsc.c +++ b/drivers/gpu/drm/xe/xe_gsc.c @@ -260,7 +260,7 @@ static int gsc_upload_and_init(struct xe_gsc *gsc) struct xe_tile *tile = gt_to_tile(gt); int ret; - if (XE_WA(gt, 14018094691)) { + if (XE_WA(tile->primary_gt, 14018094691)) { ret = xe_force_wake_get(gt_to_fw(tile->primary_gt), XE_FORCEWAKE_ALL); /* @@ -278,7 +278,7 @@ static int gsc_upload_and_init(struct xe_gsc *gsc) ret = gsc_upload(gsc); - if (XE_WA(gt, 14018094691)) + if (XE_WA(tile->primary_gt, 14018094691)) xe_force_wake_put(gt_to_fw(tile->primary_gt), XE_FORCEWAKE_ALL); if (ret) From 8776b0234e1d008d8f19b26f6c3af1cfa6187070 Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Mon, 12 Aug 2024 19:11:17 +0530 Subject: [PATCH 499/991] drm/xe/xe2hpg: Add Wa_14021821874 Wa_14021821874 applies to xe2_hpg V2(Himal): - Use space after define Cc: Matt Roper Reviewed-by: Himal Prasad Ghimiray Signed-off-by: Tejas Upadhyay Reviewed-by: Matt Roper Signed-off-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240812134117.813670-1-tejas.upadhyay@intel.com (cherry picked from commit 21ff3a16e92e2fa4f906a61d148aca1423c58298) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/regs/xe_gt_regs.h | 1 + drivers/gpu/drm/xe/xe_wa.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h index fd9d94174efb12..3c286504005862 100644 --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h @@ -437,6 +437,7 @@ #define DIS_FIX_EOT1_FLUSH REG_BIT(9) #define TDL_TSL_CHICKEN XE_REG_MCR(0xe4c4, XE_REG_OPTION_MASKED) +#define STK_ID_RESTRICT REG_BIT(12) #define SLM_WMTP_RESTORE REG_BIT(11) #define ROW_CHICKEN XE_REG_MCR(0xe4f0, XE_REG_OPTION_MASKED) diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c index fd009b2c68fa60..e648265d081be2 100644 --- a/drivers/gpu/drm/xe/xe_wa.c +++ b/drivers/gpu/drm/xe/xe_wa.c @@ -542,6 +542,10 @@ static const struct xe_rtp_entry_sr engine_was[] = { XE_RTP_RULES(GRAPHICS_VERSION(2001), ENGINE_CLASS(RENDER)), XE_RTP_ACTIONS(SET(HALF_SLICE_CHICKEN7, CLEAR_OPTIMIZATION_DISABLE)) }, + { XE_RTP_NAME("14021821874"), + XE_RTP_RULES(GRAPHICS_VERSION(2001), FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(TDL_TSL_CHICKEN, STK_ID_RESTRICT)) + }, /* Xe2_LPM */ From 8636a5c29be1f05b5162a5c82c874338b6717759 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Fri, 9 Aug 2024 16:12:35 -0700 Subject: [PATCH 500/991] drm/xe: use devm instead of drmm for managed bo The BO cleanup touches the GGTT and therefore requires the HW to be available, so we need to use devm instead of drmm. Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1160 Signed-off-by: Daniele Ceraolo Spurio Cc: Lucas De Marchi Cc: Matthew Auld Reviewed-by: Matthew Auld Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240809231237.1503796-2-daniele.ceraolospurio@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 8d3a2d3d766a823c7510cdc17e6ff7c042c63b61) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_bo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 31192d983d9e19..261d3d6c8a9315 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -1575,7 +1575,7 @@ struct xe_bo *xe_bo_create_from_data(struct xe_device *xe, struct xe_tile *tile, return bo; } -static void __xe_bo_unpin_map_no_vm(struct drm_device *drm, void *arg) +static void __xe_bo_unpin_map_no_vm(void *arg) { xe_bo_unpin_map_no_vm(arg); } @@ -1590,7 +1590,7 @@ struct xe_bo *xe_managed_bo_create_pin_map(struct xe_device *xe, struct xe_tile if (IS_ERR(bo)) return bo; - ret = drmm_add_action_or_reset(&xe->drm, __xe_bo_unpin_map_no_vm, bo); + ret = devm_add_action_or_reset(xe->drm.dev, __xe_bo_unpin_map_no_vm, bo); if (ret) return ERR_PTR(ret); @@ -1638,7 +1638,7 @@ int xe_managed_bo_reinit_in_vram(struct xe_device *xe, struct xe_tile *tile, str if (IS_ERR(bo)) return PTR_ERR(bo); - drmm_release_action(&xe->drm, __xe_bo_unpin_map_no_vm, *src); + devm_release_action(xe->drm.dev, __xe_bo_unpin_map_no_vm, *src); *src = bo; return 0; From a06a7b3429e2548a28bb661f17347b8ffe4a8a15 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Thu, 15 Aug 2024 16:05:40 -0700 Subject: [PATCH 501/991] drm/xe/uc: Use devm to register cleanup that includes exec_queues Exec_queue cleanup requires HW access, so we need to use devm instead of drmm for it. Signed-off-by: Daniele Ceraolo Spurio Cc: John Harrison Cc: Alan Previn Reviewed-by: Matthew Brost Reviewed-by: Lucas De Marchi Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20240815230541.3828206-2-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 5a891a0e69f134f53cc91b409f38e5ea1cafaf0a) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gsc.c | 4 ++-- drivers/gpu/drm/xe/xe_guc_submit.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gsc.c b/drivers/gpu/drm/xe/xe_gsc.c index 77ce44e845c5f0..2a612652bb1381 100644 --- a/drivers/gpu/drm/xe/xe_gsc.c +++ b/drivers/gpu/drm/xe/xe_gsc.c @@ -437,7 +437,7 @@ int xe_gsc_init(struct xe_gsc *gsc) return ret; } -static void free_resources(struct drm_device *drm, void *arg) +static void free_resources(void *arg) { struct xe_gsc *gsc = arg; @@ -501,7 +501,7 @@ int xe_gsc_init_post_hwconfig(struct xe_gsc *gsc) gsc->q = q; gsc->wq = wq; - err = drmm_add_action_or_reset(&xe->drm, free_resources, gsc); + err = devm_add_action_or_reset(xe->drm.dev, free_resources, gsc); if (err) return err; diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 6398629e6b4ecd..77b0f0d8f7297e 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -284,7 +284,7 @@ static void guc_submit_fini(struct drm_device *drm, void *arg) free_submit_wq(guc); } -static void guc_submit_wedged_fini(struct drm_device *drm, void *arg) +static void guc_submit_wedged_fini(void *arg) { struct xe_guc *guc = arg; struct xe_exec_queue *q; @@ -877,7 +877,7 @@ void xe_guc_submit_wedge(struct xe_guc *guc) xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode); - err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm, + err = devm_add_action_or_reset(guc_to_xe(guc)->drm.dev, guc_submit_wedged_fini, guc); if (err) { drm_err(&xe->drm, "Failed to register xe_guc_submit clean-up on wedged.mode=2. Although device is wedged.\n"); From 4e870e6bbec5c41c0d8b253282dca9465fbf5044 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 19 Aug 2024 17:04:42 -0700 Subject: [PATCH 502/991] Input: himax_hx83112b - fix incorrect size when reading product ID We need to read a u32 value (4 bytes), not size of a pointer to that value. Also, himax_read_mcu() wrapper is an overkill, remove it and use himax_bus_read() directly. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408200301.Ujpj7Vov-lkp@intel.com/ Fixes: 0944829d491e ("Input: himax_hx83112b - implement MCU register reading") Tested-by: Felix Kaechele Link: https://lore.kernel.org/r/ZsPdmtfC54R7JVxR@google.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/himax_hx83112b.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/input/touchscreen/himax_hx83112b.c b/drivers/input/touchscreen/himax_hx83112b.c index 9ed3bccde4ac4a..896a145ddb2bc2 100644 --- a/drivers/input/touchscreen/himax_hx83112b.c +++ b/drivers/input/touchscreen/himax_hx83112b.c @@ -130,17 +130,6 @@ static int himax_bus_read(struct himax_ts_data *ts, u32 address, void *dst, return 0; } -static int himax_read_mcu(struct himax_ts_data *ts, u32 address, u32 *dst) -{ - int error; - - error = himax_bus_read(ts, address, dst, sizeof(dst)); - if (error) - return error; - - return 0; -} - static void himax_reset(struct himax_ts_data *ts) { gpiod_set_value_cansleep(ts->gpiod_rst, 1); @@ -160,7 +149,8 @@ static int himax_read_product_id(struct himax_ts_data *ts, u32 *product_id) { int error; - error = himax_read_mcu(ts, HIMAX_REG_ADDR_ICID, product_id); + error = himax_bus_read(ts, HIMAX_REG_ADDR_ICID, product_id, + sizeof(*product_id)); if (error) return error; From ce335db0621648472f9bb4b7191eb2e13a5793cf Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Fri, 16 Aug 2024 18:29:17 +0800 Subject: [PATCH 503/991] net: mctp: test: Use correct skb for route input check In the MCTP route input test, we're routing one skb, then (when delivery is expected) checking the resulting routed skb. However, we're currently checking the original skb length, rather than the routed skb. Check the routed skb instead; the original will have been freed at this point. Fixes: 8892c0490779 ("mctp: Add route input to socket tests") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/kernel-janitors/4ad204f0-94cf-46c5-bdab-49592addf315@kili.mountain/ Signed-off-by: Jeremy Kerr Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240816-mctp-kunit-skb-fix-v1-1-3c367ac89c27@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/test/route-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 77e5dd42225802..8551dab1d1e698 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -366,7 +366,7 @@ static void mctp_test_route_input_sk(struct kunit *test) skb2 = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, skb2); - KUNIT_EXPECT_EQ(test, skb->len, 1); + KUNIT_EXPECT_EQ(test, skb2->len, 1); skb_free_datagram(sock->sk, skb2); From 807067bf014d4a3ae2cc55bd3de16f22a01eb580 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 15 Aug 2024 15:04:37 -0700 Subject: [PATCH 504/991] kcm: Serialise kcm_sendmsg() for the same socket. syzkaller reported UAF in kcm_release(). [0] The scenario is 1. Thread A builds a skb with MSG_MORE and sets kcm->seq_skb. 2. Thread A resumes building skb from kcm->seq_skb but is blocked by sk_stream_wait_memory() 3. Thread B calls sendmsg() concurrently, finishes building kcm->seq_skb and puts the skb to the write queue 4. Thread A faces an error and finally frees skb that is already in the write queue 5. kcm_release() does double-free the skb in the write queue When a thread is building a MSG_MORE skb, another thread must not touch it. Let's add a per-sk mutex and serialise kcm_sendmsg(). [0]: BUG: KASAN: slab-use-after-free in __skb_unlink include/linux/skbuff.h:2366 [inline] BUG: KASAN: slab-use-after-free in __skb_dequeue include/linux/skbuff.h:2385 [inline] BUG: KASAN: slab-use-after-free in __skb_queue_purge_reason include/linux/skbuff.h:3175 [inline] BUG: KASAN: slab-use-after-free in __skb_queue_purge include/linux/skbuff.h:3181 [inline] BUG: KASAN: slab-use-after-free in kcm_release+0x170/0x4c8 net/kcm/kcmsock.c:1691 Read of size 8 at addr ffff0000ced0fc80 by task syz-executor329/6167 CPU: 1 PID: 6167 Comm: syz-executor329 Tainted: G B 6.8.0-rc5-syzkaller-g9abbc24128bc #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 Call trace: dump_backtrace+0x1b8/0x1e4 arch/arm64/kernel/stacktrace.c:291 show_stack+0x2c/0x3c arch/arm64/kernel/stacktrace.c:298 __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd0/0x124 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:377 [inline] print_report+0x178/0x518 mm/kasan/report.c:488 kasan_report+0xd8/0x138 mm/kasan/report.c:601 __asan_report_load8_noabort+0x20/0x2c mm/kasan/report_generic.c:381 __skb_unlink include/linux/skbuff.h:2366 [inline] __skb_dequeue include/linux/skbuff.h:2385 [inline] __skb_queue_purge_reason include/linux/skbuff.h:3175 [inline] __skb_queue_purge include/linux/skbuff.h:3181 [inline] kcm_release+0x170/0x4c8 net/kcm/kcmsock.c:1691 __sock_release net/socket.c:659 [inline] sock_close+0xa4/0x1e8 net/socket.c:1421 __fput+0x30c/0x738 fs/file_table.c:376 ____fput+0x20/0x30 fs/file_table.c:404 task_work_run+0x230/0x2e0 kernel/task_work.c:180 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0x618/0x1f64 kernel/exit.c:871 do_group_exit+0x194/0x22c kernel/exit.c:1020 get_signal+0x1500/0x15ec kernel/signal.c:2893 do_signal+0x23c/0x3b44 arch/arm64/kernel/signal.c:1249 do_notify_resume+0x74/0x1f4 arch/arm64/kernel/entry-common.c:148 exit_to_user_mode_prepare arch/arm64/kernel/entry-common.c:169 [inline] exit_to_user_mode arch/arm64/kernel/entry-common.c:178 [inline] el0_svc+0xac/0x168 arch/arm64/kernel/entry-common.c:713 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 Allocated by task 6166: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x40/0x78 mm/kasan/common.c:68 kasan_save_alloc_info+0x70/0x84 mm/kasan/generic.c:626 unpoison_slab_object mm/kasan/common.c:314 [inline] __kasan_slab_alloc+0x74/0x8c mm/kasan/common.c:340 kasan_slab_alloc include/linux/kasan.h:201 [inline] slab_post_alloc_hook mm/slub.c:3813 [inline] slab_alloc_node mm/slub.c:3860 [inline] kmem_cache_alloc_node+0x204/0x4c0 mm/slub.c:3903 __alloc_skb+0x19c/0x3d8 net/core/skbuff.c:641 alloc_skb include/linux/skbuff.h:1296 [inline] kcm_sendmsg+0x1d3c/0x2124 net/kcm/kcmsock.c:783 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] sock_sendmsg+0x220/0x2c0 net/socket.c:768 splice_to_socket+0x7cc/0xd58 fs/splice.c:889 do_splice_from fs/splice.c:941 [inline] direct_splice_actor+0xec/0x1d8 fs/splice.c:1164 splice_direct_to_actor+0x438/0xa0c fs/splice.c:1108 do_splice_direct_actor fs/splice.c:1207 [inline] do_splice_direct+0x1e4/0x304 fs/splice.c:1233 do_sendfile+0x460/0xb3c fs/read_write.c:1295 __do_sys_sendfile64 fs/read_write.c:1362 [inline] __se_sys_sendfile64 fs/read_write.c:1348 [inline] __arm64_sys_sendfile64+0x160/0x3b4 fs/read_write.c:1348 __invoke_syscall arch/arm64/kernel/syscall.c:37 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:51 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:136 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:155 el0_svc+0x54/0x168 arch/arm64/kernel/entry-common.c:712 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 Freed by task 6167: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x40/0x78 mm/kasan/common.c:68 kasan_save_free_info+0x5c/0x74 mm/kasan/generic.c:640 poison_slab_object+0x124/0x18c mm/kasan/common.c:241 __kasan_slab_free+0x3c/0x78 mm/kasan/common.c:257 kasan_slab_free include/linux/kasan.h:184 [inline] slab_free_hook mm/slub.c:2121 [inline] slab_free mm/slub.c:4299 [inline] kmem_cache_free+0x15c/0x3d4 mm/slub.c:4363 kfree_skbmem+0x10c/0x19c __kfree_skb net/core/skbuff.c:1109 [inline] kfree_skb_reason+0x240/0x6f4 net/core/skbuff.c:1144 kfree_skb include/linux/skbuff.h:1244 [inline] kcm_release+0x104/0x4c8 net/kcm/kcmsock.c:1685 __sock_release net/socket.c:659 [inline] sock_close+0xa4/0x1e8 net/socket.c:1421 __fput+0x30c/0x738 fs/file_table.c:376 ____fput+0x20/0x30 fs/file_table.c:404 task_work_run+0x230/0x2e0 kernel/task_work.c:180 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0x618/0x1f64 kernel/exit.c:871 do_group_exit+0x194/0x22c kernel/exit.c:1020 get_signal+0x1500/0x15ec kernel/signal.c:2893 do_signal+0x23c/0x3b44 arch/arm64/kernel/signal.c:1249 do_notify_resume+0x74/0x1f4 arch/arm64/kernel/entry-common.c:148 exit_to_user_mode_prepare arch/arm64/kernel/entry-common.c:169 [inline] exit_to_user_mode arch/arm64/kernel/entry-common.c:178 [inline] el0_svc+0xac/0x168 arch/arm64/kernel/entry-common.c:713 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 The buggy address belongs to the object at ffff0000ced0fc80 which belongs to the cache skbuff_head_cache of size 240 The buggy address is located 0 bytes inside of freed 240-byte region [ffff0000ced0fc80, ffff0000ced0fd70) The buggy address belongs to the physical page: page:00000000d35f4ae4 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x10ed0f flags: 0x5ffc00000000800(slab|node=0|zone=2|lastcpupid=0x7ff) page_type: 0xffffffff() raw: 05ffc00000000800 ffff0000c1cbf640 fffffdffc3423100 dead000000000004 raw: 0000000000000000 00000000000c000c 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff0000ced0fb80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff0000ced0fc00: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc >ffff0000ced0fc80: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff0000ced0fd00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fc fc ffff0000ced0fd80: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Reported-by: syzbot+b72d86aa5df17ce74c60@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b72d86aa5df17ce74c60 Tested-by: syzbot+b72d86aa5df17ce74c60@syzkaller.appspotmail.com Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240815220437.69511-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/kcm.h | 1 + net/kcm/kcmsock.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/include/net/kcm.h b/include/net/kcm.h index 90279e5e09a5c0..441e993be634ce 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -70,6 +70,7 @@ struct kcm_sock { struct work_struct tx_work; struct list_head wait_psock_list; struct sk_buff *seq_skb; + struct mutex tx_mutex; u32 tx_stopped : 1; /* Don't use bit fields here, these are set under different locks */ diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 2f191e50d4fc94..d4118c796290e5 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -755,6 +755,7 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); int err = -EPIPE; + mutex_lock(&kcm->tx_mutex); lock_sock(sk); /* Per tcp_sendmsg this should be in poll */ @@ -926,6 +927,7 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) KCM_STATS_ADD(kcm->stats.tx_bytes, copied); release_sock(sk); + mutex_unlock(&kcm->tx_mutex); return copied; out_error: @@ -951,6 +953,7 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) sk->sk_write_space(sk); release_sock(sk); + mutex_unlock(&kcm->tx_mutex); return err; } @@ -1204,6 +1207,7 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) spin_unlock_bh(&mux->lock); INIT_WORK(&kcm->tx_work, kcm_tx_work); + mutex_init(&kcm->tx_mutex); spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); From 2102bdac67b55bf2d1df4ff757bced74e94a5f74 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 Aug 2024 21:03:07 -0400 Subject: [PATCH 505/991] bcachefs: Extra debug for data move path We don't have sufficient information to debug: https://github.com/koverstreet/bcachefs/issues/726 - print out durability of extent ptrs, when non default - print the number of replicas we need in data_update_to_text() Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 3 +++ fs/bcachefs/extents.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 6a854c9184965e..1ca628e93e87f1 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -475,6 +475,9 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, bch2_compression_opt_to_text(out, background_compression(*io_opts)); prt_newline(out); + prt_str(out, "opts.replicas:\t"); + prt_u64(out, io_opts->data_replicas); + prt_str(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); } diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 4419ad3e454e4f..9406f82fc2550b 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1017,6 +1017,8 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_printf(out, "ptr: %u:%llu:%u gen %u", ptr->dev, b, offset, ptr->gen); + if (ca->mi.durability != 1) + prt_printf(out, " d=%u", ca->mi.durability); if (ptr->cached) prt_str(out, " cached"); if (ptr->unwritten) From 50f2b98dc83de7809a5c5bf0ccf9af2e75c37c13 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Tue, 13 Aug 2024 10:59:08 +0100 Subject: [PATCH 506/991] MIPS: cevt-r4k: Don't call get_c0_compare_int if timer irq is installed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This avoids warning: [ 0.118053] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:283 Caused by get_c0_compare_int on secondary CPU. We also skipped saving IRQ number to struct clock_event_device *cd as it's never used by clockevent core, as per comments it's only meant for "non CPU local devices". Reported-by: Serge Semin Closes: https://lore.kernel.org/linux-mips/6szkkqxpsw26zajwysdrwplpjvhl5abpnmxgu2xuj3dkzjnvsf@4daqrz4mf44k/ Signed-off-by: Jiaxun Yang Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Serge Semin Tested-by: Serge Semin Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/cevt-r4k.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/mips/kernel/cevt-r4k.c b/arch/mips/kernel/cevt-r4k.c index 368e8475870f08..5f6e9e2ebbdbb8 100644 --- a/arch/mips/kernel/cevt-r4k.c +++ b/arch/mips/kernel/cevt-r4k.c @@ -303,13 +303,6 @@ int r4k_clockevent_init(void) if (!c0_compare_int_usable()) return -ENXIO; - /* - * With vectored interrupts things are getting platform specific. - * get_c0_compare_int is a hook to allow a platform to return the - * interrupt number of its liking. - */ - irq = get_c0_compare_int(); - cd = &per_cpu(mips_clockevent_device, cpu); cd->name = "MIPS"; @@ -320,7 +313,6 @@ int r4k_clockevent_init(void) min_delta = calculate_min_delta(); cd->rating = 300; - cd->irq = irq; cd->cpumask = cpumask_of(cpu); cd->set_next_event = mips_next_event; cd->event_handler = mips_event_handler; @@ -332,6 +324,13 @@ int r4k_clockevent_init(void) cp0_timer_irq_installed = 1; + /* + * With vectored interrupts things are getting platform specific. + * get_c0_compare_int is a hook to allow a platform to return the + * interrupt number of its liking. + */ + irq = get_c0_compare_int(); + if (request_irq(irq, c0_compare_interrupt, flags, "timer", c0_compare_interrupt)) pr_err("Failed to request irq %d (timer)\n", irq); From 1eacdd71b3436b54d5fc8218c4bb0187d92a6892 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 20 Aug 2024 09:54:30 +0200 Subject: [PATCH 507/991] netfilter: nft_counter: Disable BH in nft_counter_offload_stats(). The sequence counter nft_counter_seq is a per-CPU counter. There is no lock associated with it. nft_counter_do_eval() is using the same counter and disables BH which suggest that it can be invoked from a softirq. This in turn means that nft_counter_offload_stats(), which disables only preemption, can be interrupted by nft_counter_do_eval() leading to two writer for one seqcount_t. This can lead to loosing stats or reading statistics while they are updated. Disable BH during stats update in nft_counter_offload_stats() to ensure one writer at a time. Fixes: b72920f6e4a9d ("netfilter: nftables: counter hardware offload support") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 291ed2026367ec..16f40b503d3798 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -265,7 +265,7 @@ static void nft_counter_offload_stats(struct nft_expr *expr, struct nft_counter *this_cpu; seqcount_t *myseq; - preempt_disable(); + local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); myseq = this_cpu_ptr(&nft_counter_seq); @@ -273,7 +273,7 @@ static void nft_counter_offload_stats(struct nft_expr *expr, this_cpu->packets += stats->pkts; this_cpu->bytes += stats->bytes; write_seqcount_end(myseq); - preempt_enable(); + local_bh_enable(); } void nft_counter_init_seqcount(void) From a0b39e2dc7017ac667b70bdeee5293e410fab2fb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 20 Aug 2024 09:54:31 +0200 Subject: [PATCH 508/991] netfilter: nft_counter: Synchronize nft_counter_reset() against reader. nft_counter_reset() resets the counter by subtracting the previously retrieved value from the counter. This is a write operation on the counter and as such it requires to be performed with a write sequence of nft_counter_seq to serialize against its possible reader. Update the packets/ bytes within write-sequence of nft_counter_seq. Fixes: d84701ecbcd6a ("netfilter: nft_counter: rework atomic dump and reset") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_counter.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 16f40b503d3798..eab0dc66bee6bd 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -107,11 +107,16 @@ static void nft_counter_reset(struct nft_counter_percpu_priv *priv, struct nft_counter *total) { struct nft_counter *this_cpu; + seqcount_t *myseq; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); + myseq = this_cpu_ptr(&nft_counter_seq); + + write_seqcount_begin(myseq); this_cpu->packets -= total->packets; this_cpu->bytes -= total->bytes; + write_seqcount_end(myseq); local_bh_enable(); } From 46ee21e9f59205e54943dfe51b2dc8a9352ca37d Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 16 Aug 2024 09:36:26 -0700 Subject: [PATCH 509/991] platform/x86: ISST: Fix return value on last invalid resource MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When only the last resource is invalid, tpmi_sst_dev_add() is returing error even if there are other valid resources before. This function should return error when there are no valid resources. Here tpmi_sst_dev_add() is returning "ret" variable. But this "ret" variable contains the failure status of last call to sst_main(), which failed for the invalid resource. But there may be other valid resources before the last entry. To address this, do not update "ret" variable for sst_main() return status. If there are no valid resources, it is already checked for by !inst below the loop and -ENODEV is returned. Fixes: 9d1d36268f3d ("platform/x86: ISST: Support partitioned systems") Signed-off-by: Srinivas Pandruvada Cc: stable@vger.kernel.org # 6.10+ Link: https://lore.kernel.org/r/20240816163626.415762-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c index 7fa360073f6ef4..4045823071091a 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c @@ -1549,8 +1549,7 @@ int tpmi_sst_dev_add(struct auxiliary_device *auxdev) goto unlock_free; } - ret = sst_main(auxdev, &pd_info[i]); - if (ret) { + if (sst_main(auxdev, &pd_info[i])) { /* * This entry is not valid, hardware can partially * populate dies. In this case MMIO will have 0xFFs. From 4b3e33fcc38f7750604b065c55a43e94c5bc3145 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Thu, 15 Aug 2024 17:14:16 +0200 Subject: [PATCH 510/991] ip6_tunnel: Fix broken GRO GRO code checks for matching layer 2 headers to see, if packet belongs to the same flow and because ip6 tunnel set dev->hard_header_len this check fails in cases, where it shouldn't. To fix this don't set hard_header_len, but use needed_headroom like ipv4/ip_tunnel.c does. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Thomas Bogendoerfer Link: https://patch.msgid.link/20240815151419.109864-1-tbogendoerfer@suse.de Signed-off-by: Paolo Abeni --- net/ipv6/ip6_tunnel.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 9dee0c12795540..87dfb565a9f81b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1507,7 +1507,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { - dev->hard_header_len = tdev->hard_header_len + t_hlen; + dev->needed_headroom = tdev->hard_header_len + + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; @@ -1731,7 +1732,9 @@ ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); + int t_hlen; + t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; @@ -1740,10 +1743,10 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { - if (new_mtu > IP6_MAX_MTU - dev->hard_header_len) + if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { - if (new_mtu > IP_MAX_MTU - dev->hard_header_len) + if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); @@ -1887,12 +1890,11 @@ ip6_tnl_dev_init_gen(struct net_device *dev) t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; - dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len; + dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); From 6275c7bc8dd07644ea8142a1773d826800f0f3f7 Mon Sep 17 00:00:00 2001 From: Ben Whitten Date: Sun, 11 Aug 2024 22:22:11 +0100 Subject: [PATCH 511/991] mmc: dw_mmc: allow biu and ciu clocks to defer Fix a race condition if the clock provider comes up after mmc is probed, this causes mmc to fail without retrying. When given the DEFER error from the clk source, pass it on up the chain. Fixes: f90a0612f0e1 ("mmc: dw_mmc: lookup for optional biu and ciu clocks") Signed-off-by: Ben Whitten Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240811212212.123255-1-ben.whitten@gmail.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 2333ef4893ee0d..e9f6e4e622901a 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -3299,6 +3299,10 @@ int dw_mci_probe(struct dw_mci *host) host->biu_clk = devm_clk_get(host->dev, "biu"); if (IS_ERR(host->biu_clk)) { dev_dbg(host->dev, "biu clock not available\n"); + ret = PTR_ERR(host->biu_clk); + if (ret == -EPROBE_DEFER) + return ret; + } else { ret = clk_prepare_enable(host->biu_clk); if (ret) { @@ -3310,6 +3314,10 @@ int dw_mci_probe(struct dw_mci *host) host->ciu_clk = devm_clk_get(host->dev, "ciu"); if (IS_ERR(host->ciu_clk)) { dev_dbg(host->dev, "ciu clock not available\n"); + ret = PTR_ERR(host->ciu_clk); + if (ret == -EPROBE_DEFER) + goto err_clk_biu; + host->bus_hz = host->pdata->bus_hz; } else { ret = clk_prepare_enable(host->ciu_clk); From ea72ce5da22806d5713f3ffb39a6d5ae73841f93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Aug 2024 00:29:36 +0200 Subject: [PATCH 512/991] x86/kaslr: Expose and use the end of the physical memory address space iounmap() on x86 occasionally fails to unmap because the provided valid ioremap address is not below high_memory. It turned out that this happens due to KASLR. KASLR uses the full address space between PAGE_OFFSET and vaddr_end to randomize the starting points of the direct map, vmalloc and vmemmap regions. It thereby limits the size of the direct map by using the installed memory size plus an extra configurable margin for hot-plug memory. This limitation is done to gain more randomization space because otherwise only the holes between the direct map, vmalloc, vmemmap and vaddr_end would be usable for randomizing. The limited direct map size is not exposed to the rest of the kernel, so the memory hot-plug and resource management related code paths still operate under the assumption that the available address space can be determined with MAX_PHYSMEM_BITS. request_free_mem_region() allocates from (1 << MAX_PHYSMEM_BITS) - 1 downwards. That means the first allocation happens past the end of the direct map and if unlucky this address is in the vmalloc space, which causes high_memory to become greater than VMALLOC_START and consequently causes iounmap() to fail for valid ioremap addresses. MAX_PHYSMEM_BITS cannot be changed for that because the randomization does not align with address bit boundaries and there are other places which actually require to know the maximum number of address bits. All remaining usage sites of MAX_PHYSMEM_BITS have been analyzed and found to be correct. Cure this by exposing the end of the direct map via PHYSMEM_END and use that for the memory hot-plug and resource management related places instead of relying on MAX_PHYSMEM_BITS. In the KASLR case PHYSMEM_END maps to a variable which is initialized by the KASLR initialization and otherwise it is based on MAX_PHYSMEM_BITS as before. To prevent future hickups add a check into add_pages() to catch callers trying to add memory above PHYSMEM_END. Fixes: 0483e1fa6e09 ("x86/mm: Implement ASLR for kernel memory regions") Reported-by: Max Ramanouski Reported-by: Alistair Popple Signed-off-by: Thomas Gleixner Tested-By: Max Ramanouski Tested-by: Alistair Popple Reviewed-by: Dan Williams Reviewed-by: Alistair Popple Reviewed-by: Kees Cook Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/87ed6soy3z.ffs@tglx --- arch/x86/include/asm/page_64.h | 1 + arch/x86/include/asm/pgtable_64_types.h | 4 ++++ arch/x86/mm/init_64.c | 4 ++++ arch/x86/mm/kaslr.c | 32 ++++++++++++++++++++----- include/linux/mm.h | 4 ++++ kernel/resource.c | 6 ++--- mm/memory_hotplug.c | 2 +- mm/sparse.c | 2 +- 8 files changed, 43 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index af4302d79b59bd..f3d257c452254d 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -17,6 +17,7 @@ extern unsigned long phys_base; extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; +extern unsigned long physmem_end; static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 9053dfe9fa03f9..a98e53491a4e6e 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d; # define VMEMMAP_START __VMEMMAP_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ +#ifdef CONFIG_RANDOMIZE_MEMORY +# define PHYSMEM_END physmem_end +#endif + /* * End of the region for which vmalloc page tables are pre-allocated. * For non-KMSAN builds, this is the same as VMALLOC_END. diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d8dbeac8b206dd..ff253648706fa9 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size) int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct mhp_params *params) { + unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1; int ret; + if (WARN_ON_ONCE(end > PHYSMEM_END)) + return -ERANGE; + ret = __add_pages(nid, start_pfn, nr_pages, params); WARN_ON_ONCE(ret); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 37db264866b648..230f1dee4f0954 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; */ static __initdata struct kaslr_memory_region { unsigned long *base; + unsigned long *end; unsigned long size_tb; } kaslr_regions[] = { - { &page_offset_base, 0 }, - { &vmalloc_base, 0 }, - { &vmemmap_base, 0 }, + { + .base = &page_offset_base, + .end = &physmem_end, + }, + { + .base = &vmalloc_base, + }, + { + .base = &vmemmap_base, + }, }; +/* The end of the possible address space for physical memory */ +unsigned long physmem_end __ro_after_init; + /* Get size in bytes used by the memory region */ static inline unsigned long get_padding(struct kaslr_memory_region *region) { @@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void) BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE); BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); + /* Preset the end of the possible address space for physical memory */ + physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1); if (!kaslr_memory_enabled()) return; @@ -128,11 +141,18 @@ void __init kernel_randomize_memory(void) vaddr += entropy; *kaslr_regions[i].base = vaddr; + /* Calculate the end of the region */ + vaddr += get_padding(&kaslr_regions[i]); /* - * Jump the region and add a minimum padding based on - * randomization alignment. + * KASLR trims the maximum possible size of the + * direct-map. Update the physmem_end boundary. + * No rounding required as the region starts + * PUD aligned and size is in units of TB. */ - vaddr += get_padding(&kaslr_regions[i]); + if (kaslr_regions[i].end) + *kaslr_regions[i].end = __pa_nodebug(vaddr - 1); + + /* Add a minimum padding based on randomization alignment. */ vaddr = round_up(vaddr + 1, PUD_SIZE); remain_entropy -= entropy; } diff --git a/include/linux/mm.h b/include/linux/mm.h index c4b238a20b76e9..b3864156eaa4e1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -97,6 +97,10 @@ extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif +#ifndef PHYSMEM_END +# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) +#endif + #include #include diff --git a/kernel/resource.c b/kernel/resource.c index 14777afb0a99e8..a83040fde236fb 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1826,8 +1826,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size, if (flags & GFR_DESCENDING) { resource_size_t end; - end = min_t(resource_size_t, base->end, - (1ULL << MAX_PHYSMEM_BITS) - 1); + end = min_t(resource_size_t, base->end, PHYSMEM_END); return end - size + 1; } @@ -1844,8 +1843,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr, * @size did not wrap 0. */ return addr > addr - size && - addr <= min_t(resource_size_t, base->end, - (1ULL << MAX_PHYSMEM_BITS) - 1); + addr <= min_t(resource_size_t, base->end, PHYSMEM_END); } static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 66267c26ca1bbf..951878ab627a87 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void) struct range mhp_get_pluggable_range(bool need_mapping) { - const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1; + const u64 max_phys = PHYSMEM_END; struct range mhp_range; if (need_mapping) { diff --git a/mm/sparse.c b/mm/sparse.c index e4b830091d1373..0c3bff882033c5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section) static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, unsigned long *end_pfn) { - unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); + unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT; /* * Sanity checks - do not allow an architecture to pass From a1e627af32ed60713941cbfc8075d44cad07f6dd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 20 Aug 2024 11:44:08 +0300 Subject: [PATCH 513/991] mmc: mmc_test: Fix NULL dereference on allocation failure If the "test->highmem = alloc_pages()" allocation fails then calling __free_pages(test->highmem) will result in a NULL dereference. Also change the error code to -ENOMEM instead of returning success. Fixes: 2661081f5ab9 ("mmc_test: highmem tests") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/8c90be28-67b4-4b0d-a105-034dc72a0b31@stanley.mountain Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc_test.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c index 8f7f587a0025b3..b7f627a9fdeab9 100644 --- a/drivers/mmc/core/mmc_test.c +++ b/drivers/mmc/core/mmc_test.c @@ -3125,13 +3125,13 @@ static ssize_t mtf_test_write(struct file *file, const char __user *buf, test->buffer = kzalloc(BUFFER_SIZE, GFP_KERNEL); #ifdef CONFIG_HIGHMEM test->highmem = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, BUFFER_ORDER); + if (!test->highmem) { + count = -ENOMEM; + goto free_test_buffer; + } #endif -#ifdef CONFIG_HIGHMEM - if (test->buffer && test->highmem) { -#else if (test->buffer) { -#endif mutex_lock(&mmc_test_lock); mmc_test_run(test, testcase); mutex_unlock(&mmc_test_lock); @@ -3139,6 +3139,7 @@ static ssize_t mtf_test_write(struct file *file, const char __user *buf, #ifdef CONFIG_HIGHMEM __free_pages(test->highmem, BUFFER_ORDER); +free_test_buffer: #endif kfree(test->buffer); kfree(test); From 783bf5d09f86b9736605f3e01a3472e55ef98ff8 Mon Sep 17 00:00:00 2001 From: Carlos Song Date: Tue, 20 Aug 2024 15:06:58 +0800 Subject: [PATCH 514/991] spi: spi-fsl-lpspi: limit PRESCALE bit in TCR register Referring to the errata ERR051608 of I.MX93, LPSPI TCR[PRESCALE] can only be configured to be 0 or 1, other values are not valid and will cause LPSPI to not work. Add the prescale limitation for LPSPI in I.MX93. Other platforms are not affected. Signed-off-by: Carlos Song Link: https://patch.msgid.link/20240820070658.672127-1-carlos.song@nxp.com Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-lpspi.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c index be261ac09df823..350c5d91d869f6 100644 --- a/drivers/spi/spi-fsl-lpspi.c +++ b/drivers/spi/spi-fsl-lpspi.c @@ -82,6 +82,10 @@ #define TCR_RXMSK BIT(19) #define TCR_TXMSK BIT(18) +struct fsl_lpspi_devtype_data { + u8 prescale_max; +}; + struct lpspi_config { u8 bpw; u8 chip_select; @@ -119,10 +123,25 @@ struct fsl_lpspi_data { bool usedma; struct completion dma_rx_completion; struct completion dma_tx_completion; + + const struct fsl_lpspi_devtype_data *devtype_data; +}; + +/* + * ERR051608 fixed or not: + * https://www.nxp.com/docs/en/errata/i.MX93_1P87f.pdf + */ +static struct fsl_lpspi_devtype_data imx93_lpspi_devtype_data = { + .prescale_max = 1, +}; + +static struct fsl_lpspi_devtype_data imx7ulp_lpspi_devtype_data = { + .prescale_max = 8, }; static const struct of_device_id fsl_lpspi_dt_ids[] = { - { .compatible = "fsl,imx7ulp-spi", }, + { .compatible = "fsl,imx7ulp-spi", .data = &imx7ulp_lpspi_devtype_data,}, + { .compatible = "fsl,imx93-spi", .data = &imx93_lpspi_devtype_data,}, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, fsl_lpspi_dt_ids); @@ -297,9 +316,11 @@ static int fsl_lpspi_set_bitrate(struct fsl_lpspi_data *fsl_lpspi) { struct lpspi_config config = fsl_lpspi->config; unsigned int perclk_rate, scldiv, div; + u8 prescale_max; u8 prescale; perclk_rate = clk_get_rate(fsl_lpspi->clk_per); + prescale_max = fsl_lpspi->devtype_data->prescale_max; if (!config.speed_hz) { dev_err(fsl_lpspi->dev, @@ -315,7 +336,7 @@ static int fsl_lpspi_set_bitrate(struct fsl_lpspi_data *fsl_lpspi) div = DIV_ROUND_UP(perclk_rate, config.speed_hz); - for (prescale = 0; prescale < 8; prescale++) { + for (prescale = 0; prescale < prescale_max; prescale++) { scldiv = div / (1 << prescale) - 2; if (scldiv < 256) { fsl_lpspi->config.prescale = prescale; @@ -822,6 +843,7 @@ static int fsl_lpspi_init_rpm(struct fsl_lpspi_data *fsl_lpspi) static int fsl_lpspi_probe(struct platform_device *pdev) { + const struct fsl_lpspi_devtype_data *devtype_data; struct fsl_lpspi_data *fsl_lpspi; struct spi_controller *controller; struct resource *res; @@ -830,6 +852,10 @@ static int fsl_lpspi_probe(struct platform_device *pdev) u32 temp; bool is_target; + devtype_data = of_device_get_match_data(&pdev->dev); + if (!devtype_data) + return -ENODEV; + is_target = of_property_read_bool((&pdev->dev)->of_node, "spi-slave"); if (is_target) controller = devm_spi_alloc_target(&pdev->dev, @@ -848,6 +874,7 @@ static int fsl_lpspi_probe(struct platform_device *pdev) fsl_lpspi->is_target = is_target; fsl_lpspi->is_only_cs1 = of_property_read_bool((&pdev->dev)->of_node, "fsl,spi-only-use-cs1-sel"); + fsl_lpspi->devtype_data = devtype_data; init_completion(&fsl_lpspi->xfer_done); From fc59b9a5f7201b9f7272944596113a82cc7773d5 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 16 Aug 2024 14:48:10 +0300 Subject: [PATCH 515/991] bonding: fix bond_ipsec_offload_ok return type Fix the return type which should be bool. Fixes: 955b785ec6b3 ("bonding: fix suspicious RCU usage in bond_ipsec_offload_ok()") Signed-off-by: Nikolay Aleksandrov Reviewed-by: Hangbin Liu Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 1cd92c12e7824c..85b5868deeeaf6 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -599,34 +599,28 @@ static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs) struct net_device *real_dev; struct slave *curr_active; struct bonding *bond; - int err; + bool ok = false; bond = netdev_priv(bond_dev); rcu_read_lock(); curr_active = rcu_dereference(bond->curr_active_slave); real_dev = curr_active->dev; - if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { - err = false; + if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) goto out; - } - if (!xs->xso.real_dev) { - err = false; + if (!xs->xso.real_dev) goto out; - } if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_offload_ok || - netif_is_bond_master(real_dev)) { - err = false; + netif_is_bond_master(real_dev)) goto out; - } - err = real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs); + ok = real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs); out: rcu_read_unlock(); - return err; + return ok; } static const struct xfrmdev_ops bond_xfrmdev_ops = { From 95c90e4ad89d493a7a14fa200082e466e2548f9d Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 16 Aug 2024 14:48:11 +0300 Subject: [PATCH 516/991] bonding: fix null pointer deref in bond_ipsec_offload_ok We must check if there is an active slave before dereferencing the pointer. Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves") Signed-off-by: Nikolay Aleksandrov Reviewed-by: Hangbin Liu Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 85b5868deeeaf6..65ddb71eebcda2 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -604,6 +604,8 @@ static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs) bond = netdev_priv(bond_dev); rcu_read_lock(); curr_active = rcu_dereference(bond->curr_active_slave); + if (!curr_active) + goto out; real_dev = curr_active->dev; if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) From f8cde9805981c50d0c029063dc7d82821806fc44 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 16 Aug 2024 14:48:12 +0300 Subject: [PATCH 517/991] bonding: fix xfrm real_dev null pointer dereference We shouldn't set real_dev to NULL because packets can be in transit and xfrm might call xdo_dev_offload_ok() in parallel. All callbacks assume real_dev is set. Example trace: kernel: BUG: unable to handle page fault for address: 0000000000001030 kernel: bond0: (slave eni0np1): making interface the new active one kernel: #PF: supervisor write access in kernel mode kernel: #PF: error_code(0x0002) - not-present page kernel: PGD 0 P4D 0 kernel: Oops: 0002 [#1] PREEMPT SMP kernel: CPU: 4 PID: 2237 Comm: ping Not tainted 6.7.7+ #12 kernel: Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014 kernel: RIP: 0010:nsim_ipsec_offload_ok+0xc/0x20 [netdevsim] kernel: bond0: (slave eni0np1): bond_ipsec_add_sa_all: failed to add SA kernel: Code: e0 0f 0b 48 83 7f 38 00 74 de 0f 0b 48 8b 47 08 48 8b 37 48 8b 78 40 e9 b2 e5 9a d7 66 90 0f 1f 44 00 00 48 8b 86 80 02 00 00 <83> 80 30 10 00 00 01 b8 01 00 00 00 c3 0f 1f 80 00 00 00 00 0f 1f kernel: bond0: (slave eni0np1): making interface the new active one kernel: RSP: 0018:ffffabde81553b98 EFLAGS: 00010246 kernel: bond0: (slave eni0np1): bond_ipsec_add_sa_all: failed to add SA kernel: kernel: RAX: 0000000000000000 RBX: ffff9eb404e74900 RCX: ffff9eb403d97c60 kernel: RDX: ffffffffc090de10 RSI: ffff9eb404e74900 RDI: ffff9eb3c5de9e00 kernel: RBP: ffff9eb3c0a42000 R08: 0000000000000010 R09: 0000000000000014 kernel: R10: 7974203030303030 R11: 3030303030303030 R12: 0000000000000000 kernel: R13: ffff9eb3c5de9e00 R14: ffffabde81553cc8 R15: ffff9eb404c53000 kernel: FS: 00007f2a77a3ad00(0000) GS:ffff9eb43bd00000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 0000000000001030 CR3: 00000001122ab000 CR4: 0000000000350ef0 kernel: bond0: (slave eni0np1): making interface the new active one kernel: Call Trace: kernel: kernel: ? __die+0x1f/0x60 kernel: bond0: (slave eni0np1): bond_ipsec_add_sa_all: failed to add SA kernel: ? page_fault_oops+0x142/0x4c0 kernel: ? do_user_addr_fault+0x65/0x670 kernel: ? kvm_read_and_reset_apf_flags+0x3b/0x50 kernel: bond0: (slave eni0np1): making interface the new active one kernel: ? exc_page_fault+0x7b/0x180 kernel: ? asm_exc_page_fault+0x22/0x30 kernel: ? nsim_bpf_uninit+0x50/0x50 [netdevsim] kernel: bond0: (slave eni0np1): bond_ipsec_add_sa_all: failed to add SA kernel: ? nsim_ipsec_offload_ok+0xc/0x20 [netdevsim] kernel: bond0: (slave eni0np1): making interface the new active one kernel: bond_ipsec_offload_ok+0x7b/0x90 [bonding] kernel: xfrm_output+0x61/0x3b0 kernel: bond0: (slave eni0np1): bond_ipsec_add_sa_all: failed to add SA kernel: ip_push_pending_frames+0x56/0x80 Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves") Signed-off-by: Nikolay Aleksandrov Reviewed-by: Hangbin Liu Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 65ddb71eebcda2..f74bacf071fca0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -582,7 +582,6 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) } else { slave->dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs); } - ipsec->xs->xso.real_dev = NULL; } spin_unlock_bh(&bond->ipsec_lock); rcu_read_unlock(); From c4c5c5d2ef40a9f67a9241dc5422eac9ffe19547 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 16 Aug 2024 14:48:13 +0300 Subject: [PATCH 518/991] bonding: fix xfrm state handling when clearing active slave If the active slave is cleared manually the xfrm state is not flushed. This leads to xfrm add/del imbalance and adding the same state multiple times. For example when the device cannot handle anymore states we get: [ 1169.884811] bond0: (slave eni0np1): bond_ipsec_add_sa_all: failed to add SA because it's filled with the same state after multiple active slave clearings. This change also has a few nice side effects: user-space gets a notification for the change, the old device gets its mac address and promisc/mcast adjusted properly. Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves") Signed-off-by: Nikolay Aleksandrov Reviewed-by: Hangbin Liu Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index bc80fb6397dcd0..95d59a18c0223f 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -936,7 +936,7 @@ static int bond_option_active_slave_set(struct bonding *bond, /* check to see if we are clearing active */ if (!slave_dev) { netdev_dbg(bond->dev, "Clearing current active slave\n"); - RCU_INIT_POINTER(bond->curr_active_slave, NULL); + bond_change_active_slave(bond, NULL); bond_select_active_slave(bond); } else { struct slave *old_active = rtnl_dereference(bond->curr_active_slave); From 4d936f10ff80274841537a26d1fbfe9984de0ef9 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Tue, 20 Aug 2024 09:18:50 +0530 Subject: [PATCH 519/991] irqchip/sifive-plic: Probe plic driver early for Allwinner D1 platform The latest Linux RISC-V no longer boots on the Allwinner D1 platform because the sun4i_timer driver fails to get an interrupt from PLIC due to the recent conversion of the PLIC to a platform driver. Converting the sun4i timer to a platform driver does not work either because the D1 does not have a SBI timer available so early boot hangs. See the 'Closes:' link for deeper analysis. The real fix requires enabling the SBI time extension in the platform firmware (OpenSBI) and convert sun4i_timer into platform driver. Unfortunately, the real fix involves changing multiple places and can't be achieved in a short duration and aside of that requires users to update firmware. As a work-around, retrofit PLIC probing such that the PLIC is probed early only for the Allwinner D1 platform and probed as a regular platform driver for rest of the RISC-V platforms. In the process, partially revert some of the previous changes because the PLIC device pointer is not available in all probing paths. Fixes: e306a894bd51 ("irqchip/sifive-plic: Chain to parent IRQ after handlers are ready") Fixes: 8ec99b033147 ("irqchip/sifive-plic: Convert PLIC driver into a platform driver") Suggested-by: Thomas Gleixner Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Tested-by: Samuel Holland Tested-by: Emil Renner Berthing Tested-by: Charlie Jenkins Reviewed-by: Samuel Holland Reviewed-by: Charlie Jenkins Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240820034850.3189912-1-apatel@ventanamicro.com Closes: https://lore.kernel.org/lkml/20240814145642.344485-1-emil.renner.berthing@canonical.com/ --- drivers/irqchip/irq-sifive-plic.c | 115 ++++++++++++++++++------------ 1 file changed, 71 insertions(+), 44 deletions(-) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index 9e22f7e378f502..4d9ea718086d30 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -3,6 +3,7 @@ * Copyright (C) 2017 SiFive * Copyright (C) 2018 Christoph Hellwig */ +#define pr_fmt(fmt) "riscv-plic: " fmt #include #include #include @@ -63,7 +64,7 @@ #define PLIC_QUIRK_EDGE_INTERRUPT 0 struct plic_priv { - struct device *dev; + struct fwnode_handle *fwnode; struct cpumask lmask; struct irq_domain *irqdomain; void __iomem *regs; @@ -378,8 +379,8 @@ static void plic_handle_irq(struct irq_desc *desc) int err = generic_handle_domain_irq(handler->priv->irqdomain, hwirq); if (unlikely(err)) { - dev_warn_ratelimited(handler->priv->dev, - "can't find mapping for hwirq %lu\n", hwirq); + pr_warn_ratelimited("%pfwP: can't find mapping for hwirq %lu\n", + handler->priv->fwnode, hwirq); } } @@ -408,7 +409,8 @@ static int plic_starting_cpu(unsigned int cpu) enable_percpu_irq(plic_parent_irq, irq_get_trigger_type(plic_parent_irq)); else - dev_warn(handler->priv->dev, "cpu%d: parent irq not available\n", cpu); + pr_warn("%pfwP: cpu%d: parent irq not available\n", + handler->priv->fwnode, cpu); plic_set_threshold(handler, PLIC_ENABLE_THRESHOLD); return 0; @@ -424,38 +426,36 @@ static const struct of_device_id plic_match[] = { {} }; -static int plic_parse_nr_irqs_and_contexts(struct platform_device *pdev, +static int plic_parse_nr_irqs_and_contexts(struct fwnode_handle *fwnode, u32 *nr_irqs, u32 *nr_contexts) { - struct device *dev = &pdev->dev; int rc; /* * Currently, only OF fwnode is supported so extend this * function for ACPI support. */ - if (!is_of_node(dev->fwnode)) + if (!is_of_node(fwnode)) return -EINVAL; - rc = of_property_read_u32(to_of_node(dev->fwnode), "riscv,ndev", nr_irqs); + rc = of_property_read_u32(to_of_node(fwnode), "riscv,ndev", nr_irqs); if (rc) { - dev_err(dev, "riscv,ndev property not available\n"); + pr_err("%pfwP: riscv,ndev property not available\n", fwnode); return rc; } - *nr_contexts = of_irq_count(to_of_node(dev->fwnode)); + *nr_contexts = of_irq_count(to_of_node(fwnode)); if (WARN_ON(!(*nr_contexts))) { - dev_err(dev, "no PLIC context available\n"); + pr_err("%pfwP: no PLIC context available\n", fwnode); return -EINVAL; } return 0; } -static int plic_parse_context_parent(struct platform_device *pdev, u32 context, +static int plic_parse_context_parent(struct fwnode_handle *fwnode, u32 context, u32 *parent_hwirq, int *parent_cpu) { - struct device *dev = &pdev->dev; struct of_phandle_args parent; unsigned long hartid; int rc; @@ -464,10 +464,10 @@ static int plic_parse_context_parent(struct platform_device *pdev, u32 context, * Currently, only OF fwnode is supported so extend this * function for ACPI support. */ - if (!is_of_node(dev->fwnode)) + if (!is_of_node(fwnode)) return -EINVAL; - rc = of_irq_parse_one(to_of_node(dev->fwnode), context, &parent); + rc = of_irq_parse_one(to_of_node(fwnode), context, &parent); if (rc) return rc; @@ -480,48 +480,55 @@ static int plic_parse_context_parent(struct platform_device *pdev, u32 context, return 0; } -static int plic_probe(struct platform_device *pdev) +static int plic_probe(struct fwnode_handle *fwnode) { int error = 0, nr_contexts, nr_handlers = 0, cpu, i; - struct device *dev = &pdev->dev; unsigned long plic_quirks = 0; struct plic_handler *handler; u32 nr_irqs, parent_hwirq; struct plic_priv *priv; irq_hw_number_t hwirq; + void __iomem *regs; - if (is_of_node(dev->fwnode)) { + if (is_of_node(fwnode)) { const struct of_device_id *id; - id = of_match_node(plic_match, to_of_node(dev->fwnode)); + id = of_match_node(plic_match, to_of_node(fwnode)); if (id) plic_quirks = (unsigned long)id->data; + + regs = of_iomap(to_of_node(fwnode), 0); + if (!regs) + return -ENOMEM; + } else { + return -ENODEV; } - error = plic_parse_nr_irqs_and_contexts(pdev, &nr_irqs, &nr_contexts); + error = plic_parse_nr_irqs_and_contexts(fwnode, &nr_irqs, &nr_contexts); if (error) - return error; + goto fail_free_regs; - priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); - if (!priv) - return -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + error = -ENOMEM; + goto fail_free_regs; + } - priv->dev = dev; + priv->fwnode = fwnode; priv->plic_quirks = plic_quirks; priv->nr_irqs = nr_irqs; + priv->regs = regs; - priv->regs = devm_platform_ioremap_resource(pdev, 0); - if (WARN_ON(!priv->regs)) - return -EIO; - - priv->prio_save = devm_bitmap_zalloc(dev, nr_irqs, GFP_KERNEL); - if (!priv->prio_save) - return -ENOMEM; + priv->prio_save = bitmap_zalloc(nr_irqs, GFP_KERNEL); + if (!priv->prio_save) { + error = -ENOMEM; + goto fail_free_priv; + } for (i = 0; i < nr_contexts; i++) { - error = plic_parse_context_parent(pdev, i, &parent_hwirq, &cpu); + error = plic_parse_context_parent(fwnode, i, &parent_hwirq, &cpu); if (error) { - dev_warn(dev, "hwirq for context%d not found\n", i); + pr_warn("%pfwP: hwirq for context%d not found\n", fwnode, i); continue; } @@ -543,7 +550,7 @@ static int plic_probe(struct platform_device *pdev) } if (cpu < 0) { - dev_warn(dev, "Invalid cpuid for context %d\n", i); + pr_warn("%pfwP: Invalid cpuid for context %d\n", fwnode, i); continue; } @@ -554,7 +561,7 @@ static int plic_probe(struct platform_device *pdev) */ handler = per_cpu_ptr(&plic_handlers, cpu); if (handler->present) { - dev_warn(dev, "handler already present for context %d.\n", i); + pr_warn("%pfwP: handler already present for context %d.\n", fwnode, i); plic_set_threshold(handler, PLIC_DISABLE_THRESHOLD); goto done; } @@ -568,8 +575,8 @@ static int plic_probe(struct platform_device *pdev) i * CONTEXT_ENABLE_SIZE; handler->priv = priv; - handler->enable_save = devm_kcalloc(dev, DIV_ROUND_UP(nr_irqs, 32), - sizeof(*handler->enable_save), GFP_KERNEL); + handler->enable_save = kcalloc(DIV_ROUND_UP(nr_irqs, 32), + sizeof(*handler->enable_save), GFP_KERNEL); if (!handler->enable_save) goto fail_cleanup_contexts; done: @@ -581,7 +588,7 @@ static int plic_probe(struct platform_device *pdev) nr_handlers++; } - priv->irqdomain = irq_domain_add_linear(to_of_node(dev->fwnode), nr_irqs + 1, + priv->irqdomain = irq_domain_add_linear(to_of_node(fwnode), nr_irqs + 1, &plic_irqdomain_ops, priv); if (WARN_ON(!priv->irqdomain)) goto fail_cleanup_contexts; @@ -619,13 +626,13 @@ static int plic_probe(struct platform_device *pdev) } } - dev_info(dev, "mapped %d interrupts with %d handlers for %d contexts.\n", - nr_irqs, nr_handlers, nr_contexts); + pr_info("%pfwP: mapped %d interrupts with %d handlers for %d contexts.\n", + fwnode, nr_irqs, nr_handlers, nr_contexts); return 0; fail_cleanup_contexts: for (i = 0; i < nr_contexts; i++) { - if (plic_parse_context_parent(pdev, i, &parent_hwirq, &cpu)) + if (plic_parse_context_parent(fwnode, i, &parent_hwirq, &cpu)) continue; if (parent_hwirq != RV_IRQ_EXT || cpu < 0) continue; @@ -634,17 +641,37 @@ static int plic_probe(struct platform_device *pdev) handler->present = false; handler->hart_base = NULL; handler->enable_base = NULL; + kfree(handler->enable_save); handler->enable_save = NULL; handler->priv = NULL; } - return -ENOMEM; + bitmap_free(priv->prio_save); +fail_free_priv: + kfree(priv); +fail_free_regs: + iounmap(regs); + return error; +} + +static int plic_platform_probe(struct platform_device *pdev) +{ + return plic_probe(pdev->dev.fwnode); } static struct platform_driver plic_driver = { .driver = { .name = "riscv-plic", .of_match_table = plic_match, + .suppress_bind_attrs = true, }, - .probe = plic_probe, + .probe = plic_platform_probe, }; builtin_platform_driver(plic_driver); + +static int __init plic_early_probe(struct device_node *node, + struct device_node *parent) +{ + return plic_probe(&node->fwnode); +} + +IRQCHIP_DECLARE(riscv, "allwinner,sun20i-d1-plic", plic_early_probe); From f97fd458763a4801d04dbb4a79d9ca6282d293ec Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 18 Aug 2024 18:16:25 +0100 Subject: [PATCH 520/991] irqchip/gic-v4: Fix ordering between vmapp and vpe locks The recently established lock ordering mandates that the per-VM vmapp_lock is acquired before taking the per-VPE lock. As it turns out, its_vpe_set_affinity() takes the VPE lock, and then calls into its_send_vmovp(), which itself takes the vmapp lock. Obviously, this is a lock order violation. As its_send_vmovp() is only called from its_vpe_set_affinity(), hoist the vmapp locking from the former into the latter, restoring the expected order. Fixes: f0eb154c39471 ("irqchip/gic-v4: Substitute vmovp_lock for a per-VM lock") Reported-by: Zhou Wang Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240818171625.3030584-1-maz@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 9b34596b3542ed..fdec478ba5e70a 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1329,12 +1329,6 @@ static void its_send_vmovp(struct its_vpe *vpe) return; } - /* - * Protect against concurrent updates of the mapping state on - * individual VMs. - */ - guard(raw_spinlock_irqsave)(&vpe->its_vm->vmapp_lock); - /* * Yet another marvel of the architecture. If using the * its_list "feature", we need to make sure that all ITSs @@ -3824,7 +3818,14 @@ static int its_vpe_set_affinity(struct irq_data *d, * protect us, and that we must ensure nobody samples vpe->col_idx * during the update, hence the lock below which must also be * taken on any vLPI handling path that evaluates vpe->col_idx. + * + * Finally, we must protect ourselves against concurrent updates of + * the mapping state on this VM should the ITS list be in use (see + * the shortcut in its_send_vmovp() otherewise). */ + if (its_list_map) + raw_spin_lock(&vpe->its_vm->vmapp_lock); + from = vpe_to_cpuid_lock(vpe, &flags); table_mask = gic_data_rdist_cpu(from)->vpe_table_mask; @@ -3854,6 +3855,9 @@ static int its_vpe_set_affinity(struct irq_data *d, irq_data_update_effective_affinity(d, cpumask_of(cpu)); vpe_to_cpuid_unlock(vpe, flags); + if (its_list_map) + raw_spin_unlock(&vpe->its_vm->vmapp_lock); + return IRQ_SET_MASK_OK_DONE; } From efe81b7bdf7d882d0ce3d183f1571321046da8f1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 20 Aug 2024 11:42:40 +0300 Subject: [PATCH 521/991] irqchip/riscv-aplic: Fix an IS_ERR() vs NULL bug in probe() The devm_platform_ioremap_resource() function doesn't return NULL, it returns error pointers. Fix the error handling to match. Fixes: 2333df5ae51e ("irqchip: Add RISC-V advanced PLIC driver for direct-mode") Signed-off-by: Dan Carpenter Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Reviewed-by: Anup Patel Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/a5a628d6-81d8-4933-81a8-64aad4743ec4@stanley.mountain --- drivers/irqchip/irq-riscv-aplic-main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-riscv-aplic-main.c b/drivers/irqchip/irq-riscv-aplic-main.c index 28dd175b576408..981fad6fb8f71f 100644 --- a/drivers/irqchip/irq-riscv-aplic-main.c +++ b/drivers/irqchip/irq-riscv-aplic-main.c @@ -175,9 +175,9 @@ static int aplic_probe(struct platform_device *pdev) /* Map the MMIO registers */ regs = devm_platform_ioremap_resource(pdev, 0); - if (!regs) { + if (IS_ERR(regs)) { dev_err(dev, "failed map MMIO registers\n"); - return -ENOMEM; + return PTR_ERR(regs); } /* From c5af2c90ba5629f0424a8d315f75fb8d91713c3c Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Tue, 20 Aug 2024 17:28:43 +0800 Subject: [PATCH 522/991] irqchip/gic-v2m: Fix refcount leak in gicv2m_of_init() gicv2m_of_init() fails to perform an of_node_put() when of_address_to_resource() fails, leading to a refcount leak. Address this by moving the error handling path outside of the loop and making it common to all failure modes. Fixes: 4266ab1a8ff5 ("irqchip/gic-v2m: Refactor to prepare for ACPI support") Signed-off-by: Ma Ke Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240820092843.1219933-1-make24@iscas.ac.cn --- drivers/irqchip/irq-gic-v2m.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index 51af63c046edb0..be35c5349986aa 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -407,12 +407,12 @@ static int __init gicv2m_of_init(struct fwnode_handle *parent_handle, ret = gicv2m_init_one(&child->fwnode, spi_start, nr_spis, &res, 0); - if (ret) { - of_node_put(child); + if (ret) break; - } } + if (ret && child) + of_node_put(child); if (!ret) ret = gicv2m_allocate_domains(parent); if (ret) From 50b2143356e888777fc5bca023c39f34f404613a Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 7 Aug 2024 12:53:24 +0200 Subject: [PATCH 523/991] ice: fix page reuse when PAGE_SIZE is over 8k Architectures that have PAGE_SIZE >= 8192 such as arm64 should act the same as x86 currently, meaning reuse of a page should only take place when no one else is busy with it. Do two things independently of underlying PAGE_SIZE: - store the page count under ice_rx_buf::pgcnt - then act upon its value vs ice_rx_buf::pagecnt_bias when making the decision regarding page reuse Fixes: 2b245cb29421 ("ice: Implement transmit and NAPI support") Signed-off-by: Maciej Fijalkowski Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_txrx.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 8d25b698126982..50211188c1a7aa 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -837,16 +837,15 @@ ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf) if (!dev_page_is_reusable(page)) return false; -#if (PAGE_SIZE < 8192) /* if we are only owner of page we can reuse it */ if (unlikely(rx_buf->pgcnt - pagecnt_bias > 1)) return false; -#else +#if (PAGE_SIZE >= 8192) #define ICE_LAST_OFFSET \ (SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_2048) if (rx_buf->page_offset > ICE_LAST_OFFSET) return false; -#endif /* PAGE_SIZE < 8192) */ +#endif /* PAGE_SIZE >= 8192) */ /* If we have drained the page fragment pool we need to update * the pagecnt_bias and page count so that we fully restock the @@ -949,12 +948,7 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size, struct ice_rx_buf *rx_buf; rx_buf = &rx_ring->rx_buf[ntc]; - rx_buf->pgcnt = -#if (PAGE_SIZE < 8192) - page_count(rx_buf->page); -#else - 0; -#endif + rx_buf->pgcnt = page_count(rx_buf->page); prefetchw(rx_buf->page); if (!size) From b966ad832942b5a11e002f9b5ef102b08425b84a Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 7 Aug 2024 12:53:25 +0200 Subject: [PATCH 524/991] ice: fix ICE_LAST_OFFSET formula For bigger PAGE_SIZE archs, ice driver works on 3k Rx buffers. Therefore, ICE_LAST_OFFSET should take into account ICE_RXBUF_3072, not ICE_RXBUF_2048. Fixes: 7237f5b0dba4 ("ice: introduce legacy Rx flag") Suggested-by: Luiz Capitulino Signed-off-by: Maciej Fijalkowski Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_txrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 50211188c1a7aa..4b690952bb403c 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -842,7 +842,7 @@ ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf) return false; #if (PAGE_SIZE >= 8192) #define ICE_LAST_OFFSET \ - (SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_2048) + (SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_3072) if (rx_buf->page_offset > ICE_LAST_OFFSET) return false; #endif /* PAGE_SIZE >= 8192) */ From d53d4dcce69be5773e2d0878c9899ebfbf58c393 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 7 Aug 2024 12:53:26 +0200 Subject: [PATCH 525/991] ice: fix truesize operations for PAGE_SIZE >= 8192 When working on multi-buffer packet on arch that has PAGE_SIZE >= 8192, truesize is calculated and stored in xdp_buff::frame_sz per each processed Rx buffer. This means that frame_sz will contain the truesize based on last received buffer, but commit 1dc1a7e7f410 ("ice: Centrallize Rx buffer recycling") assumed this value will be constant for each buffer, which breaks the page recycling scheme and mess up the way we update the page::page_offset. To fix this, let us work on constant truesize when PAGE_SIZE >= 8192 instead of basing this on size of a packet read from Rx descriptor. This way we can simplify the code and avoid calculating truesize per each received frame and on top of that when using xdp_update_skb_shared_info(), current formula for truesize update will be valid. This means ice_rx_frame_truesize() can be removed altogether. Furthermore, first call to it within ice_clean_rx_irq() for 4k PAGE_SIZE was redundant as xdp_buff::frame_sz is initialized via xdp_init_buff() in ice_vsi_cfg_rxq(). This should have been removed at the point where xdp_buff struct started to be a member of ice_rx_ring and it was no longer a stack based variable. There are two fixes tags as my understanding is that the first one exposed us to broken truesize and page_offset handling and then second introduced broken skb_shared_info update in ice_{construct,build}_skb(). Reported-and-tested-by: Luiz Capitulino Closes: https://lore.kernel.org/netdev/8f9e2a5c-fd30-4206-9311-946a06d031bb@redhat.com/ Fixes: 1dc1a7e7f410 ("ice: Centrallize Rx buffer recycling") Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side") Signed-off-by: Maciej Fijalkowski Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_base.c | 21 ++++++++++++++- drivers/net/ethernet/intel/ice/ice_txrx.c | 33 ----------------------- 2 files changed, 20 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 1facf179a96fd6..f448d3a8456425 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -512,6 +512,25 @@ static void ice_xsk_pool_fill_cb(struct ice_rx_ring *ring) xsk_pool_fill_cb(ring->xsk_pool, &desc); } +/** + * ice_get_frame_sz - calculate xdp_buff::frame_sz + * @rx_ring: the ring being configured + * + * Return frame size based on underlying PAGE_SIZE + */ +static unsigned int ice_get_frame_sz(struct ice_rx_ring *rx_ring) +{ + unsigned int frame_sz; + +#if (PAGE_SIZE >= 8192) + frame_sz = rx_ring->rx_buf_len; +#else + frame_sz = ice_rx_pg_size(rx_ring) / 2; +#endif + + return frame_sz; +} + /** * ice_vsi_cfg_rxq - Configure an Rx queue * @ring: the ring being configured @@ -576,7 +595,7 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) } } - xdp_init_buff(&ring->xdp, ice_rx_pg_size(ring) / 2, &ring->xdp_rxq); + xdp_init_buff(&ring->xdp, ice_get_frame_sz(ring), &ring->xdp_rxq); ring->xdp.data = NULL; ring->xdp_ext.pkt_ctx = &ring->pkt_ctx; err = ice_setup_rx_ctx(ring); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 4b690952bb403c..c9bc3f1add5d37 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -521,30 +521,6 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring) return -ENOMEM; } -/** - * ice_rx_frame_truesize - * @rx_ring: ptr to Rx ring - * @size: size - * - * calculate the truesize with taking into the account PAGE_SIZE of - * underlying arch - */ -static unsigned int -ice_rx_frame_truesize(struct ice_rx_ring *rx_ring, const unsigned int size) -{ - unsigned int truesize; - -#if (PAGE_SIZE < 8192) - truesize = ice_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ -#else - truesize = rx_ring->rx_offset ? - SKB_DATA_ALIGN(rx_ring->rx_offset + size) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : - SKB_DATA_ALIGN(size); -#endif - return truesize; -} - /** * ice_run_xdp - Executes an XDP program on initialized xdp_buff * @rx_ring: Rx ring @@ -1154,11 +1130,6 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) bool failure; u32 first; - /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ -#if (PAGE_SIZE < 8192) - xdp->frame_sz = ice_rx_frame_truesize(rx_ring, 0); -#endif - xdp_prog = READ_ONCE(rx_ring->xdp_prog); if (xdp_prog) { xdp_ring = rx_ring->xdp_ring; @@ -1217,10 +1188,6 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) hard_start = page_address(rx_buf->page) + rx_buf->page_offset - offset; xdp_prepare_buff(xdp, hard_start, offset, size, !!offset); -#if (PAGE_SIZE > 4096) - /* At larger PAGE_SIZE, frame_sz depend on len size */ - xdp->frame_sz = ice_rx_frame_truesize(rx_ring, size); -#endif xdp_buff_clear_frags_flag(xdp); } else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) { break; From 503ab6ee40fc103ea55cc9e50bb879e571d65aac Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Mon, 19 Aug 2024 09:17:42 +0200 Subject: [PATCH 526/991] ice: use internal pf id instead of function number Use always the same pf id in devlink port number. When doing pass-through the PF to VM bus info func number can be any value. Fixes: 2ae0aa4758b0 ("ice: Move devlink port to PF/VF struct") Reviewed-by: Wojciech Drewek Suggested-by: Jiri Pirko Signed-off-by: Michal Swiatkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/devlink/devlink_port.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink_port.c b/drivers/net/ethernet/intel/ice/devlink/devlink_port.c index 00fed5a61d62a9..62ef8e2fb5f1ba 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink_port.c +++ b/drivers/net/ethernet/intel/ice/devlink/devlink_port.c @@ -337,7 +337,7 @@ int ice_devlink_create_pf_port(struct ice_pf *pf) return -EIO; attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; - attrs.phys.port_number = pf->hw.bus.func; + attrs.phys.port_number = pf->hw.pf_id; /* As FW supports only port split options for whole device, * set port split options only for first PF. @@ -455,7 +455,7 @@ int ice_devlink_create_vf_port(struct ice_vf *vf) return -EINVAL; attrs.flavour = DEVLINK_PORT_FLAVOUR_PCI_VF; - attrs.pci_vf.pf = pf->hw.bus.func; + attrs.pci_vf.pf = pf->hw.pf_id; attrs.pci_vf.vf = vf->vf_id; ice_devlink_set_switch_id(pf, &attrs.switch_id); From c50e7475961c36ec4d21d60af055b32f9436b431 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 17 Aug 2024 09:52:46 +0300 Subject: [PATCH 527/991] dpaa2-switch: Fix error checking in dpaa2_switch_seed_bp() The dpaa2_switch_add_bufs() function returns the number of bufs that it was able to add. It returns BUFS_PER_CMD (7) for complete success or a smaller number if there are not enough pages available. However, the error checking is looking at the total number of bufs instead of the number which were added on this iteration. Thus the error checking only works correctly for the first iteration through the loop and subsequent iterations are always counted as a success. Fix this by checking only the bufs added in the current iteration. Fixes: 0b1b71370458 ("staging: dpaa2-switch: handle Rx path on control interface") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Reviewed-by: Ioana Ciornei Tested-by: Ioana Ciornei Link: https://patch.msgid.link/eec27f30-b43f-42b6-b8ee-04a6f83423b6@stanley.mountain Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index a71f848adc0545..a293b08f36d46d 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -2638,13 +2638,14 @@ static int dpaa2_switch_refill_bp(struct ethsw_core *ethsw) static int dpaa2_switch_seed_bp(struct ethsw_core *ethsw) { - int *count, i; + int *count, ret, i; for (i = 0; i < DPAA2_ETHSW_NUM_BUFS; i += BUFS_PER_CMD) { + ret = dpaa2_switch_add_bufs(ethsw, ethsw->bpid); count = ðsw->buf_count; - *count += dpaa2_switch_add_bufs(ethsw, ethsw->bpid); + *count += ret; - if (unlikely(*count < BUFS_PER_CMD)) + if (unlikely(ret < BUFS_PER_CMD)) return -ENOMEM; } From 80a1e7b83bb1834b5568a3872e64c05795d88f31 Mon Sep 17 00:00:00 2001 From: Nikolay Kuratov Date: Mon, 19 Aug 2024 10:54:08 +0300 Subject: [PATCH 528/991] cxgb4: add forgotten u64 ivlan cast before shift It is done everywhere in cxgb4 code, e.g. in is_filter_exact_match() There is no reason it should not be done here Found by Linux Verification Center (linuxtesting.org) with SVACE Signed-off-by: Nikolay Kuratov Cc: stable@vger.kernel.org Fixes: 12b276fbf6e0 ("cxgb4: add support to create hash filters") Reviewed-by: Simon Horman Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20240819075408.92378-1-kniv@yandex-team.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c index 786ceae3448875..dd9e68465e6978 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c @@ -1244,7 +1244,8 @@ static u64 hash_filter_ntuple(struct ch_filter_specification *fs, * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && fs->mask.ivlan) - ntuple |= (FT_VLAN_VLD_F | fs->val.ivlan) << tp->vlan_shift; + ntuple |= (u64)(FT_VLAN_VLD_F | + fs->val.ivlan) << tp->vlan_shift; if (tp->port_shift >= 0 && fs->mask.iport) ntuple |= (u64)fs->val.iport << tp->port_shift; From 8aba27c4a5020abdf60149239198297f88338a8d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 16 Aug 2024 17:20:34 +0200 Subject: [PATCH 529/991] igb: cope with large MAX_SKB_FRAGS Sabrina reports that the igb driver does not cope well with large MAX_SKB_FRAG values: setting MAX_SKB_FRAG to 45 causes payload corruption on TX. An easy reproducer is to run ssh to connect to the machine. With MAX_SKB_FRAGS=17 it works, with MAX_SKB_FRAGS=45 it fails. This has been reported originally in https://bugzilla.redhat.com/show_bug.cgi?id=2265320 The root cause of the issue is that the driver does not take into account properly the (possibly large) shared info size when selecting the ring layout, and will try to fit two packets inside the same 4K page even when the 1st fraglist will trump over the 2nd head. Address the issue by checking if 2K buffers are insufficient. Fixes: 3948b05950fd ("net: introduce a config option to tweak MAX_SKB_FRAGS") Reported-by: Jan Tluka Reported-by: Jirka Hladky Reported-by: Sabrina Dubroca Tested-by: Sabrina Dubroca Tested-by: Corinna Vinschen Signed-off-by: Paolo Abeni Signed-off-by: Corinna Vinschen Link: https://patch.msgid.link/20240816152034.1453285-1-vinschen@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igb/igb_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 11be39f435f38d..33a42b4c21e0b6 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -4808,6 +4808,7 @@ static void igb_set_rx_buffer_len(struct igb_adapter *adapter, #if (PAGE_SIZE < 8192) if (adapter->max_frame_size > IGB_MAX_FRAME_BUILD_SKB || + IGB_2K_TOO_SMALL_WITH_PADDING || rd32(E1000_RCTL) & E1000_RCTL_SBP) set_ring_uses_large_buffer(rx_ring); #endif From 6efea5135417ae8194485d1d05ea79a21cf1a11c Mon Sep 17 00:00:00 2001 From: Martin Whitaker Date: Sat, 17 Aug 2024 10:41:41 +0100 Subject: [PATCH 530/991] net: dsa: microchip: fix PTP config failure when using multiple ports When performing the port_hwtstamp_set operation, ptp_schedule_worker() will be called if hardware timestamoing is enabled on any of the ports. When using multiple ports for PTP, port_hwtstamp_set is executed for each port. When called for the first time ptp_schedule_worker() returns 0. On subsequent calls it returns 1, indicating the worker is already scheduled. Currently the ksz driver treats 1 as an error and fails to complete the port_hwtstamp_set operation, thus leaving the timestamping configuration for those ports unchanged. This patch fixes this by ignoring the ptp_schedule_worker() return value. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/7aae307a-35ca-4209-a850-7b2749d40f90@martin-whitaker.me.uk Fixes: bb01ad30570b0 ("net: dsa: microchip: ptp: manipulating absolute time using ptp hw clock") Signed-off-by: Martin Whitaker Reviewed-by: Andrew Lunn Acked-by: Arun Ramadoss Link: https://patch.msgid.link/20240817094141.3332-1-foss@martin-whitaker.me.uk Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_ptp.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_ptp.c b/drivers/net/dsa/microchip/ksz_ptp.c index f0bd46e5d4ec0f..050f17c43ef60d 100644 --- a/drivers/net/dsa/microchip/ksz_ptp.c +++ b/drivers/net/dsa/microchip/ksz_ptp.c @@ -266,7 +266,6 @@ static int ksz_ptp_enable_mode(struct ksz_device *dev) struct ksz_port *prt; struct dsa_port *dp; bool tag_en = false; - int ret; dsa_switch_for_each_user_port(dp, dev->ds) { prt = &dev->ports[dp->index]; @@ -277,9 +276,7 @@ static int ksz_ptp_enable_mode(struct ksz_device *dev) } if (tag_en) { - ret = ptp_schedule_worker(ptp_data->clock, 0); - if (ret) - return ret; + ptp_schedule_worker(ptp_data->clock, 0); } else { ptp_cancel_worker_sync(ptp_data->clock); } From 528876d867a23b5198022baf2e388052ca67c952 Mon Sep 17 00:00:00 2001 From: Joseph Huang Date: Mon, 19 Aug 2024 19:52:50 -0400 Subject: [PATCH 531/991] net: dsa: mv88e6xxx: Fix out-of-bound access If an ATU violation was caused by a CPU Load operation, the SPID could be larger than DSA_MAX_PORTS (the size of mv88e6xxx_chip.ports[] array). Fixes: 75c05a74e745 ("net: dsa: mv88e6xxx: Fix counting of ATU violations") Signed-off-by: Joseph Huang Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20240819235251.1331763-1-Joseph.Huang@garmin.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/global1_atu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/global1_atu.c b/drivers/net/dsa/mv88e6xxx/global1_atu.c index ce3b3690c3c057..c47f068f56b32a 100644 --- a/drivers/net/dsa/mv88e6xxx/global1_atu.c +++ b/drivers/net/dsa/mv88e6xxx/global1_atu.c @@ -457,7 +457,8 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id) trace_mv88e6xxx_atu_full_violation(chip->dev, spid, entry.portvec, entry.mac, fid); - chip->ports[spid].atu_full_violation++; + if (spid < ARRAY_SIZE(chip->ports)) + chip->ports[spid].atu_full_violation++; } return IRQ_HANDLED; From c07ff8592d57ed258afee5a5e04991a48dbaf382 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 19 Aug 2024 10:56:45 -0700 Subject: [PATCH 532/991] netem: fix return value if duplicate enqueue fails There is a bug in netem_enqueue() introduced by commit 5845f706388a ("net: netem: fix skb length BUG_ON in __skb_to_sgvec") that can lead to a use-after-free. This commit made netem_enqueue() always return NET_XMIT_SUCCESS when a packet is duplicated, which can cause the parent qdisc's q.qlen to be mistakenly incremented. When this happens qlen_notify() may be skipped on the parent during destruction, leaving a dangling pointer for some classful qdiscs like DRR. There are two ways for the bug happen: - If the duplicated packet is dropped by rootq->enqueue() and then the original packet is also dropped. - If rootq->enqueue() sends the duplicated packet to a different qdisc and the original packet is dropped. In both cases NET_XMIT_SUCCESS is returned even though no packets are enqueued at the netem qdisc. The fix is to defer the enqueue of the duplicate packet until after the original packet has been guaranteed to return NET_XMIT_SUCCESS. Fixes: 5845f706388a ("net: netem: fix skb length BUG_ON in __skb_to_sgvec") Reported-by: Budimir Markovic Signed-off-by: Stephen Hemminger Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240819175753.5151-1-stephen@networkplumber.org Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 47 ++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index edc72962ae63a7..0f8d581438c392 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -446,12 +446,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct netem_sched_data *q = qdisc_priv(sch); /* We don't fill cb now as skb_unshare() may invalidate it */ struct netem_skb_cb *cb; - struct sk_buff *skb2; + struct sk_buff *skb2 = NULL; struct sk_buff *segs = NULL; unsigned int prev_len = qdisc_pkt_len(skb); int count = 1; - int rc = NET_XMIT_SUCCESS; - int rc_drop = NET_XMIT_DROP; /* Do not fool qdisc_drop_all() */ skb->prev = NULL; @@ -480,19 +478,11 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb_orphan_partial(skb); /* - * If we need to duplicate packet, then re-insert at top of the - * qdisc tree, since parent queuer expects that only one - * skb will be queued. + * If we need to duplicate packet, then clone it before + * original is modified. */ - if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { - struct Qdisc *rootq = qdisc_root_bh(sch); - u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ - - q->duplicate = 0; - rootq->enqueue(skb2, rootq, to_free); - q->duplicate = dupsave; - rc_drop = NET_XMIT_SUCCESS; - } + if (count > 1) + skb2 = skb_clone(skb, GFP_ATOMIC); /* * Randomized packet corruption. @@ -504,7 +494,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (skb_is_gso(skb)) { skb = netem_segment(skb, sch, to_free); if (!skb) - return rc_drop; + goto finish_segs; + segs = skb->next; skb_mark_not_on_list(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; @@ -530,7 +521,24 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* re-link segs, so that qdisc_drop_all() frees them all */ skb->next = segs; qdisc_drop_all(skb, sch, to_free); - return rc_drop; + if (skb2) + __qdisc_drop(skb2, to_free); + return NET_XMIT_DROP; + } + + /* + * If doing duplication then re-insert at top of the + * qdisc tree, since parent queuer expects that only one + * skb will be queued. + */ + if (skb2) { + struct Qdisc *rootq = qdisc_root_bh(sch); + u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ + + q->duplicate = 0; + rootq->enqueue(skb2, rootq, to_free); + q->duplicate = dupsave; + skb2 = NULL; } qdisc_qstats_backlog_inc(sch, skb); @@ -601,9 +609,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, } finish_segs: + if (skb2) + __qdisc_drop(skb2, to_free); + if (segs) { unsigned int len, last_len; - int nb; + int rc, nb; len = skb ? skb->len : 0; nb = skb ? 1 : 0; From 0005e01e1e875c5e27130c5e2ed0189749d1e08a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 20 Aug 2024 16:56:19 +0800 Subject: [PATCH 533/991] erofs: fix out-of-bound access when z_erofs_gbuf_growsize() partially fails If z_erofs_gbuf_growsize() partially fails on a global buffer due to memory allocation failure or fault injection (as reported by syzbot [1]), new pages need to be freed by comparing to the existing pages to avoid memory leaks. However, the old gbuf->pages[] array may not be large enough, which can lead to null-ptr-deref or out-of-bound access. Fix this by checking against gbuf->nrpages in advance. [1] https://lore.kernel.org/r/000000000000f7b96e062018c6e3@google.com Reported-by: syzbot+242ee56aaa9585553766@syzkaller.appspotmail.com Fixes: d6db47e571dc ("erofs: do not use pagepool in z_erofs_gbuf_growsize()") Cc: # 6.10+ Reviewed-by: Chunhai Guo Reviewed-by: Sandeep Dhavale Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240820085619.1375963-1-hsiangkao@linux.alibaba.com --- fs/erofs/zutil.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 9b53883e5caf8a..37afe202484091 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -111,7 +111,8 @@ int z_erofs_gbuf_growsize(unsigned int nrpages) out: if (i < z_erofs_gbuf_count && tmp_pages) { for (j = 0; j < nrpages; ++j) - if (tmp_pages[j] && tmp_pages[j] != gbuf->pages[j]) + if (tmp_pages[j] && (j >= gbuf->nrpages || + tmp_pages[j] != gbuf->pages[j])) __free_page(tmp_pages[j]); kfree(tmp_pages); } From e255683c06df572ead96db5efb5d21be30c0efaa Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:19 +0200 Subject: [PATCH 534/991] mptcp: pm: re-using ID of unused removed ADD_ADDR If no subflow is attached to the 'signal' endpoint that is being removed, the addr ID will not be marked as available again. Mark the linked ID as available when removing the address entry from the list to cover this case. Fixes: b6c08380860b ("mptcp: remove addr and subflow in PM netlink") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-1-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 4cae2aa7be5cbc..26f0329e16bbf6 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1431,7 +1431,10 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, ret = remove_anno_list_by_saddr(msk, addr); if (ret || force) { spin_lock_bh(&msk->pm.lock); - msk->pm.add_addr_signaled -= ret; + if (ret) { + __set_bit(addr->id, msk->pm.id_avail_bitmap); + msk->pm.add_addr_signaled--; + } mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); } From a13d5aad4dd9a309eecdc33cfd75045bd5f376a3 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:20 +0200 Subject: [PATCH 535/991] selftests: mptcp: join: check re-using ID of unused ADD_ADDR This test extends "delete re-add signal" to validate the previous commit. An extra address is announced by the server, but this address cannot be used by the client. The result is that no subflow will be established to this address. Later, the server will delete this extra endpoint, and set a new one, with a valid address, but re-using the same ID. Before the previous commit, the server would not have been able to announce this new address. While at it, extra checks have been added to validate the expected numbers of MPJ, ADD_ADDR and RM_ADDR. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: b6c08380860b ("mptcp: remove addr and subflow in PM netlink") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-2-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_join.sh | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 9ea6d698e9d382..25077ccf31d275 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3601,9 +3601,11 @@ endpoint_tests() # remove and re-add if reset "delete re-add signal" && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then - pm_nl_set_limits $ns1 1 1 - pm_nl_set_limits $ns2 1 1 + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 2 2 pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal + # broadcast IP: no packet for this address will be received on ns1 + pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal test_linkfail=4 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & local tests_pid=$! @@ -3615,15 +3617,21 @@ endpoint_tests() chk_mptcp_info subflows 1 subflows 1 pm_nl_del_endpoint $ns1 1 10.0.2.1 + pm_nl_del_endpoint $ns1 2 224.0.0.1 sleep 0.5 chk_subflow_nr "after delete" 1 chk_mptcp_info subflows 0 subflows 0 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal + pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal wait_mpj $ns2 - chk_subflow_nr "after re-add" 2 - chk_mptcp_info subflows 1 subflows 1 + chk_subflow_nr "after re-add" 3 + chk_mptcp_info subflows 2 subflows 2 mptcp_lib_kill_wait $tests_pid + + chk_join_nr 3 3 3 + chk_add_nr 4 4 + chk_rm_nr 2 1 invert fi } From edd8b5d868a4d459f3065493001e293901af758d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:21 +0200 Subject: [PATCH 536/991] mptcp: pm: re-using ID of unused removed subflows If no subflow is attached to the 'subflow' endpoint that is being removed, the addr ID will not be marked as available again. Mark the linked ID as available when removing the 'subflow' endpoint if no subflow is attached to it. While at it, the local_addr_used counter is decremented if the ID was marked as being used to reflect the reality, but also to allow adding new endpoints after that. Fixes: b6c08380860b ("mptcp: remove addr and subflow in PM netlink") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-3-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 26f0329e16bbf6..8b232a210a066e 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1469,8 +1469,17 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr); mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); - if (remove_subflow) + + if (remove_subflow) { mptcp_pm_remove_subflow(msk, &list); + } else if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + /* If the subflow has been used, but now closed */ + spin_lock_bh(&msk->pm.lock); + if (!__test_and_set_bit(entry->addr.id, msk->pm.id_avail_bitmap)) + msk->pm.local_addr_used--; + spin_unlock_bh(&msk->pm.lock); + } + release_sock(sk); next: From 65fb58afa341ad68e71e5c4d816b407e6a683a66 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:22 +0200 Subject: [PATCH 537/991] selftests: mptcp: join: check re-using ID of closed subflow This test extends "delete and re-add" to validate the previous commit. A new 'subflow' endpoint is added, but the subflow request will be rejected. The result is that no subflow will be established from this address. Later, the endpoint is removed and re-added after having cleared the firewall rule. Before the previous commit, the client would not have been able to create this new subflow. While at it, extra checks have been added to validate the expected numbers of MPJ and RM_ADDR. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: b6c08380860b ("mptcp: remove addr and subflow in PM netlink") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-4-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_join.sh | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 25077ccf31d275..fbb0174145ad32 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -436,9 +436,10 @@ reset_with_tcp_filter() local ns="${!1}" local src="${2}" local target="${3}" + local chain="${4:-INPUT}" if ! ip netns exec "${ns}" ${iptables} \ - -A INPUT \ + -A "${chain}" \ -s "${src}" \ -p tcp \ -j "${target}"; then @@ -3571,10 +3572,10 @@ endpoint_tests() mptcp_lib_kill_wait $tests_pid fi - if reset "delete and re-add" && + if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then - pm_nl_set_limits $ns1 1 1 - pm_nl_set_limits $ns2 1 1 + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 0 2 pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow test_linkfail=4 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & @@ -3591,11 +3592,27 @@ endpoint_tests() chk_subflow_nr "after delete" 1 chk_mptcp_info subflows 0 subflows 0 - pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow + pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow wait_mpj $ns2 chk_subflow_nr "after re-add" 2 chk_mptcp_info subflows 1 subflows 1 + + pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow + wait_attempt_fail $ns2 + chk_subflow_nr "after new reject" 2 + chk_mptcp_info subflows 1 subflows 1 + + ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT + pm_nl_del_endpoint $ns2 3 10.0.3.2 + pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow + wait_mpj $ns2 + chk_subflow_nr "after no reject" 3 + chk_mptcp_info subflows 2 subflows 2 + mptcp_lib_kill_wait $tests_pid + + chk_join_nr 3 3 3 + chk_rm_nr 1 1 fi # remove and re-add From ef34a6ea0cab1800f4b3c9c3c2cefd5091e03379 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:23 +0200 Subject: [PATCH 538/991] mptcp: pm: re-using ID of unused flushed subflows If no subflows are attached to the 'subflow' endpoints that are being flushed, the corresponding addr IDs will not be marked as available again. Mark all ID as being available when flushing all the 'subflow' endpoints, and reset local_addr_used counter to cover these cases. Note that mptcp_pm_remove_addrs_and_subflows() helper is only called for flushing operations, not to remove a specific set of addresses and subflows. Fixes: 06faa2271034 ("mptcp: remove multi addresses and subflows in PM") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-5-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 8b232a210a066e..2c26696b820e8a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1623,8 +1623,15 @@ static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, mptcp_pm_remove_addr(msk, &alist); spin_unlock_bh(&msk->pm.lock); } + if (slist.nr) mptcp_pm_remove_subflow(msk, &slist); + + /* Reset counters: maybe some subflows have been removed before */ + spin_lock_bh(&msk->pm.lock); + bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + msk->pm.local_addr_used = 0; + spin_unlock_bh(&msk->pm.lock); } static void mptcp_nl_remove_addrs_list(struct net *net, From e06959e9eebdfea4654390f53b65cff57691872e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:24 +0200 Subject: [PATCH 539/991] selftests: mptcp: join: test for flush/re-add endpoints After having flushed endpoints that didn't cause the creation of new subflows, it is important to check endpoints can be re-created, re-using previously used IDs. Before the previous commit, the client would not have been able to re-create the subflow that was previously rejected. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 06faa2271034 ("mptcp: remove multi addresses and subflows in PM") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-6-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_join.sh | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index fbb0174145ad32..f609c02c6123e6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3651,6 +3651,36 @@ endpoint_tests() chk_rm_nr 2 1 invert fi + # flush and re-add + if reset_with_tcp_filter "flush re-add" ns2 10.0.3.2 REJECT OUTPUT && + mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 1 2 + # broadcast IP: no packet for this address will be received on ns1 + pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal + pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow + test_linkfail=4 speed=20 \ + run_tests $ns1 $ns2 10.0.1.1 & + local tests_pid=$! + + wait_attempt_fail $ns2 + chk_subflow_nr "before flush" 1 + chk_mptcp_info subflows 0 subflows 0 + + pm_nl_flush_endpoint $ns2 + pm_nl_flush_endpoint $ns1 + wait_rm_addr $ns2 0 + ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT + pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow + wait_mpj $ns2 + pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal + wait_mpj $ns2 + mptcp_lib_kill_wait $tests_pid + + chk_join_nr 2 2 2 + chk_add_nr 2 2 + chk_rm_nr 1 0 invert + fi } # [$1: error message] From f448451aa62d54be16acb0034223c17e0d12bc69 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:25 +0200 Subject: [PATCH 540/991] mptcp: pm: remove mptcp_pm_remove_subflow() This helper is confusing. It is in pm.c, but it is specific to the in-kernel PM and it cannot be used by the userspace one. Also, it simply calls one in-kernel specific function with the PM lock, while the similar mptcp_pm_remove_addr() helper requires the PM lock. What's left is the pr_debug(), which is not that useful, because a similar one is present in the only function called by this helper: mptcp_pm_nl_rm_subflow_received() After these modifications, this helper can be marked as 'static', and the lock can be taken only once in mptcp_pm_flush_addrs_and_subflows(). Note that it is not a bug fix, but it will help backporting the following commits. Fixes: 0ee4261a3681 ("mptcp: implement mptcp_pm_remove_subflow") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-7-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 10 ---------- net/mptcp/pm_netlink.c | 16 +++++++--------- net/mptcp/protocol.h | 3 --- 3 files changed, 7 insertions(+), 22 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 23bb89c94e90d6..925123e99889bd 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -60,16 +60,6 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ return 0; } -int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) -{ - pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); - - spin_lock_bh(&msk->pm.lock); - mptcp_pm_nl_rm_subflow_received(msk, rm_list); - spin_unlock_bh(&msk->pm.lock); - return 0; -} - /* path manager event handlers */ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2c26696b820e8a..44fc1c5959ac8d 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -857,8 +857,8 @@ static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); } -void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list) +static void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) { mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); } @@ -1471,7 +1471,9 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); if (remove_subflow) { - mptcp_pm_remove_subflow(msk, &list); + spin_lock_bh(&msk->pm.lock); + mptcp_pm_nl_rm_subflow_received(msk, &list); + spin_unlock_bh(&msk->pm.lock); } else if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { /* If the subflow has been used, but now closed */ spin_lock_bh(&msk->pm.lock); @@ -1617,18 +1619,14 @@ static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, alist.ids[alist.nr++] = entry->addr.id; } + spin_lock_bh(&msk->pm.lock); if (alist.nr) { - spin_lock_bh(&msk->pm.lock); msk->pm.add_addr_signaled -= alist.nr; mptcp_pm_remove_addr(msk, &alist); - spin_unlock_bh(&msk->pm.lock); } - if (slist.nr) - mptcp_pm_remove_subflow(msk, &slist); - + mptcp_pm_nl_rm_subflow_received(msk, &slist); /* Reset counters: maybe some subflows have been removed before */ - spin_lock_bh(&msk->pm.lock); bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); msk->pm.local_addr_used = 0; spin_unlock_bh(&msk->pm.lock); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 60c6b073d65fe5..a1c1b0ff1ce1c8 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1026,7 +1026,6 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); -int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list); void mptcp_free_local_addr_list(struct mptcp_sock *msk); @@ -1133,8 +1132,6 @@ static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflo void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_work(struct mptcp_sock *msk); -void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); From 322ea3778965da72862cca2a0c50253aacf65fe6 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:26 +0200 Subject: [PATCH 541/991] mptcp: pm: only mark 'subflow' endp as available Adding the following warning ... WARN_ON_ONCE(msk->pm.local_addr_used == 0) ... before decrementing the local_addr_used counter helped to find a bug when running the "remove single address" subtest from the mptcp_join.sh selftests. Removing a 'signal' endpoint will trigger the removal of all subflows linked to this endpoint via mptcp_pm_nl_rm_addr_or_subflow() with rm_type == MPTCP_MIB_RMSUBFLOW. This will decrement the local_addr_used counter, which is wrong in this case because this counter is linked to 'subflow' endpoints, and here it is a 'signal' endpoint that is being removed. Now, the counter is decremented, only if the ID is being used outside of mptcp_pm_nl_rm_addr_or_subflow(), only for 'subflow' endpoints, and if the ID is not 0 -- local_addr_used is not taking into account these ones. This marking of the ID as being available, and the decrement is done no matter if a subflow using this ID is currently available, because the subflow could have been closed before. Fixes: 06faa2271034 ("mptcp: remove multi addresses and subflows in PM") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-8-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 44fc1c5959ac8d..4cf7cc851f80a5 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -833,10 +833,10 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, if (rm_type == MPTCP_MIB_RMSUBFLOW) __MPTCP_INC_STATS(sock_net(sk), rm_type); } - if (rm_type == MPTCP_MIB_RMSUBFLOW) - __set_bit(rm_id ? rm_id : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap); - else if (rm_type == MPTCP_MIB_RMADDR) + + if (rm_type == MPTCP_MIB_RMADDR) __MPTCP_INC_STATS(sock_net(sk), rm_type); + if (!removed) continue; @@ -846,8 +846,6 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, if (rm_type == MPTCP_MIB_RMADDR) { msk->pm.add_addr_accepted--; WRITE_ONCE(msk->pm.accept_addr, true); - } else if (rm_type == MPTCP_MIB_RMSUBFLOW) { - msk->pm.local_addr_used--; } } } @@ -1441,6 +1439,14 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, return ret; } +static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id) +{ + /* If it was marked as used, and not ID 0, decrement local_addr_used */ + if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) && + id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0)) + msk->pm.local_addr_used--; +} + static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, const struct mptcp_pm_addr_entry *entry) { @@ -1474,11 +1480,11 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, &list); spin_unlock_bh(&msk->pm.lock); - } else if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - /* If the subflow has been used, but now closed */ + } + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { spin_lock_bh(&msk->pm.lock); - if (!__test_and_set_bit(entry->addr.id, msk->pm.id_avail_bitmap)) - msk->pm.local_addr_used--; + __mark_subflow_endp_available(msk, list.ids[0]); spin_unlock_bh(&msk->pm.lock); } @@ -1516,6 +1522,7 @@ static int mptcp_nl_remove_id_zero_address(struct net *net, spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &list); mptcp_pm_nl_rm_subflow_received(msk, &list); + __mark_subflow_endp_available(msk, 0); spin_unlock_bh(&msk->pm.lock); release_sock(sk); @@ -1917,6 +1924,7 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, &list); + __mark_subflow_endp_available(msk, list.ids[0]); mptcp_pm_create_subflow_or_signal_addr(msk); spin_unlock_bh(&msk->pm.lock); } From 1c1f721375989579e46741f59523e39ec9b2a9bd Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:27 +0200 Subject: [PATCH 542/991] mptcp: pm: only decrement add_addr_accepted for MPJ req Adding the following warning ... WARN_ON_ONCE(msk->pm.add_addr_accepted == 0) ... before decrementing the add_addr_accepted counter helped to find a bug when running the "remove single subflow" subtest from the mptcp_join.sh selftest. Removing a 'subflow' endpoint will first trigger a RM_ADDR, then the subflow closure. Before this patch, and upon the reception of the RM_ADDR, the other peer will then try to decrement this add_addr_accepted. That's not correct because the attached subflows have not been created upon the reception of an ADD_ADDR. A way to solve that is to decrement the counter only if the attached subflow was an MP_JOIN to a remote id that was not 0, and initiated by the host receiving the RM_ADDR. Fixes: d0876b2284cf ("mptcp: add the incoming RM_ADDR support") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-9-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 4cf7cc851f80a5..882781571c7b4a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -829,7 +829,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); - removed = true; + removed |= subflow->request_join; if (rm_type == MPTCP_MIB_RMSUBFLOW) __MPTCP_INC_STATS(sock_net(sk), rm_type); } @@ -843,7 +843,11 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, if (!mptcp_pm_is_kernel(msk)) continue; - if (rm_type == MPTCP_MIB_RMADDR) { + if (rm_type == MPTCP_MIB_RMADDR && rm_id && + !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + /* Note: if the subflow has been closed before, this + * add_addr_accepted counter will not be decremented. + */ msk->pm.add_addr_accepted--; WRITE_ONCE(msk->pm.accept_addr, true); } From 0137a3c7c2ea3f9df8ebfc65d78b4ba712a187bb Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:28 +0200 Subject: [PATCH 543/991] mptcp: pm: check add_addr_accept_max before accepting new ADD_ADDR The limits might have changed in between, it is best to check them before accepting new ADD_ADDR. Fixes: d0876b2284cf ("mptcp: add the incoming RM_ADDR support") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-10-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 882781571c7b4a..28a9a37261461d 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -848,8 +848,8 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, /* Note: if the subflow has been closed before, this * add_addr_accepted counter will not be decremented. */ - msk->pm.add_addr_accepted--; - WRITE_ONCE(msk->pm.accept_addr, true); + if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + WRITE_ONCE(msk->pm.accept_addr, true); } } } From ca6e55a703ca2894611bb5c5bca8bfd2290fd91e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:29 +0200 Subject: [PATCH 544/991] mptcp: pm: only in-kernel cannot have entries with ID 0 The ID 0 is specific per MPTCP connections. The per netns entries cannot have this special ID 0 then. But that's different for the userspace PM where the entries are per connection, they can then use this special ID 0. Fixes: f40be0db0b76 ("mptcp: unify pm get_flags_and_ifindex_by_id") Cc: stable@vger.kernel.org Acked-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-11-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 3 --- net/mptcp/pm_netlink.c | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 925123e99889bd..3e6e0f5510bb13 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -434,9 +434,6 @@ int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id *flags = 0; *ifindex = 0; - if (!id) - return 0; - if (mptcp_pm_is_userspace(msk)) return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk, id, flags, ifindex); return mptcp_pm_nl_get_flags_and_ifindex_by_id(msk, id, flags, ifindex); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 28a9a37261461d..d0a80f537fc3e6 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1395,6 +1395,10 @@ int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int struct sock *sk = (struct sock *)msk; struct net *net = sock_net(sk); + /* No entries with ID 0 */ + if (id == 0) + return 0; + rcu_read_lock(); entry = __lookup_addr_by_id(pm_nl_get_pernet(net), id); if (entry) { From 09355f7abb9fbfc1a240be029837921ea417bf4f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:30 +0200 Subject: [PATCH 545/991] mptcp: pm: fullmesh: select the right ID later When reacting upon the reception of an ADD_ADDR, the in-kernel PM first looks for fullmesh endpoints. If there are some, it will pick them, using their entry ID. It should set the ID 0 when using the endpoint corresponding to the initial subflow, it is a special case imposed by the MPTCP specs. Note that msk->mpc_endpoint_id might not be set when receiving the first ADD_ADDR from the server. So better to compare the addresses. Fixes: 1a0d6136c5f0 ("mptcp: local addresses fullmesh") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-12-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index d0a80f537fc3e6..a2e37ab1c40fc6 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -636,6 +636,7 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, { struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info mpc_addr; struct pm_nl_pernet *pernet; unsigned int subflows_max; int i = 0; @@ -643,6 +644,8 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, pernet = pm_nl_get_pernet_from_msk(msk); subflows_max = mptcp_pm_get_subflows_max(msk); + mptcp_local_address((struct sock_common *)msk, &mpc_addr); + rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) @@ -653,7 +656,13 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, if (msk->pm.subflows < subflows_max) { msk->pm.subflows++; - addrs[i++] = entry->addr; + addrs[i] = entry->addr; + + /* Special case for ID0: set the correct ID */ + if (mptcp_addresses_equal(&entry->addr, &mpc_addr, entry->addr.port)) + addrs[i].id = 0; + + i++; } } rcu_read_unlock(); From 4878f9f8421f4587bee7b232c1c8a9d3a7d4d782 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:31 +0200 Subject: [PATCH 546/991] selftests: mptcp: join: validate fullmesh endp on 1st sf This case was not covered, and the wrong ID was set before the previous commit. The rest is not modified, it is just that it will increase the code coverage. The right address ID can be verified by looking at the packet traces. We could automate that using Netfilter with some cBPF code for example, but that's always a bit cryptic. Packetdrill seems better fitted for that. Fixes: 4f49d63352da ("selftests: mptcp: add fullmesh testcases") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-13-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index f609c02c6123e6..89e553e0e0c2ef 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3059,6 +3059,7 @@ fullmesh_tests() pm_nl_set_limits $ns1 1 3 pm_nl_set_limits $ns2 1 3 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,fullmesh fullmesh=1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 From 48e50dcbcbaaf713d82bf2da5c16aeced94ad07d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:32 +0200 Subject: [PATCH 547/991] mptcp: pm: avoid possible UaF when selecting endp select_local_address() and select_signal_address() both select an endpoint entry from the list inside an RCU protected section, but return a reference to it, to be read later on. If the entry is dereferenced after the RCU unlock, reading info could cause a Use-after-Free. A simple solution is to copy the required info while inside the RCU protected section to avoid any risk of UaF later. The address ID might need to be modified later to handle the ID0 case later, so a copy seems OK to deal with. Reported-by: Paolo Abeni Closes: https://lore.kernel.org/45cd30d3-7710-491c-ae4d-a1368c00beb1@redhat.com Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-14-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 64 ++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a2e37ab1c40fc6..3e4ad801786f2b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -143,11 +143,13 @@ static bool lookup_subflow_by_daddr(const struct list_head *list, return false; } -static struct mptcp_pm_addr_entry * +static bool select_local_address(const struct pm_nl_pernet *pernet, - const struct mptcp_sock *msk) + const struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *new_entry) { - struct mptcp_pm_addr_entry *entry, *ret = NULL; + struct mptcp_pm_addr_entry *entry; + bool found = false; msk_owned_by_me(msk); @@ -159,17 +161,21 @@ select_local_address(const struct pm_nl_pernet *pernet, if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; - ret = entry; + *new_entry = *entry; + found = true; break; } rcu_read_unlock(); - return ret; + + return found; } -static struct mptcp_pm_addr_entry * -select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk) +static bool +select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *new_entry) { - struct mptcp_pm_addr_entry *entry, *ret = NULL; + struct mptcp_pm_addr_entry *entry; + bool found = false; rcu_read_lock(); /* do not keep any additional per socket state, just signal @@ -184,11 +190,13 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk) if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) continue; - ret = entry; + *new_entry = *entry; + found = true; break; } rcu_read_unlock(); - return ret; + + return found; } unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) @@ -512,9 +520,10 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { - struct mptcp_pm_addr_entry *local, *signal_and_subflow = NULL; struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry local; unsigned int add_addr_signal_max; + bool signal_and_subflow = false; unsigned int local_addr_max; struct pm_nl_pernet *pernet; unsigned int subflows_max; @@ -565,23 +574,22 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) return; - local = select_signal_address(pernet, msk); - if (!local) + if (!select_signal_address(pernet, msk, &local)) goto subflow; /* If the alloc fails, we are on memory pressure, not worth * continuing, and trying to create subflows. */ - if (!mptcp_pm_alloc_anno_list(msk, &local->addr)) + if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) return; - __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; - mptcp_pm_announce_addr(msk, &local->addr, false); + mptcp_pm_announce_addr(msk, &local.addr, false); mptcp_pm_nl_addr_send_ack(msk); - if (local->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) - signal_and_subflow = local; + if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + signal_and_subflow = true; } subflow: @@ -592,26 +600,22 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) bool fullmesh; int i, nr; - if (signal_and_subflow) { - local = signal_and_subflow; - signal_and_subflow = NULL; - } else { - local = select_local_address(pernet, msk); - if (!local) - break; - } + if (signal_and_subflow) + signal_and_subflow = false; + else if (!select_local_address(pernet, msk, &local)) + break; - fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH); + fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH); msk->pm.local_addr_used++; - __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); - nr = fill_remote_addresses_vec(msk, &local->addr, fullmesh, addrs); + __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); + nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs); if (nr == 0) continue; spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) - __mptcp_subflow_connect(sk, &local->addr, &addrs[i]); + __mptcp_subflow_connect(sk, &local.addr, &addrs[i]); spin_lock_bh(&msk->pm.lock); } mptcp_pm_nl_check_work_pending(msk); From 0b43312902d165c4c8429cd49e8c91479f52b7c4 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 13 Aug 2024 13:51:48 +0800 Subject: [PATCH 548/991] drm/amdgpu: fixing rlc firmware loading failure issue Skip rlc firmware validation to ignore firmware header size mismatch issues. This restores the workaround added in commit 849e133c973c ("drm/amdgpu: Fix the null pointer when load rlc firmware") Fixes: 3af2c80ae2f5 ("drm/amdgpu: refine gfx10 firmware loading") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3551 Signed-off-by: Yang Wang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 89ec85d16eb8110d88c273d1d34f1fe5a70ba8cc) --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 2957702fca0c66..e444e621ddaa01 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -4116,6 +4116,7 @@ static void gfx_v10_0_check_gfxoff_flag(struct amdgpu_device *adev) static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) { + char fw_name[53]; char ucode_prefix[30]; const char *wks = ""; int err; @@ -4149,8 +4150,8 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_CE); if (!amdgpu_sriov_vf(adev)) { - err = amdgpu_ucode_request(adev, &adev->gfx.rlc_fw, - "amdgpu/%s_rlc.bin", ucode_prefix); + snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_rlc.bin", ucode_prefix); + err = request_firmware(&adev->gfx.rlc_fw, fw_name, adev->dev); if (err) goto out; From e3e4bf58bad1576ac732a1429f53e3d4bfb82b4b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 14 Aug 2024 10:28:24 -0400 Subject: [PATCH 549/991] drm/amdgpu/sdma5.2: limit wptr workaround to sdma 5.2.1 The workaround seems to cause stability issues on other SDMA 5.2.x IPs. Fixes: a03ebf116303 ("drm/amdgpu/sdma5.2: Update wptr registers as well as doorbell") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3556 Acked-by: Ruijing Dong Signed-off-by: Alex Deucher (cherry picked from commit 2dc3851ef7d9c5439ea8e9623fc36878f3b40649) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index af1e90159ce36c..2e72d445415f9e 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -176,14 +176,16 @@ static void sdma_v5_2_ring_set_wptr(struct amdgpu_ring *ring) DRM_DEBUG("calling WDOORBELL64(0x%08x, 0x%016llx)\n", ring->doorbell_index, ring->wptr << 2); WDOORBELL64(ring->doorbell_index, ring->wptr << 2); - /* SDMA seems to miss doorbells sometimes when powergating kicks in. - * Updating the wptr directly will wake it. This is only safe because - * we disallow gfxoff in begin_use() and then allow it again in end_use(). - */ - WREG32(sdma_v5_2_get_reg_offset(adev, ring->me, mmSDMA0_GFX_RB_WPTR), - lower_32_bits(ring->wptr << 2)); - WREG32(sdma_v5_2_get_reg_offset(adev, ring->me, mmSDMA0_GFX_RB_WPTR_HI), - upper_32_bits(ring->wptr << 2)); + if (amdgpu_ip_version(adev, SDMA0_HWIP, 0) == IP_VERSION(5, 2, 1)) { + /* SDMA seems to miss doorbells sometimes when powergating kicks in. + * Updating the wptr directly will wake it. This is only safe because + * we disallow gfxoff in begin_use() and then allow it again in end_use(). + */ + WREG32(sdma_v5_2_get_reg_offset(adev, ring->me, mmSDMA0_GFX_RB_WPTR), + lower_32_bits(ring->wptr << 2)); + WREG32(sdma_v5_2_get_reg_offset(adev, ring->me, mmSDMA0_GFX_RB_WPTR_HI), + upper_32_bits(ring->wptr << 2)); + } } else { DRM_DEBUG("Not using doorbell -- " "mmSDMA%i_GFX_RB_WPTR == 0x%08x " From c99769bceab4ecb6a067b9af11f9db281eea3e2a Mon Sep 17 00:00:00 2001 From: Candice Li Date: Thu, 15 Aug 2024 11:37:28 +0800 Subject: [PATCH 550/991] drm/amdgpu: Validate TA binary size Add TA binary size validation to avoid OOB write. Signed-off-by: Candice Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit c0a04e3570d72aaf090962156ad085e37c62e442) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c index 0c856005df6b95..38face981c3e38 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c @@ -166,6 +166,9 @@ static ssize_t ta_if_load_debugfs_write(struct file *fp, const char *buf, size_t if (ret) return -EFAULT; + if (ta_bin_len > PSP_1_MEG) + return -EINVAL; + copy_pos += sizeof(uint32_t); ta_bin = kzalloc(ta_bin_len, GFP_KERNEL); From 9cead81eff635e3b3cbce51b40228f3bdc6f2b8c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 19 Aug 2024 11:14:29 -0400 Subject: [PATCH 551/991] drm/amdgpu: fix eGPU hotplug regression The driver needs to wait for the on board firmware to finish its initialization before probing the card. Commit 959056982a9b ("drm/amdgpu: Fix discovery initialization failure during pci rescan") switched from using msleep() to using usleep_range() which seems to have caused init failures on some navi1x boards. Switch back to msleep(). Fixes: 959056982a9b ("drm/amdgpu: Fix discovery initialization failure during pci rescan") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3559 Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3500 Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: Ma Jun (cherry picked from commit c69b07f7bbc905022491c45097923d3487479529) Cc: stable@vger.kernel.org # 6.10.x --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index ac108fca64fe62..7b561e8e3cafc0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -278,7 +278,7 @@ static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev, msg = RREG32(mmMP0_SMN_C2PMSG_33); if (msg & 0x80000000) break; - usleep_range(1000, 1100); + msleep(1); } } From 822c8020aebcf5804a143b891e34f29873fee5e2 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 20 Aug 2024 13:03:58 +1000 Subject: [PATCH 552/991] ata: pata_macio: Fix DMA table overflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kolbjørn and Jonáš reported that their 32-bit PowerMacs were crashing in pata-macio since commit 09fe2bfa6b83 ("ata: pata_macio: Fix max_segment_size with PAGE_SIZE == 64K"). For example: kernel BUG at drivers/ata/pata_macio.c:544! Oops: Exception in kernel mode, sig: 5 [#1] BE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2 DEBUG_PAGEALLOC PowerMac ... NIP pata_macio_qc_prep+0xf4/0x190 LR pata_macio_qc_prep+0xfc/0x190 Call Trace: 0xc1421660 (unreliable) ata_qc_issue+0x14c/0x2d4 __ata_scsi_queuecmd+0x200/0x53c ata_scsi_queuecmd+0x50/0xe0 scsi_queue_rq+0x788/0xb1c __blk_mq_issue_directly+0x58/0xf4 blk_mq_plug_issue_direct+0x8c/0x1b4 blk_mq_flush_plug_list.part.0+0x584/0x5e0 __blk_flush_plug+0xf8/0x194 __submit_bio+0x1b8/0x2e0 submit_bio_noacct_nocheck+0x230/0x304 btrfs_work_helper+0x200/0x338 process_one_work+0x1a8/0x338 worker_thread+0x364/0x4c0 kthread+0x100/0x104 start_kernel_thread+0x10/0x14 That commit increased max_segment_size to 64KB, with the justification that the SCSI core was already using that size when PAGE_SIZE == 64KB, and that there was existing logic to split over-sized requests. However with a sufficiently large request, the splitting logic causes each sg to be split into two commands in the DMA table, leading to overflow of the DMA table, triggering the BUG_ON(). With default settings the bug doesn't trigger, because the request size is limited by max_sectors_kb == 1280, however max_sectors_kb can be increased, and apparently some distros do that by default using udev rules. Fix the bug for 4KB kernels by reverting to the old max_segment_size. For 64KB kernels the sg_tablesize needs to be halved, to allow for the possibility that each sg will be split into two. Fixes: 09fe2bfa6b83 ("ata: pata_macio: Fix max_segment_size with PAGE_SIZE == 64K") Cc: stable@vger.kernel.org # v6.10+ Reported-by: Kolbjørn Barmen Closes: https://lore.kernel.org/all/62d248bb-e97a-25d2-bcf2-9160c518cae5@kolla.no/ Reported-by: Jonáš Vidra Closes: https://lore.kernel.org/all/3b6441b8-06e6-45da-9e55-f92f2c86933e@ufal.mff.cuni.cz/ Tested-by: Kolbjørn Barmen Signed-off-by: Michael Ellerman Signed-off-by: Damien Le Moal --- drivers/ata/pata_macio.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c index 1b85e8bf4ef91b..1cb8d24b088f8e 100644 --- a/drivers/ata/pata_macio.c +++ b/drivers/ata/pata_macio.c @@ -208,6 +208,19 @@ static const char* macio_ata_names[] = { /* Don't let a DMA segment go all the way to 64K */ #define MAX_DBDMA_SEG 0xff00 +#ifdef CONFIG_PAGE_SIZE_64KB +/* + * The SCSI core requires the segment size to cover at least a page, so + * for 64K page size kernels it must be at least 64K. However the + * hardware can't handle 64K, so pata_macio_qc_prep() will split large + * requests. To handle the split requests the tablesize must be halved. + */ +#define PATA_MACIO_MAX_SEGMENT_SIZE SZ_64K +#define PATA_MACIO_SG_TABLESIZE (MAX_DCMDS / 2) +#else +#define PATA_MACIO_MAX_SEGMENT_SIZE MAX_DBDMA_SEG +#define PATA_MACIO_SG_TABLESIZE MAX_DCMDS +#endif /* * Wait 1s for disk to answer on IDE bus after a hard reset @@ -912,16 +925,10 @@ static int pata_macio_do_resume(struct pata_macio_priv *priv) static const struct scsi_host_template pata_macio_sht = { __ATA_BASE_SHT(DRV_NAME), - .sg_tablesize = MAX_DCMDS, + .sg_tablesize = PATA_MACIO_SG_TABLESIZE, /* We may not need that strict one */ .dma_boundary = ATA_DMA_BOUNDARY, - /* - * The SCSI core requires the segment size to cover at least a page, so - * for 64K page size kernels this must be at least 64K. However the - * hardware can't handle 64K, so pata_macio_qc_prep() will split large - * requests. - */ - .max_segment_size = SZ_64K, + .max_segment_size = PATA_MACIO_MAX_SEGMENT_SIZE, .device_configure = pata_macio_device_configure, .sdev_groups = ata_common_sdev_groups, .can_queue = ATA_DEF_QUEUE, From d4bc0a264fb482b019c84fbc7202dd3cab059087 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 20 Aug 2024 13:04:07 +1000 Subject: [PATCH 553/991] ata: pata_macio: Use WARN instead of BUG The overflow/underflow conditions in pata_macio_qc_prep() should never happen. But if they do there's no need to kill the system entirely, a WARN and failing the IO request should be sufficient and might allow the system to keep running. Signed-off-by: Michael Ellerman Signed-off-by: Damien Le Moal --- drivers/ata/pata_macio.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c index 1cb8d24b088f8e..f2f36e55a1f4d2 100644 --- a/drivers/ata/pata_macio.c +++ b/drivers/ata/pata_macio.c @@ -554,7 +554,8 @@ static enum ata_completion_errors pata_macio_qc_prep(struct ata_queued_cmd *qc) while (sg_len) { /* table overflow should never happen */ - BUG_ON (pi++ >= MAX_DCMDS); + if (WARN_ON_ONCE(pi >= MAX_DCMDS)) + return AC_ERR_SYSTEM; len = (sg_len < MAX_DBDMA_SEG) ? sg_len : MAX_DBDMA_SEG; table->command = cpu_to_le16(write ? OUTPUT_MORE: INPUT_MORE); @@ -566,11 +567,13 @@ static enum ata_completion_errors pata_macio_qc_prep(struct ata_queued_cmd *qc) addr += len; sg_len -= len; ++table; + ++pi; } } /* Should never happen according to Tejun */ - BUG_ON(!pi); + if (WARN_ON_ONCE(!pi)) + return AC_ERR_SYSTEM; /* Convert the last command to an input/output */ table--; From e0ee967630c8ee67bb47a5b38d235cd5a8789c48 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 Aug 2024 18:31:58 -0600 Subject: [PATCH 554/991] io_uring/kbuf: sanitize peek buffer setup Harden the buffer peeking a bit, by adding a sanity check for it having a valid size. Outside of that, arg->max_len is a size_t, though it's only ever set to a 32-bit value (as it's governed by MAX_RW_COUNT). Bump our needed check to a size_t so we know it fits. Finally, cap the calculated needed iov value to the PEEK_MAX_IMPORT, which is the maximum number of segments that should be peeked. Fixes: 35c8711c8fc4 ("io_uring/kbuf: add helpers for getting/peeking multiple buffers") Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index c95dc1736dd930..1af2bd56af44ac 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -218,10 +218,13 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, buf = io_ring_head_to_buf(br, head, bl->mask); if (arg->max_len) { - int needed; + u32 len = READ_ONCE(buf->len); + size_t needed; - needed = (arg->max_len + buf->len - 1) / buf->len; - needed = min(needed, PEEK_MAX_IMPORT); + if (unlikely(!len)) + return -ENOBUFS; + needed = (arg->max_len + len - 1) / len; + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); if (nr_avail > needed) nr_avail = needed; } From 91191a6e50a2ff752da244493171037663536768 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Wed, 21 Aug 2024 12:47:11 +0000 Subject: [PATCH 555/991] ALSA: hda: cs35l56: Don't use the device index as a calibration index The HDA driver cannot assume that the order that the devices are specified in the cirrus,dev-index matches the order of calibration entries. Only a calibration entry with a matching silicon id will be used. Fixes: cfa43aaa7948 ("ALSA: hda: cs35l56: Apply amp calibration from EFI data") Signed-off-by: Simon Trimmer Link: https://patch.msgid.link/20240821124711.44325-1-simont@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l56_hda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/cs35l56_hda.c b/sound/pci/hda/cs35l56_hda.c index a9dfd62637cf4c..e3ac0e23ae3211 100644 --- a/sound/pci/hda/cs35l56_hda.c +++ b/sound/pci/hda/cs35l56_hda.c @@ -1003,7 +1003,7 @@ int cs35l56_hda_common_probe(struct cs35l56_hda *cs35l56, int hid, int id) goto err; } - cs35l56->base.cal_index = cs35l56->index; + cs35l56->base.cal_index = -1; cs35l56_init_cs_dsp(&cs35l56->base, &cs35l56->cs_dsp); cs35l56->cs_dsp.client_ops = &cs35l56_hda_client_ops; From 3568affcddd68743e25aa3ec1647d9b82797757b Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 20 Aug 2024 13:29:30 -0700 Subject: [PATCH 556/991] soc: qcom: pmic_glink: Fix race during initialization As pointed out by Stephen Boyd it is possible that during initialization of the pmic_glink child drivers, the protection-domain notifiers fires, and the associated work is scheduled, before the client registration returns and as a result the local "client" pointer has been initialized. The outcome of this is a NULL pointer dereference as the "client" pointer is blindly dereferenced. Timeline provided by Stephen: CPU0 CPU1 ---- ---- ucsi->client = NULL; devm_pmic_glink_register_client() client->pdr_notify(client->priv, pg->client_state) pmic_glink_ucsi_pdr_notify() schedule_work(&ucsi->register_work) pmic_glink_ucsi_register() ucsi_register() pmic_glink_ucsi_read_version() pmic_glink_ucsi_read() pmic_glink_ucsi_read() pmic_glink_send(ucsi->client) ucsi->client = client // Too late! This code is identical across the altmode, battery manager and usci child drivers. Resolve this by splitting the allocation of the "client" object and the registration thereof into two operations. This only happens if the protection domain registry is populated at the time of registration, which by the introduction of commit '1ebcde047c54 ("soc: qcom: add pd-mapper implementation")' became much more likely. Reported-by: Amit Pundir Closes: https://lore.kernel.org/all/CAMi1Hd2_a7TjA7J9ShrAbNOd_CoZ3D87twmO5t+nZxC9sX18tA@mail.gmail.com/ Reported-by: Johan Hovold Closes: https://lore.kernel.org/all/ZqiyLvP0gkBnuekL@hovoldconsulting.com/ Reported-by: Stephen Boyd Closes: https://lore.kernel.org/all/CAE-0n52JgfCBWiFQyQWPji8cq_rCsviBpW-m72YitgNfdaEhQg@mail.gmail.com/ Fixes: 58ef4ece1e41 ("soc: qcom: pmic_glink: Introduce base PMIC GLINK driver") Cc: stable@vger.kernel.org Reviewed-by: Heikki Krogerus Reviewed-by: Neil Armstrong Tested-by: Amit Pundir Reviewed-by: Johan Hovold Acked-by: Sebastian Reichel Tested-by: Johan Hovold Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20240820-pmic-glink-v6-11-races-v3-1-eec53c750a04@quicinc.com Signed-off-by: Bjorn Andersson --- drivers/power/supply/qcom_battmgr.c | 16 +++++++++------ drivers/soc/qcom/pmic_glink.c | 28 +++++++++++++++++---------- drivers/soc/qcom/pmic_glink_altmode.c | 17 ++++++++++------ drivers/usb/typec/ucsi/ucsi_glink.c | 16 +++++++++------ include/linux/soc/qcom/pmic_glink.h | 11 ++++++----- 5 files changed, 55 insertions(+), 33 deletions(-) diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c index 46f36dcb185c36..7cf19a39d98605 100644 --- a/drivers/power/supply/qcom_battmgr.c +++ b/drivers/power/supply/qcom_battmgr.c @@ -1385,12 +1385,16 @@ static int qcom_battmgr_probe(struct auxiliary_device *adev, "failed to register wireless charing power supply\n"); } - battmgr->client = devm_pmic_glink_register_client(dev, - PMIC_GLINK_OWNER_BATTMGR, - qcom_battmgr_callback, - qcom_battmgr_pdr_notify, - battmgr); - return PTR_ERR_OR_ZERO(battmgr->client); + battmgr->client = devm_pmic_glink_client_alloc(dev, PMIC_GLINK_OWNER_BATTMGR, + qcom_battmgr_callback, + qcom_battmgr_pdr_notify, + battmgr); + if (IS_ERR(battmgr->client)) + return PTR_ERR(battmgr->client); + + pmic_glink_client_register(battmgr->client); + + return 0; } static const struct auxiliary_device_id qcom_battmgr_id_table[] = { diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c index 9ebc0ba359477a..53b176d04fbdfe 100644 --- a/drivers/soc/qcom/pmic_glink.c +++ b/drivers/soc/qcom/pmic_glink.c @@ -66,15 +66,14 @@ static void _devm_pmic_glink_release_client(struct device *dev, void *res) spin_unlock_irqrestore(&pg->client_lock, flags); } -struct pmic_glink_client *devm_pmic_glink_register_client(struct device *dev, - unsigned int id, - void (*cb)(const void *, size_t, void *), - void (*pdr)(void *, int), - void *priv) +struct pmic_glink_client *devm_pmic_glink_client_alloc(struct device *dev, + unsigned int id, + void (*cb)(const void *, size_t, void *), + void (*pdr)(void *, int), + void *priv) { struct pmic_glink_client *client; struct pmic_glink *pg = dev_get_drvdata(dev->parent); - unsigned long flags; client = devres_alloc(_devm_pmic_glink_release_client, sizeof(*client), GFP_KERNEL); if (!client) @@ -85,6 +84,18 @@ struct pmic_glink_client *devm_pmic_glink_register_client(struct device *dev, client->cb = cb; client->pdr_notify = pdr; client->priv = priv; + INIT_LIST_HEAD(&client->node); + + devres_add(dev, client); + + return client; +} +EXPORT_SYMBOL_GPL(devm_pmic_glink_client_alloc); + +void pmic_glink_client_register(struct pmic_glink_client *client) +{ + struct pmic_glink *pg = client->pg; + unsigned long flags; mutex_lock(&pg->state_lock); spin_lock_irqsave(&pg->client_lock, flags); @@ -95,11 +106,8 @@ struct pmic_glink_client *devm_pmic_glink_register_client(struct device *dev, spin_unlock_irqrestore(&pg->client_lock, flags); mutex_unlock(&pg->state_lock); - devres_add(dev, client); - - return client; } -EXPORT_SYMBOL_GPL(devm_pmic_glink_register_client); +EXPORT_SYMBOL_GPL(pmic_glink_client_register); int pmic_glink_send(struct pmic_glink_client *client, void *data, size_t len) { diff --git a/drivers/soc/qcom/pmic_glink_altmode.c b/drivers/soc/qcom/pmic_glink_altmode.c index 1e0808b3cb93ea..463b1c5288318d 100644 --- a/drivers/soc/qcom/pmic_glink_altmode.c +++ b/drivers/soc/qcom/pmic_glink_altmode.c @@ -520,12 +520,17 @@ static int pmic_glink_altmode_probe(struct auxiliary_device *adev, return ret; } - altmode->client = devm_pmic_glink_register_client(dev, - altmode->owner_id, - pmic_glink_altmode_callback, - pmic_glink_altmode_pdr_notify, - altmode); - return PTR_ERR_OR_ZERO(altmode->client); + altmode->client = devm_pmic_glink_client_alloc(dev, + altmode->owner_id, + pmic_glink_altmode_callback, + pmic_glink_altmode_pdr_notify, + altmode); + if (IS_ERR(altmode->client)) + return PTR_ERR(altmode->client); + + pmic_glink_client_register(altmode->client); + + return 0; } static const struct auxiliary_device_id pmic_glink_altmode_id_table[] = { diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c index 16c328497e0b8d..f6f4fae4039934 100644 --- a/drivers/usb/typec/ucsi/ucsi_glink.c +++ b/drivers/usb/typec/ucsi/ucsi_glink.c @@ -367,12 +367,16 @@ static int pmic_glink_ucsi_probe(struct auxiliary_device *adev, ucsi->port_orientation[port] = desc; } - ucsi->client = devm_pmic_glink_register_client(dev, - PMIC_GLINK_OWNER_USBC, - pmic_glink_ucsi_callback, - pmic_glink_ucsi_pdr_notify, - ucsi); - return PTR_ERR_OR_ZERO(ucsi->client); + ucsi->client = devm_pmic_glink_client_alloc(dev, PMIC_GLINK_OWNER_USBC, + pmic_glink_ucsi_callback, + pmic_glink_ucsi_pdr_notify, + ucsi); + if (IS_ERR(ucsi->client)) + return PTR_ERR(ucsi->client); + + pmic_glink_client_register(ucsi->client); + + return 0; } static void pmic_glink_ucsi_remove(struct auxiliary_device *adev) diff --git a/include/linux/soc/qcom/pmic_glink.h b/include/linux/soc/qcom/pmic_glink.h index fd124aa18c81a4..7cddf10277528e 100644 --- a/include/linux/soc/qcom/pmic_glink.h +++ b/include/linux/soc/qcom/pmic_glink.h @@ -23,10 +23,11 @@ struct pmic_glink_hdr { int pmic_glink_send(struct pmic_glink_client *client, void *data, size_t len); -struct pmic_glink_client *devm_pmic_glink_register_client(struct device *dev, - unsigned int id, - void (*cb)(const void *, size_t, void *), - void (*pdr)(void *, int), - void *priv); +struct pmic_glink_client *devm_pmic_glink_client_alloc(struct device *dev, + unsigned int id, + void (*cb)(const void *, size_t, void *), + void (*pdr)(void *, int), + void *priv); +void pmic_glink_client_register(struct pmic_glink_client *client); #endif From 11bb2ffb679399f99041540cf662409905179e3a Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 20 Aug 2024 13:29:31 -0700 Subject: [PATCH 557/991] usb: typec: ucsi: Move unregister out of atomic section Commit '9329933699b3 ("soc: qcom: pmic_glink: Make client-lock non-sleeping")' moved the pmic_glink client list under a spinlock, as it is accessed by the rpmsg/glink callback, which in turn is invoked from IRQ context. This means that ucsi_unregister() is now called from atomic context, which isn't feasible as it's expecting a sleepable context. An effort is under way to get GLINK to invoke its callbacks in a sleepable context, but until then lets schedule the unregistration. A side effect of this is that ucsi_unregister() can now happen after the remote processor, and thereby the communication link with it, is gone. pmic_glink_send() is amended with a check to avoid the resulting NULL pointer dereference. This does however result in the user being informed about this error by the following entry in the kernel log: ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: failed to send UCSI write request: -5 Fixes: 9329933699b3 ("soc: qcom: pmic_glink: Make client-lock non-sleeping") Cc: stable@vger.kernel.org Reviewed-by: Heikki Krogerus Reviewed-by: Neil Armstrong Reviewed-by: Dmitry Baryshkov Tested-by: Amit Pundir Reviewed-by: Johan Hovold Tested-by: Johan Hovold Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20240820-pmic-glink-v6-11-races-v3-2-eec53c750a04@quicinc.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/pmic_glink.c | 10 +++++++++- drivers/usb/typec/ucsi/ucsi_glink.c | 27 ++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c index 53b176d04fbdfe..b218460219b76f 100644 --- a/drivers/soc/qcom/pmic_glink.c +++ b/drivers/soc/qcom/pmic_glink.c @@ -112,8 +112,16 @@ EXPORT_SYMBOL_GPL(pmic_glink_client_register); int pmic_glink_send(struct pmic_glink_client *client, void *data, size_t len) { struct pmic_glink *pg = client->pg; + int ret; - return rpmsg_send(pg->ept, data, len); + mutex_lock(&pg->state_lock); + if (!pg->ept) + ret = -ECONNRESET; + else + ret = rpmsg_send(pg->ept, data, len); + mutex_unlock(&pg->state_lock); + + return ret; } EXPORT_SYMBOL_GPL(pmic_glink_send); diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c index f6f4fae4039934..6aace19d595bcd 100644 --- a/drivers/usb/typec/ucsi/ucsi_glink.c +++ b/drivers/usb/typec/ucsi/ucsi_glink.c @@ -68,6 +68,9 @@ struct pmic_glink_ucsi { struct work_struct notify_work; struct work_struct register_work; + spinlock_t state_lock; + bool ucsi_registered; + bool pd_running; u8 read_buf[UCSI_BUF_SIZE]; }; @@ -244,8 +247,20 @@ static void pmic_glink_ucsi_notify(struct work_struct *work) static void pmic_glink_ucsi_register(struct work_struct *work) { struct pmic_glink_ucsi *ucsi = container_of(work, struct pmic_glink_ucsi, register_work); + unsigned long flags; + bool pd_running; - ucsi_register(ucsi->ucsi); + spin_lock_irqsave(&ucsi->state_lock, flags); + pd_running = ucsi->pd_running; + spin_unlock_irqrestore(&ucsi->state_lock, flags); + + if (!ucsi->ucsi_registered && pd_running) { + ucsi_register(ucsi->ucsi); + ucsi->ucsi_registered = true; + } else if (ucsi->ucsi_registered && !pd_running) { + ucsi_unregister(ucsi->ucsi); + ucsi->ucsi_registered = false; + } } static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv) @@ -269,11 +284,12 @@ static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv) static void pmic_glink_ucsi_pdr_notify(void *priv, int state) { struct pmic_glink_ucsi *ucsi = priv; + unsigned long flags; - if (state == SERVREG_SERVICE_STATE_UP) - schedule_work(&ucsi->register_work); - else if (state == SERVREG_SERVICE_STATE_DOWN) - ucsi_unregister(ucsi->ucsi); + spin_lock_irqsave(&ucsi->state_lock, flags); + ucsi->pd_running = (state == SERVREG_SERVICE_STATE_UP); + spin_unlock_irqrestore(&ucsi->state_lock, flags); + schedule_work(&ucsi->register_work); } static void pmic_glink_ucsi_destroy(void *data) @@ -320,6 +336,7 @@ static int pmic_glink_ucsi_probe(struct auxiliary_device *adev, INIT_WORK(&ucsi->register_work, pmic_glink_ucsi_register); init_completion(&ucsi->read_ack); init_completion(&ucsi->write_ack); + spin_lock_init(&ucsi->state_lock); mutex_init(&ucsi->lock); ucsi->ucsi = ucsi_create(dev, &pmic_glink_ucsi_ops); From ad51126037a43c05f5f4af5eb262734e3e88ca59 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 20 Aug 2024 13:29:32 -0700 Subject: [PATCH 558/991] soc: qcom: pmic_glink: Actually communicate when remote goes down When the pmic_glink state is UP and we either receive a protection- domain (PD) notification indicating that the PD is going down, or that the whole remoteproc is going down, it's expected that the pmic_glink client instances are notified that their function has gone DOWN. This is not what the code does, which results in the client state either not updating, or being wrong in many cases. So let's fix the conditions. Fixes: 58ef4ece1e41 ("soc: qcom: pmic_glink: Introduce base PMIC GLINK driver") Cc: stable@vger.kernel.org Reviewed-by: Heikki Krogerus Reviewed-by: Neil Armstrong Reviewed-by: Dmitry Baryshkov Tested-by: Amit Pundir Reviewed-by: Johan Hovold Tested-by: Johan Hovold Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20240820-pmic-glink-v6-11-races-v3-3-eec53c750a04@quicinc.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/pmic_glink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c index b218460219b76f..9606222993fd78 100644 --- a/drivers/soc/qcom/pmic_glink.c +++ b/drivers/soc/qcom/pmic_glink.c @@ -191,7 +191,7 @@ static void pmic_glink_state_notify_clients(struct pmic_glink *pg) if (pg->pdr_state == SERVREG_SERVICE_STATE_UP && pg->ept) new_state = SERVREG_SERVICE_STATE_UP; } else { - if (pg->pdr_state == SERVREG_SERVICE_STATE_UP && pg->ept) + if (pg->pdr_state == SERVREG_SERVICE_STATE_DOWN || !pg->ept) new_state = SERVREG_SERVICE_STATE_DOWN; } From 8342009efa2a5e75dce56173d7de026bcc6666d8 Mon Sep 17 00:00:00 2001 From: Richard Acayan Date: Mon, 29 Jul 2024 21:38:35 -0400 Subject: [PATCH 559/991] firmware: qcom: tzmem: disable sdm670 platform The Pixel 3a returns 4291821499 (-3145797 or 0xFFCFFFBB) when attempting to load the GPU firmware if tzmem is allowed. Disable it on SDM670 so the GPU can successfully probe. Signed-off-by: Richard Acayan Acked-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20240730013834.41840-2-mailingradian@gmail.com Signed-off-by: Bjorn Andersson --- drivers/firmware/qcom/qcom_tzmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/firmware/qcom/qcom_tzmem.c b/drivers/firmware/qcom/qcom_tzmem.c index caedeef0059c9c..92b3651782355f 100644 --- a/drivers/firmware/qcom/qcom_tzmem.c +++ b/drivers/firmware/qcom/qcom_tzmem.c @@ -77,6 +77,7 @@ static bool qcom_tzmem_using_shm_bridge; /* List of machines that are known to not support SHM bridge correctly. */ static const char *const qcom_tzmem_blacklist[] = { "qcom,sc8180x", + "qcom,sdm670", /* failure in GPU firmware loading */ "qcom,sdm845", /* reset in rmtfs memory assignment */ "qcom,sm8150", /* reset in rmtfs memory assignment */ NULL From a3ca27c405faad584af6e8e38cdafe5be73230a1 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 9 Aug 2024 08:47:15 +0200 Subject: [PATCH 560/991] s390/mm: Prevent lowcore vs identity mapping overlap The identity mapping position in virtual memory is randomized together with the kernel mapping. That position can never overlap with the lowcore even when the lowcore is relocated. Prevent overlapping with the lowcore to allow independent positioning of the identity mapping. With the current value of the alternative lowcore address of 0x70000 the overlap could happen in case the identity mapping is placed at zero. This is a prerequisite for uncoupling of randomization base of kernel image and identity mapping in virtual memory. Acked-by: Vasily Gorbik Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/kernel/setup.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 4ec99f73fa27e1..a3fea683b22706 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -734,7 +734,23 @@ static void __init memblock_add_physmem_info(void) } /* - * Reserve memory used for lowcore/command line/kernel image. + * Reserve memory used for lowcore. + */ +static void __init reserve_lowcore(void) +{ + void *lowcore_start = get_lowcore(); + void *lowcore_end = lowcore_start + sizeof(struct lowcore); + void *start, *end; + + if ((void *)__identity_base < lowcore_end) { + start = max(lowcore_start, (void *)__identity_base); + end = min(lowcore_end, (void *)(__identity_base + ident_map_size)); + memblock_reserve(__pa(start), __pa(end)); + } +} + +/* + * Reserve memory used for absolute lowcore/command line/kernel image. */ static void __init reserve_kernel(void) { @@ -918,6 +934,7 @@ void __init setup_arch(char **cmdline_p) /* Do some memory reservations *before* memory is added to memblock */ reserve_pgtables(); + reserve_lowcore(); reserve_kernel(); reserve_initrd(); reserve_certificate_list(); From 32db401965f165f7c44447d0508097f070c8f576 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 9 Aug 2024 08:47:16 +0200 Subject: [PATCH 561/991] s390/mm: Pin identity mapping base to zero SIE instruction performs faster when the virtual address of SIE block matches the physical one. Pin the identity mapping base to zero for the benefit of SIE and other instructions that have similar performance impact. Still, randomize the base when DEBUG_VM kernel configuration option is enabled. Suggested-by: Vasily Gorbik Reviewed-by: Christian Borntraeger Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 13 +++++++++++++ arch/s390/boot/startup.c | 3 ++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a822f952f64a9c..c60e699e99f5b3 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -604,6 +604,19 @@ config RANDOMIZE_BASE as a security feature that deters exploit attempts relying on knowledge of the location of kernel internals. +config RANDOMIZE_IDENTITY_BASE + bool "Randomize the address of the identity mapping base" + depends on RANDOMIZE_BASE + default DEBUG_VM + help + The identity mapping base address is pinned to zero by default. + Allow randomization of that base to expose otherwise missed + notion of physical and virtual addresses of data structures. + That does not have any impact on the base address at which the + kernel image is loaded. + + If unsure, say N + config KERNEL_IMAGE_BASE hex "Kernel image base address" range 0x100000 0x1FFFFFE0000000 if !KASAN diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index ce232552bc1c38..cff34744b5a9bf 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -341,7 +341,8 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) BUILD_BUG_ON(MAX_DCSS_ADDR > (1UL << MAX_PHYSMEM_BITS)); max_mappable = max(ident_map_size, MAX_DCSS_ADDR); max_mappable = min(max_mappable, vmemmap_start); - __identity_base = round_down(vmemmap_start - max_mappable, rte_size); + if (IS_ENABLED(CONFIG_RANDOMIZE_IDENTITY_BASE)) + __identity_base = round_down(vmemmap_start - max_mappable, rte_size); return asce_limit; } From b4f5bd60d558f6ba451d7e76aa05782c07a182a3 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Tue, 6 Aug 2024 12:06:23 +0200 Subject: [PATCH 562/991] s390/ap: Refine AP bus bindings complete processing With the rework of the AP bus scan and the introduction of a bindings complete completion also the timing until the userspace finally receives a AP bus binding complete uevent had increased. Unfortunately this event triggers some important jobs for preparation of KVM guests, for example the modification of card/queue masks to reassign AP resources to the alternate AP queue device driver (vfio_ap) which is the precondition for building mediated devices which may be a precondition for starting KVM guests using AP resources. This small fix now triggers the check for binding complete each time an AP device driver has registered. With this patch the bindings complete may be posted up to 30s earlier as there is no need to wait for the next AP bus scan any more. Fixes: 778412ab915d ("s390/ap: rearm APQNs bindings complete completion") Signed-off-by: Harald Freudenberger Reviewed-by: Holger Dengler Cc: stable@vger.kernel.org Acked-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/ap_bus.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 0998b17ecb37e2..f9f682f194154d 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -971,11 +971,16 @@ int ap_driver_register(struct ap_driver *ap_drv, struct module *owner, char *name) { struct device_driver *drv = &ap_drv->driver; + int rc; drv->bus = &ap_bus_type; drv->owner = owner; drv->name = name; - return driver_register(drv); + rc = driver_register(drv); + + ap_check_bindings_complete(); + + return rc; } EXPORT_SYMBOL(ap_driver_register); From c158ceb826068a8bbe3c9e78df420f47ba53c8a8 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 20 Aug 2024 15:59:34 -0700 Subject: [PATCH 563/991] soc: qcom: pd-mapper: Fix singleton refcount The Qualcomm pd-mapper is a refcounted singleton, but the refcount is never incremented, which means the as soon as any remoteproc instance stops the count will hit 0. At this point the pd-mapper QMI service is stopped, leaving firmware without access to the PD information. Stopping any other remoteproc instances will result in a use-after-free, which best case manifest itself as a refcount underflow: refcount_t: underflow; use-after-free. WARNING: CPU: 1 PID: 354 at lib/refcount.c:87 refcount_dec_and_mutex_lock+0xc4/0x148 ... Call trace: refcount_dec_and_mutex_lock+0xc4/0x148 qcom_pdm_remove+0x40/0x118 [qcom_pd_mapper] ... Fix this by incrementing the refcount, so that the pd-mapper is only torn down when the last remoteproc stops, as intended. Fixes: 1ebcde047c54 ("soc: qcom: add pd-mapper implementation") Signed-off-by: Bjorn Andersson Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240820-pd-mapper-refcount-fix-v1-1-03ea65c0309b@quicinc.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/qcom_pd_mapper.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/soc/qcom/qcom_pd_mapper.c b/drivers/soc/qcom/qcom_pd_mapper.c index 9afa09c3920e10..2228595a3dc5a4 100644 --- a/drivers/soc/qcom/qcom_pd_mapper.c +++ b/drivers/soc/qcom/qcom_pd_mapper.c @@ -635,6 +635,8 @@ static int qcom_pdm_probe(struct auxiliary_device *auxdev, ret = PTR_ERR(data); else __qcom_pdm_data = data; + } else { + refcount_inc(&__qcom_pdm_data->refcnt); } auxiliary_set_drvdata(auxdev, __qcom_pdm_data); From 0e9fdab1e8df490354562187cdbb8dec643eae2c Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 21 Aug 2024 14:19:54 +0800 Subject: [PATCH 564/991] ASoC: allow module autoloading for table db1200_pids Add MODULE_DEVICE_TABLE(), so modules could be properly autoloaded based on the alias from platform_device_id table. Signed-off-by: Hongbo Li Link: https://patch.msgid.link/20240821061955.2273782-2-lihongbo22@huawei.com Signed-off-by: Mark Brown --- sound/soc/au1x/db1200.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/au1x/db1200.c b/sound/soc/au1x/db1200.c index 83a75a38705b4e..81abe2e184024e 100644 --- a/sound/soc/au1x/db1200.c +++ b/sound/soc/au1x/db1200.c @@ -44,6 +44,7 @@ static const struct platform_device_id db1200_pids[] = { }, {}, }; +MODULE_DEVICE_TABLE(platform, db1200_pids); /*------------------------- AC97 PART ---------------------------*/ From 5f7c98b7519a3a847d9182bd99d57ea250032ca1 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 21 Aug 2024 14:19:55 +0800 Subject: [PATCH 565/991] ASoC: allow module autoloading for table board_ids Add MODULE_DEVICE_TABLE(), so modules could be properly autoloaded based on the alias from platform_device_id table. Signed-off-by: Hongbo Li Link: https://patch.msgid.link/20240821061955.2273782-3-lihongbo22@huawei.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-sof-mach.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/amd/acp/acp-sof-mach.c b/sound/soc/amd/acp/acp-sof-mach.c index fc59ea34e687ad..b3a702dcd9911a 100644 --- a/sound/soc/amd/acp/acp-sof-mach.c +++ b/sound/soc/amd/acp/acp-sof-mach.c @@ -158,6 +158,8 @@ static const struct platform_device_id board_ids[] = { }, { } }; +MODULE_DEVICE_TABLE(platform, board_ids); + static struct platform_driver acp_asoc_audio = { .driver = { .name = "sof_mach", From a6f78359ac75f24cac3c1bdd753c49c1877bcd82 Mon Sep 17 00:00:00 2001 From: Stuart Summers Date: Sat, 17 Aug 2024 02:47:30 +0000 Subject: [PATCH 566/991] drm/xe: Fix missing workqueue destroy in xe_gt_pagefault On driver reload we never free up the memory for the pagefault and access counter workqueues. Add those destroy calls here. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Stuart Summers Reviewed-by: Rodrigo Vivi Signed-off-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/c9a951505271dc3a7aee76de7656679f69c11518.1723862633.git.stuart.summers@intel.com (cherry picked from commit 7586fc52b14e0b8edd0d1f8a434e0de2078b7b2b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_pagefault.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c index 9292d54688684e..b2a7fa55bd1815 100644 --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c @@ -382,6 +382,18 @@ static void pf_queue_work_func(struct work_struct *w) static void acc_queue_work_func(struct work_struct *w); +static void pagefault_fini(void *arg) +{ + struct xe_gt *gt = arg; + struct xe_device *xe = gt_to_xe(gt); + + if (!xe->info.has_usm) + return; + + destroy_workqueue(gt->usm.acc_wq); + destroy_workqueue(gt->usm.pf_wq); +} + int xe_gt_pagefault_init(struct xe_gt *gt) { struct xe_device *xe = gt_to_xe(gt); @@ -409,10 +421,12 @@ int xe_gt_pagefault_init(struct xe_gt *gt) gt->usm.acc_wq = alloc_workqueue("xe_gt_access_counter_work_queue", WQ_UNBOUND | WQ_HIGHPRI, NUM_ACC_QUEUE); - if (!gt->usm.acc_wq) + if (!gt->usm.acc_wq) { + destroy_workqueue(gt->usm.pf_wq); return -ENOMEM; + } - return 0; + return devm_add_action_or_reset(xe->drm.dev, pagefault_fini, gt); } void xe_gt_pagefault_reset(struct xe_gt *gt) From dd3e840a33b57b92812fbec26273b3f0b4eb5ae3 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 15 Aug 2024 12:35:22 -0700 Subject: [PATCH 567/991] drm/xe: Drop HW fence pointer to HW fence ctx The HW fence ctx objects are not ref counted rather tied to the life of an LRC object. HW fences reference the HW fence ctx, HW fences can outlive LRCs thus resulting in UAF. Drop the HW fence pointer to HW fence ctx rather just store what is needed directly in HW fence. v2: - Fix typo in commit (Ashutosh) - Use snprintf (Ashutosh) Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Brost Reviewed-by: Ashutosh Dixit Link: https://patchwork.freedesktop.org/patch/msgid/20240815193522.16008-1-matthew.brost@intel.com (cherry picked from commit 60db6f540af9f93144d5039140aa2ed17171d168) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_hw_fence.c | 9 +++++---- drivers/gpu/drm/xe/xe_hw_fence_types.h | 7 +++++-- drivers/gpu/drm/xe/xe_trace.h | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_hw_fence.c b/drivers/gpu/drm/xe/xe_hw_fence.c index 45a9789cf50191..0b4f12be3692ab 100644 --- a/drivers/gpu/drm/xe/xe_hw_fence.c +++ b/drivers/gpu/drm/xe/xe_hw_fence.c @@ -148,20 +148,20 @@ static const char *xe_hw_fence_get_driver_name(struct dma_fence *dma_fence) { struct xe_hw_fence *fence = to_xe_hw_fence(dma_fence); - return dev_name(gt_to_xe(fence->ctx->gt)->drm.dev); + return dev_name(fence->xe->drm.dev); } static const char *xe_hw_fence_get_timeline_name(struct dma_fence *dma_fence) { struct xe_hw_fence *fence = to_xe_hw_fence(dma_fence); - return fence->ctx->name; + return fence->name; } static bool xe_hw_fence_signaled(struct dma_fence *dma_fence) { struct xe_hw_fence *fence = to_xe_hw_fence(dma_fence); - struct xe_device *xe = gt_to_xe(fence->ctx->gt); + struct xe_device *xe = fence->xe; u32 seqno = xe_map_rd(xe, &fence->seqno_map, 0, u32); return dma_fence->error || @@ -253,7 +253,8 @@ void xe_hw_fence_init(struct dma_fence *fence, struct xe_hw_fence_ctx *ctx, struct xe_hw_fence *hw_fence = container_of(fence, typeof(*hw_fence), dma); - hw_fence->ctx = ctx; + hw_fence->xe = gt_to_xe(ctx->gt); + snprintf(hw_fence->name, sizeof(hw_fence->name), "%s", ctx->name); hw_fence->seqno_map = seqno_map; INIT_LIST_HEAD(&hw_fence->irq_link); diff --git a/drivers/gpu/drm/xe/xe_hw_fence_types.h b/drivers/gpu/drm/xe/xe_hw_fence_types.h index b33c4956e8ea0e..364a61f4bfda98 100644 --- a/drivers/gpu/drm/xe/xe_hw_fence_types.h +++ b/drivers/gpu/drm/xe/xe_hw_fence_types.h @@ -12,6 +12,7 @@ #include #include +struct xe_device; struct xe_gt; /** @@ -61,8 +62,10 @@ struct xe_hw_fence_ctx { struct xe_hw_fence { /** @dma: base dma fence for hardware fence context */ struct dma_fence dma; - /** @ctx: hardware fence context */ - struct xe_hw_fence_ctx *ctx; + /** @xe: Xe device for hw fence driver name */ + struct xe_device *xe; + /** @name: name of hardware fence context */ + char name[MAX_FENCE_NAME_LEN]; /** @seqno_map: I/O map for seqno */ struct iosys_map seqno_map; /** @irq_link: Link in struct xe_hw_fence_irq.pending */ diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h index baba14fb1e32e6..01837f6f609f52 100644 --- a/drivers/gpu/drm/xe/xe_trace.h +++ b/drivers/gpu/drm/xe/xe_trace.h @@ -309,7 +309,7 @@ DECLARE_EVENT_CLASS(xe_hw_fence, TP_ARGS(fence), TP_STRUCT__entry( - __string(dev, __dev_name_gt(fence->ctx->gt)) + __string(dev, __dev_name_xe(fence->xe)) __field(u64, ctx) __field(u32, seqno) __field(struct xe_hw_fence *, fence) From 9e7f30563677fbeff62d368d5d2a5ac7aaa9746a Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 20 Aug 2024 13:23:09 -0700 Subject: [PATCH 568/991] drm/xe: Free job before xe_exec_queue_put Free job depends on job->vm being valid, the last xe_exec_queue_put can destroy the VM. Prevent UAF by freeing job before xe_exec_queue_put. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Brost Reviewed-by: Nirmoy Das Reviewed-by: Jagmeet Randhawa Link: https://patchwork.freedesktop.org/patch/msgid/20240820202309.1260755-1-matthew.brost@intel.com (cherry picked from commit 32a42c93b74c8ca6d0915ea3eba21bceff53042f) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_sched_job.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c index 44d534e362cd39..9628f9deb3c012 100644 --- a/drivers/gpu/drm/xe/xe_sched_job.c +++ b/drivers/gpu/drm/xe/xe_sched_job.c @@ -171,12 +171,13 @@ void xe_sched_job_destroy(struct kref *ref) struct xe_sched_job *job = container_of(ref, struct xe_sched_job, refcount); struct xe_device *xe = job_to_xe(job); + struct xe_exec_queue *q = job->q; xe_sched_job_free_fences(job); - xe_exec_queue_put(job->q); dma_fence_put(job->fence); drm_sched_job_cleanup(&job->drm); job_free(job); + xe_exec_queue_put(q); xe_pm_runtime_put(xe); } From 57df60e1f981fa8c288a49012a4bbb02ae0ecdbc Mon Sep 17 00:00:00 2001 From: Yang Ruibin <11162571@vivo.com> Date: Wed, 21 Aug 2024 03:59:33 -0400 Subject: [PATCH 569/991] thermal/debugfs: Fix the NULL vs IS_ERR() confusion in debugfs_create_dir() The debugfs_create_dir() return value is never NULL, it is either a valid pointer or an error one. Use IS_ERR() to check it. Fixes: 7ef01f228c9f ("thermal/debugfs: Add thermal debugfs information for mitigation episodes") Fixes: 755113d76786 ("thermal/debugfs: Add thermal cooling device debugfs information") Signed-off-by: Yang Ruibin <11162571@vivo.com> Link: https://patch.msgid.link/20240821075934.12145-1-11162571@vivo.com [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_debugfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/thermal_debugfs.c b/drivers/thermal/thermal_debugfs.c index 7dd67bf4857198..939d3e5f181771 100644 --- a/drivers/thermal/thermal_debugfs.c +++ b/drivers/thermal/thermal_debugfs.c @@ -178,11 +178,11 @@ struct thermal_debugfs { void thermal_debug_init(void) { d_root = debugfs_create_dir("thermal", NULL); - if (!d_root) + if (IS_ERR(d_root)) return; d_cdev = debugfs_create_dir("cooling_devices", d_root); - if (!d_cdev) + if (IS_ERR(d_cdev)) return; d_tz = debugfs_create_dir("thermal_zones", d_root); @@ -202,7 +202,7 @@ static struct thermal_debugfs *thermal_debugfs_add_id(struct dentry *d, int id) snprintf(ids, IDSLENGTH, "%d", id); thermal_dbg->d_top = debugfs_create_dir(ids, d); - if (!thermal_dbg->d_top) { + if (IS_ERR(thermal_dbg->d_top)) { kfree(thermal_dbg); return NULL; } From 92764e8822d4e7f8efb5ad959fac195a7f8ea0c6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 14 Aug 2024 21:38:21 +0100 Subject: [PATCH 570/991] netfs, ceph: Partially revert "netfs: Replace PG_fscache by setting folio->private and marking dirty" This partially reverts commit 2ff1e97587f4d398686f52c07afde3faf3da4e5c. In addition to reverting the removal of PG_private_2 wrangling from the buffered read code[1][2], the removal of the waits for PG_private_2 from netfs_release_folio() and netfs_invalidate_folio() need reverting too. It also adds a wait into ceph_evict_inode() to wait for netfs read and copy-to-cache ops to complete. Fixes: 2ff1e97587f4 ("netfs: Replace PG_fscache by setting folio->private and marking dirty") Signed-off-by: David Howells Link: https://lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk [1] Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8e5ced7804cb9184c4a23f8054551240562a8eda [2] Link: https://lore.kernel.org/r/20240814203850.2240469-2-dhowells@redhat.com cc: Max Kellermann cc: Ilya Dryomov cc: Xiubo Li cc: Jeff Layton cc: Matthew Wilcox cc: ceph-devel@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: Christian Brauner --- fs/ceph/inode.c | 1 + fs/netfs/misc.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 71cd70514efa55..4a8eec46254b16 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -695,6 +695,7 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_state & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 83e644bd518f3f..554a1a4615adee 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -101,6 +101,8 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) _enter("{%lx},%zx,%zx", folio->index, offset, length); + folio_wait_private_2(folio); /* [DEPRECATED] */ + if (!folio_test_private(folio)) return; @@ -165,6 +167,11 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) if (folio_test_private(folio)) return false; + if (unlikely(folio_test_private_2(folio))) { /* [DEPRECATED] */ + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + folio_wait_private_2(folio); + } fscache_note_page_release(netfs_i_cookie(ctx)); return true; } From 524b2c6dc80d735be9ebcd2decffe2889baab65d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 15 Aug 2024 14:39:33 +0200 Subject: [PATCH 571/991] romfs: fix romfs_read_folio() Add the correct offset to folio_zero_tail(). Fixes: d86f2de026c5 ("romfs: Convert romfs_read_folio() to use a folio") Reported-by: Greg Ungerer Link: https://lore.kernel.org/r/Zr0GTnPHfeA0P8nb@casper.infradead.org Signed-off-by: Christian Brauner --- fs/romfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 68758b6fed942e..0addcc849ff2ca 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -126,7 +126,7 @@ static int romfs_read_folio(struct file *file, struct folio *folio) } } - buf = folio_zero_tail(folio, fillsize, buf); + buf = folio_zero_tail(folio, fillsize, buf + fillsize); kunmap_local(buf); folio_end_read(folio, ret == 0); return ret; From 232590ea7fc125986a526e03081b98e5783f70d2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 19 Aug 2024 10:38:23 +0200 Subject: [PATCH 572/991] Revert "pidfd: prevent creation of pidfds for kthreads" This reverts commit 3b5bbe798b2451820e74243b738268f51901e7d0. Eric reported that systemd-shutdown gets broken by blocking the creating of pidfds for kthreads as older versions seems to rely on being able to create a pidfd for any process in /proc. Reported-by: Eric Biggers Link: https://lore.kernel.org/r/20240818035818.GA1929@sol.localdomain Signed-off-by: Christian Brauner --- kernel/fork.c | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 18bdc87209d057..cc760491f20127 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2053,23 +2053,10 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - if (!pid) - return -EINVAL; - - scoped_guard(rcu) { - struct task_struct *tsk; - - if (flags & PIDFD_THREAD) - tsk = pid_task(pid, PIDTYPE_PID); - else - tsk = pid_task(pid, PIDTYPE_TGID); - if (!tsk) - return -EINVAL; + bool thread = flags & PIDFD_THREAD; - /* Don't create pidfds for kernel threads for now. */ - if (tsk->flags & PF_KTHREAD) - return -EINVAL; - } + if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) + return -EINVAL; return __pidfd_prepare(pid, flags, ret); } @@ -2416,12 +2403,6 @@ __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; - /* Don't create pidfds for kernel threads for now. */ - if (args->kthread) { - retval = -EINVAL; - goto bad_fork_free_pid; - } - /* Note that no task has been attached to @pid yet. */ retval = __pidfd_prepare(pid, flags, &pidfile); if (retval < 0) From 5d6a6c7454ebaefba518e334750b05700131923b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 8 Aug 2024 12:00:57 +0530 Subject: [PATCH 573/991] PCI: qcom-ep: Disable MHI RAM data parity error interrupt for SA8775P SoC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SA8775P SoC has support for the hardware parity check feature on the MHI RAM (entity that holds MHI registers, etc.) But due to a hardware bug in the parity check logic, the data parity error interrupt is getting generated all the time when using MHI. So the hardware team has suggested disabling the parity check error to work around the hardware bug. Mask the parity error interrupt in PARF_INT_ALL_5_MASK register. Fixes: 58d0d3e032b3 ("PCI: qcom-ep: Add support for SA8775P SOC") Link: https://lore.kernel.org/linux-pci/20240808063057.7394-1-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index 236229f66c808e..a9b263f749b6aa 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -58,6 +58,7 @@ #define PARF_DEBUG_CNT_AUX_CLK_IN_L1SUB_L2 0xc88 #define PARF_DEVICE_TYPE 0x1000 #define PARF_BDF_TO_SID_CFG 0x2c00 +#define PARF_INT_ALL_5_MASK 0x2dcc /* PARF_INT_ALL_{STATUS/CLEAR/MASK} register fields */ #define PARF_INT_ALL_LINK_DOWN BIT(1) @@ -127,6 +128,9 @@ /* PARF_CFG_BITS register fields */ #define PARF_CFG_BITS_REQ_EXIT_L1SS_MSI_LTR_EN BIT(1) +/* PARF_INT_ALL_5_MASK fields */ +#define PARF_INT_ALL_5_MHI_RAM_DATA_PARITY_ERR BIT(0) + /* ELBI registers */ #define ELBI_SYS_STTS 0x08 #define ELBI_CS2_ENABLE 0xa4 @@ -158,10 +162,12 @@ enum qcom_pcie_ep_link_status { * struct qcom_pcie_ep_cfg - Per SoC config struct * @hdma_support: HDMA support on this SoC * @override_no_snoop: Override NO_SNOOP attribute in TLP to enable cache snooping + * @disable_mhi_ram_parity_check: Disable MHI RAM data parity error check */ struct qcom_pcie_ep_cfg { bool hdma_support; bool override_no_snoop; + bool disable_mhi_ram_parity_check; }; /** @@ -480,6 +486,12 @@ static int qcom_pcie_perst_deassert(struct dw_pcie *pci) PARF_INT_ALL_LINK_UP | PARF_INT_ALL_EDMA; writel_relaxed(val, pcie_ep->parf + PARF_INT_ALL_MASK); + if (pcie_ep->cfg && pcie_ep->cfg->disable_mhi_ram_parity_check) { + val = readl_relaxed(pcie_ep->parf + PARF_INT_ALL_5_MASK); + val &= ~PARF_INT_ALL_5_MHI_RAM_DATA_PARITY_ERR; + writel_relaxed(val, pcie_ep->parf + PARF_INT_ALL_5_MASK); + } + ret = dw_pcie_ep_init_registers(&pcie_ep->pci.ep); if (ret) { dev_err(dev, "Failed to complete initialization: %d\n", ret); @@ -901,6 +913,7 @@ static void qcom_pcie_ep_remove(struct platform_device *pdev) static const struct qcom_pcie_ep_cfg cfg_1_34_0 = { .hdma_support = true, .override_no_snoop = true, + .disable_mhi_ram_parity_check = true, }; static const struct of_device_id qcom_pcie_ep_match[] = { From db1ec60fba4a995975dc1dc837b408db0d666801 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 22 Jul 2024 18:41:28 +0530 Subject: [PATCH 574/991] PCI: qcom: Use OPP only if the platform supports it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With commit 5b6272e0efd5 ("PCI: qcom: Add OPP support to scale performance"), OPP was used to control the interconnect and power domains if the platform supported OPP. Also to maintain the backward compatibility with platforms not supporting OPP but just ICC, the above mentioned commit assumed that if ICC was not available on the platform, it would resort to OPP. Unfortunately, some old platforms don't support either ICC or OPP. On those platforms, resorting to OPP in the absence of ICC throws below errors from OPP core during suspend and resume: qcom-pcie 1c08000.pcie: dev_pm_opp_set_opp: device opp doesn't exist qcom-pcie 1c08000.pcie: _find_key: OPP table not found (-19) Also, it doesn't make sense to invoke the OPP APIs when OPP is not supported by the platform at all. Add a "use_pm_opp" flag to identify whether OPP is supported and use it to control invoking the OPP APIs. Fixes: 5b6272e0efd5 ("PCI: qcom: Add OPP support to scale performance") Link: https://lore.kernel.org/linux-pci/20240722131128.32470-1-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas Reviewed-by: Mayank Rana --- drivers/pci/controller/dwc/pcie-qcom.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index 0180edf3310ec3..6f953e32d99070 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -261,6 +261,7 @@ struct qcom_pcie { const struct qcom_pcie_cfg *cfg; struct dentry *debugfs; bool suspended; + bool use_pm_opp; }; #define to_qcom_pcie(x) dev_get_drvdata((x)->dev) @@ -1433,7 +1434,7 @@ static void qcom_pcie_icc_opp_update(struct qcom_pcie *pcie) dev_err(pci->dev, "Failed to set bandwidth for PCIe-MEM interconnect path: %d\n", ret); } - } else { + } else if (pcie->use_pm_opp) { freq_mbps = pcie_dev_speed_mbps(pcie_link_speed[speed]); if (freq_mbps < 0) return; @@ -1592,6 +1593,8 @@ static int qcom_pcie_probe(struct platform_device *pdev) max_freq); goto err_pm_runtime_put; } + + pcie->use_pm_opp = true; } else { /* Skip ICC init if OPP is supported as it is handled by OPP */ ret = qcom_pcie_icc_init(pcie); @@ -1683,7 +1686,7 @@ static int qcom_pcie_suspend_noirq(struct device *dev) if (ret) dev_err(dev, "Failed to disable CPU-PCIe interconnect path: %d\n", ret); - if (!pcie->icc_mem) + if (pcie->use_pm_opp) dev_pm_opp_set_opp(pcie->pci->dev, NULL); } return ret; From b128ed5ab27330deeeaf51ea8bb69f1442a96f7f Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 19 Aug 2024 17:06:21 +0200 Subject: [PATCH 575/991] udp: fix receiving fraglist GSO packets When assembling fraglist GSO packets, udp4_gro_complete does not set skb->csum_start, which makes the extra validation in __udp_gso_segment fail. Fixes: 89add40066f9 ("net: drop bad gso csum_start and offset in virtio_net_hdr") Signed-off-by: Felix Fietkau Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20240819150621.59833-1-nbd@nbd.name Signed-off-by: Jakub Kicinski --- net/ipv4/udp_offload.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b254a5dadfcf3f..d842303587af93 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -279,7 +279,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, return ERR_PTR(-EINVAL); if (unlikely(skb_checksum_start(gso_skb) != - skb_transport_header(gso_skb))) + skb_transport_header(gso_skb) && + !(skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST))) return ERR_PTR(-EINVAL); /* We don't know if egress device can segment and checksum the packet From f8669d7b5f5d2d88959456ae9123d8bb6fdc1ebe Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 20 Aug 2024 12:53:47 +0200 Subject: [PATCH 576/991] selftests: mlxsw: ethtool_lanes: Source ethtool lib from correct path Source the ethtool library from the correct path and avoid the following error: ./ethtool_lanes.sh: line 14: ./../../../net/forwarding/ethtool_lib.sh: No such file or directory Fixes: 40d269c000bd ("selftests: forwarding: Move several selftests") Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Reviewed-by: Simon Horman Link: https://patch.msgid.link/2112faff02e536e1ac14beb4c2be09c9574b90ae.1724150067.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh index 877cd6df94a10c..fe905a7f34b3c6 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh @@ -2,6 +2,7 @@ # SPDX-License-Identifier: GPL-2.0 lib_dir=$(dirname $0)/../../../net/forwarding +ethtool_lib_dir=$(dirname $0)/../hw ALL_TESTS=" autoneg @@ -11,7 +12,7 @@ ALL_TESTS=" NUM_NETIFS=2 : ${TIMEOUT:=30000} # ms source $lib_dir/lib.sh -source $lib_dir/ethtool_lib.sh +source $ethtool_lib_dir/ethtool_lib.sh setup_prepare() { From 007d4271a5f10638cba6f0b99698557ef30014b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Aug 2024 16:20:53 +0000 Subject: [PATCH 577/991] netpoll: do not export netpoll_poll_[disable|enable]() netpoll_poll_disable() and netpoll_poll_enable() are only used from core networking code, there is no need to export them. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240820162053.3870927-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 55bcacf67df3b6..d657b042d5a048 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -228,7 +228,6 @@ void netpoll_poll_disable(struct net_device *dev) down(&ni->dev_lock); srcu_read_unlock(&netpoll_srcu, idx); } -EXPORT_SYMBOL(netpoll_poll_disable); void netpoll_poll_enable(struct net_device *dev) { @@ -239,7 +238,6 @@ void netpoll_poll_enable(struct net_device *dev) up(&ni->dev_lock); rcu_read_unlock(); } -EXPORT_SYMBOL(netpoll_poll_enable); static void refill_skbs(void) { From faa389b2fbaaec7fd27a390b4896139f9da662e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Aug 2024 16:08:57 +0000 Subject: [PATCH 578/991] ipv6: prevent UAF in ip6_send_skb() syzbot reported an UAF in ip6_send_skb() [1] After ip6_local_out() has returned, we no longer can safely dereference rt, unless we hold rcu_read_lock(). A similar issue has been fixed in commit a688caa34beb ("ipv6: take rcu lock in rawv6_send_hdrinc()") Another potential issue in ip6_finish_output2() is handled in a separate patch. [1] BUG: KASAN: slab-use-after-free in ip6_send_skb+0x18d/0x230 net/ipv6/ip6_output.c:1964 Read of size 8 at addr ffff88806dde4858 by task syz.1.380/6530 CPU: 1 UID: 0 PID: 6530 Comm: syz.1.380 Not tainted 6.11.0-rc3-syzkaller-00306-gdf6cbc62cc9b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 Call Trace: __dump_stack lib/dump_stack.c:93 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:119 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 ip6_send_skb+0x18d/0x230 net/ipv6/ip6_output.c:1964 rawv6_push_pending_frames+0x75c/0x9e0 net/ipv6/raw.c:588 rawv6_sendmsg+0x19c7/0x23c0 net/ipv6/raw.c:926 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x1a6/0x270 net/socket.c:745 sock_write_iter+0x2dd/0x400 net/socket.c:1160 do_iter_readv_writev+0x60a/0x890 vfs_writev+0x37c/0xbb0 fs/read_write.c:971 do_writev+0x1b1/0x350 fs/read_write.c:1018 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f936bf79e79 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f936cd7f038 EFLAGS: 00000246 ORIG_RAX: 0000000000000014 RAX: ffffffffffffffda RBX: 00007f936c115f80 RCX: 00007f936bf79e79 RDX: 0000000000000001 RSI: 0000000020000040 RDI: 0000000000000004 RBP: 00007f936bfe7916 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f936c115f80 R15: 00007fff2860a7a8 Allocated by task 6530: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 unpoison_slab_object mm/kasan/common.c:312 [inline] __kasan_slab_alloc+0x66/0x80 mm/kasan/common.c:338 kasan_slab_alloc include/linux/kasan.h:201 [inline] slab_post_alloc_hook mm/slub.c:3988 [inline] slab_alloc_node mm/slub.c:4037 [inline] kmem_cache_alloc_noprof+0x135/0x2a0 mm/slub.c:4044 dst_alloc+0x12b/0x190 net/core/dst.c:89 ip6_blackhole_route+0x59/0x340 net/ipv6/route.c:2670 make_blackhole net/xfrm/xfrm_policy.c:3120 [inline] xfrm_lookup_route+0xd1/0x1c0 net/xfrm/xfrm_policy.c:3313 ip6_dst_lookup_flow+0x13e/0x180 net/ipv6/ip6_output.c:1257 rawv6_sendmsg+0x1283/0x23c0 net/ipv6/raw.c:898 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x1a6/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2597 ___sys_sendmsg net/socket.c:2651 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2680 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 45: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:579 poison_slab_object+0xe0/0x150 mm/kasan/common.c:240 __kasan_slab_free+0x37/0x60 mm/kasan/common.c:256 kasan_slab_free include/linux/kasan.h:184 [inline] slab_free_hook mm/slub.c:2252 [inline] slab_free mm/slub.c:4473 [inline] kmem_cache_free+0x145/0x350 mm/slub.c:4548 dst_destroy+0x2ac/0x460 net/core/dst.c:124 rcu_do_batch kernel/rcu/tree.c:2569 [inline] rcu_core+0xafd/0x1830 kernel/rcu/tree.c:2843 handle_softirqs+0x2c4/0x970 kernel/softirq.c:554 __do_softirq kernel/softirq.c:588 [inline] invoke_softirq kernel/softirq.c:428 [inline] __irq_exit_rcu+0xf4/0x1c0 kernel/softirq.c:637 irq_exit_rcu+0x9/0x30 kernel/softirq.c:649 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1043 [inline] sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1043 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702 Last potentially related work creation: kasan_save_stack+0x3f/0x60 mm/kasan/common.c:47 __kasan_record_aux_stack+0xac/0xc0 mm/kasan/generic.c:541 __call_rcu_common kernel/rcu/tree.c:3106 [inline] call_rcu+0x167/0xa70 kernel/rcu/tree.c:3210 refdst_drop include/net/dst.h:263 [inline] skb_dst_drop include/net/dst.h:275 [inline] nf_ct_frag6_queue net/ipv6/netfilter/nf_conntrack_reasm.c:306 [inline] nf_ct_frag6_gather+0xb9a/0x2080 net/ipv6/netfilter/nf_conntrack_reasm.c:485 ipv6_defrag+0x2c8/0x3c0 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c:67 nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline] nf_hook_slow+0xc3/0x220 net/netfilter/core.c:626 nf_hook include/linux/netfilter.h:269 [inline] __ip6_local_out+0x6fa/0x800 net/ipv6/output_core.c:143 ip6_local_out+0x26/0x70 net/ipv6/output_core.c:153 ip6_send_skb+0x112/0x230 net/ipv6/ip6_output.c:1959 rawv6_push_pending_frames+0x75c/0x9e0 net/ipv6/raw.c:588 rawv6_sendmsg+0x19c7/0x23c0 net/ipv6/raw.c:926 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x1a6/0x270 net/socket.c:745 sock_write_iter+0x2dd/0x400 net/socket.c:1160 do_iter_readv_writev+0x60a/0x890 Fixes: 0625491493d9 ("ipv6: ip6_push_pending_frames() should increment IPSTATS_MIB_OUTDISCARDS") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: David Ahern Link: https://patch.msgid.link/20240820160859.3786976-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_output.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ab504d31f0cdd8..f7b53effc80f8b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1956,6 +1956,7 @@ int ip6_send_skb(struct sk_buff *skb) struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int err; + rcu_read_lock(); err = ip6_local_out(net, skb->sk, skb); if (err) { if (err > 0) @@ -1965,6 +1966,7 @@ int ip6_send_skb(struct sk_buff *skb) IPSTATS_MIB_OUTDISCARDS); } + rcu_read_unlock(); return err; } From da273b377ae0d9bd255281ed3c2adb228321687b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Aug 2024 16:08:58 +0000 Subject: [PATCH 579/991] ipv6: fix possible UAF in ip6_finish_output2() If skb_expand_head() returns NULL, skb has been freed and associated dst/idev could also have been freed. We need to hold rcu_read_lock() to make sure the dst and associated idev are alive. Fixes: 5796015fa968 ("ipv6: allocate enough headroom in ip6_finish_output2()") Signed-off-by: Eric Dumazet Cc: Vasily Averin Reviewed-by: David Ahern Link: https://patch.msgid.link/20240820160859.3786976-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_output.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f7b53effc80f8b..1b9ebee7308f02 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -70,11 +70,15 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * /* Be paranoid, rather than too clever. */ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { + /* Make sure idev stays alive */ + rcu_read_lock(); skb = skb_expand_head(skb, hh_len); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); return -ENOMEM; } + rcu_read_unlock(); } hdr = ipv6_hdr(skb); From 2d5ff7e339d04622d8282661df36151906d0e1c7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Aug 2024 16:08:59 +0000 Subject: [PATCH 580/991] ipv6: prevent possible UAF in ip6_xmit() If skb_expand_head() returns NULL, skb has been freed and the associated dst/idev could also have been freed. We must use rcu_read_lock() to prevent a possible UAF. Fixes: 0c9f227bee11 ("ipv6: use skb_expand_head in ip6_xmit") Signed-off-by: Eric Dumazet Cc: Vasily Averin Reviewed-by: David Ahern Link: https://patch.msgid.link/20240820160859.3786976-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_output.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1b9ebee7308f02..f26841f1490f5c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -287,11 +287,15 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, head_room += opt->opt_nflen + opt->opt_flen; if (unlikely(head_room > skb_headroom(skb))) { + /* Make sure idev stays alive */ + rcu_read_lock(); skb = skb_expand_head(skb, head_room); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); return -ENOBUFS; } + rcu_read_unlock(); } if (opt) { From 8baeef7616d5194045c5a6b97fd1246b87c55b13 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Tue, 20 Aug 2024 13:34:15 -0700 Subject: [PATCH 581/991] bnxt_en: Fix double DMA unmapping for XDP_REDIRECT Remove the dma_unmap_page_attrs() call in the driver's XDP_REDIRECT code path. This should have been removed when we let the page pool handle the DMA mapping. This bug causes the warning: WARNING: CPU: 7 PID: 59 at drivers/iommu/dma-iommu.c:1198 iommu_dma_unmap_page+0xd5/0x100 CPU: 7 PID: 59 Comm: ksoftirqd/7 Tainted: G W 6.8.0-1010-gcp #11-Ubuntu Hardware name: Dell Inc. PowerEdge R7525/0PYVT1, BIOS 2.15.2 04/02/2024 RIP: 0010:iommu_dma_unmap_page+0xd5/0x100 Code: 89 ee 48 89 df e8 cb f2 69 ff 48 83 c4 08 5b 41 5c 41 5d 41 5e 41 5f 5d 31 c0 31 d2 31 c9 31 f6 31 ff 45 31 c0 e9 ab 17 71 00 <0f> 0b 48 83 c4 08 5b 41 5c 41 5d 41 5e 41 5f 5d 31 c0 31 d2 31 c9 RSP: 0018:ffffab1fc0597a48 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff99ff838280c8 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffab1fc0597a78 R08: 0000000000000002 R09: ffffab1fc0597c1c R10: ffffab1fc0597cd3 R11: ffff99ffe375acd8 R12: 00000000e65b9000 R13: 0000000000000050 R14: 0000000000001000 R15: 0000000000000002 FS: 0000000000000000(0000) GS:ffff9a06efb80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000565c34c37210 CR3: 00000005c7e3e000 CR4: 0000000000350ef0 ? show_regs+0x6d/0x80 ? __warn+0x89/0x150 ? iommu_dma_unmap_page+0xd5/0x100 ? report_bug+0x16a/0x190 ? handle_bug+0x51/0xa0 ? exc_invalid_op+0x18/0x80 ? iommu_dma_unmap_page+0xd5/0x100 ? iommu_dma_unmap_page+0x35/0x100 dma_unmap_page_attrs+0x55/0x220 ? bpf_prog_4d7e87c0d30db711_xdp_dispatcher+0x64/0x9f bnxt_rx_xdp+0x237/0x520 [bnxt_en] bnxt_rx_pkt+0x640/0xdd0 [bnxt_en] __bnxt_poll_work+0x1a1/0x3d0 [bnxt_en] bnxt_poll+0xaa/0x1e0 [bnxt_en] __napi_poll+0x33/0x1e0 net_rx_action+0x18a/0x2f0 Fixes: 578fcfd26e2a ("bnxt_en: Let the page pool manage the DMA mapping") Reviewed-by: Andy Gospodarek Reviewed-by: Kalesh AP Signed-off-by: Somnath Kotur Signed-off-by: Michael Chan Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20240820203415.168178-1-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 345681d5007e35..f88b641533fcc5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -297,11 +297,6 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, * redirect is coming from a frame received by the * bnxt_en driver. */ - rx_buf = &rxr->rx_buf_ring[cons]; - mapping = rx_buf->mapping - bp->rx_dma_offset; - dma_unmap_page_attrs(&pdev->dev, mapping, - BNXT_RX_PAGE_SIZE, bp->rx_dir, - DMA_ATTR_WEAK_ORDERING); /* if we are unable to allocate a new buffer, abort and reuse */ if (bnxt_alloc_rx_data(bp, rxr, rxr->rx_prod, GFP_ATOMIC)) { From 3f53d050416e88122d53aabbadb1fede998004da Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 18 Aug 2024 12:22:23 -0400 Subject: [PATCH 582/991] bcachefs: bch2_data_update_init() cleanup Factor out some helpers - this function has gotten much too big. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 179 +++++++++++++++++++++----------------- 1 file changed, 101 insertions(+), 78 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 1ca628e93e87f1..5f49f4953b19d0 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -20,6 +20,76 @@ #include "subvolume.h" #include "trace.h" +static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) + bch2_dev_put(bch2_dev_have_ref(c, ptr->dev)); +} + +static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + if (!bch2_dev_tryget(c, ptr->dev)) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); + } + return false; + } + } + return true; +} + +static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + } +} + +static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + + if (ctxt) { + bool locked; + + move_ctxt_wait_event(ctxt, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || + list_empty(&ctxt->ios)); + + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); + } else { + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + + bucket = PTR_BUCKET_POS(ca, ptr2); + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + } + return false; + } + } + } + return true; +} + static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) { if (trace_move_extent_finish_enabled()) { @@ -355,17 +425,11 @@ void bch2_data_update_read_done(struct data_update *m, void bch2_data_update_exit(struct data_update *update) { struct bch_fs *c = update->op.c; - struct bkey_ptrs_c ptrs = - bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); - - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - if (c->opts.nocow_enabled) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(ca, ptr), 0); - bch2_dev_put(ca); - } + struct bkey_s_c k = bkey_i_to_s_c(update->k.k); + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); + bkey_put_dev_refs(c, k); bch2_bkey_buf_exit(&update->k, c); bch2_disk_reservation_put(c, &update->op.res); bch2_bio_free_pages_pool(c, &update->op.wbio.bio); @@ -546,7 +610,6 @@ int bch2_data_update_init(struct btree_trans *trans, const union bch_extent_entry *entry; struct extent_ptr_decoded p; unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; - unsigned ptrs_locked = 0; int ret = 0; /* @@ -557,6 +620,15 @@ int bch2_data_update_init(struct btree_trans *trans, if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) return -BCH_ERR_data_update_done; + if (!bkey_get_dev_refs(c, k)) + return -BCH_ERR_data_update_done; + + if (c->opts.nocow_enabled && + !bkey_nocow_lock(c, ctxt, k)) { + bkey_put_dev_refs(c, k); + return -BCH_ERR_nocow_lock_blocked; + } + bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); m->btree_id = btree_id; @@ -578,40 +650,24 @@ int bch2_data_update_init(struct btree_trans *trans, m->op.compression_opt = background_compression(io_opts); m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - bkey_for_each_ptr(ptrs, ptr) { - if (!bch2_dev_tryget(c, ptr->dev)) { - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; - bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); - } - return -BCH_ERR_data_update_done; - } - } - unsigned durability_have = 0, durability_removing = 0; i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); - struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); - bool locked; - - rcu_read_lock(); - if (((1U << i) & m->data_opts.rewrite_ptrs)) { - BUG_ON(p.ptr.cached); - - if (crc_is_compressed(p.crc)) - reserve_sectors += k.k->size; - - m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); - durability_removing += bch2_extent_ptr_desired_durability(c, &p); - } else if (!p.ptr.cached && - !((1U << i) & m->data_opts.kill_ptrs)) { - bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); - durability_have += bch2_extent_ptr_durability(c, &p); + if (!p.ptr.cached) { + rcu_read_lock(); + if (BIT(i) & m->data_opts.rewrite_ptrs) { + if (crc_is_compressed(p.crc)) + reserve_sectors += k.k->size; + + m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); + durability_removing += bch2_extent_ptr_desired_durability(c, &p); + } else if (!(BIT(i) & m->data_opts.kill_ptrs)) { + bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + durability_have += bch2_extent_ptr_durability(c, &p); + } + rcu_read_unlock(); } - rcu_read_unlock(); /* * op->csum_type is normally initialized from the fs/file's @@ -626,24 +682,6 @@ int bch2_data_update_init(struct btree_trans *trans, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; - if (c->opts.nocow_enabled) { - if (ctxt) { - move_ctxt_wait_event(ctxt, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, - bucket, 0)) || - list_empty(&ctxt->ios)); - - if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); - } else { - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { - ret = -BCH_ERR_nocow_lock_blocked; - goto err; - } - } - ptrs_locked |= (1U << i); - } - i++; } @@ -664,7 +702,7 @@ int bch2_data_update_init(struct btree_trans *trans, /* if iter == NULL, it's just a promote */ if (iter) ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); - goto done; + goto out; } m->op.nr_replicas = min(durability_removing, durability_required) + @@ -684,8 +722,7 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_data_update_to_text(&buf, m); WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_data_update_done; - goto done; + goto out; } m->op.nr_replicas_required = m->op.nr_replicas; @@ -696,30 +733,16 @@ int bch2_data_update_init(struct btree_trans *trans, ? 0 : BCH_DISK_RESERVATION_NOFAIL); if (ret) - goto err; + goto out; } if (bkey_extent_is_unwritten(k)) { bch2_update_unwritten_extent(trans, m); - goto done; + goto out; } return 0; -err: - i = 0; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); - struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); - if ((1U << i) & ptrs_locked) - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - bch2_dev_put(ca); - i++; - } - - bch2_bkey_buf_exit(&m->k, c); - bch2_bio_free_pages_pool(c, &m->op.wbio.bio); - return ret; -done: +out: bch2_data_update_exit(m); return ret ?: -BCH_ERR_data_update_done; } From 8cc0e50614520c6c609c6ae32a65d0591b7865a1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 18 Aug 2024 13:13:39 -0400 Subject: [PATCH 583/991] bcachefs: Fix "trying to move an extent, but nr_replicas=0" data_update_init() does a bunch of complicated stuff to decide how many replicas to add, since we only want to increase an extent's durability on an explicit rereplicate, but extent pointers may be on devices with different durability settings. There was a corner case when evacuating a device that had been set to durability=0 after data had been written to it, and extents on that device had already been rereplicated - then evacuate only needs to drop pointers on that device, not move them. So the assert for !m->op.nr_replicas was spurious; this was a perfectly legitimate case that needed to be handled. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 5f49f4953b19d0..65176d51b502e6 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -695,16 +695,6 @@ int bch2_data_update_init(struct btree_trans *trans, * Increasing replication is an explicit operation triggered by * rereplicate, currently, so that users don't get an unexpected -ENOSPC */ - if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) && - !durability_required) { - m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; - m->data_opts.rewrite_ptrs = 0; - /* if iter == NULL, it's just a promote */ - if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); - goto out; - } - m->op.nr_replicas = min(durability_removing, durability_required) + m->data_opts.extra_replicas; @@ -716,17 +706,22 @@ int bch2_data_update_init(struct btree_trans *trans, if (!(durability_have + durability_removing)) m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); - if (!m->op.nr_replicas) { - struct printbuf buf = PRINTBUF; + m->op.nr_replicas_required = m->op.nr_replicas; - bch2_data_update_to_text(&buf, m); - WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf); - printbuf_exit(&buf); + /* + * It might turn out that we don't need any new replicas, if the + * replicas or durability settings have been changed since the extent + * was written: + */ + if (!m->op.nr_replicas) { + m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; + m->data_opts.rewrite_ptrs = 0; + /* if iter == NULL, it's just a promote */ + if (iter) + ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); goto out; } - m->op.nr_replicas_required = m->op.nr_replicas; - if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, m->data_opts.extra_replicas From 548e7f51679bf0ec3cdc2027d780c5d06a2a7ac6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 18 Aug 2024 13:24:26 -0400 Subject: [PATCH 584/991] bcachefs: setting bcachefs_effective.* xattrs is a noop bcachefs_effective.* xattrs show the options inherited from parent directories (as well as explicitly set); this namespace is not for setting bcachefs options. Change the .set() handler to a noop so that if e.g. rsync is copying xattrs it'll do the right thing, and only copy xattrs in the bcachefs.* namespace. We don't want to return an error, because that will cause rsync to bail out or get spammy. Signed-off-by: Kent Overstreet --- fs/bcachefs/xattr.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index f2b4c17a0307df..331f944d73dc93 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -612,10 +612,20 @@ static int bch2_xattr_bcachefs_get_effective( name, buffer, size, true); } +/* Noop - xattrs in the bcachefs_effective namespace are inherited */ +static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *vinode, + const char *name, const void *value, + size_t size, int flags) +{ + return 0; +} + static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { .prefix = "bcachefs_effective.", .get = bch2_xattr_bcachefs_get_effective, - .set = bch2_xattr_bcachefs_set, + .set = bch2_xattr_bcachefs_set_effective, }; #endif /* NO_BCACHEFS_FS */ From 49203a6b9d12bfd1a223a67847a631a78f1cd782 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 18 Aug 2024 15:08:12 -0400 Subject: [PATCH 585/991] bcachefs: Fix failure to relock in btree_node_get() discovered by new trans->locked asserts Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index f5d85b50b6f2f7..cc778d7e769e75 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -974,6 +974,10 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr bch2_btree_node_wait_on_read(b); + ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + /* * should_be_locked is not set on this path yet, so we need to * relock it specifically: From e150a7e89c4727176d07f5a0a8966fc2af05821c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 18 Aug 2024 20:18:34 -0400 Subject: [PATCH 586/991] bcachefs: Fix bch2_trigger_alloc assert On testing on an old mangled filesystem, we missed a case. Fixes: bd864bc2d907 ("bcachefs: Fix bch2_trigger_alloc when upgrading from old versions") Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index fd3a2522bc3ed3..488d0710f7b997 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -829,7 +829,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (likely(new.k->type == KEY_TYPE_alloc_v4)) { new_a = bkey_s_to_alloc_v4(new).v; } else { - BUG_ON(!(flags & BTREE_TRIGGER_gc)); + BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair))); struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); ret = PTR_ERR_OR_ZERO(new_ka); From c2a503f3e98e191d86738f5438a3a2b69575c830 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 18 Aug 2024 20:38:49 -0400 Subject: [PATCH 587/991] bcachefs: Fix bch2_bucket_gens_init() Comparing the wrong bpos - this was missed because normally bucket_gens_init() runs on brand new filesystems, but this bug caused it to overwrite bucket_gens keys with 0s when upgrading ancient filesystems. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 488d0710f7b997..ac933142aedab9 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -556,7 +556,7 @@ int bch2_bucket_gens_init(struct bch_fs *c) struct bpos pos = alloc_gens_pos(iter.pos, &offset); int ret2 = 0; - if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { + if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) { ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret2) From b8db1bd8020d5fecb3bf46cd8b954a657c20ba14 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 19 Aug 2024 16:13:16 -0400 Subject: [PATCH 588/991] bcachefs: fix time_stats_to_text() Fixes: 7423330e30ab ("bcachefs: prt_printf() now respects \r\n\t") Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 138320eaa2ad39..1b8554460af47e 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -416,7 +416,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstop_push(out, TABSTOP_SIZE + 2); prt_printf(out, "\tsince mount\r\trecent\r\n"); - prt_printf(out, "recent"); printbuf_tabstops_reset(out); printbuf_tabstop_push(out, out->indent + 20); From cecc328240609df17395dfd0ea03cc813d8be36d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 20 Aug 2024 11:25:39 -0400 Subject: [PATCH 589/991] bcachefs: fix missing bch2_err_str() Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9138944c5ae698..267c2336b1155d 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2469,8 +2469,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino : bch2_inode_unpack(inode_k, &inode); if (ret) { /* Should have been caught in dirents pass */ - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "error looking up parent directory: %i", ret); + bch_err_msg(c, ret, "error looking up parent directory"); break; } From 1dceae4cc12aa6389d9a8706f0d2a94d1679e79d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 20 Aug 2024 12:10:33 -0400 Subject: [PATCH 590/991] bcachefs: unlock_long() before resort in journal replay Fix another SRCU splat - this one pretty harmless. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index d89eb43c5ce951..11368dfa96b282 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -322,6 +322,7 @@ int bch2_journal_replay(struct bch_fs *c) } } + bch2_trans_unlock_long(trans); /* * Now, replay any remaining keys in the order in which they appear in * the journal, unpinning those journal entries as we go: From 3c5d0b72a8e8c19c960e8fefb7463067e58b6bc4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 19 Aug 2024 15:22:55 -0400 Subject: [PATCH 591/991] bcachefs: fix failure to relock in bch2_btree_node_mem_alloc() We weren't always so strict about trans->locked state - but now we are, and new assertions are shaking some bugs out. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 17 +++++++++++ fs/bcachefs/btree_cache.h | 2 ++ fs/bcachefs/btree_update_interior.c | 46 ++++++++++++++++------------- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index cc778d7e769e75..063725ecb2b388 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -159,6 +159,16 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) return b; } +void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) +{ + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +} + /* Btree in memory cache - hash table */ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) @@ -736,6 +746,13 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea start_time); memalloc_nofs_restore(flags); + + int ret = bch2_trans_relock(trans); + if (unlikely(ret)) { + bch2_btree_node_to_freelist(c, b); + return ERR_PTR(ret); + } + return b; err: mutex_lock(&bc->lock); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index c0eb87a057ccb9..f8206400712720 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -12,6 +12,8 @@ struct btree_iter; void bch2_recalc_btree_reserve(struct bch_fs *); +void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *); + void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index b3454d4619e8fb..8fd112026e7a34 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -317,6 +317,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, : 0; int ret; + b = bch2_btree_node_mem_alloc(trans, interior_node); + if (IS_ERR(b)) + return b; + + BUG_ON(b->ob.nr); + mutex_lock(&c->btree_reserve_cache_lock); if (c->btree_reserve_cache_nr > nr_reserve) { struct btree_alloc *a = @@ -325,10 +331,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, obs = a->ob; bkey_copy(&tmp.k, &a->k); mutex_unlock(&c->btree_reserve_cache_lock); - goto mem_alloc; + goto out; } mutex_unlock(&c->btree_reserve_cache_lock); - retry: ret = bch2_alloc_sectors_start_trans(trans, c->opts.metadata_target ?: @@ -341,7 +346,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, c->opts.metadata_replicas_required), watermark, 0, cl, &wp); if (unlikely(ret)) - return ERR_PTR(ret); + goto err; if (wp->sectors_free < btree_sectors(c)) { struct open_bucket *ob; @@ -360,19 +365,16 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, bch2_open_bucket_get(c, wp, &obs); bch2_alloc_sectors_done(c, wp); -mem_alloc: - b = bch2_btree_node_mem_alloc(trans, interior_node); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - /* we hold cannibalize_lock: */ - BUG_ON(IS_ERR(b)); - BUG_ON(b->ob.nr); - +out: bkey_copy(&b->key, &tmp.k); b->ob = obs; + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); return b; +err: + bch2_btree_node_to_freelist(c, b); + return ERR_PTR(ret); } static struct btree *bch2_btree_node_alloc(struct btree_update *as, @@ -2439,6 +2441,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite } new_hash = bch2_btree_node_mem_alloc(trans, false); + ret = PTR_ERR_OR_ZERO(new_hash); + if (ret) + goto err; } path->intent_ref++; @@ -2446,14 +2451,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite commit_flags, skip_triggers); --path->intent_ref; - if (new_hash) { - mutex_lock(&c->btree_cache.lock); - list_move(&new_hash->list, &c->btree_cache.freeable); - mutex_unlock(&c->btree_cache.lock); - - six_unlock_write(&new_hash->c.lock); - six_unlock_intent(&new_hash->c.lock); - } + if (new_hash) + bch2_btree_node_to_freelist(c, new_hash); +err: closure_sync(&cl); bch2_btree_cache_cannibalize_unlock(trans); return ret; @@ -2522,6 +2522,10 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id b = bch2_btree_node_mem_alloc(trans, false); bch2_btree_cache_cannibalize_unlock(trans); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret; + set_btree_node_fake(b); set_btree_node_need_rewrite(b); b->c.level = level; @@ -2553,7 +2557,7 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level)); + bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level))); } static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) From 5dbfc4ef72f15508882aff58c307b8425cf037a8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 20 Aug 2024 15:04:15 -0400 Subject: [PATCH 592/991] bcachefs: fix failure to relock in btree_node_fill() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 063725ecb2b388..e52a06d3418ccd 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -873,6 +873,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, bch2_btree_node_read(trans, b, sync); + int ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + if (!sync) return NULL; From 6575b8c9877c3dd1f7db1d0d61bd250a0bf18b6d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 20 Aug 2024 19:31:20 -0400 Subject: [PATCH 593/991] bcachefs: Fix locking in bch2_ioc_setlabel() Fixes: 7a254053a590 ("bcachefs: support FS_IOC_SETFSLABEL") Reported-by: syzbot+7e9efdfec27fbde0141d@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-ioctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index aea8132d2c40e5..99c7fe987c74ff 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -328,9 +328,8 @@ static int bch2_ioc_setlabel(struct bch_fs *c, mutex_lock(&c->sb_lock); strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); - mutex_unlock(&c->sb_lock); - ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); mnt_drop_write_file(file); return ret; From cab18be6957b6af8cbe3502fd5f6d7b9f02ccceb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 Aug 2024 20:49:07 -0400 Subject: [PATCH 594/991] bcachefs: Fix replay_now_at() assert Journal replay, in the slowpath where we insert keys in journal order, was inserting keys in the wrong order; keys from early repair come last. Reported-by: syzbot+2c4fcb257ce2b6a29d0e@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 11368dfa96b282..36de1c6fe8c36e 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -241,7 +241,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) const struct journal_key *l = *((const struct journal_key **)_l); const struct journal_key *r = *((const struct journal_key **)_r); - return cmp_int(l->journal_seq, r->journal_seq); + /* + * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last + * + * journal_seq == 0 means that the key comes from early repair, and + * should be inserted last so as to avoid overflowing the journal + */ + return cmp_int(l->journal_seq - 1, r->journal_seq - 1); } int bch2_journal_replay(struct bch_fs *c) From bdbdd4759f081ca2d0a5d9e8af21d742ffaf8439 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 Aug 2024 21:10:45 -0400 Subject: [PATCH 595/991] bcachefs: Fix missing validation in bch2_sb_journal_v2_validate() Reported-by: syzbot+47ecc948aadfb2ab3efc@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_sb.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c index db80e506e3abee..62b910f2fb27cd 100644 --- a/fs/bcachefs/journal_sb.c +++ b/fs/bcachefs/journal_sb.c @@ -104,6 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); int ret = -BCH_ERR_invalid_sb_journal; + u64 sum = 0; unsigned nr; unsigned i; struct u64_range *b; @@ -119,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f for (i = 0; i < nr; i++) { b[i].start = le64_to_cpu(journal->d[i].start); b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); + + if (b[i].end <= b[i].start) { + prt_printf(err, "journal buckets entry with bad nr: %llu+%llu", + le64_to_cpu(journal->d[i].start), + le64_to_cpu(journal->d[i].nr)); + goto err; + } + + sum += le64_to_cpu(journal->d[i].nr); } sort(b, nr, sizeof(*b), u64_range_cmp, NULL); @@ -148,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f } } + if (sum > UINT_MAX) { + prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX); + goto err; + } + ret = 0; err: kfree(b); From 06f67437ab356e3140f51aea272d33ce28421f66 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 Aug 2024 22:06:44 -0400 Subject: [PATCH 596/991] fs/super.c: improve get_tree() error message seeing an odd bug where we fail to correctly return an error from .get_tree(): https://syzkaller.appspot.com/bug?extid=c0360e8367d6d8d04a66 we need to be able to distinguish between accidently returning a positive error (as implied by the log) and no error. Cc: David Howells Signed-off-by: Kent Overstreet --- fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/super.c b/fs/super.c index 38d72a3cf6fcf8..b7913b55debc1f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1802,8 +1802,8 @@ int vfs_get_tree(struct fs_context *fc) return error; if (!fc->root) { - pr_err("Filesystem %s get_tree() didn't set fc->root\n", - fc->fs_type->name); + pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n", + fc->fs_type->name, error); /* We don't know what the locking state of the superblock is - * if there is a superblock. */ From 7f2de6947f92cfa4be8e5eaa1237e962bb8ee65f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 Aug 2024 22:27:45 -0400 Subject: [PATCH 597/991] bcachefs: Fix warning in bch2_fs_journal_stop() j->last_empty_seq needs to match j->seq when the journal is empty Reported-by: syzbot+4093905737cf289b6b38@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 649e3a01608af4..f5f7db50ca310c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1260,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) } if (!had_entries) - j->last_empty_seq = cur_seq; + j->last_empty_seq = cur_seq - 1; /* to match j->seq */ spin_lock(&j->lock); From 8ed823b19214e403ca485532f48c0e02035021ae Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 Aug 2024 22:57:56 -0400 Subject: [PATCH 598/991] bcachefs: Fix compat issue with old alloc_v4 keys we allow new fields to be added to existing key types, and new versions should treat them as being zeroed; this was not handled in alloc_v4_validate. Reported-by: syzbot+3b2968fa4953885dd66a@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 50 ++++++++++++++------------- fs/bcachefs/alloc_background_format.h | 1 + fs/bcachefs/btree_iter.h | 9 +++++ 3 files changed, 36 insertions(+), 24 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index ac933142aedab9..dc97b1f8bc084a 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -240,71 +240,73 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { - struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + struct bch_alloc_v4 a; int ret = 0; - bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), + bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k)); + + bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k), c, alloc_v4_val_size_bad, "bad val size (%u > %zu)", - alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k)); + alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k)); - bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), + bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) && + BCH_ALLOC_V4_NR_BACKPOINTERS(&a), c, alloc_v4_backpointers_start_bad, "invalid backpointers_start"); - bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, + bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type, c, alloc_key_data_type_bad, "invalid data type (got %u should be %u)", - a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + a.data_type, alloc_data_type(a, a.data_type)); for (unsigned i = 0; i < 2; i++) - bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX, + bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX, c, alloc_key_io_time_bad, "invalid io_time[%s]: %llu, max %llu", i == READ ? "read" : "write", - a.v->io_time[i], LRU_TIME_MAX); + a.io_time[i], LRU_TIME_MAX); - unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) > + unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) > offsetof(struct bch_alloc_v4, stripe_sectors) - ? a.v->stripe_sectors + ? a.stripe_sectors : 0; - switch (a.v->data_type) { + switch (a.data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: bkey_fsck_err_on(stripe_sectors || - a.v->dirty_sectors || - a.v->cached_sectors || - a.v->stripe, + a.dirty_sectors || + a.cached_sectors || + a.stripe, c, alloc_key_empty_but_have_data, "empty data type free but have data %u.%u.%u %u", stripe_sectors, - a.v->dirty_sectors, - a.v->cached_sectors, - a.v->stripe); + a.dirty_sectors, + a.cached_sectors, + a.stripe); break; case BCH_DATA_sb: case BCH_DATA_journal: case BCH_DATA_btree: case BCH_DATA_user: case BCH_DATA_parity: - bkey_fsck_err_on(!a.v->dirty_sectors && + bkey_fsck_err_on(!a.dirty_sectors && !stripe_sectors, c, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", - bch2_data_type_str(a.v->data_type)); + bch2_data_type_str(a.data_type)); break; case BCH_DATA_cached: - bkey_fsck_err_on(!a.v->cached_sectors || - a.v->dirty_sectors || + bkey_fsck_err_on(!a.cached_sectors || + a.dirty_sectors || stripe_sectors || - a.v->stripe, + a.stripe, c, alloc_key_cached_inconsistency, "data type inconsistency"); - bkey_fsck_err_on(!a.v->io_time[READ] && + bkey_fsck_err_on(!a.io_time[READ] && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h index 47d9d006502cb1..f754a2951d8aab 100644 --- a/fs/bcachefs/alloc_background_format.h +++ b/fs/bcachefs/alloc_background_format.h @@ -69,6 +69,7 @@ struct bch_alloc_v4 { __u64 io_time[2]; __u32 stripe; __u32 nr_external_backpointers; + /* end of fields in original version of alloc_v4 */ __u64 fragmentation_lru; __u32 stripe_sectors; __u32 pad; diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index dca62375d7d30f..222b7ce8a901e3 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -569,6 +569,15 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ _btree_id, _pos, _flags, KEY_TYPE_##_type)) +#define bkey_val_copy(_dst_v, _src_k) \ +do { \ + unsigned b = min_t(unsigned, sizeof(*_dst_v), \ + bkey_val_bytes(_src_k.k)); \ + memcpy(_dst_v, _src_k.v, b); \ + if (b < sizeof(*_dst_v)) \ + memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \ +} while (0) + static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, unsigned btree_id, struct bpos pos, unsigned flags, unsigned type, From 0b50b7313ef2494926df30ce8e2ce284f1b847fc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 Aug 2024 23:21:52 -0400 Subject: [PATCH 599/991] bcachefs: Fix refcounting in discard path bch_dev->io_ref does not protect against the filesystem going away; bch_fs->writes does. Thus the filesystem write ref needs to be the last ref we release. Reported-by: syzbot+9e0404b505e604f67e41@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index dc97b1f8bc084a..ba46f1c1d78aad 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1874,26 +1874,26 @@ static void bch2_do_discards_work(struct work_struct *work) trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - bch2_write_ref_put(c, BCH_WRITE_REF_discard); percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_dev_do_discards(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) - goto put_ioref; + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + goto put_write_ref; if (queue_work(c->write_ref_wq, &ca->discard_work)) return; - bch2_write_ref_put(c, BCH_WRITE_REF_discard); -put_ioref: percpu_ref_put(&ca->io_ref); +put_write_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) From dedb2fe37574857c84e9598b9f5272505dedf7af Mon Sep 17 00:00:00 2001 From: Yuesong Li Date: Thu, 22 Aug 2024 14:21:58 +0800 Subject: [PATCH 600/991] bcachefs: Fix double assignment in check_dirent_to_subvol() ret was assigned twice in check_dirent_to_subvol(). Reported by cocci. Signed-off-by: Yuesong Li Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 267c2336b1155d..6801c37ee803dc 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2006,7 +2006,6 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (ret) { bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); ret = -BCH_ERR_fsck_repair_unimplemented; - ret = 0; goto err; } From 87313ac1f134d6ee1e7c858da8bdea9147b537a9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 Aug 2024 02:13:02 -0400 Subject: [PATCH 601/991] bcachefs: clear path->should_be_locked in bch2_btree_key_cache_drop() bch2_btree_key_cache_drop() evicts the key cache entry - it's used when we're doing an update that bypasses the key cache, because for cache coherency reasons a key can't be in the key cache unless it also exists in the btree - i.e. creates have to bypass the cache. After evicting, the path no longer points to a key cache key, and relock() will always fail if should_be_locked is true. Prep for improving path->should_be_locked assertions Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 79954490627cca..9b3ec2a3b8cecf 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -726,6 +726,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + path->should_be_locked = false; } static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, From 9b82ff1362f50914c8292902e07be98a9f59d33d Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Thu, 22 Aug 2024 10:54:19 +0800 Subject: [PATCH 602/991] ALSA: hda/realtek - Fixed ALC256 headphone no sound Dell platform, plug headphone or headset, it had a chance to get no sound from headphone. Replace depop procedure will solve this issue. Signed-off-by: Kailang Yang Link: https://lore.kernel.org/bb8e2de30d294dc287944efa0667685a@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 50 ++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 4eafbcb40120c2..bf098c6fedb510 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4930,6 +4930,30 @@ static void alc269_fixup_hp_line1_mic1_led(struct hda_codec *codec, } } +static void alc_hp_mute_disable(struct hda_codec *codec, unsigned int delay) +{ + if (delay <= 0) + delay = 75; + snd_hda_codec_write(codec, 0x21, 0, + AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_MUTE); + msleep(delay); + snd_hda_codec_write(codec, 0x21, 0, + AC_VERB_SET_PIN_WIDGET_CONTROL, 0x0); + msleep(delay); +} + +static void alc_hp_enable_unmute(struct hda_codec *codec, unsigned int delay) +{ + if (delay <= 0) + delay = 75; + snd_hda_codec_write(codec, 0x21, 0, + AC_VERB_SET_PIN_WIDGET_CONTROL, PIN_OUT); + msleep(delay); + snd_hda_codec_write(codec, 0x21, 0, + AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_UNMUTE); + msleep(delay); +} + static const struct coef_fw alc225_pre_hsmode[] = { UPDATE_COEF(0x4a, 1<<8, 0), UPDATE_COEFEX(0x57, 0x05, 1<<14, 0), @@ -5031,6 +5055,7 @@ static void alc_headset_mode_unplugged(struct hda_codec *codec) case 0x10ec0236: case 0x10ec0256: case 0x19e58326: + alc_hp_mute_disable(codec, 75); alc_process_coef_fw(codec, coef0256); break; case 0x10ec0234: @@ -5302,6 +5327,7 @@ static void alc_headset_mode_default(struct hda_codec *codec) alc_write_coef_idx(codec, 0x45, 0xc089); msleep(50); alc_process_coef_fw(codec, coef0256); + alc_hp_enable_unmute(codec, 75); break; case 0x10ec0234: case 0x10ec0274: @@ -5399,6 +5425,7 @@ static void alc_headset_mode_ctia(struct hda_codec *codec) case 0x10ec0256: case 0x19e58326: alc_process_coef_fw(codec, coef0256); + alc_hp_enable_unmute(codec, 75); break; case 0x10ec0234: case 0x10ec0274: @@ -5514,6 +5541,7 @@ static void alc_headset_mode_omtp(struct hda_codec *codec) case 0x10ec0256: case 0x19e58326: alc_process_coef_fw(codec, coef0256); + alc_hp_enable_unmute(codec, 75); break; case 0x10ec0234: case 0x10ec0274: @@ -5619,25 +5647,21 @@ static void alc_determine_headset_type(struct hda_codec *codec) alc_write_coef_idx(codec, 0x06, 0x6104); alc_write_coefex_idx(codec, 0x57, 0x3, 0x09a3); - snd_hda_codec_write(codec, 0x21, 0, - AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_MUTE); - msleep(80); - snd_hda_codec_write(codec, 0x21, 0, - AC_VERB_SET_PIN_WIDGET_CONTROL, 0x0); - alc_process_coef_fw(codec, coef0255); msleep(300); val = alc_read_coef_idx(codec, 0x46); is_ctia = (val & 0x0070) == 0x0070; - + if (!is_ctia) { + alc_write_coef_idx(codec, 0x45, 0xe089); + msleep(100); + val = alc_read_coef_idx(codec, 0x46); + if ((val & 0x0070) == 0x0070) + is_ctia = false; + else + is_ctia = true; + } alc_write_coefex_idx(codec, 0x57, 0x3, 0x0da3); alc_update_coefex_idx(codec, 0x57, 0x5, 1<<14, 0); - - snd_hda_codec_write(codec, 0x21, 0, - AC_VERB_SET_PIN_WIDGET_CONTROL, PIN_OUT); - msleep(80); - snd_hda_codec_write(codec, 0x21, 0, - AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_UNMUTE); break; case 0x10ec0234: case 0x10ec0274: From 1d8c3c23a6bc1527e253b305b4b68c03d833b824 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 22 Aug 2024 07:17:09 +0000 Subject: [PATCH 603/991] KVM: arm64: Ensure canonical IPA is hugepage-aligned when handling fault Zenghui reports that VMs backed by hugetlb pages are no longer booting after commit fd276e71d1e7 ("KVM: arm64: nv: Handle shadow stage 2 page faults"). Support for shadow stage-2 MMUs introduced the concept of a fault IPA and canonical IPA to stage-2 fault handling. These are identical in the non-nested case, as the hardware stage-2 context is always that of the canonical IPA space. Both addresses need to be hugepage-aligned when preparing to install a hugepage mapping to ensure that KVM uses the correct GFN->PFN translation and installs that at the correct IPA for the current stage-2. And now I'm feeling thirsty after all this talk of IPAs... Fixes: fd276e71d1e7 ("KVM: arm64: nv: Handle shadow stage 2 page faults") Reported-by: Zenghui Yu Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240822071710.2291690-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 6981b1bc094686..a509b63bd4dd50 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1540,8 +1540,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_pagesize = min(vma_pagesize, (long)max_map_size); } - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) + /* + * Both the canonical IPA and fault IPA must be hugepage-aligned to + * ensure we find the right PFN and lay down the mapping in the right + * place. + */ + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { fault_ipa &= ~(vma_pagesize - 1); + ipa &= ~(vma_pagesize - 1); + } gfn = ipa >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); From 6e95097b6bb20f0021180b150f41ad9962dcdcc9 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Tue, 13 Aug 2024 13:44:47 +0300 Subject: [PATCH 604/991] MAINTAINERS: Mark UVC gadget driver as orphan I haven't had time to maintain the UVC gadget driver for a long while. Dan Scally confirmed he is also in a similar -ENOTIME situation with no short term hope of fixing that. Being listed as maintainers doesn't help progress, so mark the driver as orphan to reflect the current state. Signed-off-by: Laurent Pinchart Acked-by: Daniel Scally Link: https://lore.kernel.org/r/20240813104447.25821-1-laurent.pinchart@ideasonboard.com Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index f328373463b0d2..2b193c9a44ee24 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23821,10 +23821,8 @@ F: drivers/media/usb/uvc/ F: include/uapi/linux/uvcvideo.h USB WEBCAM GADGET -M: Laurent Pinchart -M: Daniel Scally L: linux-usb@vger.kernel.org -S: Maintained +S: Orphan F: drivers/usb/gadget/function/*uvc* F: drivers/usb/gadget/legacy/webcam.c F: include/uapi/linux/usb/g_uvc.h From 3e6245ebe7ef341639e9a7e402b3ade8ad45a19f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 20 Aug 2024 11:03:38 +0100 Subject: [PATCH 605/991] KVM: arm64: Make ICC_*SGI*_EL1 undef in the absence of a vGICv3 On a system with a GICv3, if a guest hasn't been configured with GICv3 and that the host is not capable of GICv2 emulation, a write to any of the ICC_*SGI*_EL1 registers is trapped to EL2. We therefore try to emulate the SGI access, only to hit a NULL pointer as no private interrupt is allocated (no GIC, remember?). The obvious fix is to give the guest what it deserves, in the shape of a UNDEF exception. Reported-by: Alexander Potapenko Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240820100349.3544850-2-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 6 ++++++ arch/arm64/kvm/vgic/vgic.h | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c90324060436b2..31e49da867ffc3 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -33,6 +33,7 @@ #include #include "sys_regs.h" +#include "vgic/vgic.h" #include "trace.h" @@ -435,6 +436,11 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, { bool g1; + if (!kvm_has_gicv3(vcpu->kvm)) { + kvm_inject_undefined(vcpu); + return false; + } + if (!p->is_write) return read_from_write_only(vcpu, p, r); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index ba8f790431bd3a..8532bfe3fed40c 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -346,4 +346,11 @@ void vgic_v4_configure_vsgis(struct kvm *kvm); void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq); +static inline bool kvm_has_gicv3(struct kvm *kvm) +{ + return (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) && + irqchip_in_kernel(kvm) && + kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3); +} + #endif From 14e497183df28c006603cc67fd3797a537eef7b9 Mon Sep 17 00:00:00 2001 From: Selvarasu Ganesan Date: Thu, 15 Aug 2024 12:18:31 +0530 Subject: [PATCH 606/991] usb: dwc3: core: Prevent USB core invalid event buffer address access This commit addresses an issue where the USB core could access an invalid event buffer address during runtime suspend, potentially causing SMMU faults and other memory issues in Exynos platforms. The problem arises from the following sequence. 1. In dwc3_gadget_suspend, there is a chance of a timeout when moving the USB core to the halt state after clearing the run/stop bit by software. 2. In dwc3_core_exit, the event buffer is cleared regardless of the USB core's status, which may lead to an SMMU faults and other memory issues. if the USB core tries to access the event buffer address. To prevent this hardware quirk on Exynos platforms, this commit ensures that the event buffer address is not cleared by software when the USB core is active during runtime suspend by checking its status before clearing the buffer address. Cc: stable Signed-off-by: Selvarasu Ganesan Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20240815064836.1491-1-selvarasu.g@samsung.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 734de2a8bd212a..ccc3895dbd7f9f 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -564,9 +564,17 @@ int dwc3_event_buffers_setup(struct dwc3 *dwc) void dwc3_event_buffers_cleanup(struct dwc3 *dwc) { struct dwc3_event_buffer *evt; + u32 reg; if (!dwc->ev_buf) return; + /* + * Exynos platforms may not be able to access event buffer if the + * controller failed to halt on dwc3_core_exit(). + */ + reg = dwc3_readl(dwc->regs, DWC3_DSTS); + if (!(reg & DWC3_DSTS_DEVCTRLHLT)) + return; evt = dwc->ev_buf; From 0497a356d3c498221eb0c1edc1e8985816092f12 Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Tue, 20 Aug 2024 08:21:19 +0000 Subject: [PATCH 607/991] usb: cdnsp: fix incorrect index in cdnsp_get_hw_deq function Patch fixes the incorrect "stream_id" table index instead of "ep_index" used in cdnsp_get_hw_deq function. Fixes: 3d82904559f4 ("usb: cdnsp: cdns3 Add main part of Cadence USBSSP DRD Driver") cc: stable@vger.kernel.org Signed-off-by: Pawel Laszczak Reviewed-by: Peter Chen Link: https://lore.kernel.org/r/PH7PR07MB95381F2182688811D5C711CEDD8D2@PH7PR07MB9538.namprd07.prod.outlook.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/cdnsp-ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/cdns3/cdnsp-ring.c b/drivers/usb/cdns3/cdnsp-ring.c index 02f297f5637d75..a60c0cb991cd1b 100644 --- a/drivers/usb/cdns3/cdnsp-ring.c +++ b/drivers/usb/cdns3/cdnsp-ring.c @@ -402,7 +402,7 @@ static u64 cdnsp_get_hw_deq(struct cdnsp_device *pdev, struct cdnsp_stream_ctx *st_ctx; struct cdnsp_ep *pep; - pep = &pdev->eps[stream_id]; + pep = &pdev->eps[ep_index]; if (pep->ep_state & EP_HAS_STREAMS) { st_ctx = &pep->stream_info.stream_ctx_array[stream_id]; From 0b00583ecacb0b51712a5ecd34cf7e6684307c67 Mon Sep 17 00:00:00 2001 From: Ian Ray Date: Wed, 14 Aug 2024 10:29:05 +0300 Subject: [PATCH 608/991] cdc-acm: Add DISABLE_ECHO quirk for GE HealthCare UI Controller USB_DEVICE(0x1901, 0x0006) may send data before cdc_acm is ready, which may be misinterpreted in the default N_TTY line discipline. Signed-off-by: Ian Ray Acked-by: Oliver Neuku Cc: stable Link: https://lore.kernel.org/r/20240814072905.2501-1-ian.ray@gehealthcare.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 0e7439dba8fe8c..0c1b69d944ca45 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1761,6 +1761,9 @@ static const struct usb_device_id acm_ids[] = { { USB_DEVICE(0x11ca, 0x0201), /* VeriFone Mx870 Gadget Serial */ .driver_info = SINGLE_RX_URB, }, + { USB_DEVICE(0x1901, 0x0006), /* GE Healthcare Patient Monitor UI Controller */ + .driver_info = DISABLE_ECHO, /* DISABLE ECHO in termios flag */ + }, { USB_DEVICE(0x1965, 0x0018), /* Uniden UBC125XLT */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, From b52a07e07dead777517af3cbda851bb2cc157c9d Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Wed, 14 Aug 2024 19:25:37 +0800 Subject: [PATCH 609/991] usb: gadget: uvc: queue pump work in uvcg_video_enable() Since commit "6acba0345b68 usb:gadget:uvc Do not use worker thread to pump isoc usb requests", pump work could only be queued in uvc_video_complete() and uvc_v4l2_qbuf(). If VIDIOC_QBUF is executed before VIDIOC_STREAMON, we can only depend on uvc_video_complete() to queue pump work. However, this requires some free requests in req_ready list. If req_ready list is empty all the time, pump work will never be queued and video datas will never be pumped to usb controller. Actually, this situation could happen when run uvc-gadget with static image: $ ./uvc-gadget -i 1080p.jpg uvc.0 When capture image from this device, the user app will always block there. The issue is uvc driver has queued video buffer before streamon, but the req_ready list is empty all the time after streamon. This will queue pump work in uvcg_video_enable() to fill some request to req_ready list so the uvc device could work properly. Fixes: 6acba0345b68 ("usb:gadget:uvc Do not use worker thread to pump isoc usb requests") Cc: stable@vger.kernel.org Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20240814112537.2608949-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/uvc_video.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/uvc_video.c b/drivers/usb/gadget/function/uvc_video.c index d41f5f31dadd58..a9edd60fbbf779 100644 --- a/drivers/usb/gadget/function/uvc_video.c +++ b/drivers/usb/gadget/function/uvc_video.c @@ -753,6 +753,7 @@ int uvcg_video_enable(struct uvc_video *video) video->req_int_count = 0; uvc_video_ep_queue_initial_requests(video); + queue_work(video->async_wq, &video->pump); return ret; } From 5b235693ed2a1e4963625717a1598becf97759cc Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Thu, 15 Aug 2024 13:31:31 +0200 Subject: [PATCH 610/991] dt-bindings: usb: microchip,usb2514: Fix reference USB device schema An USB hub is not a HCD, but an USB device. Fix the referenced schema accordingly. Fixes: bfbf2e4b77e2 ("dt-bindings: usb: Document the Microchip USB2514 hub") Cc: stable@vger.kernel.org Reviewed-by: Krzysztof Kozlowski Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20240815113132.372542-1-alexander.stein@ew.tq-group.com Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/usb/microchip,usb2514.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/usb/microchip,usb2514.yaml b/Documentation/devicetree/bindings/usb/microchip,usb2514.yaml index 245e8c3ce66993..b14e6f37b2987c 100644 --- a/Documentation/devicetree/bindings/usb/microchip,usb2514.yaml +++ b/Documentation/devicetree/bindings/usb/microchip,usb2514.yaml @@ -10,7 +10,7 @@ maintainers: - Fabio Estevam allOf: - - $ref: usb-hcd.yaml# + - $ref: usb-device.yaml# properties: compatible: @@ -36,6 +36,13 @@ required: - compatible - reg +patternProperties: + "^.*@[0-9a-f]{1,2}$": + description: The hard wired USB devices + type: object + $ref: /schemas/usb/usb-device.yaml + additionalProperties: true + unevaluatedProperties: false examples: From 2aa765a43817ec8add990f83c8e54a9a5d87aa9c Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 16 Aug 2024 09:54:08 +0200 Subject: [PATCH 611/991] usb: dwc3: omap: add missing depopulate in probe error path Depopulate device in probe error paths to fix leak of children resources. Fixes: ee249b455494 ("usb: dwc3: omap: remove IRQ_NOAUTOEN used with shared irq") Cc: stable@vger.kernel.org Acked-by: Thinh Nguyen Signed-off-by: Krzysztof Kozlowski Reviewed-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/20240816075409.23080-1-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-omap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c index d5c77db4daa920..2a11fc0ee84f1e 100644 --- a/drivers/usb/dwc3/dwc3-omap.c +++ b/drivers/usb/dwc3/dwc3-omap.c @@ -522,11 +522,13 @@ static int dwc3_omap_probe(struct platform_device *pdev) if (ret) { dev_err(dev, "failed to request IRQ #%d --> %d\n", omap->irq, ret); - goto err1; + goto err2; } dwc3_omap_enable_irqs(omap); return 0; +err2: + of_platform_depopulate(dev); err1: pm_runtime_put_sync(dev); pm_runtime_disable(dev); From 16f2a21d9d7e48e1af02654fe3d926c0ce6cb3e5 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 16 Aug 2024 09:54:09 +0200 Subject: [PATCH 612/991] usb: dwc3: xilinx: add missing depopulate in probe error path Depopulate device in probe error paths to fix leak of children resources. Fixes: 53b5ff83d893 ("usb: dwc3: xilinx: improve error handling for PM APIs") Cc: stable@vger.kernel.org Reviewed-by: Radhey Shyam Pandey Signed-off-by: Krzysztof Kozlowski Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20240816075409.23080-2-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-xilinx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/dwc3-xilinx.c b/drivers/usb/dwc3/dwc3-xilinx.c index bb4d894c16e949..f1298b1b4f8491 100644 --- a/drivers/usb/dwc3/dwc3-xilinx.c +++ b/drivers/usb/dwc3/dwc3-xilinx.c @@ -327,9 +327,14 @@ static int dwc3_xlnx_probe(struct platform_device *pdev) goto err_pm_set_suspended; pm_suspend_ignore_children(dev, false); - return pm_runtime_resume_and_get(dev); + ret = pm_runtime_resume_and_get(dev); + if (ret < 0) + goto err_pm_set_suspended; + + return 0; err_pm_set_suspended: + of_platform_depopulate(dev); pm_runtime_set_suspended(dev); err_clk_put: From 4f83cae0edb2b13aabb82e8a4852092844d320aa Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Sun, 18 Aug 2024 22:21:01 +0200 Subject: [PATCH 613/991] usb: typec: fsa4480: Relax CHIP_ID check Some FSA4480-compatible chips like the OCP96011 used on Fairphone 5 return 0x00 from the CHIP_ID register. Handle that gracefully and only fail probe when the I2C read has failed. With this the dev_dbg will print 0 but otherwise continue working. [ 0.251581] fsa4480 1-0042: Found FSA4480 v0.0 (Vendor ID = 0) Cc: stable@vger.kernel.org Fixes: e885f5f1f2b4 ("usb: typec: fsa4480: Check if the chip is really there") Signed-off-by: Luca Weiss Reviewed-by: Heikki Krogerus Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20240818-fsa4480-chipid-fix-v1-1-17c239435cf7@fairphone.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/mux/fsa4480.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/mux/fsa4480.c b/drivers/usb/typec/mux/fsa4480.c index cd235339834b0b..f71dba8bf07c98 100644 --- a/drivers/usb/typec/mux/fsa4480.c +++ b/drivers/usb/typec/mux/fsa4480.c @@ -274,7 +274,7 @@ static int fsa4480_probe(struct i2c_client *client) return dev_err_probe(dev, PTR_ERR(fsa->regmap), "failed to initialize regmap\n"); ret = regmap_read(fsa->regmap, FSA4480_DEVICE_ID, &val); - if (ret || !val) + if (ret) return dev_err_probe(dev, -ENODEV, "FSA4480 not found\n"); dev_dbg(dev, "Found FSA4480 v%lu.%lu (Vendor ID = %lu)\n", From 3a8839bbb86da7968a792123ed2296d063871a52 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 20 Aug 2024 19:01:27 +0800 Subject: [PATCH 614/991] usb: core: sysfs: Unmerge @usb3_hardware_lpm_attr_group in remove_power_attributes() Device attribute group @usb3_hardware_lpm_attr_group is merged by add_power_attributes(), but it is not unmerged explicitly, fixed by unmerging it in remove_power_attributes(). Fixes: 655fe4effe0f ("usbcore: add sysfs support to xHCI usb3 hardware LPM") Cc: stable@vger.kernel.org Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20240820-sysfs_fix-v2-1-a9441487077e@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c index d83231d6736ac6..61b6d978892c79 100644 --- a/drivers/usb/core/sysfs.c +++ b/drivers/usb/core/sysfs.c @@ -670,6 +670,7 @@ static int add_power_attributes(struct device *dev) static void remove_power_attributes(struct device *dev) { + sysfs_unmerge_group(&dev->kobj, &usb3_hardware_lpm_attr_group); sysfs_unmerge_group(&dev->kobj, &usb2_hardware_lpm_attr_group); sysfs_unmerge_group(&dev->kobj, &power_attr_group); } From 72fca8371f205d654f95b09cd023a71fd5307041 Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Thu, 15 Aug 2024 08:40:29 +0200 Subject: [PATCH 615/991] usb: dwc3: ep0: Don't reset resource alloc flag (including ep0) The DWC3_EP_RESOURCE_ALLOCATED flag ensures that the resource of an endpoint is only assigned once. Unless the endpoint is reset, don't clear this flag. Otherwise we may set endpoint resource again, which prevents the driver from initiate transfer after handling a STALL or endpoint halt to the control endpoint. Commit f2e0eee47038 ("usb: dwc3: ep0: Don't reset resource alloc flag") was fixing the initial issue, but did this only for physical ep1. Since the function dwc3_ep0_stall_and_restart is resetting the flags for both physical endpoints, this also has to be done for ep0. Cc: stable@vger.kernel.org Fixes: b311048c174d ("usb: dwc3: gadget: Rewrite endpoint allocation flow") Acked-by: Thinh Nguyen Signed-off-by: Michael Grzeschik Link: https://lore.kernel.org/r/20240814-dwc3hwep0reset-v2-1-29e1d7d923ea@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/ep0.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index d96ffbe520397a..c9533a99e47c89 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -232,7 +232,8 @@ void dwc3_ep0_stall_and_restart(struct dwc3 *dwc) /* stall is always issued on EP0 */ dep = dwc->eps[0]; __dwc3_gadget_ep_set_halt(dep, 1, false); - dep->flags = DWC3_EP_ENABLED; + dep->flags &= DWC3_EP_RESOURCE_ALLOCATED; + dep->flags |= DWC3_EP_ENABLED; dwc->delayed_status = false; if (!list_empty(&dep->pending_list)) { From ddfcfeba891064b88bb844208b43bef2ef970f0c Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 14 Aug 2024 11:39:56 +0200 Subject: [PATCH 616/991] usb: dwc3: st: fix probed platform device ref count on probe error path The probe function never performs any paltform device allocation, thus error path "undo_platform_dev_alloc" is entirely bogus. It drops the reference count from the platform device being probed. If error path is triggered, this will lead to unbalanced device reference counts and premature release of device resources, thus possible use-after-free when releasing remaining devm-managed resources. Fixes: f83fca0707c6 ("usb: dwc3: add ST dwc3 glue layer to manage dwc3 HC") Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Kozlowski Acked-by: Thinh Nguyen Reviewed-by: Patrice Chotard Link: https://lore.kernel.org/r/20240814093957.37940-1-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-st.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-st.c b/drivers/usb/dwc3/dwc3-st.c index 211360eee95a0f..a9cb04043f08ed 100644 --- a/drivers/usb/dwc3/dwc3-st.c +++ b/drivers/usb/dwc3/dwc3-st.c @@ -219,10 +219,8 @@ static int st_dwc3_probe(struct platform_device *pdev) dwc3_data->regmap = regmap; res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "syscfg-reg"); - if (!res) { - ret = -ENXIO; - goto undo_platform_dev_alloc; - } + if (!res) + return -ENXIO; dwc3_data->syscfg_reg_off = res->start; @@ -233,8 +231,7 @@ static int st_dwc3_probe(struct platform_device *pdev) devm_reset_control_get_exclusive(dev, "powerdown"); if (IS_ERR(dwc3_data->rstc_pwrdn)) { dev_err(&pdev->dev, "could not get power controller\n"); - ret = PTR_ERR(dwc3_data->rstc_pwrdn); - goto undo_platform_dev_alloc; + return PTR_ERR(dwc3_data->rstc_pwrdn); } /* Manage PowerDown */ @@ -300,8 +297,6 @@ static int st_dwc3_probe(struct platform_device *pdev) reset_control_assert(dwc3_data->rstc_rst); undo_powerdown: reset_control_assert(dwc3_data->rstc_pwrdn); -undo_platform_dev_alloc: - platform_device_put(pdev); return ret; } From cd4897bfd14f6a5388b21ba45a066541a0425199 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 14 Aug 2024 11:39:57 +0200 Subject: [PATCH 617/991] usb: dwc3: st: add missing depopulate in probe error path Depopulate device in probe error paths to fix leak of children resources. Fixes: f83fca0707c6 ("usb: dwc3: add ST dwc3 glue layer to manage dwc3 HC") Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Kozlowski Reviewed-by: Patrice Chotard Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20240814093957.37940-2-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-st.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/dwc3-st.c b/drivers/usb/dwc3/dwc3-st.c index a9cb04043f08ed..c8c7cd0c179693 100644 --- a/drivers/usb/dwc3/dwc3-st.c +++ b/drivers/usb/dwc3/dwc3-st.c @@ -266,7 +266,7 @@ static int st_dwc3_probe(struct platform_device *pdev) if (!child_pdev) { dev_err(dev, "failed to find dwc3 core device\n"); ret = -ENODEV; - goto err_node_put; + goto depopulate; } dwc3_data->dr_mode = usb_get_dr_mode(&child_pdev->dev); @@ -282,6 +282,7 @@ static int st_dwc3_probe(struct platform_device *pdev) ret = st_dwc3_drd_init(dwc3_data); if (ret) { dev_err(dev, "drd initialisation failed\n"); + of_platform_depopulate(dev); goto undo_softreset; } @@ -291,6 +292,8 @@ static int st_dwc3_probe(struct platform_device *pdev) platform_set_drvdata(pdev, dwc3_data); return 0; +depopulate: + of_platform_depopulate(dev); err_node_put: of_node_put(child); undo_softreset: From 6ea14ccb60c8ab829349979b22b58a941ec4a3ee Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 13 Aug 2024 12:39:46 +0200 Subject: [PATCH 618/991] netfilter: flowtable: validate vlan header Ensure there is sufficient room to access the protocol field of the VLAN header, validate it once before the flowtable lookup. ===================================================== BUG: KMSAN: uninit-value in nf_flow_offload_inet_hook+0x45a/0x5f0 net/netfilter/nf_flow_table_inet.c:32 nf_flow_offload_inet_hook+0x45a/0x5f0 net/netfilter/nf_flow_table_inet.c:32 nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline] nf_hook_slow+0xf4/0x400 net/netfilter/core.c:626 nf_hook_ingress include/linux/netfilter_netdev.h:34 [inline] nf_ingress net/core/dev.c:5440 [inline] Fixes: 4cd91f7c290f ("netfilter: flowtable: add vlan support") Reported-by: syzbot+8407d9bb88cd4c6bf61a@syzkaller.appspotmail.com Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_inet.c | 3 +++ net/netfilter/nf_flow_table_ip.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 88787b45e30d6b..8b541a08034206 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -17,6 +17,9 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, switch (skb->protocol) { case htons(ETH_P_8021Q): + if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) + return NF_ACCEPT; + veth = (struct vlan_ethhdr *)skb_mac_header(skb); proto = veth->h_vlan_encapsulated_proto; break; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index c2c005234dcd38..98edcaa37b38d4 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -281,6 +281,9 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, switch (skb->protocol) { case htons(ETH_P_8021Q): + if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) + return false; + veth = (struct vlan_ethhdr *)skb_mac_header(skb); if (veth->h_vlan_encapsulated_proto == proto) { *offset += VLAN_HLEN; From 1fa7b099d60ad64f559bd3b8e3f0d94b2e015514 Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Thu, 22 Aug 2024 16:46:56 +0800 Subject: [PATCH 619/991] ALSA: hda/realtek - FIxed ALC285 headphone no sound Dell platform with ALC215 ALC285 ALC289 ALC225 ALC295 ALC299, plug headphone or headset. It had a chance to get no sound from headphone. Replace depop procedure will solve this issue. Signed-off-by: Kailang Yang Link: https://lore.kernel.org/d0de1b03fd174520945dde216d765223@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index bf098c6fedb510..b5cc3417138c74 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5090,6 +5090,7 @@ static void alc_headset_mode_unplugged(struct hda_codec *codec) case 0x10ec0295: case 0x10ec0289: case 0x10ec0299: + alc_hp_mute_disable(codec, 75); alc_process_coef_fw(codec, alc225_pre_hsmode); alc_process_coef_fw(codec, coef0225); break; @@ -5315,6 +5316,7 @@ static void alc_headset_mode_default(struct hda_codec *codec) case 0x10ec0299: alc_process_coef_fw(codec, alc225_pre_hsmode); alc_process_coef_fw(codec, coef0225); + alc_hp_enable_unmute(codec, 75); break; case 0x10ec0255: alc_process_coef_fw(codec, coef0255); @@ -5474,6 +5476,7 @@ static void alc_headset_mode_ctia(struct hda_codec *codec) alc_process_coef_fw(codec, coef0225_2); else alc_process_coef_fw(codec, coef0225_1); + alc_hp_enable_unmute(codec, 75); break; case 0x10ec0867: alc_update_coefex_idx(codec, 0x57, 0x5, 1<<14, 0); @@ -5579,6 +5582,7 @@ static void alc_headset_mode_omtp(struct hda_codec *codec) case 0x10ec0289: case 0x10ec0299: alc_process_coef_fw(codec, coef0225); + alc_hp_enable_unmute(codec, 75); break; } codec_dbg(codec, "Headset jack set to Nokia-style headset mode.\n"); @@ -5738,12 +5742,6 @@ static void alc_determine_headset_type(struct hda_codec *codec) case 0x10ec0295: case 0x10ec0289: case 0x10ec0299: - snd_hda_codec_write(codec, 0x21, 0, - AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_MUTE); - msleep(80); - snd_hda_codec_write(codec, 0x21, 0, - AC_VERB_SET_PIN_WIDGET_CONTROL, 0x0); - alc_process_coef_fw(codec, alc225_pre_hsmode); alc_update_coef_idx(codec, 0x67, 0xf000, 0x1000); val = alc_read_coef_idx(codec, 0x45); @@ -5760,15 +5758,19 @@ static void alc_determine_headset_type(struct hda_codec *codec) val = alc_read_coef_idx(codec, 0x46); is_ctia = (val & 0x00f0) == 0x00f0; } + if (!is_ctia) { + alc_update_coef_idx(codec, 0x45, 0x3f<<10, 0x38<<10); + alc_update_coef_idx(codec, 0x49, 3<<8, 1<<8); + msleep(100); + val = alc_read_coef_idx(codec, 0x46); + if ((val & 0x00f0) == 0x00f0) + is_ctia = false; + else + is_ctia = true; + } alc_update_coef_idx(codec, 0x4a, 7<<6, 7<<6); alc_update_coef_idx(codec, 0x4a, 3<<4, 3<<4); alc_update_coef_idx(codec, 0x67, 0xf000, 0x3000); - - snd_hda_codec_write(codec, 0x21, 0, - AC_VERB_SET_PIN_WIDGET_CONTROL, PIN_OUT); - msleep(80); - snd_hda_codec_write(codec, 0x21, 0, - AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_UNMUTE); break; case 0x10ec0867: is_ctia = true; From f2916c83d746eb99f50f42c15cf4c47c2ea5f3b3 Mon Sep 17 00:00:00 2001 From: Mengyuan Lou Date: Tue, 20 Aug 2024 11:04:25 +0800 Subject: [PATCH 620/991] net: ngbe: Fix phy mode set to external phy The MAC only has add the TX delay and it can not be modified. MAC and PHY are both set the TX delay cause transmission problems. So just disable TX delay in PHY, when use rgmii to attach to external phy, set PHY_INTERFACE_MODE_RGMII_RXID to phy drivers. And it is does not matter to internal phy. Fixes: bc2426d74aa3 ("net: ngbe: convert phylib to phylink") Signed-off-by: Mengyuan Lou Cc: stable@vger.kernel.org # 6.3+ Reviewed-by: Jacob Keller Link: https://patch.msgid.link/E6759CF1387CF84C+20240820030425.93003-1-mengyuanlou@net-swift.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c index ec54b18c5fe73b..a5e9b779c44d01 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c @@ -124,8 +124,12 @@ static int ngbe_phylink_init(struct wx *wx) MAC_SYM_PAUSE | MAC_ASYM_PAUSE; config->mac_managed_pm = true; - phy_mode = PHY_INTERFACE_MODE_RGMII_ID; - __set_bit(PHY_INTERFACE_MODE_RGMII_ID, config->supported_interfaces); + /* The MAC only has add the Tx delay and it can not be modified. + * So just disable TX delay in PHY, and it is does not matter to + * internal phy. + */ + phy_mode = PHY_INTERFACE_MODE_RGMII_RXID; + __set_bit(PHY_INTERFACE_MODE_RGMII_RXID, config->supported_interfaces); phylink = phylink_create(config, NULL, phy_mode, &ngbe_mac_ops); if (IS_ERR(phylink)) From a2f5c505b4378cd6fc7c4a44ff3665ccef2037db Mon Sep 17 00:00:00 2001 From: Sava Jakovljev Date: Wed, 21 Aug 2024 04:16:57 +0200 Subject: [PATCH 621/991] net: phy: realtek: Fix setting of PHY LEDs Mode B bit on RTL8211F The current implementation incorrectly sets the mode bit of the PHY chip. Bit 15 (RTL8211F_LEDCR_MODE) should not be shifted together with the configuration nibble of a LED- it should be set independently of the index of the LED being configured. As a consequence, the RTL8211F LED control is actually operating in Mode A. Fix the error by or-ing final register value to write with a const-value of RTL8211F_LEDCR_MODE, thus setting Mode bit explicitly. Fixes: 17784801d888 ("net: phy: realtek: Add support for PHY LEDs on RTL8211F") Signed-off-by: Sava Jakovljev Reviewed-by: Marek Vasut Link: https://patch.msgid.link/PAWP192MB21287372F30C4E55B6DF6158C38E2@PAWP192MB2128.EURP192.PROD.OUTLOOK.COM Signed-off-by: Paolo Abeni --- drivers/net/phy/realtek.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index 87865918dab6d0..25e5bfbb6f89b8 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -555,7 +555,7 @@ static int rtl8211f_led_hw_control_set(struct phy_device *phydev, u8 index, unsigned long rules) { const u16 mask = RTL8211F_LEDCR_MASK << (RTL8211F_LEDCR_SHIFT * index); - u16 reg = RTL8211F_LEDCR_MODE; /* Mode B */ + u16 reg = 0; if (index >= RTL8211F_LED_COUNT) return -EINVAL; @@ -575,6 +575,7 @@ static int rtl8211f_led_hw_control_set(struct phy_device *phydev, u8 index, } reg <<= RTL8211F_LEDCR_SHIFT * index; + reg |= RTL8211F_LEDCR_MODE; /* Mode B */ return phy_modify_paged(phydev, 0xd04, RTL8211F_LEDCR, mask, reg); } From af688a99eb1fc7ef69774665d61e6be51cea627a Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 21 Aug 2024 12:35:58 +0530 Subject: [PATCH 622/991] octeontx2-af: Fix CPT AF register offset calculation Some CPT AF registers are per LF and others are global. Translation of PF/VF local LF slot number to actual LF slot number is required only for accessing perf LF registers. CPT AF global registers access do not require any LF slot number. Also, there is no reason CPT PF/VF to know actual lf's register offset. Without this fix microcode loading will fail, VFs cannot be created and hardware is not usable. Fixes: bc35e28af789 ("octeontx2-af: replace cpt slot with lf id on reg write") Signed-off-by: Bharat Bhushan Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240821070558.1020101-1-bbhushan2@marvell.com Signed-off-by: Paolo Abeni --- .../ethernet/marvell/octeontx2/af/rvu_cpt.c | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c index 3e09d228581470..daf4b951e90591 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c @@ -632,7 +632,9 @@ int rvu_mbox_handler_cpt_inline_ipsec_cfg(struct rvu *rvu, return ret; } -static bool is_valid_offset(struct rvu *rvu, struct cpt_rd_wr_reg_msg *req) +static bool validate_and_update_reg_offset(struct rvu *rvu, + struct cpt_rd_wr_reg_msg *req, + u64 *reg_offset) { u64 offset = req->reg_offset; int blkaddr, num_lfs, lf; @@ -663,6 +665,11 @@ static bool is_valid_offset(struct rvu *rvu, struct cpt_rd_wr_reg_msg *req) if (lf < 0) return false; + /* Translate local LF's offset to global CPT LF's offset to + * access LFX register. + */ + *reg_offset = (req->reg_offset & 0xFF000) + (lf << 3); + return true; } else if (!(req->hdr.pcifunc & RVU_PFVF_FUNC_MASK)) { /* Registers that can be accessed from PF */ @@ -697,7 +704,7 @@ int rvu_mbox_handler_cpt_rd_wr_register(struct rvu *rvu, struct cpt_rd_wr_reg_msg *rsp) { u64 offset = req->reg_offset; - int blkaddr, lf; + int blkaddr; blkaddr = validate_and_get_cpt_blkaddr(req->blkaddr); if (blkaddr < 0) @@ -708,18 +715,10 @@ int rvu_mbox_handler_cpt_rd_wr_register(struct rvu *rvu, !is_cpt_vf(rvu, req->hdr.pcifunc)) return CPT_AF_ERR_ACCESS_DENIED; - if (!is_valid_offset(rvu, req)) + if (!validate_and_update_reg_offset(rvu, req, &offset)) return CPT_AF_ERR_ACCESS_DENIED; - /* Translate local LF used by VFs to global CPT LF */ - lf = rvu_get_lf(rvu, &rvu->hw->block[blkaddr], req->hdr.pcifunc, - (offset & 0xFFF) >> 3); - - /* Translate local LF's offset to global CPT LF's offset */ - offset &= 0xFF000; - offset += lf << 3; - - rsp->reg_offset = offset; + rsp->reg_offset = req->reg_offset; rsp->ret_val = req->ret_val; rsp->is_write = req->is_write; From 9a8fc292dd93b93db30e01c94c0da4c944852f28 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 22 Aug 2024 14:30:53 +0300 Subject: [PATCH 623/991] spi: pxa2xx: Do not override dev->platform_data on probe The platform_data field may be supplied by legacy board code. In other cases we override it, and module remove and probe cycle will crash the kernel since it will carry a stale pointer. Fix this by supplying a third argument to the pxa2xx_spi_probe() and avoid overriding dev->platform_data. Reported-by: Hao Ma Fixes: cc160697a576 ("spi: pxa2xx: Convert PCI driver to use spi-pxa2xx code directly") Fixes: 3d8f037fbcab ("spi: pxa2xx: Move platform driver to a separate file") Fixes: 20ade9b9771c ("spi: pxa2xx: Extract pxa2xx_spi_platform_*() callbacks") Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20240822113408.750831-2-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-pxa2xx-pci.c | 2 +- drivers/spi/spi-pxa2xx-platform.c | 6 ++---- drivers/spi/spi-pxa2xx.c | 5 ++--- drivers/spi/spi-pxa2xx.h | 3 ++- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c index 616d032f1a89a2..c98bb214b6ae02 100644 --- a/drivers/spi/spi-pxa2xx-pci.c +++ b/drivers/spi/spi-pxa2xx-pci.c @@ -297,7 +297,7 @@ static int pxa2xx_spi_pci_probe(struct pci_dev *dev, return ret; ssp->irq = pci_irq_vector(dev, 0); - return pxa2xx_spi_probe(&dev->dev, ssp); + return pxa2xx_spi_probe(&dev->dev, ssp, pdata); } static void pxa2xx_spi_pci_remove(struct pci_dev *dev) diff --git a/drivers/spi/spi-pxa2xx-platform.c b/drivers/spi/spi-pxa2xx-platform.c index 98a8ceb7db6feb..f9504cddc7ba78 100644 --- a/drivers/spi/spi-pxa2xx-platform.c +++ b/drivers/spi/spi-pxa2xx-platform.c @@ -63,7 +63,7 @@ static struct ssp_device *pxa2xx_spi_ssp_request(struct platform_device *pdev) ssp = pxa_ssp_request(pdev->id, pdev->name); if (!ssp) - return ssp; + return NULL; status = devm_add_action_or_reset(&pdev->dev, pxa2xx_spi_ssp_release, ssp); if (status) @@ -148,8 +148,6 @@ static int pxa2xx_spi_platform_probe(struct platform_device *pdev) platform_info = pxa2xx_spi_init_pdata(pdev); if (IS_ERR(platform_info)) return dev_err_probe(dev, PTR_ERR(platform_info), "missing platform data\n"); - - dev->platform_data = platform_info; } ssp = pxa2xx_spi_ssp_request(pdev); @@ -158,7 +156,7 @@ static int pxa2xx_spi_platform_probe(struct platform_device *pdev) if (!ssp) ssp = &platform_info->ssp; - return pxa2xx_spi_probe(dev, ssp); + return pxa2xx_spi_probe(dev, ssp, platform_info); } static void pxa2xx_spi_platform_remove(struct platform_device *pdev) diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index 16b96eb176cd9d..e3a95adc527939 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -1277,16 +1277,15 @@ static size_t pxa2xx_spi_max_dma_transfer_size(struct spi_device *spi) return MAX_DMA_LEN; } -int pxa2xx_spi_probe(struct device *dev, struct ssp_device *ssp) +int pxa2xx_spi_probe(struct device *dev, struct ssp_device *ssp, + struct pxa2xx_spi_controller *platform_info) { - struct pxa2xx_spi_controller *platform_info; struct spi_controller *controller; struct driver_data *drv_data; const struct lpss_config *config; int status; u32 tmp; - platform_info = dev_get_platdata(dev); if (platform_info->is_target) controller = devm_spi_alloc_target(dev, sizeof(*drv_data)); else diff --git a/drivers/spi/spi-pxa2xx.h b/drivers/spi/spi-pxa2xx.h index a470d3d634d34d..447be036938483 100644 --- a/drivers/spi/spi-pxa2xx.h +++ b/drivers/spi/spi-pxa2xx.h @@ -132,7 +132,8 @@ extern void pxa2xx_spi_dma_stop(struct driver_data *drv_data); extern int pxa2xx_spi_dma_setup(struct driver_data *drv_data); extern void pxa2xx_spi_dma_release(struct driver_data *drv_data); -int pxa2xx_spi_probe(struct device *dev, struct ssp_device *ssp); +int pxa2xx_spi_probe(struct device *dev, struct ssp_device *ssp, + struct pxa2xx_spi_controller *platform_info); void pxa2xx_spi_remove(struct device *dev); extern const struct dev_pm_ops pxa2xx_spi_pm_ops; From e17465f78eb92ebb4be17e35d6c0584406f643a0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 22 Aug 2024 14:30:54 +0300 Subject: [PATCH 624/991] spi: pxa2xx: Move PM runtime handling to the glue drivers PCI and platform buses have different defaults for runtime PM. In particular PCI probe is assumed to be called when PM runtime is enabled by the PCI core. In this case if we try enable it again the PM runtime complaints with pxa2xx_spi_pci 0000:00:07.0: Unbalanced pm_runtime_enable! Fix this by moving PM runtime handling from the SPI PXA2xx core to the glue drivers. Fixes: cc160697a576 ("spi: pxa2xx: Convert PCI driver to use spi-pxa2xx code directly") Fixes: 3d8f037fbcab ("spi: pxa2xx: Move platform driver to a separate file") Fixes: 20ade9b9771c ("spi: pxa2xx: Extract pxa2xx_spi_platform_*() callbacks") Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20240822113408.750831-3-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-pxa2xx-pci.c | 15 ++++++++++++++- drivers/spi/spi-pxa2xx-platform.c | 22 ++++++++++++++++++++-- drivers/spi/spi-pxa2xx.c | 15 +-------------- 3 files changed, 35 insertions(+), 17 deletions(-) diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c index c98bb214b6ae02..cc8dcf782399e9 100644 --- a/drivers/spi/spi-pxa2xx-pci.c +++ b/drivers/spi/spi-pxa2xx-pci.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -297,11 +298,23 @@ static int pxa2xx_spi_pci_probe(struct pci_dev *dev, return ret; ssp->irq = pci_irq_vector(dev, 0); - return pxa2xx_spi_probe(&dev->dev, ssp, pdata); + ret = pxa2xx_spi_probe(&dev->dev, ssp, pdata); + if (ret) + return ret; + + pm_runtime_set_autosuspend_delay(&dev->dev, 50); + pm_runtime_use_autosuspend(&dev->dev); + pm_runtime_put_autosuspend(&dev->dev); + pm_runtime_allow(&dev->dev); + + return 0; } static void pxa2xx_spi_pci_remove(struct pci_dev *dev) { + pm_runtime_forbid(&dev->dev); + pm_runtime_get_noresume(&dev->dev); + pxa2xx_spi_remove(&dev->dev); } diff --git a/drivers/spi/spi-pxa2xx-platform.c b/drivers/spi/spi-pxa2xx-platform.c index f9504cddc7ba78..595af9fa4e0f89 100644 --- a/drivers/spi/spi-pxa2xx-platform.c +++ b/drivers/spi/spi-pxa2xx-platform.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -142,6 +143,7 @@ static int pxa2xx_spi_platform_probe(struct platform_device *pdev) struct pxa2xx_spi_controller *platform_info; struct device *dev = &pdev->dev; struct ssp_device *ssp; + int ret; platform_info = dev_get_platdata(dev); if (!platform_info) { @@ -156,12 +158,28 @@ static int pxa2xx_spi_platform_probe(struct platform_device *pdev) if (!ssp) ssp = &platform_info->ssp; - return pxa2xx_spi_probe(dev, ssp, platform_info); + pm_runtime_set_autosuspend_delay(dev, 50); + pm_runtime_use_autosuspend(dev); + pm_runtime_set_active(dev); + pm_runtime_enable(dev); + + ret = pxa2xx_spi_probe(dev, ssp, platform_info); + if (ret) + pm_runtime_disable(dev); + + return ret; } static void pxa2xx_spi_platform_remove(struct platform_device *pdev) { - pxa2xx_spi_remove(&pdev->dev); + struct device *dev = &pdev->dev; + + pm_runtime_get_sync(dev); + + pxa2xx_spi_remove(dev); + + pm_runtime_put_noidle(dev); + pm_runtime_disable(dev); } static const struct acpi_device_id pxa2xx_spi_acpi_match[] = { diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index e3a95adc527939..bf1f34b0ffc8eb 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -1449,24 +1449,16 @@ int pxa2xx_spi_probe(struct device *dev, struct ssp_device *ssp, } } - pm_runtime_set_autosuspend_delay(dev, 50); - pm_runtime_use_autosuspend(dev); - pm_runtime_set_active(dev); - pm_runtime_enable(dev); - /* Register with the SPI framework */ dev_set_drvdata(dev, drv_data); status = spi_register_controller(controller); if (status) { dev_err_probe(dev, status, "problem registering SPI controller\n"); - goto out_error_pm_runtime_enabled; + goto out_error_clock_enabled; } return status; -out_error_pm_runtime_enabled: - pm_runtime_disable(dev); - out_error_clock_enabled: clk_disable_unprepare(ssp->clk); @@ -1483,8 +1475,6 @@ void pxa2xx_spi_remove(struct device *dev) struct driver_data *drv_data = dev_get_drvdata(dev); struct ssp_device *ssp = drv_data->ssp; - pm_runtime_get_sync(dev); - spi_unregister_controller(drv_data->controller); /* Disable the SSP at the peripheral and SOC level */ @@ -1495,9 +1485,6 @@ void pxa2xx_spi_remove(struct device *dev) if (drv_data->controller_info->enable_dma) pxa2xx_spi_dma_release(drv_data); - pm_runtime_put_noidle(dev); - pm_runtime_disable(dev); - /* Release IRQ */ free_irq(ssp->irq, drv_data); } From bff980d8d9ca537fd5f3c0e9a99876c1e3713e81 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 22 Aug 2024 12:57:25 +0100 Subject: [PATCH 625/991] ASoC: cs-amp-lib-test: Force test calibration blob entries to be valid For a normal calibration blob the calTarget values must be non-zero and unique, and the calTime values must be non-zero. Don't rely on get_random_bytes() to be random enough to guarantee this. Force the calTarget and calTime values to be valid while retaining randomness in the values. Signed-off-by: Richard Fitzgerald Fixes: 177862317a98 ("ASoC: cs-amp-lib: Add KUnit test for calibration helpers") Link: https://patch.msgid.link/20240822115725.259568-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs-amp-lib-test.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sound/soc/codecs/cs-amp-lib-test.c b/sound/soc/codecs/cs-amp-lib-test.c index 15f991b2e16e27..8169ec88a8ba8b 100644 --- a/sound/soc/codecs/cs-amp-lib-test.c +++ b/sound/soc/codecs/cs-amp-lib-test.c @@ -38,6 +38,7 @@ static void cs_amp_lib_test_init_dummy_cal_blob(struct kunit *test, int num_amps { struct cs_amp_lib_test_priv *priv = test->priv; unsigned int blob_size; + int i; blob_size = offsetof(struct cirrus_amp_efi_data, data) + sizeof(struct cirrus_amp_cal_data) * num_amps; @@ -49,6 +50,14 @@ static void cs_amp_lib_test_init_dummy_cal_blob(struct kunit *test, int num_amps priv->cal_blob->count = num_amps; get_random_bytes(priv->cal_blob->data, sizeof(struct cirrus_amp_cal_data) * num_amps); + + /* Ensure all timestamps are non-zero to mark the entry valid. */ + for (i = 0; i < num_amps; i++) + priv->cal_blob->data[i].calTime[0] |= 1; + + /* Ensure that all UIDs are non-zero and unique. */ + for (i = 0; i < num_amps; i++) + *(u8 *)&priv->cal_blob->data[i].calTarget[0] = i + 1; } static u64 cs_amp_lib_test_get_target_uid(struct kunit *test) From 1ac66c4960e1c735eb6edfd3e6d52bebb2aa347e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 21 Aug 2024 09:46:44 +0100 Subject: [PATCH 626/991] MAINTAINERS: Add sonet.h to ATM section of MAINTAINERS This is part of an effort to assign a section in MAINTAINERS to header files that relate to Networking. In this case the files with "net" in their name. It seems that sonet.h is included in ATM related source files, and thus that ATM is the most relevant section for these files. Cc: Chas Williams <3chas3@gmail.com> Signed-off-by: Simon Horman Signed-off-by: Paolo Abeni --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a964a34651f563..c682203915a2a9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3504,7 +3504,9 @@ S: Maintained W: http://linux-atm.sourceforge.net F: drivers/atm/ F: include/linux/atm* +F: include/linux/sonet.h F: include/uapi/linux/atm* +F: include/uapi/linux/sonet.h ATMEL MACB ETHERNET DRIVER M: Nicolas Ferre From eb208fecd77d898709c25af680487289fd5f3e16 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 21 Aug 2024 09:46:45 +0100 Subject: [PATCH 627/991] MAINTAINERS: Add net_tstamp.h to SOCKET TIMESTAMPING section This is part of an effort to assign a section in MAINTAINERS to header files that relate to Networking. In this case the files with "net" in their name. Cc: Richard Cochran Cc: Willem de Bruijn Signed-off-by: Simon Horman Acked-by: Willem de Bruijn Signed-off-by: Paolo Abeni --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index c682203915a2a9..e5b9a4d9bc21dd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21057,6 +21057,7 @@ SOCKET TIMESTAMPING M: Willem de Bruijn S: Maintained F: Documentation/networking/timestamping.rst +F: include/linux/net_tstamp.h F: include/uapi/linux/net_tstamp.h F: tools/testing/selftests/net/so_txtime.c From 8cb0a938d90b25f123fcb2e24bbda9eaabd79c9e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 21 Aug 2024 09:46:46 +0100 Subject: [PATCH 628/991] MAINTAINERS: Add limited globs for Networking headers This aims to add limited globs to improve the coverage of header files in the NETWORKING DRIVERS and NETWORKING [GENERAL] sections. It is done so in a minimal way to exclude overlap with other sections. And so as not to require "X" entries to exclude files otherwise matched by these new globs. While imperfect, due to it's limited nature, this does extend coverage of header files by these sections. And aims to automatically cover new files that seem very likely belong to these sections. The include/linux/netdev* glob (both sections) + Subsumes the entries for: - include/linux/netdevice.h + Extends the sections to cover - include/linux/netdevice_xmit.h - include/linux/netdev_features.h The include/uapi/linux/netdev* globs: (both sections) + Subsumes the entries for: - include/linux/netdevice.h + Extends the sections to cover - include/linux/netdev.h The include/linux/skbuff* glob (NETWORKING [GENERAL] section only): + Subsumes the entry for: - include/linux/skbuff.h + Extends the section to cover - include/linux/skbuff_ref.h A include/uapi/linux/net_* glob was not added to the NETWORKING [GENERAL] section. Although it would subsume the entry for include/uapi/linux/net_namespace.h, which is fine, it would also extend coverage to: - include/uapi/linux/net_dropmon.h, which belongs to the NETWORK DROP MONITOR section - include/uapi/linux/net_tstamp.h which, as per an earlier patch in this series, belongs to the SOCKET TIMESTAMPING section Signed-off-by: Simon Horman Signed-off-by: Paolo Abeni --- MAINTAINERS | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index e5b9a4d9bc21dd..03d571b131eb0f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15884,10 +15884,10 @@ F: include/linux/fddidevice.h F: include/linux/hippidevice.h F: include/linux/if_* F: include/linux/inetdevice.h -F: include/linux/netdevice.h +F: include/linux/netdev* F: include/uapi/linux/cn_proc.h F: include/uapi/linux/if_* -F: include/uapi/linux/netdevice.h +F: include/uapi/linux/netdev* F: tools/testing/selftests/drivers/net/ X: drivers/net/wireless/ @@ -15940,13 +15940,13 @@ F: include/linux/framer/framer.h F: include/linux/in.h F: include/linux/indirect_call_wrapper.h F: include/linux/net.h -F: include/linux/netdevice.h -F: include/linux/skbuff.h +F: include/linux/netdev* +F: include/linux/skbuff* F: include/net/ F: include/uapi/linux/in.h F: include/uapi/linux/net.h F: include/uapi/linux/net_namespace.h -F: include/uapi/linux/netdevice.h +F: include/uapi/linux/netdev* F: lib/net_utils.c F: lib/random32.c F: net/ From f2d20c9b97f0df64841b89fa1ad3e9c92f7377ae Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 21 Aug 2024 09:46:47 +0100 Subject: [PATCH 629/991] MAINTAINERS: Add header files to NETWORKING sections This is part of an effort to assign a section in MAINTAINERS to header files that relate to Networking. In this case the files with "net" or "skbuff" in their name. This patch adds a number of such files to the NETWORKING DRIVERS and NETWORKING [GENERAL] sections. Signed-off-by: Simon Horman Signed-off-by: Paolo Abeni --- MAINTAINERS | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 03d571b131eb0f..798f1ffcbbaaf4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15879,13 +15879,16 @@ F: drivers/net/ F: include/dt-bindings/net/ F: include/linux/cn_proc.h F: include/linux/etherdevice.h +F: include/linux/ethtool_netlink.h F: include/linux/fcdevice.h F: include/linux/fddidevice.h F: include/linux/hippidevice.h F: include/linux/if_* F: include/linux/inetdevice.h F: include/linux/netdev* +F: include/linux/platform_data/wiznet.h F: include/uapi/linux/cn_proc.h +F: include/uapi/linux/ethtool_netlink.h F: include/uapi/linux/if_* F: include/uapi/linux/netdev* F: tools/testing/selftests/drivers/net/ @@ -15939,14 +15942,28 @@ F: include/linux/framer/framer-provider.h F: include/linux/framer/framer.h F: include/linux/in.h F: include/linux/indirect_call_wrapper.h +F: include/linux/inet.h +F: include/linux/inet_diag.h F: include/linux/net.h F: include/linux/netdev* +F: include/linux/netlink.h +F: include/linux/netpoll.h +F: include/linux/rtnetlink.h +F: include/linux/seq_file_net.h F: include/linux/skbuff* F: include/net/ +F: include/uapi/linux/genetlink.h +F: include/uapi/linux/hsr_netlink.h F: include/uapi/linux/in.h +F: include/uapi/linux/inet_diag.h +F: include/uapi/linux/nbd-netlink.h F: include/uapi/linux/net.h F: include/uapi/linux/net_namespace.h +F: include/uapi/linux/netconf.h F: include/uapi/linux/netdev* +F: include/uapi/linux/netlink.h +F: include/uapi/linux/netlink_diag.h +F: include/uapi/linux/rtnetlink.h F: lib/net_utils.c F: lib/random32.c F: net/ From 46097a92662496394628cb41138e681d6074cce7 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 21 Aug 2024 09:46:48 +0100 Subject: [PATCH 630/991] MAINTAINERS: Mark JME Network Driver as Odd Fixes This driver only appears to have received sporadic clean-ups, typically part of some tree-wide activity, and fixes for quite some time. And according to the maintainer, Guo-Fu Tseng, the device has been EOLed for a long time (see Link). Accordingly, it seems appropriate to mark this driver as odd fixes. Cc: Moon Yeounsu Cc: Guo-Fu Tseng Link: https://lore.kernel.org/netdev/20240805003139.M94125@cooldavid.org/ Signed-off-by: Simon Horman Signed-off-by: Paolo Abeni --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 798f1ffcbbaaf4..0c94ec0ca47823 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11995,7 +11995,7 @@ F: fs/jfs/ JME NETWORK DRIVER M: Guo-Fu Tseng L: netdev@vger.kernel.org -S: Maintained +S: Odd Fixes F: drivers/net/ethernet/jme.* JOURNALLING FLASH FILE SYSTEM V2 (JFFS2) From 3e878fe5a0b139838a65f50a3df3caf3299dbc24 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 Aug 2024 03:57:39 -0400 Subject: [PATCH 631/991] bcachefs: add missing inode_walker_exit() fix a small leak Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 6801c37ee803dc..83bd31b44aad0b 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2215,6 +2215,8 @@ int bch2_check_xattrs(struct bch_fs *c) NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_xattr(trans, &iter, k, &hash_info, &inode))); + + inode_walker_exit(&inode); bch_err_fn(c, ret); return ret; } From a592cdf5164d3feb821085df71f63e70e8b8b08c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 19 Aug 2024 16:41:00 -0400 Subject: [PATCH 632/991] bcachefs: don't use rht_bucket() in btree_key_cache_scan() rht_bucket() does strange complicated things when a rehash is in progress. Instead, just skip scanning when a rehash is in progress: scanning is going to be more expensive (many more empty slots to cover), and some sort of infinite loop is being observed Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 9b3ec2a3b8cecf..fda7998734cbc1 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -778,6 +778,20 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, rcu_read_lock(); tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + + /* + * Scanning is expensive while a rehash is in progress - most elements + * will be on the new hashtable, if it's in progress + * + * A rehash could still start while we're scanning - that's ok, we'll + * still see most elements. + */ + if (unlikely(tbl->nest)) { + rcu_read_unlock(); + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + return SHRINK_STOP; + } + if (bc->shrink_iter >= tbl->size) bc->shrink_iter = 0; start = bc->shrink_iter; @@ -785,7 +799,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, do { struct rhash_head *pos, *next; - pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); + pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]); while (!rht_is_a_nulls(pos)) { next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); @@ -866,12 +880,22 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) while (atomic_long_read(&bc->nr_keys)) { rcu_read_lock(); tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - if (tbl) + if (tbl) { + if (tbl->nest) { + /* wait for in progress rehash */ + rcu_read_unlock(); + mutex_lock(&bc->table.mutex); + mutex_unlock(&bc->table.mutex); + rcu_read_lock(); + continue; + } for (i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { + ck = container_of(pos, struct bkey_cached, hash); bkey_cached_evict(bc, ck); list_add(&ck->list, &items); } + } rcu_read_unlock(); } From ce61b605a00502c59311d0a4b1f58d62b48272d0 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 20 Aug 2024 22:07:38 +0900 Subject: [PATCH 633/991] ksmbd: the buffer of smb2 query dir response has at least 1 byte When STATUS_NO_MORE_FILES status is set to smb2 query dir response, ->StructureSize is set to 9, which mean buffer has 1 byte. This issue occurs because ->Buffer[1] in smb2_query_directory_rsp to flex-array. Fixes: eb3e28c1e89b ("smb3: Replace smb2pdu 1-element arrays with flex-arrays") Cc: stable@vger.kernel.org # v6.1+ Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 0bc9edf22ba407..e9204180919e35 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -4409,7 +4409,8 @@ int smb2_query_dir(struct ksmbd_work *work) rsp->OutputBufferLength = cpu_to_le32(0); rsp->Buffer[0] = 0; rc = ksmbd_iov_pin_rsp(work, (void *)rsp, - sizeof(struct smb2_query_directory_rsp)); + offsetof(struct smb2_query_directory_rsp, Buffer) + + 1); if (rc) goto err_out; } else { From 2186a116538a715b20e15f84fdd3545e5fe0a39b Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Thu, 22 Aug 2024 08:20:50 +0000 Subject: [PATCH 634/991] smb/server: fix return value of smb2_open() In most error cases, error code is not returned in smb2_open(), __process_request() will not print error message. Fix this by returning the correct value at the end of smb2_open(). Signed-off-by: ChenXiaoSong Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index e9204180919e35..55d4e69bd9c99c 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -3713,7 +3713,7 @@ int smb2_open(struct ksmbd_work *work) kfree(name); kfree(lc); - return 0; + return rc; } static int readdir_info_level_struct_sz(int info_level) From 4e8771a3666c8f216eefd6bd2fd50121c6c437db Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Thu, 22 Aug 2024 08:20:51 +0000 Subject: [PATCH 635/991] smb/server: fix potential null-ptr-deref of lease_ctx_info in smb2_open() null-ptr-deref will occur when (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) and parse_lease_state() return NULL. Fix this by check if 'lease_ctx_info' is NULL. Additionally, remove the redundant parentheses in parse_durable_handle_context(). Signed-off-by: ChenXiaoSong Signed-off-by: Steve French --- fs/smb/server/oplock.c | 2 +- fs/smb/server/smb2pdu.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index a8f52c4ebbdadd..e546ffa57b55ab 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -1510,7 +1510,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease) * parse_lease_state() - parse lease context containted in file open request * @open_req: buffer containing smb2 file open(create) request * - * Return: oplock state, -ENOENT if create lease context not found + * Return: allocated lease context object on success, otherwise NULL */ struct lease_ctx_info *parse_lease_state(void *open_req) { diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 55d4e69bd9c99c..5d170ab0817d5a 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2770,8 +2770,8 @@ static int parse_durable_handle_context(struct ksmbd_work *work, } } - if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) || - req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) { + if ((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) || + req_op_level == SMB2_OPLOCK_LEVEL_BATCH) { dh_info->CreateGuid = durable_v2_blob->CreateGuid; dh_info->persistent = @@ -2791,8 +2791,8 @@ static int parse_durable_handle_context(struct ksmbd_work *work, goto out; } - if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) || - req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) { + if ((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) || + req_op_level == SMB2_OPLOCK_LEVEL_BATCH) { ksmbd_debug(SMB, "Request for durable open\n"); dh_info->type = dh_idx; } @@ -3414,7 +3414,7 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } } else { - if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) { + if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE && lc) { if (S_ISDIR(file_inode(filp)->i_mode)) { lc->req_state &= ~SMB2_LEASE_WRITE_CACHING_LE; lc->is_dir = true; From 0dd771b7d60b8281f10f6721783c60716d22075f Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Thu, 22 Aug 2024 08:20:52 +0000 Subject: [PATCH 636/991] smb/server: remove useless assignment of 'file_present' in smb2_open() The variable is already true here. Signed-off-by: ChenXiaoSong Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 5d170ab0817d5a..cd23517d964069 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -3096,7 +3096,6 @@ int smb2_open(struct ksmbd_work *work) goto err_out; } - file_present = true; idmap = mnt_idmap(path.mnt); } else { if (rc != -ENOENT) From 2b7e0573a49064d9c94c114b4471327cd96ae39c Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Thu, 22 Aug 2024 08:20:54 +0000 Subject: [PATCH 637/991] smb/server: update misguided comment of smb2_allocate_rsp_buf() smb2_allocate_rsp_buf() will return other error code except -ENOMEM. Signed-off-by: ChenXiaoSong Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index cd23517d964069..20846a4d3031fb 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -519,7 +519,7 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work) * smb2_allocate_rsp_buf() - allocate smb2 response buffer * @work: smb work containing smb request buffer * - * Return: 0 on success, otherwise -ENOMEM + * Return: 0 on success, otherwise error */ int smb2_allocate_rsp_buf(struct ksmbd_work *work) { From bb4485562f5907708f1c218b5d70dce04165d1e1 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 22 Aug 2024 14:35:44 +0100 Subject: [PATCH 638/991] ASoC: cs-amp-lib: Ignore empty UEFI calibration entries If the timestamp of a calibration entry is 0 it is an unused entry and must be ignored. Some end-products reserve EFI space for calibration entries by shipping with a zero-filled EFI file. When searching the file for calibration data the driver must skip the empty entries. The timestamp of a valid entry is always non-zero. Signed-off-by: Richard Fitzgerald Fixes: 1cad8725f2b9 ("ASoC: cs-amp-lib: Add helpers for factory calibration data") Link: https://patch.msgid.link/20240822133544.304421-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs-amp-lib.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c index 605964af8afad4..51b128c8067188 100644 --- a/sound/soc/codecs/cs-amp-lib.c +++ b/sound/soc/codecs/cs-amp-lib.c @@ -182,6 +182,10 @@ static int _cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, for (i = 0; i < efi_data->count; ++i) { u64 cal_target = cs_amp_cal_target_u64(&efi_data->data[i]); + /* Skip empty entries */ + if (!efi_data->data[i].calTime[0] && !efi_data->data[i].calTime[1]) + continue; + /* Skip entries with unpopulated silicon ID */ if (cal_target == 0) continue; @@ -193,7 +197,8 @@ static int _cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, } } - if (!cal && (amp_index >= 0) && (amp_index < efi_data->count)) { + if (!cal && (amp_index >= 0) && (amp_index < efi_data->count) && + (efi_data->data[amp_index].calTime[0] || efi_data->data[amp_index].calTime[1])) { u64 cal_target = cs_amp_cal_target_u64(&efi_data->data[amp_index]); /* From d7fd2941ae9a67423d1c7bee985f240e4686634f Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 21 Aug 2024 18:55:06 +0200 Subject: [PATCH 639/991] s390/boot: Avoid possible physmem_info segment corruption When physical memory for the kernel image is allocated it does not consider extra memory required for offsetting the image start to match it with the lower 20 bits of KASLR virtual base address. That might lead to kernel access beyond its memory range. Suggested-by: Vasily Gorbik Fixes: 693d41f7c938 ("s390/mm: Restore mapping of kernel image using large pages") Signed-off-by: Alexander Gordeev Acked-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/boot/startup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index cff34744b5a9bf..d69f1dfd3b1e73 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -457,9 +457,9 @@ void startup_kernel(void) */ kaslr_large_page_offset = __kaslr_offset & ~_SEGMENT_MASK; if (kaslr_enabled()) { - unsigned long end = ident_map_size - kaslr_large_page_offset; + unsigned long size = kernel_size + kaslr_large_page_offset; - __kaslr_offset_phys = randomize_within_range(kernel_size, _SEGMENT_SIZE, 0, end); + __kaslr_offset_phys = randomize_within_range(size, _SEGMENT_SIZE, 0, ident_map_size); } if (!__kaslr_offset_phys) __kaslr_offset_phys = nokaslr_offset_phys; From 1642285e511c2a40b14e87a41aa8feace6123036 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 21 Aug 2024 18:55:07 +0200 Subject: [PATCH 640/991] s390/boot: Fix KASLR base offset off by __START_KERNEL bytes Symbol offsets to the KASLR base do not match symbol address in the vmlinux image. That is the result of setting the KASLR base to the beginning of .text section as result of an optimization. Revert that optimization and allocate virtual memory for the whole kernel image including __START_KERNEL bytes as per the linker script. That allows keeping the semantics of the KASLR base offset in sync with other architectures. Rename __START_KERNEL to TEXT_OFFSET, since it represents the offset of the .text section within the kernel image, rather than a virtual address. Still skip mapping TEXT_OFFSET bytes to save memory on pgtables and provoke exceptions in case an attempt to access this area is made, as no kernel symbol may reside there. In case CONFIG_KASAN is enabled the location counter might exceed the value of TEXT_OFFSET, while the decompressor linker script forcefully resets it to TEXT_OFFSET, which leads to a sections overlap link failure. Use MAX() expression to avoid that. Reported-by: Omar Sandoval Closes: https://lore.kernel.org/linux-s390/ZnS8dycxhtXBZVky@telecaster.dhcp.thefacebook.com/ Fixes: 56b1069c40c7 ("s390/boot: Rework deployment of the kernel image") Signed-off-by: Alexander Gordeev Acked-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/boot/startup.c | 55 ++++++++++++++++++---------------- arch/s390/boot/vmem.c | 14 +++++++-- arch/s390/boot/vmlinux.lds.S | 7 ++++- arch/s390/include/asm/page.h | 3 +- arch/s390/kernel/vmlinux.lds.S | 2 +- arch/s390/tools/relocs.c | 2 +- 6 files changed, 52 insertions(+), 31 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index d69f1dfd3b1e73..c73b5118ad429a 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -162,7 +162,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, loc = (long)*reloc + phys_offset; if (loc < min_addr || loc > max_addr) error("64-bit relocation outside of kernel!\n"); - *(u64 *)loc += offset - __START_KERNEL; + *(u64 *)loc += offset; } } @@ -177,7 +177,7 @@ static void kaslr_adjust_got(unsigned long offset) */ for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++) { if (*entry) - *entry += offset - __START_KERNEL; + *entry += offset; } } @@ -252,7 +252,7 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); /* choose kernel address space layout: 4 or 3 levels. */ - BUILD_BUG_ON(!IS_ALIGNED(__START_KERNEL, THREAD_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(TEXT_OFFSET, THREAD_SIZE)); BUILD_BUG_ON(!IS_ALIGNED(__NO_KASLR_START_KERNEL, THREAD_SIZE)); BUILD_BUG_ON(__NO_KASLR_END_KERNEL > _REGION1_SIZE); vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE); @@ -389,31 +389,25 @@ static void kaslr_adjust_vmlinux_info(long offset) #endif } -static void fixup_vmlinux_info(void) -{ - vmlinux.entry -= __START_KERNEL; - kaslr_adjust_vmlinux_info(-__START_KERNEL); -} - void startup_kernel(void) { - unsigned long kernel_size = vmlinux.image_size + vmlinux.bss_size; - unsigned long nokaslr_offset_phys, kaslr_large_page_offset; - unsigned long amode31_lma = 0; + unsigned long vmlinux_size = vmlinux.image_size + vmlinux.bss_size; + unsigned long nokaslr_text_lma, text_lma = 0, amode31_lma = 0; + unsigned long kernel_size = TEXT_OFFSET + vmlinux_size; + unsigned long kaslr_large_page_offset; unsigned long max_physmem_end; unsigned long asce_limit; unsigned long safe_addr; psw_t psw; - fixup_vmlinux_info(); setup_lpp(); /* * Non-randomized kernel physical start address must be _SEGMENT_SIZE * aligned (see blow). */ - nokaslr_offset_phys = ALIGN(mem_safe_offset(), _SEGMENT_SIZE); - safe_addr = PAGE_ALIGN(nokaslr_offset_phys + kernel_size); + nokaslr_text_lma = ALIGN(mem_safe_offset(), _SEGMENT_SIZE); + safe_addr = PAGE_ALIGN(nokaslr_text_lma + vmlinux_size); /* * Reserve decompressor memory together with decompression heap, @@ -457,16 +451,27 @@ void startup_kernel(void) */ kaslr_large_page_offset = __kaslr_offset & ~_SEGMENT_MASK; if (kaslr_enabled()) { - unsigned long size = kernel_size + kaslr_large_page_offset; + unsigned long size = vmlinux_size + kaslr_large_page_offset; - __kaslr_offset_phys = randomize_within_range(size, _SEGMENT_SIZE, 0, ident_map_size); + text_lma = randomize_within_range(size, _SEGMENT_SIZE, TEXT_OFFSET, ident_map_size); } - if (!__kaslr_offset_phys) - __kaslr_offset_phys = nokaslr_offset_phys; - __kaslr_offset_phys |= kaslr_large_page_offset; + if (!text_lma) + text_lma = nokaslr_text_lma; + text_lma |= kaslr_large_page_offset; + + /* + * [__kaslr_offset_phys..__kaslr_offset_phys + TEXT_OFFSET] region is + * never accessed via the kernel image mapping as per the linker script: + * + * . = TEXT_OFFSET; + * + * Therefore, this region could be used for something else and does + * not need to be reserved. See how it is skipped in setup_vmem(). + */ + __kaslr_offset_phys = text_lma - TEXT_OFFSET; kaslr_adjust_vmlinux_info(__kaslr_offset_phys); - physmem_reserve(RR_VMLINUX, __kaslr_offset_phys, kernel_size); - deploy_kernel((void *)__kaslr_offset_phys); + physmem_reserve(RR_VMLINUX, text_lma, vmlinux_size); + deploy_kernel((void *)text_lma); /* vmlinux decompression is done, shrink reserved low memory */ physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end); @@ -489,7 +494,7 @@ void startup_kernel(void) amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, amode31_min, SZ_2G); } if (!amode31_lma) - amode31_lma = __kaslr_offset_phys - vmlinux.amode31_size; + amode31_lma = text_lma - vmlinux.amode31_size; physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size); /* @@ -505,8 +510,8 @@ void startup_kernel(void) * - copy_bootdata() must follow setup_vmem() to propagate changes * to bootdata made by setup_vmem() */ - clear_bss_section(__kaslr_offset_phys); - kaslr_adjust_relocs(__kaslr_offset_phys, __kaslr_offset_phys + vmlinux.image_size, + clear_bss_section(text_lma); + kaslr_adjust_relocs(text_lma, text_lma + vmlinux.image_size, __kaslr_offset, __kaslr_offset_phys); kaslr_adjust_got(__kaslr_offset); setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit); diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 2847cc059ab7ac..145035f84a0e3e 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -90,7 +90,7 @@ static void kasan_populate_shadow(unsigned long kernel_start, unsigned long kern } memgap_start = end; } - kasan_populate(kernel_start, kernel_end, POPULATE_KASAN_MAP_SHADOW); + kasan_populate(kernel_start + TEXT_OFFSET, kernel_end, POPULATE_KASAN_MAP_SHADOW); kasan_populate(0, (unsigned long)__identity_va(0), POPULATE_KASAN_ZERO_SHADOW); kasan_populate(AMODE31_START, AMODE31_END, POPULATE_KASAN_ZERO_SHADOW); if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { @@ -475,7 +475,17 @@ void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned l (unsigned long)__identity_va(end), POPULATE_IDENTITY); } - pgtable_populate(kernel_start, kernel_end, POPULATE_KERNEL); + + /* + * [kernel_start..kernel_start + TEXT_OFFSET] region is never + * accessed as per the linker script: + * + * . = TEXT_OFFSET; + * + * Therefore, skip mapping TEXT_OFFSET bytes to prevent access to + * [__kaslr_offset_phys..__kaslr_offset_phys + TEXT_OFFSET] region. + */ + pgtable_populate(kernel_start + TEXT_OFFSET, kernel_end, POPULATE_KERNEL); pgtable_populate(AMODE31_START, AMODE31_END, POPULATE_DIRECT); pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore), POPULATE_ABS_LOWCORE); diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index a750711d44c863..66670212a36118 100644 --- a/arch/s390/boot/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -109,7 +109,12 @@ SECTIONS #ifdef CONFIG_KERNEL_UNCOMPRESSED . = ALIGN(PAGE_SIZE); . += AMODE31_SIZE; /* .amode31 section */ - . = ALIGN(1 << 20); /* _SEGMENT_SIZE */ + + /* + * Make sure the location counter is not less than TEXT_OFFSET. + * _SEGMENT_SIZE is not available, use ALIGN(1 << 20) instead. + */ + . = MAX(TEXT_OFFSET, ALIGN(1 << 20)); #else . = ALIGN(8); #endif diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 06416b3f94f595..16e4caa931f1f3 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -279,8 +279,9 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define AMODE31_SIZE (3 * PAGE_SIZE) #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) -#define __START_KERNEL 0x100000 #define __NO_KASLR_START_KERNEL CONFIG_KERNEL_IMAGE_BASE #define __NO_KASLR_END_KERNEL (__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE) +#define TEXT_OFFSET 0x100000 + #endif /* _S390_PAGE_H */ diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index e67cd409b85871..ae5d0a9d6911bc 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -39,7 +39,7 @@ PHDRS { SECTIONS { - . = __START_KERNEL; + . = TEXT_OFFSET; .text : { _stext = .; /* Start of text section */ _text = .; /* Text and read-only data */ diff --git a/arch/s390/tools/relocs.c b/arch/s390/tools/relocs.c index a74dbd5c9896a7..30a732c808f35e 100644 --- a/arch/s390/tools/relocs.c +++ b/arch/s390/tools/relocs.c @@ -280,7 +280,7 @@ static int do_reloc(struct section *sec, Elf_Rel *rel) case R_390_GOTOFF64: break; case R_390_64: - add_reloc(&relocs64, offset - ehdr.e_entry); + add_reloc(&relocs64, offset); break; default: die("Unsupported relocation type: %d\n", r_type); From 15179cf2806f91685410e598f82813a7fcf90f6c Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 16 Aug 2024 16:47:39 -0500 Subject: [PATCH 641/991] smb3: fix problem unloading module due to leaked refcount on shutdown The shutdown ioctl can leak a refcount on the tlink which can prevent rmmod (unloading the cifs.ko) module from working. Found while debugging xfstest generic/043 Fixes: 69ca1f57555f ("smb3: add dynamic tracepoints for shutdown ioctl") Reviewed-by: Meetakshi Setiya Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/connect.c | 3 +++ fs/smb/client/ioctl.c | 2 ++ fs/smb/client/link.c | 1 + 3 files changed, 6 insertions(+) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index d2307162a2de15..c1c14274930ac6 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -4194,6 +4194,9 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) * * If one doesn't exist then insert a new tcon_link struct into the tree and * try to construct a new one. + * + * REMEMBER to call cifs_put_tlink() after successful calls to cifs_sb_tlink, + * to avoid refcount issues */ struct tcon_link * cifs_sb_tlink(struct cifs_sb_info *cifs_sb) diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index 44dbaf9929a4e3..9bb5c869f4db7e 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -229,9 +229,11 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg) shutdown_good: trace_smb3_shutdown_done(flags, tcon->tid); + cifs_put_tlink(tlink); return 0; shutdown_out_err: trace_smb3_shutdown_err(rc, flags, tcon->tid); + cifs_put_tlink(tlink); return rc; } diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index d86da949a91905..80099bbb333b08 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -588,6 +588,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { rc = PTR_ERR(tlink); + /* BB could be clearer if skipped put_tlink on error here, but harmless */ goto symlink_exit; } pTcon = tlink_tcon(tlink); From ec686804117a0421cf31d54427768aaf93aa0069 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 21 Aug 2024 00:45:03 -0300 Subject: [PATCH 642/991] smb: client: ignore unhandled reparse tags Just ignore reparse points that the client can't parse rather than bailing out and not opening the file or directory. Reported-by: Marc <1marc1@gmail.com> Closes: https://lore.kernel.org/r/CAMHwNVv-B+Q6wa0FEXrAuzdchzcJRsPKDDRrNaYZJd6X-+iJzw@mail.gmail.com Fixes: 539aad7f14da ("smb: client: introduce ->parse_reparse_point()") Tested-by: Anthony Nandaa (Microsoft) Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/reparse.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 689d8a506d4593..48c27581ec511c 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -378,6 +378,8 @@ int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, bool unicode, struct cifs_open_info_data *data) { + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + data->reparse.buf = buf; /* See MS-FSCC 2.1.2 */ @@ -394,12 +396,13 @@ int parse_reparse_point(struct reparse_data_buffer *buf, case IO_REPARSE_TAG_LX_FIFO: case IO_REPARSE_TAG_LX_CHR: case IO_REPARSE_TAG_LX_BLK: - return 0; + break; default: - cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n", - __func__, le32_to_cpu(buf->ReparseTag)); - return -EOPNOTSUPP; + cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n", + le32_to_cpu(buf->ReparseTag)); + break; } + return 0; } int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, From 8fb4ac1cee88a57e7a56faba49b408a41a4af4db Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 18 Aug 2024 16:07:11 +0900 Subject: [PATCH 643/991] kbuild: fix typos "prequisites" to "prerequisites" This typo in scripts/Makefile.build has been present for more than 20 years. It was accidentally copy-pasted to other scripts/Makefile.* files. Fix them all. Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor --- scripts/Makefile.build | 2 +- scripts/Makefile.modfinal | 2 +- scripts/Makefile.vmlinux | 2 +- scripts/Makefile.vmlinux_o | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index efacca63c89767..a5ac8ed1936fe8 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -487,7 +487,7 @@ $(subdir-ym): need-modorder=$(if $(filter $@/modules.order, $(subdir-modorder)),1) \ $(filter $@/%, $(single-subdir-goals)) -# Add FORCE to the prequisites of a target to force it to be always rebuilt. +# Add FORCE to the prerequisites of a target to force it to be always rebuilt. # --------------------------------------------------------------------------- PHONY += FORCE diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal index 1fa98b5e952b4f..306a6bb86e4dc8 100644 --- a/scripts/Makefile.modfinal +++ b/scripts/Makefile.modfinal @@ -62,7 +62,7 @@ endif targets += $(modules:%.o=%.ko) $(modules:%.o=%.mod.o) -# Add FORCE to the prequisites of a target to force it to be always rebuilt. +# Add FORCE to the prerequisites of a target to force it to be always rebuilt. # --------------------------------------------------------------------------- PHONY += FORCE diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux index 49946cb968440c..5ceecbed31eb77 100644 --- a/scripts/Makefile.vmlinux +++ b/scripts/Makefile.vmlinux @@ -33,7 +33,7 @@ targets += vmlinux vmlinux: scripts/link-vmlinux.sh vmlinux.o $(KBUILD_LDS) FORCE +$(call if_changed_dep,link_vmlinux) -# Add FORCE to the prequisites of a target to force it to be always rebuilt. +# Add FORCE to the prerequisites of a target to force it to be always rebuilt. # --------------------------------------------------------------------------- PHONY += FORCE diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index 6de297916ce680..d64070b6b4bce0 100644 --- a/scripts/Makefile.vmlinux_o +++ b/scripts/Makefile.vmlinux_o @@ -87,7 +87,7 @@ targets += modules.builtin modules.builtin: modules.builtin.modinfo FORCE $(call if_changed,modules_builtin) -# Add FORCE to the prequisites of a target to force it to be always rebuilt. +# Add FORCE to the prerequisites of a target to force it to be always rebuilt. # --------------------------------------------------------------------------- PHONY += FORCE From f58bab6fd4063913bd8321e99874b8239e9ba726 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 22 Aug 2024 14:47:01 -0400 Subject: [PATCH 644/991] nfsd: ensure that nfsd4_fattr_args.context is zeroed out If nfsd4_encode_fattr4 ends up doing a "goto out" before we get to checking for the security label, then args.context will be set to uninitialized junk on the stack, which we'll then try to free. Initialize it early. Fixes: f59388a579c6 ("NFSD: Add nfsd4_encode_fattr4_sec_label()") Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 42b41d55d4edf8..43ccf6119cf128 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3545,6 +3545,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, args.dentry = dentry; args.ignore_crossmnt = (ignore_crossmnt != 0); args.acl = NULL; +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + args.context = NULL; +#endif /* * Make a local copy of the attribute bitmap that can be modified. @@ -3617,7 +3620,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, args.contextsupport = false; #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - args.context = NULL; if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { if (exp->ex_flags & NFSEXP_SECURITY_LABEL) From afc954fd223ded70b1fa000767e2531db55cce58 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 14 Aug 2024 21:58:21 +0200 Subject: [PATCH 645/991] thermal: of: Fix OF node leak in thermal_of_trips_init() error path Terminating for_each_child_of_node() loop requires dropping OF node reference, so bailing out after thermal_of_populate_trip() error misses this. Solve the OF node reference leak with scoped for_each_child_of_node_scoped(). Fixes: d0c75fa2c17f ("thermal/of: Initialize trip points separately") Cc: All applicable Signed-off-by: Krzysztof Kozlowski Reviewed-by: Chen-Yu Tsai Reviewed-by: Daniel Lezcano Link: https://patch.msgid.link/20240814195823.437597-1-krzysztof.kozlowski@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_of.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index aa34b6e82e268b..30f8d6e70484c8 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -125,7 +125,7 @@ static int thermal_of_populate_trip(struct device_node *np, static struct thermal_trip *thermal_of_trips_init(struct device_node *np, int *ntrips) { struct thermal_trip *tt; - struct device_node *trips, *trip; + struct device_node *trips; int ret, count; trips = of_get_child_by_name(np, "trips"); @@ -150,7 +150,7 @@ static struct thermal_trip *thermal_of_trips_init(struct device_node *np, int *n *ntrips = count; count = 0; - for_each_child_of_node(trips, trip) { + for_each_child_of_node_scoped(trips, trip) { ret = thermal_of_populate_trip(trip, &tt[count++]); if (ret) goto out_kfree; From 662b52b761bfe0ba970e5823759798faf809b896 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 14 Aug 2024 21:58:22 +0200 Subject: [PATCH 646/991] thermal: of: Fix OF node leak in thermal_of_zone_register() thermal_of_zone_register() calls of_thermal_zone_find() which will iterate over OF nodes with for_each_available_child_of_node() to find matching thermal zone node. When it finds such, it exits the loop and returns the node. Prematurely ending for_each_available_child_of_node() loops requires dropping OF node reference, thus success of of_thermal_zone_find() means that caller must drop the reference. Fixes: 3fd6d6e2b4e8 ("thermal/of: Rework the thermal device tree initialization") Cc: All applicable Signed-off-by: Krzysztof Kozlowski Reviewed-by: Chen-Yu Tsai Reviewed-by: Daniel Lezcano Link: https://patch.msgid.link/20240814195823.437597-2-krzysztof.kozlowski@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_of.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index 30f8d6e70484c8..b08a9b64718d3a 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -491,7 +491,8 @@ static struct thermal_zone_device *thermal_of_zone_register(struct device_node * trips = thermal_of_trips_init(np, &ntrips); if (IS_ERR(trips)) { pr_err("Failed to find trip points for %pOFn id=%d\n", sensor, id); - return ERR_CAST(trips); + ret = PTR_ERR(trips); + goto out_of_node_put; } ret = thermal_of_monitor_init(np, &delay, &pdelay); @@ -519,6 +520,7 @@ static struct thermal_zone_device *thermal_of_zone_register(struct device_node * goto out_kfree_trips; } + of_node_put(np); kfree(trips); ret = thermal_zone_device_enable(tz); @@ -533,6 +535,8 @@ static struct thermal_zone_device *thermal_of_zone_register(struct device_node * out_kfree_trips: kfree(trips); +out_of_node_put: + of_node_put(np); return ERR_PTR(ret); } From c0a1ef9c5be72ff28a5413deb1b3e1a066593c13 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 14 Aug 2024 21:58:23 +0200 Subject: [PATCH 647/991] thermal: of: Fix OF node leak in of_thermal_zone_find() error paths Terminating for_each_available_child_of_node() loop requires dropping OF node reference, so bailing out on errors misses this. Solve the OF node reference leak with scoped for_each_available_child_of_node_scoped(). Fixes: 3fd6d6e2b4e8 ("thermal/of: Rework the thermal device tree initialization") Cc: Signed-off-by: Krzysztof Kozlowski Reviewed-by: Chen-Yu Tsai Reviewed-by: Daniel Lezcano Link: https://patch.msgid.link/20240814195823.437597-3-krzysztof.kozlowski@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_of.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index b08a9b64718d3a..1f252692815a18 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -184,14 +184,14 @@ static struct device_node *of_thermal_zone_find(struct device_node *sensor, int * Search for each thermal zone, a defined sensor * corresponding to the one passed as parameter */ - for_each_available_child_of_node(np, tz) { + for_each_available_child_of_node_scoped(np, child) { int count, i; - count = of_count_phandle_with_args(tz, "thermal-sensors", + count = of_count_phandle_with_args(child, "thermal-sensors", "#thermal-sensor-cells"); if (count <= 0) { - pr_err("%pOFn: missing thermal sensor\n", tz); + pr_err("%pOFn: missing thermal sensor\n", child); tz = ERR_PTR(-EINVAL); goto out; } @@ -200,18 +200,19 @@ static struct device_node *of_thermal_zone_find(struct device_node *sensor, int int ret; - ret = of_parse_phandle_with_args(tz, "thermal-sensors", + ret = of_parse_phandle_with_args(child, "thermal-sensors", "#thermal-sensor-cells", i, &sensor_specs); if (ret < 0) { - pr_err("%pOFn: Failed to read thermal-sensors cells: %d\n", tz, ret); + pr_err("%pOFn: Failed to read thermal-sensors cells: %d\n", child, ret); tz = ERR_PTR(ret); goto out; } if ((sensor == sensor_specs.np) && id == (sensor_specs.args_count ? sensor_specs.args[0] : 0)) { - pr_debug("sensor %pOFn id=%d belongs to %pOFn\n", sensor, id, tz); + pr_debug("sensor %pOFn id=%d belongs to %pOFn\n", sensor, id, child); + tz = no_free_ptr(child); goto out; } } From 591940e22e287fb64ac07be275e343d860cb72d6 Mon Sep 17 00:00:00 2001 From: Steve Wilkins Date: Fri, 9 Aug 2024 14:47:44 +0100 Subject: [PATCH 648/991] firmware: microchip: fix incorrect error report of programming:timeout on success After successfully programming the SPI flash with an MFPS auto update image, the error sysfs attribute reports programming:timeout. This is caused by an incorrect check on the return value from wait_for_completion_timeout() in mpfs_auto_update_poll_complete(). Fixes: ec5b0f1193ad ("firmware: microchip: add PolarFire SoC Auto Update support") Signed-off-by: Steve Wilkins Signed-off-by: Conor Dooley --- drivers/firmware/microchip/mpfs-auto-update.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/microchip/mpfs-auto-update.c b/drivers/firmware/microchip/mpfs-auto-update.c index 30de47895b1ce7..9ca5ee58edbdf8 100644 --- a/drivers/firmware/microchip/mpfs-auto-update.c +++ b/drivers/firmware/microchip/mpfs-auto-update.c @@ -166,7 +166,7 @@ static enum fw_upload_err mpfs_auto_update_poll_complete(struct fw_upload *fw_up */ ret = wait_for_completion_timeout(&priv->programming_complete, msecs_to_jiffies(AUTO_UPDATE_TIMEOUT_MS)); - if (ret) + if (!ret) return FW_UPLOAD_ERR_TIMEOUT; return FW_UPLOAD_ERR_NONE; From 4ae738dfef2c0323752ab81786e2d298c9939321 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 22 Aug 2024 11:40:55 -0400 Subject: [PATCH 649/991] net: xilinx: axienet: Always disable promiscuous mode If promiscuous mode is disabled when there are fewer than four multicast addresses, then it will not be reflected in the hardware. Fix this by always clearing the promiscuous mode flag even when we program multicast addresses. Fixes: 8a3b7a252dca ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Sean Anderson Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240822154059.1066595-2-sean.anderson@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 02fdf66e07faa3..163d052480071d 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -450,6 +450,10 @@ static void axienet_set_multicast_list(struct net_device *ndev) } else if (!netdev_mc_empty(ndev)) { struct netdev_hw_addr *ha; + reg = axienet_ior(lp, XAE_FMI_OFFSET); + reg &= ~XAE_FMI_PM_MASK; + axienet_iow(lp, XAE_FMI_OFFSET, reg); + i = 0; netdev_for_each_mc_addr(ha, ndev) { if (i >= XAE_MULTICAST_CAM_TABLE_NUM) From 797a68c9de0f5a5447baf4bd3bb9c10a3993435b Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 22 Aug 2024 11:40:56 -0400 Subject: [PATCH 650/991] net: xilinx: axienet: Fix dangling multicast addresses If a multicast address is removed but there are still some multicast addresses, that address would remain programmed into the frame filter. Fix this by explicitly setting the enable bit for each filter. Fixes: 8a3b7a252dca ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Sean Anderson Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240822154059.1066595-3-sean.anderson@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet.h | 1 + .../net/ethernet/xilinx/xilinx_axienet_main.c | 21 ++++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h index c7d9221fafdcb1..09c9f9787180bf 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h @@ -170,6 +170,7 @@ #define XAE_UAW0_OFFSET 0x00000700 /* Unicast address word 0 */ #define XAE_UAW1_OFFSET 0x00000704 /* Unicast address word 1 */ #define XAE_FMI_OFFSET 0x00000708 /* Frame Filter Control */ +#define XAE_FFE_OFFSET 0x0000070C /* Frame Filter Enable */ #define XAE_AF0_OFFSET 0x00000710 /* Address Filter 0 */ #define XAE_AF1_OFFSET 0x00000714 /* Address Filter 1 */ diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 163d052480071d..9aeb7b9f3ae408 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -432,7 +432,7 @@ static int netdev_set_mac_address(struct net_device *ndev, void *p) */ static void axienet_set_multicast_list(struct net_device *ndev) { - int i; + int i = 0; u32 reg, af0reg, af1reg; struct axienet_local *lp = netdev_priv(ndev); @@ -454,7 +454,6 @@ static void axienet_set_multicast_list(struct net_device *ndev) reg &= ~XAE_FMI_PM_MASK; axienet_iow(lp, XAE_FMI_OFFSET, reg); - i = 0; netdev_for_each_mc_addr(ha, ndev) { if (i >= XAE_MULTICAST_CAM_TABLE_NUM) break; @@ -473,6 +472,7 @@ static void axienet_set_multicast_list(struct net_device *ndev) axienet_iow(lp, XAE_FMI_OFFSET, reg); axienet_iow(lp, XAE_AF0_OFFSET, af0reg); axienet_iow(lp, XAE_AF1_OFFSET, af1reg); + axienet_iow(lp, XAE_FFE_OFFSET, 1); i++; } } else { @@ -480,18 +480,15 @@ static void axienet_set_multicast_list(struct net_device *ndev) reg &= ~XAE_FMI_PM_MASK; axienet_iow(lp, XAE_FMI_OFFSET, reg); - - for (i = 0; i < XAE_MULTICAST_CAM_TABLE_NUM; i++) { - reg = axienet_ior(lp, XAE_FMI_OFFSET) & 0xFFFFFF00; - reg |= i; - - axienet_iow(lp, XAE_FMI_OFFSET, reg); - axienet_iow(lp, XAE_AF0_OFFSET, 0); - axienet_iow(lp, XAE_AF1_OFFSET, 0); - } - dev_info(&ndev->dev, "Promiscuous mode disabled.\n"); } + + for (; i < XAE_MULTICAST_CAM_TABLE_NUM; i++) { + reg = axienet_ior(lp, XAE_FMI_OFFSET) & 0xFFFFFF00; + reg |= i; + axienet_iow(lp, XAE_FMI_OFFSET, reg); + axienet_iow(lp, XAE_FFE_OFFSET, 0); + } } /** From 57fb67783c4011581882f32e656d738da1f82042 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 21 Aug 2024 20:32:52 +0800 Subject: [PATCH 651/991] net: ovs: fix ovs_drop_reasons error There is something wrong with ovs_drop_reasons. ovs_drop_reasons[0] is "OVS_DROP_LAST_ACTION", but OVS_DROP_LAST_ACTION == __OVS_DROP_REASON + 1, which means that ovs_drop_reasons[1] should be "OVS_DROP_LAST_ACTION". And as Adrian tested, without the patch, adding flow to drop packets results in: drop at: do_execute_actions+0x197/0xb20 [openvsw (0xffffffffc0db6f97) origin: software input port ifindex: 8 timestamp: Tue Aug 20 10:19:17 2024 859853461 nsec protocol: 0x800 length: 98 original length: 98 drop reason: OVS_DROP_ACTION_ERROR With the patch, the same results in: drop at: do_execute_actions+0x197/0xb20 [openvsw (0xffffffffc0db6f97) origin: software input port ifindex: 8 timestamp: Tue Aug 20 10:16:13 2024 475856608 nsec protocol: 0x800 length: 98 original length: 98 drop reason: OVS_DROP_LAST_ACTION Fix this by initializing ovs_drop_reasons with index. Fixes: 9d802da40b7c ("net: openvswitch: add last-action drop reason") Signed-off-by: Menglong Dong Tested-by: Adrian Moreno Reviewed-by: Adrian Moreno Link: https://patch.msgid.link/20240821123252.186305-1-dongml2@chinatelecom.cn Signed-off-by: Jakub Kicinski --- net/openvswitch/datapath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 99d72543abd3aa..78d9961fcd446d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2706,7 +2706,7 @@ static struct pernet_operations ovs_net_ops = { }; static const char * const ovs_drop_reasons[] = { -#define S(x) (#x), +#define S(x) [(x) & ~SKB_DROP_REASON_SUBSYS_MASK] = (#x), OVS_DROP_REASONS(S) #undef S }; From 0124fb0ebf3b0ef89892d42147c9387be3105318 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Wed, 21 Aug 2024 11:13:37 +0200 Subject: [PATCH 652/991] s390/iucv: Fix vargs handling in iucv_alloc_device() iucv_alloc_device() gets a format string and a varying number of arguments. This is incorrectly forwarded by calling dev_set_name() with the format string and a va_list, while dev_set_name() expects also a varying number of arguments. Symptoms: Corrupted iucv device names, which can result in log messages like: sysfs: cannot create duplicate filename '/devices/iucv/hvc_iucv1827699952' Fixes: 4452e8ef8c36 ("s390/iucv: Provide iucv_alloc_device() / iucv_release_device()") Link: https://bugzilla.suse.com/show_bug.cgi?id=1228425 Signed-off-by: Alexandra Winter Reviewed-by: Thorsten Winkler Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20240821091337.3627068-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- net/iucv/iucv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 1e42e13ad24e36..d3e9efab7f4bd2 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -86,13 +86,15 @@ struct device *iucv_alloc_device(const struct attribute_group **attrs, { struct device *dev; va_list vargs; + char buf[20]; int rc; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) goto out_error; va_start(vargs, fmt); - rc = dev_set_name(dev, fmt, vargs); + vsnprintf(buf, sizeof(buf), fmt, vargs); + rc = dev_set_name(dev, "%s", buf); va_end(vargs); if (rc) goto out_error; From a54a93d0e3599b05856971734e15418ac551a14c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Aug 2024 09:35:27 +0800 Subject: [PATCH 653/991] nvme: move stopping keep-alive into nvme_uninit_ctrl() Commit 4733b65d82bd ("nvme: start keep-alive after admin queue setup") moves starting keep-alive from nvme_start_ctrl() into nvme_init_ctrl_finish(), but don't move stopping keep-alive into nvme_uninit_ctrl(), so keep-alive work can be started and keep pending after failing to start controller, finally use-after-free is triggered if nvme host driver is unloaded. This patch fixes kernel panic when running nvme/004 in case that connection failure is triggered, by moving stopping keep-alive into nvme_uninit_ctrl(). This way is reasonable because keep-alive is now started in nvme_init_ctrl_finish(). Fixes: 3af755a46881 ("nvme: move nvme_stop_keep_alive() back to original position") Cc: Hannes Reinecke Cc: Mark O'Donovan Reported-by: Changhui Zhong Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Ming Lei Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 33fa01c599adda..0dc8bcc664f223 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4612,7 +4612,6 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_mpath_stop(ctrl); nvme_auth_stop(ctrl); - nvme_stop_keep_alive(ctrl); nvme_stop_failfast_work(ctrl); flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fw_act_work); @@ -4648,6 +4647,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { + nvme_stop_keep_alive(ctrl); nvme_hwmon_exit(ctrl); nvme_fault_inject_fini(&ctrl->fault_inject); dev_pm_qos_hide_latency_tolerance(ctrl->device); From fe01751347359862c65c715d51c0b3f4fa8ee2f0 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Wed, 14 Aug 2024 19:26:50 +0530 Subject: [PATCH 654/991] nvme: Remove unused field The "name" field in struct nvme_ctrl is unsued so removing it. This would help save 12 bytes of space for each nvme_ctrl instance created. Signed-off-by: Nilay Shroff Reviewed-by: Kanchan Joshi Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ae5314d32943e8..da57947130cc7a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -301,7 +301,6 @@ struct nvme_ctrl { struct opal_dev *opal_dev; - char name[12]; u16 cntlid; u16 mtfa; From 67d95303c84732c2e1de5730756281f648dbefaf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 13 Aug 2024 15:21:13 +0530 Subject: [PATCH 655/991] cpufreq: amd-pstate: Fix uninitialized variable in amd_pstate_cpu_boost_update() Smatch complains that "ret" could be uninitialized: drivers/cpufreq/amd-pstate.c:734 amd_pstate_cpu_boost_update() error: uninitialized symbol 'ret'. This seems like it probably is a real issue. Initialize "ret" to zero to be safe. Fixes: c8c68c38b56f ("cpufreq: amd-pstate: initialize core precision boost state") Signed-off-by: Dan Carpenter Reviewed-by: Perry Yuan Acked-by: Gautham R. Shenoy Link: https://lore.kernel.org/lkml/7ff53543-6c04-48a0-8d99-7dc010b93b3a@stanley.mountain/T/ Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 68c616b572f22c..358bd88cd0c5cf 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -692,7 +692,7 @@ static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) struct amd_cpudata *cpudata = policy->driver_data; struct cppc_perf_ctrls perf_ctrls; u32 highest_perf, nominal_perf, nominal_freq, max_freq; - int ret; + int ret = 0; highest_perf = READ_ONCE(cpudata->highest_perf); nominal_perf = READ_ONCE(cpudata->nominal_perf); From 0d8584d288a9b4132e945d76bcc04395d158b2e7 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Tue, 13 Aug 2024 15:21:14 +0530 Subject: [PATCH 656/991] cpufreq/amd-pstate: Use topology_logical_package_id() instead of logical_die_id() After the commit 63edbaa48a57 ("x86/cpu/topology: Add support for the AMD 0x80000026 leaf"), the topolgy_logical_die_id() function returns the logical Core Chiplet Die (CCD) ID instead of the logical socket ID. Since this is currently used to set MSR_AMD_CPPC_ENABLE, which needs to be set on any one of the threads of the socket, it is prudent to use topology_logical_package_id() in place of topology_logical_die_id(). Fixes: 63edbaa48a57 ("x86/cpu/topology: Add support for the AMD 0x80000026 leaf") cc: stable@vger.kernel.org # 6.10 Signed-off-by: Gautham R. Shenoy Tested-by: Dhananjay Ugwekar Link: https://lore.kernel.org/lkml/20240801124509.3650-1-Dhananjay.Ugwekar@amd.com/ Signed-off-by: Dhananjay Ugwekar Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 358bd88cd0c5cf..89bda7a2bb8d13 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -321,7 +321,7 @@ static inline int pstate_enable(bool enable) return 0; for_each_present_cpu(cpu) { - unsigned long logical_id = topology_logical_die_id(cpu); + unsigned long logical_id = topology_logical_package_id(cpu); if (test_bit(logical_id, &logical_proc_id_mask)) continue; From 5e51224d2afbda57f33f47485871ee5532145e18 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Tue, 20 Aug 2024 14:33:15 +0000 Subject: [PATCH 657/991] smb/client: fix typo: GlobalMid_Sem -> GlobalMid_Lock The comments have typos, fix that to not confuse readers. Signed-off-by: ChenXiaoSong Reviewed-by: Namjae Jeon --- fs/smb/client/cifsfs.c | 6 +++--- fs/smb/client/cifsglob.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 2c4b357d85e22c..d89485235425ae 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -75,9 +75,9 @@ unsigned int sign_CIFS_PDUs = 1; /* * Global transaction id (XID) information */ -unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */ -unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ -unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ +unsigned int GlobalCurrentXid; /* protected by GlobalMid_Lock */ +unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Lock */ +unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Lock */ spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */ /* diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 5c9b3e6cd95f20..7ebe80a25d0457 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -2017,9 +2017,9 @@ extern spinlock_t cifs_tcp_ses_lock; /* * Global transaction id (XID) information */ -extern unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */ -extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ -extern unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ +extern unsigned int GlobalCurrentXid; /* protected by GlobalMid_Lock */ +extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Lock */ +extern unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Lock */ extern spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */ /* From cb78f9b7d0c0c9f86d8c0ac9c46b8b684d8785a9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 15 Aug 2024 10:18:41 -0400 Subject: [PATCH 658/991] nfs: fix the fetch of FATTR4_OPEN_ARGUMENTS The client doesn't properly request FATTR4_OPEN_ARGUMENTS in the initial SERVER_CAPS getattr. Add FATTR4_WORD2_OPEN_ARGUMENTS to the initial request. Fixes: 707f13b3d081 (NFSv4: Add support for the FATTR4_OPEN_ARGUMENTS attribute) Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8883016c551cec..06df74362e947e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3931,7 +3931,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_CASE_INSENSITIVE | FATTR4_WORD0_CASE_PRESERVING; if (minorversion) - bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT | + FATTR4_WORD2_OPEN_ARGUMENTS; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { From 95832998fb6edc50d4f2f6a958d9f90142d4be48 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 21 Aug 2024 08:28:25 -0400 Subject: [PATCH 659/991] nfs: fix bitmap decoder to handle a 3rd word It only decodes the first two words at this point. Have it decode the third word as well. Without this, the client doesn't send delegated timestamps in the CB_GETATTR response. With this change we also need to expand the on-stack bitmap in decode_recallany_args to 3 elements, in case the server sends a larger bitmap than expected. Fixes: 43df7110f4a9 ("NFSv4: Add CB_GETATTR support for delegated attributes") Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/callback_xdr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 29c49a7e5fe1c1..6df77f008d3fad 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -118,7 +118,9 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) if (likely(attrlen > 0)) bitmap[0] = ntohl(*p++); if (attrlen > 1) - bitmap[1] = ntohl(*p); + bitmap[1] = ntohl(*p++); + if (attrlen > 2) + bitmap[2] = ntohl(*p); return 0; } @@ -446,7 +448,7 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, void *argp) { struct cb_recallanyargs *args = argp; - uint32_t bitmap[2]; + uint32_t bitmap[3]; __be32 *p, status; p = xdr_inline_decode(xdr, 4); From a017ad1313fc91bdf235097fd0a02f673fc7bb11 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 21 Aug 2024 14:05:00 -0400 Subject: [PATCH 660/991] NFSv4: Add missing rescheduling points in nfs_client_return_marked_delegations We're seeing reports of soft lockups when iterating through the loops, so let's add rescheduling points. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- fs/nfs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index cbbd4866b0b7a4..97b386032b717a 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -228,6 +229,7 @@ static int __nfs_list_for_each_server(struct list_head *head, ret = fn(server, data); if (ret) goto out; + cond_resched(); rcu_read_lock(); } rcu_read_unlock(); From d72b7963115bea971a28eaa2cb76722c023f9fdf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 21 Aug 2024 14:05:01 -0400 Subject: [PATCH 661/991] NFSv4: Fix clearing of layout segments in layoutreturn Make sure that we clear the layout segments in cases where we see a fatal error, and also in the case where the layout is invalid. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 9 ++++++--- fs/nfs/pnfs.c | 5 ++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 06df74362e947e..b8ffbe52ba15a5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9998,6 +9998,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) fallthrough; default: task->tk_status = 0; + lrp->res.lrs_present = 0; fallthrough; case 0: break; @@ -10011,9 +10012,11 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) task->tk_status = 0; break; case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) - break; - goto out_restart; + if (nfs4_async_handle_error(task, server, NULL, NULL) == + -EAGAIN) + goto out_restart; + lrp->res.lrs_present = 0; + break; } return; out_restart: diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index aa698481bec8d1..0d16b383a45262 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1284,10 +1284,9 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, LIST_HEAD(freeme); spin_lock(&inode->i_lock); - if (!pnfs_layout_is_valid(lo) || - !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + if (!nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) goto out_unlock; - if (stateid) { + if (stateid && pnfs_layout_is_valid(lo)) { u32 seq = be32_to_cpu(arg_stateid->seqid); pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); From f92214e4c312f6ea9d78650cc6291d200f17abb6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 21 Aug 2024 14:05:02 -0400 Subject: [PATCH 662/991] NFS: Avoid unnecessary rescanning of the per-server delegation list If the call to nfs_delegation_grab_inode() fails, we will not have dropped any locks that require us to rescan the list. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- fs/nfs/delegation.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d5edb3b3eeef0d..20cb2008f9e469 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -647,6 +647,9 @@ static int nfs_server_return_marked_delegations(struct nfs_server *server, prev = delegation; continue; } + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; if (prev) { struct inode *tmp = nfs_delegation_grab_inode(prev); @@ -657,12 +660,6 @@ static int nfs_server_return_marked_delegations(struct nfs_server *server, } } - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - iput(to_put); - goto restart; - } delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); @@ -1184,7 +1181,6 @@ static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server, struct inode *inode; restart: rcu_read_lock(); -restart_locked: list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags) || @@ -1195,7 +1191,7 @@ static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server, continue; inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) - goto restart_locked; + continue; delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); if (delegation != NULL) { @@ -1318,7 +1314,6 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server, restart: rcu_read_lock(); -restart_locked: list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags) || @@ -1330,7 +1325,7 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server, continue; inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) - goto restart_locked; + continue; spin_lock(&delegation->lock); cred = get_cred_rcu(delegation->cred); nfs4_stateid_copy(&stateid, &delegation->stateid); From 979b581e4c69257acab1af415ddad6b2d78a2fa5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Aug 2024 17:53:39 +0000 Subject: [PATCH 663/991] pktgen: use cpus_read_lock() in pg_net_init() I have seen the WARN_ON(smp_processor_id() != cpu) firing in pktgen_thread_worker() during tests. We must use cpus_read_lock()/cpus_read_unlock() around the for_each_online_cpu(cpu) loop. While we are at it use WARN_ON_ONCE() to avoid a possible syslog flood. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20240821175339.1191779-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/pktgen.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index ea55a758a475ab..197a50ef8e2e1b 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3654,7 +3654,7 @@ static int pktgen_thread_worker(void *arg) struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; - WARN_ON(smp_processor_id() != cpu); + WARN_ON_ONCE(smp_processor_id() != cpu); init_waitqueue_head(&t->queue); complete(&t->start_done); @@ -3989,6 +3989,7 @@ static int __net_init pg_net_init(struct net *net) goto remove; } + cpus_read_lock(); for_each_online_cpu(cpu) { int err; @@ -3997,6 +3998,7 @@ static int __net_init pg_net_init(struct net *net) pr_warn("Cannot create thread for cpu %d (%d)\n", cpu, err); } + cpus_read_unlock(); if (list_empty(&pn->pktgen_threads)) { pr_err("Initialization failed for all threads\n"); From 3417c9574e368f0330637505f00d3814ca8854d2 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Tue, 20 Aug 2024 23:51:31 -0700 Subject: [PATCH 664/991] scsi: lpfc: Fix overflow build issue Build failed while enabling "CONFIG_GCOV_KERNEL=y" and "CONFIG_GCOV_PROFILE_ALL=y" with following error: BUILDSTDERR: drivers/scsi/lpfc/lpfc_bsg.c: In function 'lpfc_get_cgnbuf_info': BUILDSTDERR: ./include/linux/fortify-string.h:114:33: error: '__builtin_memcpy' accessing 18446744073709551615 bytes at offsets 0 and 0 overlaps 9223372036854775807 bytes at offset -9223372036854775808 [-Werror=restrict] BUILDSTDERR: 114 | #define __underlying_memcpy __builtin_memcpy BUILDSTDERR: | ^ BUILDSTDERR: ./include/linux/fortify-string.h:637:9: note: in expansion of macro '__underlying_memcpy' BUILDSTDERR: 637 | __underlying_##op(p, q, __fortify_size); \ BUILDSTDERR: | ^~~~~~~~~~~~~ BUILDSTDERR: ./include/linux/fortify-string.h:682:26: note: in expansion of macro '__fortify_memcpy_chk' BUILDSTDERR: 682 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ BUILDSTDERR: | ^~~~~~~~~~~~~~~~~~~~ BUILDSTDERR: drivers/scsi/lpfc/lpfc_bsg.c:5468:9: note: in expansion of macro 'memcpy' BUILDSTDERR: 5468 | memcpy(cgn_buff, cp, cinfosz); BUILDSTDERR: | ^~~~~~ This happens from the commit 06bb7fc0feee ("kbuild: turn on -Wrestrict by default"). Address this issue by using size_t type. Signed-off-by: Sherry Yang Link: https://lore.kernel.org/r/20240821065131.1180791-1-sherry.yang@oracle.com Reviewed-by: Justin Tee Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_bsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c index 4156419c52c78a..4756a3f825310c 100644 --- a/drivers/scsi/lpfc/lpfc_bsg.c +++ b/drivers/scsi/lpfc/lpfc_bsg.c @@ -5410,7 +5410,7 @@ lpfc_get_cgnbuf_info(struct bsg_job *job) struct get_cgnbuf_info_req *cgnbuf_req; struct lpfc_cgn_info *cp; uint8_t *cgn_buff; - int size, cinfosz; + size_t size, cinfosz; int rc = 0; if (job->request_len < sizeof(struct fc_bsg_request) + From 919ddf8336f0b84c0453bac583808c9f165a85c2 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 22 Aug 2024 00:51:42 +0200 Subject: [PATCH 665/991] scsi: aacraid: Fix double-free on probe failure aac_probe_one() calls hardware-specific init functions through the aac_driver_ident::init pointer, all of which eventually call down to aac_init_adapter(). If aac_init_adapter() fails after allocating memory for aac_dev::queues, it frees the memory but does not clear that member. After the hardware-specific init function returns an error, aac_probe_one() goes down an error path that frees the memory pointed to by aac_dev::queues, resulting.in a double-free. Reported-by: Michael Gordon Link: https://bugs.debian.org/1075855 Fixes: 8e0c5ebde82b ("[SCSI] aacraid: Newer adapter communication iterface support") Signed-off-by: Ben Hutchings Link: https://lore.kernel.org/r/ZsZvfqlQMveoL5KQ@decadent.org.uk Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/comminit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/aacraid/comminit.c b/drivers/scsi/aacraid/comminit.c index bd99c5492b7d49..0f64b024430376 100644 --- a/drivers/scsi/aacraid/comminit.c +++ b/drivers/scsi/aacraid/comminit.c @@ -642,6 +642,7 @@ struct aac_dev *aac_init_adapter(struct aac_dev *dev) if (aac_comm_init(dev)<0){ kfree(dev->queues); + dev->queues = NULL; return NULL; } /* @@ -649,6 +650,7 @@ struct aac_dev *aac_init_adapter(struct aac_dev *dev) */ if (aac_fib_setup(dev) < 0) { kfree(dev->queues); + dev->queues = NULL; return NULL; } From 4f9eedfa27ae5806ed10906bcceee7bae49c8941 Mon Sep 17 00:00:00 2001 From: Yihang Li Date: Mon, 19 Aug 2024 17:09:34 +0800 Subject: [PATCH 666/991] scsi: sd: Ignore command SYNCHRONIZE CACHE error if format in progress If formatting a suspended disk (such as formatting with different DIF type), the disk will be resuming first, and then the format command will submit to the disk through SG_IO ioctl. When the disk is processing the format command, the system does not submit other commands to the disk. Therefore, the system attempts to suspend the disk again and sends the SYNCHRONIZE CACHE command. However, the SYNCHRONIZE CACHE command will fail because the disk is in the formatting process. This will cause the runtime_status of the disk to error and it is difficult for user to recover it. Error info like: [ 669.925325] sd 6:0:6:0: [sdg] Synchronizing SCSI cache [ 670.202371] sd 6:0:6:0: [sdg] Synchronize Cache(10) failed: Result: hostbyte=0x00 driverbyte=DRIVER_OK [ 670.216300] sd 6:0:6:0: [sdg] Sense Key : 0x2 [current] [ 670.221860] sd 6:0:6:0: [sdg] ASC=0x4 ASCQ=0x4 To solve the issue, ignore the error and return success/0 when format is in progress. Cc: stable@vger.kernel.org Signed-off-by: Yihang Li Link: https://lore.kernel.org/r/20240819090934.2130592-1-liyihang9@huawei.com Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index dad3991397cf9a..9db86943d04cf5 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1823,13 +1823,15 @@ static int sd_sync_cache(struct scsi_disk *sdkp) (sshdr.asc == 0x74 && sshdr.ascq == 0x71)) /* drive is password locked */ /* this is no error here */ return 0; + /* - * This drive doesn't support sync and there's not much - * we can do because this is called during shutdown - * or suspend so just return success so those operations - * can proceed. + * If a format is in progress or if the drive does not + * support sync, there is not much we can do because + * this is called during shutdown or suspend so just + * return success so those operations can proceed. */ - if (sshdr.sense_key == ILLEGAL_REQUEST) + if ((sshdr.asc == 0x04 && sshdr.ascq == 0x04) || + sshdr.sense_key == ILLEGAL_REQUEST) return 0; } From b58b133e680b20d219940e0fdb6f6132c2b60f38 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Fri, 16 Aug 2024 10:49:06 +0000 Subject: [PATCH 667/991] iommu: Handle iommu faults for a bad iopf setup The iommu_report_device_fault function was updated to return void while assuming that drivers only need to call iommu_report_device_fault() for reporting an iopf. This implementation causes following problems: 1. The drivers rely on the core code to call it's page_reponse, however, when a fault is received and no fault capable domain is attached / iopf_param is NULL, the ops->page_response is NOT called causing the device to stall in case the fault type was PAGE_REQ. 2. The arm_smmu_v3 driver relies on the returned value to log errors returning void from iommu_report_device_fault causes these events to be missed while logging. Modify the iommu_report_device_fault function to return -EINVAL for cases where no fault capable domain is attached or iopf_param was NULL and calls back to the driver (ops->page_response) in case the fault type was IOMMU_FAULT_PAGE_REQ. The returned value can be used by the drivers to log the fault/event as needed. Reported-by: Kunkun Jiang Closes: https://lore.kernel.org/all/6147caf0-b9a0-30ca-795e-a1aa502a5c51@huawei.com/ Fixes: 3dfa64aecbaf ("iommu: Make iommu_report_device_fault() return void") Signed-off-by: Jason Gunthorpe Signed-off-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/20240816104906.1010626-1-praan@google.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 2 +- drivers/iommu/io-pgfault.c | 121 ++++++++++++++------ include/linux/iommu.h | 5 +- 3 files changed, 87 insertions(+), 41 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index a31460f9f3d421..ed2b106e02dd10 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1777,7 +1777,7 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) goto out_unlock; } - iommu_report_device_fault(master->dev, &fault_evt); + ret = iommu_report_device_fault(master->dev, &fault_evt); out_unlock: mutex_unlock(&smmu->streams_mutex); return ret; diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 81e9cc6e3164a4..4674e618797c15 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -115,6 +115,59 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param, return group; } +static struct iommu_attach_handle *find_fault_handler(struct device *dev, + struct iopf_fault *evt) +{ + struct iommu_fault *fault = &evt->fault; + struct iommu_attach_handle *attach_handle; + + if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) { + attach_handle = iommu_attach_handle_get(dev->iommu_group, + fault->prm.pasid, 0); + if (IS_ERR(attach_handle)) { + const struct iommu_ops *ops = dev_iommu_ops(dev); + + if (!ops->user_pasid_table) + return NULL; + /* + * The iommu driver for this device supports user- + * managed PASID table. Therefore page faults for + * any PASID should go through the NESTING domain + * attached to the device RID. + */ + attach_handle = iommu_attach_handle_get( + dev->iommu_group, IOMMU_NO_PASID, + IOMMU_DOMAIN_NESTED); + if (IS_ERR(attach_handle)) + return NULL; + } + } else { + attach_handle = iommu_attach_handle_get(dev->iommu_group, + IOMMU_NO_PASID, 0); + + if (IS_ERR(attach_handle)) + return NULL; + } + + if (!attach_handle->domain->iopf_handler) + return NULL; + + return attach_handle; +} + +static void iopf_error_response(struct device *dev, struct iopf_fault *evt) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_fault *fault = &evt->fault; + struct iommu_page_response resp = { + .pasid = fault->prm.pasid, + .grpid = fault->prm.grpid, + .code = IOMMU_PAGE_RESP_INVALID + }; + + ops->page_response(dev, evt, &resp); +} + /** * iommu_report_device_fault() - Report fault event to device driver * @dev: the device @@ -153,24 +206,39 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param, * handling framework should guarantee that the iommu domain could only be * freed after the device has stopped generating page faults (or the iommu * hardware has been set to block the page faults) and the pending page faults - * have been flushed. + * have been flushed. In case no page fault handler is attached or no iopf params + * are setup, then the ops->page_response() is called to complete the evt. + * + * Returns 0 on success, or an error in case of a bad/failed iopf setup. */ -void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) +int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { + struct iommu_attach_handle *attach_handle; struct iommu_fault *fault = &evt->fault; struct iommu_fault_param *iopf_param; struct iopf_group abort_group = {}; struct iopf_group *group; + attach_handle = find_fault_handler(dev, evt); + if (!attach_handle) + goto err_bad_iopf; + + /* + * Something has gone wrong if a fault capable domain is attached but no + * iopf_param is setup + */ iopf_param = iopf_get_dev_fault_param(dev); if (WARN_ON(!iopf_param)) - return; + goto err_bad_iopf; if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { - report_partial_fault(iopf_param, fault); + int ret; + + ret = report_partial_fault(iopf_param, fault); iopf_put_dev_fault_param(iopf_param); /* A request that is not the last does not need to be ack'd */ - return; + + return ret; } /* @@ -185,38 +253,7 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) if (group == &abort_group) goto err_abort; - if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) { - group->attach_handle = iommu_attach_handle_get(dev->iommu_group, - fault->prm.pasid, - 0); - if (IS_ERR(group->attach_handle)) { - const struct iommu_ops *ops = dev_iommu_ops(dev); - - if (!ops->user_pasid_table) - goto err_abort; - - /* - * The iommu driver for this device supports user- - * managed PASID table. Therefore page faults for - * any PASID should go through the NESTING domain - * attached to the device RID. - */ - group->attach_handle = - iommu_attach_handle_get(dev->iommu_group, - IOMMU_NO_PASID, - IOMMU_DOMAIN_NESTED); - if (IS_ERR(group->attach_handle)) - goto err_abort; - } - } else { - group->attach_handle = - iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0); - if (IS_ERR(group->attach_handle)) - goto err_abort; - } - - if (!group->attach_handle->domain->iopf_handler) - goto err_abort; + group->attach_handle = attach_handle; /* * On success iopf_handler must call iopf_group_response() and @@ -225,7 +262,7 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) if (group->attach_handle->domain->iopf_handler(group)) goto err_abort; - return; + return 0; err_abort: dev_warn_ratelimited(dev, "iopf with pasid %d aborted\n", @@ -235,6 +272,14 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) __iopf_free_group(group); else iopf_free_group(group); + + return 0; + +err_bad_iopf: + if (fault->type == IOMMU_FAULT_PAGE_REQ) + iopf_error_response(dev, evt); + + return -EINVAL; } EXPORT_SYMBOL_GPL(iommu_report_device_fault); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 04cbdae0052eb2..bd722f47363520 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1563,7 +1563,7 @@ struct iopf_queue *iopf_queue_alloc(const char *name); void iopf_queue_free(struct iopf_queue *queue); int iopf_queue_discard_partial(struct iopf_queue *queue); void iopf_free_group(struct iopf_group *group); -void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); +int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); void iopf_group_response(struct iopf_group *group, enum iommu_page_response_code status); #else @@ -1601,9 +1601,10 @@ static inline void iopf_free_group(struct iopf_group *group) { } -static inline void +static inline int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { + return -ENODEV; } static inline void iopf_group_response(struct iopf_group *group, From 82b8000c28b56b014ce52a1f1581bef4af148681 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 19 Aug 2024 11:09:43 +0200 Subject: [PATCH 668/991] net: drop special comment style As we discussed in the room at netdevconf earlier this week, drop the requirement for special comment style for netdev. For checkpatch, the general check accepts both right now, so simply drop the special request there as well. Acked-by: Stephen Hemminger Signed-off-by: Johannes Berg Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/process/coding-style.rst | 12 ------------ Documentation/process/maintainer-netdev.rst | 17 ----------------- scripts/checkpatch.pl | 10 ---------- 3 files changed, 39 deletions(-) diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index 04f6aa377a5db1..8e30c8f7697d5e 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -629,18 +629,6 @@ The preferred style for long (multi-line) comments is: * with beginning and ending almost-blank lines. */ -For files in net/ and drivers/net/ the preferred style for long (multi-line) -comments is a little different. - -.. code-block:: c - - /* The preferred comment style for files in net/ and drivers/net - * looks like this. - * - * It is nearly the same as the generally preferred comment style, - * but there is no initial almost-blank line. - */ - It's also important to comment data, whether they are basic types or derived types. To this end, use just one data declaration per line (no commas for multiple data declarations). This leaves you room for a small comment on each diff --git a/Documentation/process/maintainer-netdev.rst b/Documentation/process/maintainer-netdev.rst index fe8616397d63be..30d24eecdaaa97 100644 --- a/Documentation/process/maintainer-netdev.rst +++ b/Documentation/process/maintainer-netdev.rst @@ -355,23 +355,6 @@ just do it. As a result, a sequence of smaller series gets merged quicker and with better review coverage. Re-posting large series also increases the mailing list traffic. -Multi-line comments -~~~~~~~~~~~~~~~~~~~ - -Comment style convention is slightly different for networking and most of -the tree. Instead of this:: - - /* - * foobar blah blah blah - * another line of text - */ - -it is requested that you make it look like this:: - - /* foobar blah blah blah - * another line of text - */ - Local variable ordering ("reverse xmas tree", "RCS") ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 39032224d504f1..4427572b24771d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4015,16 +4015,6 @@ sub process { } } -# Block comment styles -# Networking with an initial /* - if ($realfile =~ m@^(drivers/net/|net/)@ && - $prevrawline =~ /^\+[ \t]*\/\*[ \t]*$/ && - $rawline =~ /^\+[ \t]*\*/ && - $realline > 3) { # Do not warn about the initial copyright comment block after SPDX-License-Identifier - WARN("NETWORKING_BLOCK_COMMENT_STYLE", - "networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev); - } - # Block comments use * on subsequent lines if ($prevline =~ /$;[ \t]*$/ && #ends in comment $prevrawline =~ /^\+.*?\/\*/ && #starting /* From 71c8e2a7c822ee557b07d9bb49028dd269c87b2e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 22 Aug 2024 11:23:08 +0100 Subject: [PATCH 669/991] irqchip/gic-v3: Init SRE before poking sysregs The GICv3 driver pokes GICv3 system registers in gic_prio_init() before gic_cpu_sys_reg_init() ensures that GICv3 system registers have been enabled by writing to ICC_SRE_EL1.SRE. On arm64 this is benign as has_useable_gicv3_cpuif() runs earlier during cpufeature detection, and this enables the GICv3 system registers. On 32-bit arm when booting on an FVP using the boot-wrapper, the accesses in gic_prio_init() end up being UNDEFINED and crashes the kernel during boot. This is a regression introduced by the addition of gic_prio_init(). Fix this by factoring out the SRE initialization into a new function and calling it early in the three paths where SRE may not have been initialized: (1) gic_init_bases(), before the primary CPU pokes GICv3 sysregs in gic_prio_init(). (2) gic_starting_cpu(), before secondary CPUs initialize GICv3 sysregs in gic_cpu_init(). (3) gic_cpu_pm_notifier(), before CPUs re-initialize GICv3 sysregs in gic_cpu_sys_reg_init(). Fixes: d447bf09a4013541 ("irqchip/gic-v3: Detect GICD_CTRL.DS and SCR_EL3.FIQ earlier") Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org --- drivers/irqchip/irq-gic-v3.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index c19083bfb94323..74f21e03d4a374 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1154,14 +1154,8 @@ static void gic_update_rdist_properties(void) gic_data.rdists.has_vpend_valid_dirty ? "Valid+Dirty " : ""); } -static void gic_cpu_sys_reg_init(void) +static void gic_cpu_sys_reg_enable(void) { - int i, cpu = smp_processor_id(); - u64 mpidr = gic_cpu_to_affinity(cpu); - u64 need_rss = MPIDR_RS(mpidr); - bool group0; - u32 pribits; - /* * Need to check that the SRE bit has actually been set. If * not, it means that SRE is disabled at EL2. We're going to @@ -1172,6 +1166,16 @@ static void gic_cpu_sys_reg_init(void) if (!gic_enable_sre()) pr_err("GIC: unable to set SRE (disabled at EL2), panic ahead\n"); +} + +static void gic_cpu_sys_reg_init(void) +{ + int i, cpu = smp_processor_id(); + u64 mpidr = gic_cpu_to_affinity(cpu); + u64 need_rss = MPIDR_RS(mpidr); + bool group0; + u32 pribits; + pribits = gic_get_pribits(); group0 = gic_has_group0(); @@ -1333,6 +1337,7 @@ static int gic_check_rdist(unsigned int cpu) static int gic_starting_cpu(unsigned int cpu) { + gic_cpu_sys_reg_enable(); gic_cpu_init(); if (gic_dist_supports_lpis()) @@ -1498,6 +1503,7 @@ static int gic_cpu_pm_notifier(struct notifier_block *self, if (cmd == CPU_PM_EXIT) { if (gic_dist_security_disabled()) gic_enable_redist(true); + gic_cpu_sys_reg_enable(); gic_cpu_sys_reg_init(); } else if (cmd == CPU_PM_ENTER && gic_dist_security_disabled()) { gic_write_grpen1(0); @@ -2070,6 +2076,7 @@ static int __init gic_init_bases(phys_addr_t dist_phys_base, gic_update_rdist_properties(); + gic_cpu_sys_reg_enable(); gic_prio_init(); gic_dist_init(); gic_cpu_init(); From 996b37da1e0f51314d4186b326742c2a95a9f0dd Mon Sep 17 00:00:00 2001 From: Ed Tsai Date: Mon, 8 Jul 2024 15:22:06 +0800 Subject: [PATCH 670/991] backing-file: convert to using fops->splice_write Filesystems may define their own splice write. Therefore, use the file fops instead of invoking iter_file_splice_write() directly. Signed-off-by: Ed Tsai Link: https://lore.kernel.org/r/20240708072208.25244-1-ed.tsai@mediatek.com Fixes: 5ca73468612d ("fuse: implement splice read/write passthrough") Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/backing-file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/backing-file.c b/fs/backing-file.c index afb557446c27c7..8860dac58c37e1 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -303,13 +303,16 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) return -EIO; + if (!out->f_op->splice_write) + return -EINVAL; + ret = file_remove_privs(ctx->user_file); if (ret) return ret; old_cred = override_creds(ctx->cred); file_start_write(out); - ret = iter_file_splice_write(pipe, out, ppos, len, flags); + ret = out->f_op->splice_write(pipe, out, ppos, len, flags); file_end_write(out); revert_creds(old_cred); From 880799fc7a3a127c43143935c1a8767d77c19cae Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 23 Aug 2024 12:07:12 +0200 Subject: [PATCH 671/991] irqchip/irq-msi-lib: Check for NULL ops in msi_lib_irq_domain_select() The irq_domain passed to msi_lib_irq_domain_select() may not have msi_parent_ops set. There is a NULL pointer check for it, but unfortunately there is a dereference of the parent ops pointer before that. Move the NULL pointer test before the first use of that pointer. This was found on a MacchiatoBin (Marvell Armada 8K SoC), which uses the irq-mvebu-sei driver. Fixes: 72e257c6f058 ("irqchip: Provide irq-msi-lib") Signed-off-by: Maxime Chevallier Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240823100733.1900666-1-maxime.chevallier@bootlin.com Closes: https://lore.kernel.org/all/20240821165034.1af97bad@fedora-3.home/ --- drivers/irqchip/irq-msi-lib.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-msi-lib.c b/drivers/irqchip/irq-msi-lib.c index b5b90003311a18..d8e29fc0d4068b 100644 --- a/drivers/irqchip/irq-msi-lib.c +++ b/drivers/irqchip/irq-msi-lib.c @@ -128,6 +128,9 @@ int msi_lib_irq_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec, const struct msi_parent_ops *ops = d->msi_parent_ops; u32 busmask = BIT(bus_token); + if (!ops) + return 0; + if (fwspec->fwnode != d->fwnode || fwspec->param_count != 0) return 0; @@ -135,6 +138,6 @@ int msi_lib_irq_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec, if (bus_token == ops->bus_select_token) return 1; - return ops && !!(ops->bus_select_mask & busmask); + return !!(ops->bus_select_mask & busmask); } EXPORT_SYMBOL_GPL(msi_lib_irq_domain_select); From ba7b6633e9afa6b5a788efd533c4bdc6fb1c606d Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Thu, 22 Aug 2024 15:23:56 +0530 Subject: [PATCH 672/991] platform/x86/amd/pmc: Fix SMU command submission path on new AMD platform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 426463d94d45 ("platform/x86/amd/pmc: Send OS_HINT command for new AMD platform") was introduced to enable sending mailbox commands to PMFW on newer platforms. However, it was later discovered that the commit did not configure the correct message port ID (i.e., S2D or PMC). Without this configuration, all command submissions to PMFW are treated as invalid, leading to command failures. To address this issue, the CPU ID association for the new platform needs to be added in amd_pmc_get_ip_info(). This ensures that the correct SMU port IDs are selected. Fixes: 426463d94d45 ("platform/x86/amd/pmc: Send OS_HINT command for new AMD platform") Co-developed-by: Sanket Goswami Signed-off-by: Sanket Goswami Signed-off-by: Shyam Sundar S K Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20240822095357.395808-1-Shyam-sundar.S-k@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/amd/pmc/pmc.c b/drivers/platform/x86/amd/pmc/pmc.c index c3e51f0a5c33e2..f0d389cf1ecb4f 100644 --- a/drivers/platform/x86/amd/pmc/pmc.c +++ b/drivers/platform/x86/amd/pmc/pmc.c @@ -359,6 +359,7 @@ static void amd_pmc_get_ip_info(struct amd_pmc_dev *dev) dev->smu_msg = 0x538; break; case PCI_DEVICE_ID_AMD_1AH_M20H_ROOT: + case PCI_DEVICE_ID_AMD_1AH_M60H_ROOT: dev->num_ips = 22; dev->s2d_msg_id = 0xDE; dev->smu_msg = 0x938; From a24cd5cfd1d07712a9f192401af638e3c6cc1491 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Thu, 22 Aug 2024 15:23:57 +0530 Subject: [PATCH 673/991] platform/x86/amd/pmc: Extend support for PMC features on new AMD platform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PMC driver has capability to get the idle mask values and STB data from the PMFW. Extend this support to the platforms that belong to family 1Ah model 60h series. Co-developed-by: Sanket Goswami Signed-off-by: Sanket Goswami Signed-off-by: Shyam Sundar S K Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20240822095357.395808-2-Shyam-sundar.S-k@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc.c b/drivers/platform/x86/amd/pmc/pmc.c index f0d389cf1ecb4f..bbb8edb62e009f 100644 --- a/drivers/platform/x86/amd/pmc/pmc.c +++ b/drivers/platform/x86/amd/pmc/pmc.c @@ -598,6 +598,7 @@ static int amd_pmc_idlemask_read(struct amd_pmc_dev *pdev, struct device *dev, val = amd_pmc_reg_read(pdev, AMD_PMC_SCRATCH_REG_YC); break; case PCI_DEVICE_ID_AMD_1AH_M20H_ROOT: + case PCI_DEVICE_ID_AMD_1AH_M60H_ROOT: val = amd_pmc_reg_read(pdev, AMD_PMC_SCRATCH_REG_1AH); break; default: @@ -631,6 +632,7 @@ static bool amd_pmc_is_stb_supported(struct amd_pmc_dev *dev) case AMD_CPU_ID_CB: case AMD_CPU_ID_PS: case PCI_DEVICE_ID_AMD_1AH_M20H_ROOT: + case PCI_DEVICE_ID_AMD_1AH_M60H_ROOT: return true; default: return false; From 8af174ea863c72f25ce31cee3baad8a301c0cf0f Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Wed, 21 Aug 2024 13:42:29 -0700 Subject: [PATCH 674/991] net: mana: Fix race of mana_hwc_post_rx_wqe and new hwc response The mana_hwc_rx_event_handler() / mana_hwc_handle_resp() calls complete(&ctx->comp_event) before posting the wqe back. It's possible that other callers, like mana_create_txq(), start the next round of mana_hwc_send_request() before the posting of wqe. And if the HW is fast enough to respond, it can hit no_wqe error on the HW channel, then the response message is lost. The mana driver may fail to create queues and open, because of waiting for the HW response and timed out. Sample dmesg: [ 528.610840] mana 39d4:00:02.0: HWC: Request timed out! [ 528.614452] mana 39d4:00:02.0: Failed to send mana message: -110, 0x0 [ 528.618326] mana 39d4:00:02.0 enP14804s2: Failed to create WQ object: -110 To fix it, move posting of rx wqe before complete(&ctx->comp_event). Cc: stable@vger.kernel.org Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Signed-off-by: Haiyang Zhang Reviewed-by: Long Li Signed-off-by: David S. Miller --- .../net/ethernet/microsoft/mana/hw_channel.c | 62 ++++++++++--------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index cafded2f938294..a00f915c51881f 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -52,9 +52,33 @@ static int mana_hwc_verify_resp_msg(const struct hwc_caller_ctx *caller_ctx, return 0; } +static int mana_hwc_post_rx_wqe(const struct hwc_wq *hwc_rxq, + struct hwc_work_request *req) +{ + struct device *dev = hwc_rxq->hwc->dev; + struct gdma_sge *sge; + int err; + + sge = &req->sge; + sge->address = (u64)req->buf_sge_addr; + sge->mem_key = hwc_rxq->msg_buf->gpa_mkey; + sge->size = req->buf_len; + + memset(&req->wqe_req, 0, sizeof(struct gdma_wqe_request)); + req->wqe_req.sgl = sge; + req->wqe_req.num_sge = 1; + req->wqe_req.client_data_unit = 0; + + err = mana_gd_post_and_ring(hwc_rxq->gdma_wq, &req->wqe_req, NULL); + if (err) + dev_err(dev, "Failed to post WQE on HWC RQ: %d\n", err); + return err; +} + static void mana_hwc_handle_resp(struct hw_channel_context *hwc, u32 resp_len, - const struct gdma_resp_hdr *resp_msg) + struct hwc_work_request *rx_req) { + const struct gdma_resp_hdr *resp_msg = rx_req->buf_va; struct hwc_caller_ctx *ctx; int err; @@ -62,6 +86,7 @@ static void mana_hwc_handle_resp(struct hw_channel_context *hwc, u32 resp_len, hwc->inflight_msg_res.map)) { dev_err(hwc->dev, "hwc_rx: invalid msg_id = %u\n", resp_msg->response.hwc_msg_id); + mana_hwc_post_rx_wqe(hwc->rxq, rx_req); return; } @@ -75,30 +100,13 @@ static void mana_hwc_handle_resp(struct hw_channel_context *hwc, u32 resp_len, memcpy(ctx->output_buf, resp_msg, resp_len); out: ctx->error = err; - complete(&ctx->comp_event); -} - -static int mana_hwc_post_rx_wqe(const struct hwc_wq *hwc_rxq, - struct hwc_work_request *req) -{ - struct device *dev = hwc_rxq->hwc->dev; - struct gdma_sge *sge; - int err; - - sge = &req->sge; - sge->address = (u64)req->buf_sge_addr; - sge->mem_key = hwc_rxq->msg_buf->gpa_mkey; - sge->size = req->buf_len; - memset(&req->wqe_req, 0, sizeof(struct gdma_wqe_request)); - req->wqe_req.sgl = sge; - req->wqe_req.num_sge = 1; - req->wqe_req.client_data_unit = 0; + /* Must post rx wqe before complete(), otherwise the next rx may + * hit no_wqe error. + */ + mana_hwc_post_rx_wqe(hwc->rxq, rx_req); - err = mana_gd_post_and_ring(hwc_rxq->gdma_wq, &req->wqe_req, NULL); - if (err) - dev_err(dev, "Failed to post WQE on HWC RQ: %d\n", err); - return err; + complete(&ctx->comp_event); } static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self, @@ -235,14 +243,12 @@ static void mana_hwc_rx_event_handler(void *ctx, u32 gdma_rxq_id, return; } - mana_hwc_handle_resp(hwc, rx_oob->tx_oob_data_size, resp); + mana_hwc_handle_resp(hwc, rx_oob->tx_oob_data_size, rx_req); - /* Do no longer use 'resp', because the buffer is posted to the HW - * in the below mana_hwc_post_rx_wqe(). + /* Can no longer use 'resp', because the buffer is posted to the HW + * in mana_hwc_handle_resp() above. */ resp = NULL; - - mana_hwc_post_rx_wqe(hwc_rxq, rx_req); } static void mana_hwc_tx_event_handler(void *ctx, u32 gdma_txq_id, From c358a809cb58af944d496944391a240e02f5837a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 20 Aug 2024 09:46:00 -0400 Subject: [PATCH 675/991] Revert "drm/ttm: increase ttm pre-fault value to PMD size" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 0ddd2ae586d28e521d37393364d989ce118802e0. This patch causes sluggishness and stuttering in graphical apps. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3564 Link: https://www.spinics.net/lists/dri-devel/msg457005.html Signed-off-by: Alex Deucher Cc: Zhu Lingshan Cc: Christian König Reviewed-by: Christian König Signed-off-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20240820134600.1909370-1-alexander.deucher@amd.com --- include/drm/ttm/ttm_bo.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h index ef0f52f56ebc63..6ccf96c91f3ae3 100644 --- a/include/drm/ttm/ttm_bo.h +++ b/include/drm/ttm/ttm_bo.h @@ -39,11 +39,7 @@ #include "ttm_device.h" /* Default number of pre-faulted pages in the TTM fault handler */ -#if CONFIG_PGTABLE_LEVELS > 2 -#define TTM_BO_VM_NUM_PREFAULT (1 << (PMD_SHIFT - PAGE_SHIFT)) -#else #define TTM_BO_VM_NUM_PREFAULT 16 -#endif struct iosys_map; From 752f387faaae0ae2e84d3f496922524785e77d60 Mon Sep 17 00:00:00 2001 From: Thomas Blocher Date: Wed, 31 Jul 2024 01:16:26 +0200 Subject: [PATCH 676/991] pinctrl: at91: make it work with current gpiolib pinctrl-at91 currently does not support the gpio-groups devicetree property and has no pin-range. Because of this at91 gpios stopped working since patch commit 2ab73c6d8323fa1e ("gpio: Support GPIO controllers without pin-ranges") This was discussed in the patches commit fc328a7d1fcce263 ("gpio: Revert regression in sysfs-gpio (gpiolib.c)") commit 56e337f2cf132632 ("Revert "gpio: Revert regression in sysfs-gpio (gpiolib.c)"") As a workaround manually set pin-range via gpiochip_add_pin_range() until a) pinctrl-at91 is reworked to support devicetree gpio-groups b) another solution as mentioned in commit 56e337f2cf132632 ("Revert "gpio: Revert regression in sysfs-gpio (gpiolib.c)"") is found Signed-off-by: Thomas Blocher Link: https://lore.kernel.org/5b992862-355d-f0de-cd3d-ff99e67a4ff1@ek-dev.de Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-at91.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-at91.c b/drivers/pinctrl/pinctrl-at91.c index b3c3f5fb2e2ec7..93ab277d9943cf 100644 --- a/drivers/pinctrl/pinctrl-at91.c +++ b/drivers/pinctrl/pinctrl-at91.c @@ -1403,8 +1403,11 @@ static int at91_pinctrl_probe(struct platform_device *pdev) /* We will handle a range of GPIO pins */ for (i = 0; i < gpio_banks; i++) - if (gpio_chips[i]) + if (gpio_chips[i]) { pinctrl_add_gpio_range(info->pctl, &gpio_chips[i]->range); + gpiochip_add_pin_range(&gpio_chips[i]->chip, dev_name(info->pctl->dev), 0, + gpio_chips[i]->range.pin_base, gpio_chips[i]->range.npins); + } dev_info(dev, "initialized AT91 pinctrl driver\n"); From 1c38a62f15e595346a1106025722869e87ffe044 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Thu, 8 Aug 2024 12:13:55 +0800 Subject: [PATCH 677/991] pinctrl: single: fix potential NULL dereference in pcs_get_function() pinmux_generic_get_function() can return NULL and the pointer 'function' was dereferenced without checking against NULL. Add checking of pointer 'function' in pcs_get_function(). Found by code review. Cc: stable@vger.kernel.org Fixes: 571aec4df5b7 ("pinctrl: single: Use generic pinmux helpers for managing functions") Signed-off-by: Ma Ke Link: https://lore.kernel.org/20240808041355.2766009-1-make24@iscas.ac.cn Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-single.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c index 4c6bfabb6bd7d8..4da3c3f422b691 100644 --- a/drivers/pinctrl/pinctrl-single.c +++ b/drivers/pinctrl/pinctrl-single.c @@ -345,6 +345,8 @@ static int pcs_get_function(struct pinctrl_dev *pctldev, unsigned pin, return -ENOTSUPP; fselector = setting->func; function = pinmux_generic_get_function(pctldev, fselector); + if (!function) + return -EINVAL; *func = function->data; if (!(*func)) { dev_err(pcs->dev, "%s could not find function%i\n", From 166bf8af91225576f85208a31eaedbadd182d1ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Thu, 8 Aug 2024 19:27:09 -0400 Subject: [PATCH 678/991] pinctrl: mediatek: common-v2: Fix broken bias-disable for PULL_PU_PD_RSEL_TYPE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite its name, commit fed74d75277d ("pinctrl: mediatek: common-v2: Fix bias-disable for PULL_PU_PD_RSEL_TYPE") actually broke bias-disable for PULL_PU_PD_RSEL_TYPE. mtk_pinconf_bias_set_combo() tries every bias method supported by the pin until one succeeds. For PULL_PU_PD_RSEL_TYPE pins, before the breaking commit, mtk_pinconf_bias_set_rsel() would be called first to try and set the RSEL value (as well as PU and PD), and if that failed, the only other valid option was that bias-disable was specified, which would then be handled by calling mtk_pinconf_bias_set_pu_pd() and disabling both PU and PD. The breaking commit misunderstood this logic and added an early "return 0" in mtk_pinconf_bias_set_rsel(). The result was that in the bias-disable case, the bias was left unchanged, since by returning success, mtk_pinconf_bias_set_combo() no longer tried calling mtk_pinconf_bias_set_pu_pd() to disable the bias. Since the logic for configuring bias-disable on PULL_PU_PD_RSEL_TYPE pins required mtk_pinconf_bias_set_rsel() to fail first, in that case, an error was printed to the log, eg: mt8195-pinctrl 10005000.pinctrl: Not support rsel value 0 Ohm for pin = 29 (GPIO29) This is what the breaking commit actually got rid of, and likely part of the reason why that commit was thought to be fixing functionality, while in reality it was breaking it. Instead of simply reverting that commit, restore the functionality but in a way that avoids the error from being printed and makes the code less confusing: * Return 0 explicitly if a bias method was successful * Introduce an extra function mtk_pinconf_bias_set_pu_pd_rsel() that calls both mtk_pinconf_bias_set_rsel() (only if needed) and mtk_pinconf_bias_set_pu_pd() * And analogously for the corresponding getters Fixes: fed74d75277d ("pinctrl: mediatek: common-v2: Fix bias-disable for PULL_PU_PD_RSEL_TYPE") Signed-off-by: Nícolas F. R. A. Prado Link: https://lore.kernel.org/20240808-mtk-rsel-bias-disable-fix-v1-1-1b4e85bf596c@collabora.com Signed-off-by: Linus Walleij --- .../pinctrl/mediatek/pinctrl-mtk-common-v2.c | 55 ++++++++++--------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c index b7921b59eb7b15..54301fbba524af 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c +++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c @@ -709,32 +709,35 @@ static int mtk_pinconf_bias_set_rsel(struct mtk_pinctrl *hw, { int err, rsel_val; - if (!pullup && arg == MTK_DISABLE) - return 0; - if (hw->rsel_si_unit) { /* find pin rsel_index from pin_rsel array*/ err = mtk_hw_pin_rsel_lookup(hw, desc, pullup, arg, &rsel_val); if (err) - goto out; + return err; } else { - if (arg < MTK_PULL_SET_RSEL_000 || - arg > MTK_PULL_SET_RSEL_111) { - err = -EINVAL; - goto out; - } + if (arg < MTK_PULL_SET_RSEL_000 || arg > MTK_PULL_SET_RSEL_111) + return -EINVAL; rsel_val = arg - MTK_PULL_SET_RSEL_000; } - err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_RSEL, rsel_val); - if (err) - goto out; + return mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_RSEL, rsel_val); +} - err = mtk_pinconf_bias_set_pu_pd(hw, desc, pullup, MTK_ENABLE); +static int mtk_pinconf_bias_set_pu_pd_rsel(struct mtk_pinctrl *hw, + const struct mtk_pin_desc *desc, + u32 pullup, u32 arg) +{ + u32 enable = arg == MTK_DISABLE ? MTK_DISABLE : MTK_ENABLE; + int err; -out: - return err; + if (arg != MTK_DISABLE) { + err = mtk_pinconf_bias_set_rsel(hw, desc, pullup, arg); + if (err) + return err; + } + + return mtk_pinconf_bias_set_pu_pd(hw, desc, pullup, enable); } int mtk_pinconf_bias_set_combo(struct mtk_pinctrl *hw, @@ -750,22 +753,22 @@ int mtk_pinconf_bias_set_combo(struct mtk_pinctrl *hw, try_all_type = MTK_PULL_TYPE_MASK; if (try_all_type & MTK_PULL_RSEL_TYPE) { - err = mtk_pinconf_bias_set_rsel(hw, desc, pullup, arg); + err = mtk_pinconf_bias_set_pu_pd_rsel(hw, desc, pullup, arg); if (!err) - return err; + return 0; } if (try_all_type & MTK_PULL_PU_PD_TYPE) { err = mtk_pinconf_bias_set_pu_pd(hw, desc, pullup, arg); if (!err) - return err; + return 0; } if (try_all_type & MTK_PULL_PULLSEL_TYPE) { err = mtk_pinconf_bias_set_pullsel_pullen(hw, desc, pullup, arg); if (!err) - return err; + return 0; } if (try_all_type & MTK_PULL_PUPD_R1R0_TYPE) @@ -803,9 +806,9 @@ static int mtk_rsel_get_si_unit(struct mtk_pinctrl *hw, return 0; } -static int mtk_pinconf_bias_get_rsel(struct mtk_pinctrl *hw, - const struct mtk_pin_desc *desc, - u32 *pullup, u32 *enable) +static int mtk_pinconf_bias_get_pu_pd_rsel(struct mtk_pinctrl *hw, + const struct mtk_pin_desc *desc, + u32 *pullup, u32 *enable) { int pu, pd, rsel, err; @@ -939,22 +942,22 @@ int mtk_pinconf_bias_get_combo(struct mtk_pinctrl *hw, try_all_type = MTK_PULL_TYPE_MASK; if (try_all_type & MTK_PULL_RSEL_TYPE) { - err = mtk_pinconf_bias_get_rsel(hw, desc, pullup, enable); + err = mtk_pinconf_bias_get_pu_pd_rsel(hw, desc, pullup, enable); if (!err) - return err; + return 0; } if (try_all_type & MTK_PULL_PU_PD_TYPE) { err = mtk_pinconf_bias_get_pu_pd(hw, desc, pullup, enable); if (!err) - return err; + return 0; } if (try_all_type & MTK_PULL_PULLSEL_TYPE) { err = mtk_pinconf_bias_get_pullsel_pullen(hw, desc, pullup, enable); if (!err) - return err; + return 0; } if (try_all_type & MTK_PULL_PUPD_R1R0_TYPE) From 9983a9cd4d429dc9ca01770083c4c1f366214b65 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 2 Jul 2024 12:15:14 -0500 Subject: [PATCH 679/991] cpufreq/amd-pstate-ut: Don't check for highest perf matching on prefcore If a system is using preferred cores the highest perf will be inconsistent as it can change from system events. Skip the checks for it. Fixes: e571a5e2068e ("cpufreq: amd-pstate: Update amd-pstate preferred core ranking dynamically") Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-ut.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index 66b73c308ce672..b7318669485e4b 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -160,14 +160,17 @@ static void amd_pstate_ut_check_perf(u32 index) lowest_perf = AMD_CPPC_LOWEST_PERF(cap1); } - if ((highest_perf != READ_ONCE(cpudata->highest_perf)) || - (nominal_perf != READ_ONCE(cpudata->nominal_perf)) || + if (highest_perf != READ_ONCE(cpudata->highest_perf) && !cpudata->hw_prefcore) { + pr_err("%s cpu%d highest=%d %d highest perf doesn't match\n", + __func__, cpu, highest_perf, cpudata->highest_perf); + goto skip_test; + } + if ((nominal_perf != READ_ONCE(cpudata->nominal_perf)) || (lowest_nonlinear_perf != READ_ONCE(cpudata->lowest_nonlinear_perf)) || (lowest_perf != READ_ONCE(cpudata->lowest_perf))) { amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; - pr_err("%s cpu%d highest=%d %d nominal=%d %d lowest_nonlinear=%d %d lowest=%d %d, they should be equal!\n", - __func__, cpu, highest_perf, cpudata->highest_perf, - nominal_perf, cpudata->nominal_perf, + pr_err("%s cpu%d nominal=%d %d lowest_nonlinear=%d %d lowest=%d %d, they should be equal!\n", + __func__, cpu, nominal_perf, cpudata->nominal_perf, lowest_nonlinear_perf, cpudata->lowest_nonlinear_perf, lowest_perf, cpudata->lowest_perf); goto skip_test; From d3692d95cc4d88114b070ee63cffc976f00f207f Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Fri, 9 Aug 2024 02:22:04 +0200 Subject: [PATCH 680/991] pinctrl: qcom: x1e80100: Fix special pin offsets Remove the erroneus 0x100000 offset to prevent the boards from crashing on pin state setting, as well as for the intended state changes to take effect. Fixes: 05e4941d97ef ("pinctrl: qcom: Add X1E80100 pinctrl driver") Signed-off-by: Konrad Dybcio Reviewed-by: Abel Vesa Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/20240809-topic-h_sdc-v1-1-bb421532c531@quicinc.com Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-x1e80100.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/qcom/pinctrl-x1e80100.c b/drivers/pinctrl/qcom/pinctrl-x1e80100.c index 6cd4d10e6fd6fa..65ed933f05ce17 100644 --- a/drivers/pinctrl/qcom/pinctrl-x1e80100.c +++ b/drivers/pinctrl/qcom/pinctrl-x1e80100.c @@ -1805,10 +1805,10 @@ static const struct msm_pingroup x1e80100_groups[] = { [235] = PINGROUP(235, aon_cci, qdss_gpio, _, _, _, _, _, _, _), [236] = PINGROUP(236, aon_cci, qdss_gpio, _, _, _, _, _, _, _), [237] = PINGROUP(237, _, _, _, _, _, _, _, _, _), - [238] = UFS_RESET(ufs_reset, 0x1f9000), - [239] = SDC_QDSD_PINGROUP(sdc2_clk, 0x1f2000, 14, 6), - [240] = SDC_QDSD_PINGROUP(sdc2_cmd, 0x1f2000, 11, 3), - [241] = SDC_QDSD_PINGROUP(sdc2_data, 0x1f2000, 9, 0), + [238] = UFS_RESET(ufs_reset, 0xf9000), + [239] = SDC_QDSD_PINGROUP(sdc2_clk, 0xf2000, 14, 6), + [240] = SDC_QDSD_PINGROUP(sdc2_cmd, 0xf2000, 11, 3), + [241] = SDC_QDSD_PINGROUP(sdc2_data, 0xf2000, 9, 0), }; static const struct msm_gpio_wakeirq_map x1e80100_pdc_map[] = { From 639766ca10d1e218e257ae7eabe76814bae6ab89 Mon Sep 17 00:00:00 2001 From: Hal Feng Date: Mon, 12 Aug 2024 15:01:08 +0800 Subject: [PATCH 681/991] pinctrl: starfive: jh7110: Correct the level trigger configuration of iev register A mistake was made in level trigger register configuration. Correct it. Fixes: 447976ab62c5 ("pinctrl: starfive: Add StarFive JH7110 sys controller driver") Signed-off-by: Hal Feng Link: https://lore.kernel.org/20240812070108.100923-1-hal.feng@starfivetech.com Signed-off-by: Linus Walleij --- drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c b/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c index 4ce080caa2338e..1d0d6c224c104e 100644 --- a/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c +++ b/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c @@ -793,12 +793,12 @@ static int jh7110_irq_set_type(struct irq_data *d, unsigned int trigger) case IRQ_TYPE_LEVEL_HIGH: irq_type = 0; /* 0: level triggered */ edge_both = 0; /* 0: ignored */ - polarity = mask; /* 1: high level */ + polarity = 0; /* 0: high level */ break; case IRQ_TYPE_LEVEL_LOW: irq_type = 0; /* 0: level triggered */ edge_both = 0; /* 0: ignored */ - polarity = 0; /* 0: low level */ + polarity = mask; /* 1: low level */ break; default: return -EINVAL; From a204501e1743d695ca2930ed25a2be9f8ced96d3 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 23 Aug 2024 11:51:08 -0400 Subject: [PATCH 682/991] nfsd: prevent panic for nfsv4.0 closed files in nfs4_show_open Prior to commit 3f29cc82a84c ("nfsd: split sc_status out of sc_type") states_show() relied on sc_type field to be of valid type before calling into a subfunction to show content of a particular stateid. From that commit, we split the validity of the stateid into sc_status and no longer changed sc_type to 0 while unhashing the stateid. This resulted in kernel oopsing for nfsv4.0 opens that stay around and in nfs4_show_open() would derefence sc_file which was NULL. Instead, for closed open stateids forgo displaying information that relies of having a valid sc_file. To reproduce: mount the server with 4.0, read and close a file and then on the server cat /proc/fs/nfsd/clients/2/states [ 513.590804] Call trace: [ 513.590925] _raw_spin_lock+0xcc/0x160 [ 513.591119] nfs4_show_open+0x78/0x2c0 [nfsd] [ 513.591412] states_show+0x44c/0x488 [nfsd] [ 513.591681] seq_read_iter+0x5d8/0x760 [ 513.591896] seq_read+0x188/0x208 [ 513.592075] vfs_read+0x148/0x470 [ 513.592241] ksys_read+0xcc/0x178 Fixes: 3f29cc82a84c ("nfsd: split sc_status out of sc_type") Signed-off-by: Olga Kornievskaia Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a20c2c9d7d457f..dafff707e23a46 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2789,15 +2789,18 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) deny & NFS4_SHARE_ACCESS_READ ? "r" : "-", deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); - spin_lock(&nf->fi_lock); - file = find_any_file_locked(nf); - if (file) { - nfs4_show_superblock(s, file); - seq_puts(s, ", "); - nfs4_show_fname(s, file); - seq_puts(s, ", "); - } - spin_unlock(&nf->fi_lock); + if (nf) { + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); + if (file) { + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + seq_puts(s, ", "); + } + spin_unlock(&nf->fi_lock); + } else + seq_puts(s, "closed, "); nfs4_show_owner(s, oo); if (st->sc_status & SC_STATUS_ADMIN_REVOKED) seq_puts(s, ", admin-revoked"); From 7eff3453cbd7e0bfc7524d59694119b5ca844778 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 5 Jul 2024 09:15:08 +0800 Subject: [PATCH 683/991] ovl: pass string to ovl_parse_layer() So it can be used for parsing the Opt_lowerdir. Signed-off-by: Zhihao Cheng Link: https://lore.kernel.org/r/20240705011510.794025-2-chengzhihao1@huawei.com Signed-off-by: Christian Brauner --- fs/overlayfs/params.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 4860fcc4611bb7..52e3860973b7ee 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -365,10 +365,9 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, } } -static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, - enum ovl_opt layer) +static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum ovl_opt layer) { - char *name = kstrdup(param->string, GFP_KERNEL); + char *name = kstrdup(layer_name, GFP_KERNEL); bool upper = (layer == Opt_upperdir || layer == Opt_workdir); struct path path; int err; @@ -582,7 +581,7 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_datadir_add: case Opt_upperdir: case Opt_workdir: - err = ovl_parse_layer(fc, param, opt); + err = ovl_parse_layer(fc, param->string, opt); break; case Opt_default_permissions: config->default_permissions = true; From ca76ac36bb6068866feca185045e7edf2a8f392f Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 5 Jul 2024 09:15:09 +0800 Subject: [PATCH 684/991] ovl: fix wrong lowerdir number check for parameter Opt_lowerdir The max count of lowerdir is OVL_MAX_STACK[500], which is broken by commit 37f32f526438("ovl: fix memory leak in ovl_parse_param()") for parameter Opt_lowerdir. Since commit 819829f0319a("ovl: refactor layer parsing helpers") and commit 24e16e385f22("ovl: add support for appending lowerdirs one by one") added check ovl_mount_dir_check() in function ovl_parse_param_lowerdir(), the 'ctx->nr' should be smaller than OVL_MAX_STACK, after commit 37f32f526438("ovl: fix memory leak in ovl_parse_param()") is applied, the 'ctx->nr' is updated before the check ovl_mount_dir_check(), which leads the max count of lowerdir to become 499 for parameter Opt_lowerdir. Fix it by replacing lower layers parsing code with the existing helper function ovl_parse_layer(). Fixes: 37f32f526438 ("ovl: fix memory leak in ovl_parse_param()") Signed-off-by: Zhihao Cheng Link: https://lore.kernel.org/r/20240705011510.794025-3-chengzhihao1@huawei.com Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/overlayfs/params.c | 40 +++++++--------------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 52e3860973b7ee..8dd834c7f291c9 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -353,6 +353,8 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, case Opt_datadir_add: ctx->nr_data++; fallthrough; + case Opt_lowerdir: + fallthrough; case Opt_lowerdir_add: WARN_ON(ctx->nr >= ctx->capacity); l = &ctx->lower[ctx->nr++]; @@ -375,7 +377,7 @@ static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum o if (!name) return -ENOMEM; - if (upper) + if (upper || layer == Opt_lowerdir) err = ovl_mount_dir(name, &path); else err = ovl_mount_dir_noesc(name, &path); @@ -431,7 +433,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) { int err; struct ovl_fs_context *ctx = fc->fs_private; - struct ovl_fs_context_layer *l; char *dup = NULL, *iter; ssize_t nr_lower, nr; bool data_layer = false; @@ -471,35 +472,11 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) goto out_err; } - if (nr_lower > ctx->capacity) { - err = -ENOMEM; - l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower), - GFP_KERNEL_ACCOUNT); - if (!l) - goto out_err; - - ctx->lower = l; - ctx->capacity = nr_lower; - } - iter = dup; - l = ctx->lower; - for (nr = 0; nr < nr_lower; nr++, l++) { - ctx->nr++; - memset(l, 0, sizeof(*l)); - - err = ovl_mount_dir(iter, &l->path); + for (nr = 0; nr < nr_lower; nr++) { + err = ovl_parse_layer(fc, iter, Opt_lowerdir); if (err) - goto out_put; - - err = ovl_mount_dir_check(fc, &l->path, Opt_lowerdir, iter, false); - if (err) - goto out_put; - - err = -ENOMEM; - l->name = kstrdup(iter, GFP_KERNEL_ACCOUNT); - if (!l->name) - goto out_put; + goto out_err; if (data_layer) ctx->nr_data++; @@ -517,7 +494,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) */ if (ctx->nr_data > 0) { pr_err("regular lower layers cannot follow data lower layers"); - goto out_put; + goto out_err; } data_layer = false; @@ -531,9 +508,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) kfree(dup); return 0; -out_put: - ovl_reset_lowerdirs(ctx); - out_err: kfree(dup); From 441e36ef5b347d9ab4f54f7b54853266be687556 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 5 Jul 2024 09:15:10 +0800 Subject: [PATCH 685/991] ovl: ovl_parse_param_lowerdir: Add missed '\n' for pr_err Add '\n' for pr_err in function ovl_parse_param_lowerdir(), which ensures that error message is displayed at once. Fixes: b36a5780cb44 ("ovl: modify layer parameter parsing") Signed-off-by: Zhihao Cheng Link: https://lore.kernel.org/r/20240705011510.794025-4-chengzhihao1@huawei.com Signed-off-by: Christian Brauner --- fs/overlayfs/params.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 8dd834c7f291c9..d0568c09134126 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -449,7 +449,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) return 0; if (*name == ':') { - pr_err("cannot append lower layer"); + pr_err("cannot append lower layer\n"); return -EINVAL; } @@ -493,7 +493,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) * there are no data layers. */ if (ctx->nr_data > 0) { - pr_err("regular lower layers cannot follow data lower layers"); + pr_err("regular lower layers cannot follow data lower layers\n"); goto out_err; } From eb9e749c0182affafadfbe5ded4503c4b5a9b57c Mon Sep 17 00:00:00 2001 From: Kiran K Date: Thu, 18 Jul 2024 20:18:04 +0530 Subject: [PATCH 686/991] Bluetooth: btintel: Allow configuring drive strength of BRI BRI (Bluetooth Radio Interface) traffic from CNVr to CNVi was found causing cross talk step errors to WiFi. To avoid this potential issue OEM platforms can replace BRI resistor to adjust the BRI response line drive strength. During the *setup*, driver reads the drive strength value from uefi variable and passes it to the controller via vendor specific command with opcode 0xfc0a. dmesg: .. [21.982720] Bluetooth: hci0: Bootloader timestamp 2023.33 buildtype 1 build 45995 [21.984250] Bluetooth: hci0: Found device firmware: intel/ibt-0190-0291-iml.sfi [21.984255] Bluetooth: hci0: Boot Address: 0x30099000 [21.984256] Bluetooth: hci0: Firmware Version: 160-24.24 [22.011501] Bluetooth: hci0: Waiting for firmware download to complete [22.011518] Bluetooth: hci0: Firmware loaded in 26624 usecs [22.011584] Bluetooth: hci0: Waiting for device to boot [22.013546] Bluetooth: hci0: Malformed MSFT vendor event: 0x02 [22.013552] Bluetooth: hci0: Device booted in 1967 usecs ... [22.013792] Bluetooth: hci0: dsbr: enable: 0x01 value: 0x0b ... [22.015027] Bluetooth: hci0: Found device firmware: intel/ibt-0190-0291.sfi [22.015041] Bluetooth: hci0: Boot Address: 0x10000800 [22.015043] Bluetooth: hci0: Firmware Version: 160-24.24 [22.395821] Bluetooth: BNEP (Ethernet Emulation) ver 1.3 [22.395828] Bluetooth: BNEP filters: protocol multicast ... Signed-off-by: Kiran K Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel.c | 124 ++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c index 7d5e4de64e3cec..1ccbb515751533 100644 --- a/drivers/bluetooth/btintel.c +++ b/drivers/bluetooth/btintel.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -26,6 +27,8 @@ #define ECDSA_OFFSET 644 #define ECDSA_HEADER_LEN 320 +#define BTINTEL_EFI_DSBR L"UefiCnvCommonDSBR" + enum { DSM_SET_WDISABLE2_DELAY = 1, DSM_SET_RESET_METHOD = 3, @@ -2616,6 +2619,120 @@ static u8 btintel_classify_pkt_type(struct hci_dev *hdev, struct sk_buff *skb) return hci_skb_pkt_type(skb); } +/* + * UefiCnvCommonDSBR UEFI variable provides information from the OEM platforms + * if they have replaced the BRI (Bluetooth Radio Interface) resistor to + * overcome the potential STEP errors on their designs. Based on the + * configauration, bluetooth firmware shall adjust the BRI response line drive + * strength. The below structure represents DSBR data. + * struct { + * u8 header; + * u32 dsbr; + * } __packed; + * + * header - defines revision number of the structure + * dsbr - defines drive strength BRI response + * bit0 + * 0 - instructs bluetooth firmware to use default values + * 1 - instructs bluetooth firmware to override default values + * bit3:1 + * Reserved + * bit7:4 + * DSBR override values (only if bit0 is set. Default value is 0xF + * bit31:7 + * Reserved + * Expected values for dsbr field: + * 1. 0xF1 - indicates that the resistor on board is 33 Ohm + * 2. 0x00 or 0xB1 - indicates that the resistor on board is 10 Ohm + * 3. Non existing UEFI variable or invalid (none of the above) - indicates + * that the resistor on board is 10 Ohm + * Even if uefi variable is not present, driver shall send 0xfc0a command to + * firmware to use default values. + * + */ +static int btintel_uefi_get_dsbr(u32 *dsbr_var) +{ + struct btintel_dsbr { + u8 header; + u32 dsbr; + } __packed data; + + efi_status_t status; + unsigned long data_size = 0; + efi_guid_t guid = EFI_GUID(0xe65d8884, 0xd4af, 0x4b20, 0x8d, 0x03, + 0x77, 0x2e, 0xcc, 0x3d, 0xa5, 0x31); + + if (!IS_ENABLED(CONFIG_EFI)) + return -EOPNOTSUPP; + + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) + return -EOPNOTSUPP; + + status = efi.get_variable(BTINTEL_EFI_DSBR, &guid, NULL, &data_size, + NULL); + + if (status != EFI_BUFFER_TOO_SMALL || !data_size) + return -EIO; + + status = efi.get_variable(BTINTEL_EFI_DSBR, &guid, NULL, &data_size, + &data); + + if (status != EFI_SUCCESS) + return -ENXIO; + + *dsbr_var = data.dsbr; + return 0; +} + +static int btintel_set_dsbr(struct hci_dev *hdev, struct intel_version_tlv *ver) +{ + struct btintel_dsbr_cmd { + u8 enable; + u8 dsbr; + } __packed; + + struct btintel_dsbr_cmd cmd; + struct sk_buff *skb; + u8 status; + u32 dsbr; + bool apply_dsbr; + int err; + + /* DSBR command needs to be sent for BlazarI + B0 step product after + * downloading IML image. + */ + apply_dsbr = (ver->img_type == BTINTEL_IMG_IML && + ((ver->cnvi_top & 0xfff) == BTINTEL_CNVI_BLAZARI) && + INTEL_CNVX_TOP_STEP(ver->cnvi_top) == 0x01); + + if (!apply_dsbr) + return 0; + + dsbr = 0; + err = btintel_uefi_get_dsbr(&dsbr); + if (err < 0) + bt_dev_dbg(hdev, "Error reading efi: %ls (%d)", + BTINTEL_EFI_DSBR, err); + + cmd.enable = dsbr & BIT(0); + cmd.dsbr = dsbr >> 4 & 0xF; + + bt_dev_info(hdev, "dsbr: enable: 0x%2.2x value: 0x%2.2x", cmd.enable, + cmd.dsbr); + + skb = __hci_cmd_sync(hdev, 0xfc0a, sizeof(cmd), &cmd, HCI_CMD_TIMEOUT); + if (IS_ERR(skb)) + return -bt_to_errno(PTR_ERR(skb)); + + status = skb->data[0]; + kfree_skb(skb); + + if (status) + return -bt_to_errno(status); + + return 0; +} + int btintel_bootloader_setup_tlv(struct hci_dev *hdev, struct intel_version_tlv *ver) { @@ -2650,6 +2767,13 @@ int btintel_bootloader_setup_tlv(struct hci_dev *hdev, if (err) return err; + /* set drive strength of BRI response */ + err = btintel_set_dsbr(hdev, ver); + if (err) { + bt_dev_err(hdev, "Failed to send dsbr command (%d)", err); + return err; + } + /* If image type returned is BTINTEL_IMG_IML, then controller supports * intermediate loader image */ From 35237475384ab3622f63c3c09bdf6af6dacfe9c3 Mon Sep 17 00:00:00 2001 From: Neeraj Sanjay Kale Date: Fri, 16 Aug 2024 15:51:13 +0530 Subject: [PATCH 687/991] Bluetooth: btnxpuart: Fix random crash seen while removing driver This fixes the random kernel crash seen while removing the driver, when running the load/unload test over multiple iterations. 1) modprobe btnxpuart 2) hciconfig hci0 reset 3) hciconfig (check hci0 interface up with valid BD address) 4) modprobe -r btnxpuart Repeat steps 1 to 4 The ps_wakeup() call in btnxpuart_close() schedules the psdata->work(), which gets scheduled after module is removed, causing a kernel crash. This hidden issue got highlighted after enabling Power Save by default in 4183a7be7700 (Bluetooth: btnxpuart: Enable Power Save feature on startup) The new ps_cleanup() deasserts UART break immediately while closing serdev device, cancels any scheduled ps_work and destroys the ps_lock mutex. [ 85.884604] Unable to handle kernel paging request at virtual address ffffd4a61638f258 [ 85.884624] Mem abort info: [ 85.884625] ESR = 0x0000000086000007 [ 85.884628] EC = 0x21: IABT (current EL), IL = 32 bits [ 85.884633] SET = 0, FnV = 0 [ 85.884636] EA = 0, S1PTW = 0 [ 85.884638] FSC = 0x07: level 3 translation fault [ 85.884642] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000041dd0000 [ 85.884646] [ffffd4a61638f258] pgd=1000000095fff003, p4d=1000000095fff003, pud=100000004823d003, pmd=100000004823e003, pte=0000000000000000 [ 85.884662] Internal error: Oops: 0000000086000007 [#1] PREEMPT SMP [ 85.890932] Modules linked in: algif_hash algif_skcipher af_alg overlay fsl_jr_uio caam_jr caamkeyblob_desc caamhash_desc caamalg_desc crypto_engine authenc libdes crct10dif_ce polyval_ce polyval_generic snd_soc_imx_spdif snd_soc_imx_card snd_soc_ak5558 snd_soc_ak4458 caam secvio error snd_soc_fsl_spdif snd_soc_fsl_micfil snd_soc_fsl_sai snd_soc_fsl_utils gpio_ir_recv rc_core fuse [last unloaded: btnxpuart(O)] [ 85.927297] CPU: 1 PID: 67 Comm: kworker/1:3 Tainted: G O 6.1.36+g937b1be4345a #1 [ 85.936176] Hardware name: FSL i.MX8MM EVK board (DT) [ 85.936182] Workqueue: events 0xffffd4a61638f380 [ 85.936198] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 85.952817] pc : 0xffffd4a61638f258 [ 85.952823] lr : 0xffffd4a61638f258 [ 85.952827] sp : ffff8000084fbd70 [ 85.952829] x29: ffff8000084fbd70 x28: 0000000000000000 x27: 0000000000000000 [ 85.963112] x26: ffffd4a69133f000 x25: ffff4bf1c8540990 x24: ffff4bf215b87305 [ 85.963119] x23: ffff4bf215b87300 x22: ffff4bf1c85409d0 x21: ffff4bf1c8540970 [ 85.977382] x20: 0000000000000000 x19: ffff4bf1c8540880 x18: 0000000000000000 [ 85.977391] x17: 0000000000000000 x16: 0000000000000133 x15: 0000ffffe2217090 [ 85.977399] x14: 0000000000000001 x13: 0000000000000133 x12: 0000000000000139 [ 85.977407] x11: 0000000000000001 x10: 0000000000000a60 x9 : ffff8000084fbc50 [ 85.977417] x8 : ffff4bf215b7d000 x7 : ffff4bf215b83b40 x6 : 00000000000003e8 [ 85.977424] x5 : 00000000410fd030 x4 : 0000000000000000 x3 : 0000000000000000 [ 85.977432] x2 : 0000000000000000 x1 : ffff4bf1c4265880 x0 : 0000000000000000 [ 85.977443] Call trace: [ 85.977446] 0xffffd4a61638f258 [ 85.977451] 0xffffd4a61638f3e8 [ 85.977455] process_one_work+0x1d4/0x330 [ 85.977464] worker_thread+0x6c/0x430 [ 85.977471] kthread+0x108/0x10c [ 85.977476] ret_from_fork+0x10/0x20 [ 85.977488] Code: bad PC value [ 85.977491] ---[ end trace 0000000000000000 ]--- Preset since v6.9.11 Fixes: 86d55f124b52 ("Bluetooth: btnxpuart: Deasset UART break before closing serdev device") Signed-off-by: Neeraj Sanjay Kale Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btnxpuart.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c index 31d3dd90b6720b..ad1ec6f3685a76 100644 --- a/drivers/bluetooth/btnxpuart.c +++ b/drivers/bluetooth/btnxpuart.c @@ -449,6 +449,23 @@ static bool ps_wakeup(struct btnxpuart_dev *nxpdev) return false; } +static void ps_cleanup(struct btnxpuart_dev *nxpdev) +{ + struct ps_data *psdata = &nxpdev->psdata; + u8 ps_state; + + mutex_lock(&psdata->ps_lock); + ps_state = psdata->ps_state; + mutex_unlock(&psdata->ps_lock); + + if (ps_state != PS_STATE_AWAKE) + ps_control(psdata->hdev, PS_STATE_AWAKE); + + ps_cancel_timer(nxpdev); + cancel_work_sync(&psdata->work); + mutex_destroy(&psdata->ps_lock); +} + static int send_ps_cmd(struct hci_dev *hdev, void *data) { struct btnxpuart_dev *nxpdev = hci_get_drvdata(hdev); @@ -1363,7 +1380,6 @@ static int btnxpuart_close(struct hci_dev *hdev) { struct btnxpuart_dev *nxpdev = hci_get_drvdata(hdev); - ps_wakeup(nxpdev); serdev_device_close(nxpdev->serdev); skb_queue_purge(&nxpdev->txq); if (!IS_ERR_OR_NULL(nxpdev->rx_skb)) { @@ -1516,8 +1532,8 @@ static void nxp_serdev_remove(struct serdev_device *serdev) nxpdev->new_baudrate = nxpdev->fw_init_baudrate; nxp_set_baudrate_cmd(hdev, NULL); } - ps_cancel_timer(nxpdev); } + ps_cleanup(nxpdev); hci_unregister_dev(hdev); hci_free_dev(hdev); } From 18b3256db76bd1130965acd99fbd38f87c3e6950 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 21 Aug 2024 14:41:52 -0400 Subject: [PATCH 688/991] Bluetooth: hci_core: Fix not handling hibernation actions This fixes not handling hibernation actions on suspend notifier so they are treated in the same way as regular suspend actions. Fixes: 9952d90ea288 ("Bluetooth: Handle PM_SUSPEND_PREPARE and PM_POST_SUSPEND") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index f25a21f532aa75..d6976db02c06c7 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2406,10 +2406,16 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, /* To avoid a potential race with hci_unregister_dev. */ hci_dev_hold(hdev); - if (action == PM_SUSPEND_PREPARE) + switch (action) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: ret = hci_suspend_dev(hdev); - else if (action == PM_POST_SUSPEND) + break; + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: ret = hci_resume_dev(hdev); + break; + } if (ret) bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", From 740f2e2791b98e47288b3814c83a3f566518fed2 Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Wed, 21 Aug 2024 06:07:42 +0000 Subject: [PATCH 689/991] usb: cdnsp: fix for Link TRB with TC Stop Endpoint command on LINK TRB with TC bit set to 1 causes that internal cycle bit can have incorrect state after command complete. In consequence empty transfer ring can be incorrectly detected when EP is resumed. NOP TRB before LINK TRB avoid such scenario. Stop Endpoint command is then on NOP TRB and internal cycle bit is not changed and have correct value. Fixes: 3d82904559f4 ("usb: cdnsp: cdns3 Add main part of Cadence USBSSP DRD Driver") cc: Signed-off-by: Pawel Laszczak Reviewed-by: Peter Chen Link: https://lore.kernel.org/r/PH7PR07MB953878279F375CCCE6C6F40FDD8E2@PH7PR07MB9538.namprd07.prod.outlook.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/cdnsp-gadget.h | 3 +++ drivers/usb/cdns3/cdnsp-ring.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/drivers/usb/cdns3/cdnsp-gadget.h b/drivers/usb/cdns3/cdnsp-gadget.h index dbee6f08527773..84887dfea7635b 100644 --- a/drivers/usb/cdns3/cdnsp-gadget.h +++ b/drivers/usb/cdns3/cdnsp-gadget.h @@ -811,6 +811,7 @@ struct cdnsp_stream_info { * generate Missed Service Error Event. * Set skip flag when receive a Missed Service Error Event and * process the missed tds on the endpoint ring. + * @wa1_nop_trb: hold pointer to NOP trb. */ struct cdnsp_ep { struct usb_ep endpoint; @@ -838,6 +839,8 @@ struct cdnsp_ep { #define EP_UNCONFIGURED BIT(7) bool skip; + union cdnsp_trb *wa1_nop_trb; + }; /** diff --git a/drivers/usb/cdns3/cdnsp-ring.c b/drivers/usb/cdns3/cdnsp-ring.c index a60c0cb991cd1b..dbd83d321bca01 100644 --- a/drivers/usb/cdns3/cdnsp-ring.c +++ b/drivers/usb/cdns3/cdnsp-ring.c @@ -1904,6 +1904,23 @@ int cdnsp_queue_bulk_tx(struct cdnsp_device *pdev, struct cdnsp_request *preq) if (ret) return ret; + /* + * workaround 1: STOP EP command on LINK TRB with TC bit set to 1 + * causes that internal cycle bit can have incorrect state after + * command complete. In consequence empty transfer ring can be + * incorrectly detected when EP is resumed. + * NOP TRB before LINK TRB avoid such scenario. STOP EP command is + * then on NOP TRB and internal cycle bit is not changed and have + * correct value. + */ + if (pep->wa1_nop_trb) { + field = le32_to_cpu(pep->wa1_nop_trb->trans_event.flags); + field ^= TRB_CYCLE; + + pep->wa1_nop_trb->trans_event.flags = cpu_to_le32(field); + pep->wa1_nop_trb = NULL; + } + /* * Don't give the first TRB to the hardware (by toggling the cycle bit) * until we've finished creating all the other TRBs. The ring's cycle @@ -1999,6 +2016,17 @@ int cdnsp_queue_bulk_tx(struct cdnsp_device *pdev, struct cdnsp_request *preq) send_addr = addr; } + if (cdnsp_trb_is_link(ring->enqueue + 1)) { + field = TRB_TYPE(TRB_TR_NOOP) | TRB_IOC; + if (!ring->cycle_state) + field |= TRB_CYCLE; + + pep->wa1_nop_trb = ring->enqueue; + + cdnsp_queue_trb(pdev, ring, 0, 0x0, 0x0, + TRB_INTR_TARGET(0), field); + } + cdnsp_check_trb_math(preq, enqd_len); ret = cdnsp_giveback_first_trb(pdev, pep, preq->request.stream_id, start_cycle, start_trb); From 0aa2e1b2fb7a75aa4b5b4347055ccfea6f091769 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Aug 2024 21:08:09 +0100 Subject: [PATCH 690/991] mm: Fix missing folio invalidation calls during truncation When AS_RELEASE_ALWAYS is set on a mapping, the ->release_folio() and ->invalidate_folio() calls should be invoked even if PG_private and PG_private_2 aren't set. This is used by netfslib to keep track of the point above which reads can be skipped in favour of just zeroing pagecache locally. There are a couple of places in truncation in which invalidation is only called when folio_has_private() is true. Fix these to check folio_needs_release() instead. Without this, the generic/075 and generic/112 xfstests (both fsx-based tests) fail with minimum folio size patches applied[1]. Fixes: b4fa966f03b7 ("mm, netfs, fscache: stop read optimisation when folio removed from pagecache") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240815090849.972355-1-kernel@pankajraghav.com/ [1] Link: https://lore.kernel.org/r/20240823200819.532106-2-dhowells@redhat.com Reviewed-by: Matthew Wilcox (Oracle) cc: Matthew Wilcox (Oracle) cc: Pankaj Raghav cc: Jeff Layton cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-mm@kvack.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- mm/truncate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index 4d61fbdd4b2f2e..0668cd340a4630 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -157,7 +157,7 @@ static void truncate_cleanup_folio(struct folio *folio) if (folio_mapped(folio)) unmap_mapping_folio(folio); - if (folio_has_private(folio)) + if (folio_needs_release(folio)) folio_invalidate(folio, 0, folio_size(folio)); /* @@ -219,7 +219,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) if (!mapping_inaccessible(folio->mapping)) folio_zero_range(folio, offset, length); - if (folio_has_private(folio)) + if (folio_needs_release(folio)) folio_invalidate(folio, offset, length); if (!folio_test_large(folio)) return true; From a74ee0e878e262c0276966528d72d4e887174410 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Aug 2024 21:08:10 +0100 Subject: [PATCH 691/991] afs: Fix post-setattr file edit to do truncation correctly At the end of an kAFS RPC operation, there is an "edit" phase (originally intended for post-directory modification ops to edit the local image) that the setattr VFS op uses to fix up the pagecache if the RPC that requested truncation of a file was successful. afs_setattr_edit_file() calls truncate_setsize() which sets i_size, expands the pagecache if needed and truncates the pagecache. The first two of those, however, are redundant as they've already been done by afs_setattr_success() under the io_lock and the first is also done under the callback lock (cb_lock). Fix afs_setattr_edit_file() to call truncate_pagecache() instead (which is called by truncate_setsize(), thereby skipping the redundant parts. Fixes: 100ccd18bb41 ("netfs: Optimise away reads above the point at which there can be no data") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240823200819.532106-3-dhowells@redhat.com cc: Matthew Wilcox (Oracle) cc: Pankaj Raghav cc: Jeff Layton cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-mm@kvack.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/afs/inode.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 3acf5e0500728b..a95e77670b4945 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -695,13 +695,18 @@ static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_vnode *vnode = vp->vnode; + struct inode *inode = &vnode->netfs.inode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; - loff_t i_size = op->setattr.old_i_size; + loff_t old = op->setattr.old_i_size; + + /* Note: inode->i_size was updated by afs_apply_status() inside + * the I/O and callback locks. + */ - if (size != i_size) { - truncate_setsize(&vnode->netfs.inode, size); + if (size != old) { + truncate_pagecache(inode, size); netfs_resize_file(&vnode->netfs, size, true); fscache_resize_cookie(afs_vnode_cache(vnode), size); } From 7dfc8f0c6144c290dbeb01835a67e81b34dda8cd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Aug 2024 21:08:11 +0100 Subject: [PATCH 692/991] netfs: Fix netfs_release_folio() to say no if folio dirty Fix netfs_release_folio() to say no (ie. return false) if the folio is dirty (analogous with iomap's behaviour). Without this, it will say yes to the release of a dirty page by split_huge_page_to_list_to_order(), which will result in the loss of untruncated data in the folio. Without this, the generic/075 and generic/112 xfstests (both fsx-based tests) fail with minimum folio size patches applied[1]. Fixes: c1ec4d7c2e13 ("netfs: Provide invalidate_folio and release_folio calls") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240815090849.972355-1-kernel@pankajraghav.com/ [1] Link: https://lore.kernel.org/r/20240823200819.532106-4-dhowells@redhat.com cc: Matthew Wilcox (Oracle) cc: Pankaj Raghav cc: Jeff Layton cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-mm@kvack.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/misc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 554a1a4615adee..69324761fcf7c0 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -161,6 +161,9 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); unsigned long long end; + if (folio_test_dirty(folio)) + return false; + end = folio_pos(folio) + folio_size(folio); if (end > ctx->zero_point) ctx->zero_point = end; From cce6bfa6ca0e30af9927b0074c97fe6a92f28092 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Aug 2024 21:08:12 +0100 Subject: [PATCH 693/991] netfs: Fix trimming of streaming-write folios in netfs_inval_folio() When netfslib writes to a folio that it doesn't have data for, but that data exists on the server, it will make a 'streaming write' whereby it stores data in a folio that is marked dirty, but not uptodate. When it does this, it attaches a record to folio->private to track the dirty region. When truncate() or fallocate() wants to invalidate part of such a folio, it will call into ->invalidate_folio(), specifying the part of the folio that is to be invalidated. netfs_invalidate_folio(), on behalf of the filesystem, must then determine how to trim the streaming write record. In a couple of cases, however, it does this incorrectly (the reduce-length and move-start cases are switched over and don't, in any case, calculate the value correctly). Fix this by making the logic tree more obvious and fixing the cases. Fixes: 9ebff83e6481 ("netfs: Prep to use folio->private for write grouping and streaming write") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240823200819.532106-5-dhowells@redhat.com cc: Matthew Wilcox (Oracle) cc: Pankaj Raghav cc: Jeff Layton cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-mm@kvack.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/misc.c | 50 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 69324761fcf7c0..c1f321cf599993 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -97,10 +97,20 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct netfs_folio *finfo; + struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); size_t flen = folio_size(folio); _enter("{%lx},%zx,%zx", folio->index, offset, length); + if (offset == 0 && length == flen) { + unsigned long long i_size = i_size_read(&ctx->inode); + unsigned long long fpos = folio_pos(folio), end; + + end = umin(fpos + flen, i_size); + if (fpos < i_size && end > ctx->zero_point) + ctx->zero_point = end; + } + folio_wait_private_2(folio); /* [DEPRECATED] */ if (!folio_test_private(folio)) @@ -115,18 +125,34 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) /* We have a partially uptodate page from a streaming write. */ unsigned int fstart = finfo->dirty_offset; unsigned int fend = fstart + finfo->dirty_len; - unsigned int end = offset + length; + unsigned int iend = offset + length; if (offset >= fend) return; - if (end <= fstart) + if (iend <= fstart) + return; + + /* The invalidation region overlaps the data. If the region + * covers the start of the data, we either move along the start + * or just erase the data entirely. + */ + if (offset <= fstart) { + if (iend >= fend) + goto erase_completely; + /* Move the start of the data. */ + finfo->dirty_len = fend - iend; + finfo->dirty_offset = offset; + return; + } + + /* Reduce the length of the data if the invalidation region + * covers the tail part. + */ + if (iend >= fend) { + finfo->dirty_len = offset - fstart; return; - if (offset <= fstart && end >= fend) - goto erase_completely; - if (offset <= fstart && end > fstart) - goto reduce_len; - if (offset > fstart && end >= fend) - goto move_start; + } + /* A partial write was split. The caller has already zeroed * it, so just absorb the hole. */ @@ -139,12 +165,6 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) folio_clear_uptodate(folio); kfree(finfo); return; -reduce_len: - finfo->dirty_len = offset + length - finfo->dirty_offset; - return; -move_start: - finfo->dirty_len -= offset - finfo->dirty_offset; - finfo->dirty_offset = offset; } EXPORT_SYMBOL(netfs_invalidate_folio); @@ -164,7 +184,7 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) if (folio_test_dirty(folio)) return false; - end = folio_pos(folio) + folio_size(folio); + end = umin(folio_pos(folio) + folio_size(folio), i_size_read(&ctx->inode)); if (end > ctx->zero_point) ctx->zero_point = end; From 950b03d0f664a54389a555d79215348ed413161f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Aug 2024 21:08:13 +0100 Subject: [PATCH 694/991] netfs: Fix missing iterator reset on retry of short read Fix netfs_rreq_perform_resubmissions() to reset before retrying a short read, otherwise the wrong part of the output buffer will be used. Fixes: 92b6cc5d1e7c ("netfs: Add iov_iters to (sub)requests to describe various buffers") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240823200819.532106-6-dhowells@redhat.com cc: Steve French cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 5367caf3fa2863..4da0a494e860f1 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -313,6 +313,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) netfs_reset_subreq_iter(rreq, subreq); netfs_read_from_server(rreq, subreq); } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { + netfs_reset_subreq_iter(rreq, subreq); netfs_rreq_short_read(rreq, subreq); } } From e00e99ba6c6b8e5239e75cd6684a6827d93c39a2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Aug 2024 12:56:53 +0100 Subject: [PATCH 695/991] netfs: Fix interaction of streaming writes with zero-point tracker When a folio that is marked for streaming write (dirty, but not uptodate, with partial content specified in the private data) is written back, the folio is effectively switched to the blank state upon completion of the write. This means that if we want to read it in future, we need to reread the whole folio. However, if the folio is above the zero_point position, when it is read back, it will just be cleared and the read skipped, leading to apparent local corruption. Fix this by increasing the zero_point to the end of the dirty data in the folio when clearing the folio state after writeback. This is analogous to the folio having ->release_folio() called upon it. This was causing the config.log generated by configuring a cpython tree on a cifs share to get corrupted because the scripts involved were appending text to the file in small pieces. Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Signed-off-by: David Howells Link: https://lore.kernel.org/r/563286.1724500613@warthog.procyon.org.uk cc: Steve French cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/write_collect.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 426cf87aaf2ecd..ae7a2043f67036 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -33,6 +33,7 @@ int netfs_folio_written_back(struct folio *folio) { enum netfs_folio_trace why = netfs_folio_trace_clear; + struct netfs_inode *ictx = netfs_inode(folio->mapping->host); struct netfs_folio *finfo; struct netfs_group *group = NULL; int gcount = 0; @@ -41,6 +42,12 @@ int netfs_folio_written_back(struct folio *folio) /* Streaming writes cannot be redirtied whilst under writeback, * so discard the streaming record. */ + unsigned long long fend; + + fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len; + if (fend > ictx->zero_point) + ictx->zero_point = fend; + folio_detach_private(folio); group = finfo->netfs_group; gcount++; From d3204616a67e53fdcad14c7026869330fb382fd4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 Aug 2024 17:38:41 -0400 Subject: [PATCH 696/991] bcachefs: Fix failure to flush moves before sleeping in copygc This fixes an apparent deadlock - rebalance would get stuck trying to take nocow locks because they weren't being released by copygc. Signed-off-by: Kent Overstreet --- fs/bcachefs/movinggc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index deef4f024d20bd..d86565bf07c8c5 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -383,7 +383,7 @@ static int bch2_copygc_thread(void *arg) if (min_member_capacity == U64_MAX) min_member_capacity = 128 * 2048; - bch2_trans_unlock_long(ctxt.trans); + move_buckets_wait(&ctxt, buckets, true); bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), MAX_SCHEDULE_TIMEOUT); } From 49aa7830396bce33b00fa7ee734c35de36521138 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 Aug 2024 15:35:22 -0400 Subject: [PATCH 697/991] bcachefs: Fix rebalance_work accounting rebalance_work was keying off of the presence of rebelance_opts in the extent - but that was incorrect, we keep those around after rebalance for indirect extents since the inode's options are not directly available Fixes: 20ac515a9cc7 ("bcachefs: bch_acct_rebalance_work") Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 +- fs/bcachefs/buckets.c | 74 +++++++++++++++++++++++------------ fs/bcachefs/extents.c | 39 ++++++++++++++++++ fs/bcachefs/extents.h | 1 + fs/bcachefs/sb-downgrade.c | 8 +++- 5 files changed, 98 insertions(+), 27 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index c75f2e0f32bb96..14ce726bf5a3cc 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -677,7 +677,8 @@ struct bch_sb_field_ext { x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ x(disk_accounting_v2, BCH_VERSION(1, 9)) \ x(disk_accounting_v3, BCH_VERSION(1, 10)) \ - x(disk_accounting_inum, BCH_VERSION(1, 11)) + x(disk_accounting_inum, BCH_VERSION(1, 11)) \ + x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index be2bbd2486314f..a2274429e7f4ad 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -699,7 +699,8 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, static int __trigger_extent(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) + enum btree_iter_update_trigger_flags flags, + s64 *replicas_sectors) { bool gc = flags & BTREE_TRIGGER_gc; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -708,7 +709,6 @@ static int __trigger_extent(struct btree_trans *trans, enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ? BCH_DATA_btree : BCH_DATA_user; - s64 replicas_sectors = 0; int ret = 0; struct disk_accounting_pos acc_replicas_key = { @@ -739,7 +739,7 @@ static int __trigger_extent(struct btree_trans *trans, if (ret) return ret; } else if (!p.has_ec) { - replicas_sectors += disk_sectors; + *replicas_sectors += disk_sectors; acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev; } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); @@ -777,7 +777,7 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs) { - ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc); if (ret) return ret; } @@ -787,7 +787,7 @@ static int __trigger_extent(struct btree_trans *trans, .type = BCH_DISK_ACCOUNTING_snapshot, .snapshot.id = k.k->p.snapshot, }; - ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc); if (ret) return ret; } @@ -807,7 +807,7 @@ static int __trigger_extent(struct btree_trans *trans, .type = BCH_DISK_ACCOUNTING_btree, .btree.id = btree_id, }; - ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc); if (ret) return ret; } else { @@ -819,22 +819,13 @@ static int __trigger_extent(struct btree_trans *trans, s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), - replicas_sectors, + *replicas_sectors, }; ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); if (ret) return ret; } - if (bch2_bkey_rebalance_opts(k)) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_rebalance_work, - }; - ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc); - if (ret) - return ret; - } - return 0; } @@ -843,6 +834,7 @@ int bch2_trigger_extent(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s new, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; @@ -858,21 +850,53 @@ int bch2_trigger_extent(struct btree_trans *trans, new_ptrs_bytes)) return 0; - if (flags & BTREE_TRIGGER_transactional) { - struct bch_fs *c = trans->c; - int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) - - (int) bch2_bkey_needs_rebalance(c, old); + if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { + s64 old_replicas_sectors = 0, new_replicas_sectors = 0; + + if (old.k->type) { + int ret = __trigger_extent(trans, btree, level, old, + flags & ~BTREE_TRIGGER_insert, + &old_replicas_sectors); + if (ret) + return ret; + } + + if (new.k->type) { + int ret = __trigger_extent(trans, btree, level, new.s_c, + flags & ~BTREE_TRIGGER_overwrite, + &new_replicas_sectors); + if (ret) + return ret; + } + + int need_rebalance_delta = 0; + s64 need_rebalance_sectors_delta = 0; + + s64 s = bch2_bkey_sectors_need_rebalance(c, old); + need_rebalance_delta -= s != 0; + need_rebalance_sectors_delta -= s; - if (mod) { + s = bch2_bkey_sectors_need_rebalance(c, old); + need_rebalance_delta += s != 0; + need_rebalance_sectors_delta += s; + + if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - new.k->p, mod > 0); + new.k->p, need_rebalance_delta > 0); if (ret) return ret; } - } - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) - return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags); + if (need_rebalance_sectors_delta) { + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_rebalance_work, + }; + int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1, + flags & BTREE_TRIGGER_gc); + if (ret) + return ret; + } + } return 0; } diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 9406f82fc2550b..e317df3644a119 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1379,6 +1379,45 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) return r != NULL; } +static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k, + unsigned target, unsigned compression) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 sectors = 0; + + if (compression) { + unsigned compression_type = bch2_compression_opt_to_type(compression); + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) { + sectors = 0; + goto incompressible; + } + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + sectors += p.crc.compressed_size; + } + } +incompressible: + if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target)) + sectors += p.crc.compressed_size; + } + + return sectors; +} + +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + + return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0; +} + int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, struct bch_io_opts *opts) { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 1a6ddee48041d1..709dd83183be1f 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -692,6 +692,7 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, unsigned, unsigned); bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, struct bch_io_opts *); diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 650a1f77ca4036..c7e4cdd3f6a521 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -74,6 +74,9 @@ BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \ BCH_FSCK_ERR_accounting_key_junk_at_end) \ x(disk_accounting_inum, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch) \ + x(rebalance_work_acct_fix, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BCH_FSCK_ERR_accounting_mismatch) @@ -108,7 +111,10 @@ BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ BCH_FSCK_ERR_fs_usage_replicas_wrong, \ BCH_FSCK_ERR_accounting_replicas_not_marked, \ - BCH_FSCK_ERR_bkey_version_in_future) + BCH_FSCK_ERR_bkey_version_in_future) \ + x(rebalance_work_acct_fix, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch) struct upgrade_downgrade_entry { u64 recovery_passes; From 128f71fe014fc91efa1407ce549f94a9a9f1072c Mon Sep 17 00:00:00 2001 From: Huang-Huang Bao Date: Tue, 9 Jul 2024 18:54:28 +0800 Subject: [PATCH 698/991] pinctrl: rockchip: correct RK3328 iomux width flag for GPIO2-B pins The base iomux offsets for each GPIO pin line are accumulatively calculated based off iomux width flag in rockchip_pinctrl_get_soc_data. If the iomux width flag is one of IOMUX_WIDTH_4BIT, IOMUX_WIDTH_3BIT or IOMUX_WIDTH_2BIT, the base offset for next pin line would increase by 8 bytes, otherwise it would increase by 4 bytes. Despite most of GPIO2-B iomux have 2-bit data width, which can be fit into 4 bytes space with write mask, it actually take 8 bytes width for whole GPIO2-B line. Commit e8448a6c817c ("pinctrl: rockchip: fix pinmux bits for RK3328 GPIO2-B pins") wrongly set iomux width flag to 0, causing all base iomux offset for line after GPIO2-B to be calculated wrong. Fix the iomux width flag to IOMUX_WIDTH_2BIT so the offset after GPIO2-B is correctly increased by 8, matching the actual width of GPIO2-B iomux. Fixes: e8448a6c817c ("pinctrl: rockchip: fix pinmux bits for RK3328 GPIO2-B pins") Cc: stable@vger.kernel.org Reported-by: Richard Kojedzinszky Closes: https://lore.kernel.org/linux-rockchip/4f29b743202397d60edfb3c725537415@kojedz.in/ Tested-by: Richard Kojedzinszky Signed-off-by: Huang-Huang Bao Reviewed-by: Heiko Stuebner Tested-by: Daniel Golle Tested-by: Trevor Woerner Link: https://lore.kernel.org/20240709105428.1176375-1-i@eh5.me Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-rockchip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index 0eacaf10c640fc..6878bc86faa2ce 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -3795,7 +3795,7 @@ static struct rockchip_pin_bank rk3328_pin_banks[] = { PIN_BANK_IOMUX_FLAGS(0, 32, "gpio0", 0, 0, 0, 0), PIN_BANK_IOMUX_FLAGS(1, 32, "gpio1", 0, 0, 0, 0), PIN_BANK_IOMUX_FLAGS(2, 32, "gpio2", 0, - 0, + IOMUX_WIDTH_2BIT, IOMUX_WIDTH_3BIT, 0), PIN_BANK_IOMUX_FLAGS(3, 32, "gpio3", From 5be63fc19fcaa4c236b307420483578a56986a37 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 25 Aug 2024 19:07:11 +1200 Subject: [PATCH 699/991] Linux 6.11-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2c1db7a6f793e2..7b60eb103c5d3c 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Baby Opossum Posse # *DOCUMENTATION* From c724b2ab6a46435b4e7d58ad2fbbdb7a318823cf Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 21 Aug 2024 17:18:23 +0200 Subject: [PATCH 700/991] smb/client: avoid dereferencing rdata=NULL in smb2_new_read_req() This happens when called from SMB2_read() while using rdma and reaching the rdma_readwrite_threshold. Cc: stable@vger.kernel.org Fixes: a6559cc1d35d ("cifs: split out smb3_use_rdma_offload() helper") Reviewed-by: David Howells Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 83facb54276a31..8901de199a6b5c 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4441,7 +4441,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * If we want to do a RDMA write, fill in and append * smbd_buffer_descriptor_v1 to the end of read request */ - if (smb3_use_rdma_offload(io_parms)) { + if (rdata && smb3_use_rdma_offload(io_parms)) { struct smbd_buffer_descriptor_v1 *v1; bool need_invalidate = server->dialect == SMB30_PROT_ID; From b608e2c318789aeba49055747166e13bee57df4a Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 21 Aug 2024 15:59:12 +0200 Subject: [PATCH 701/991] smb/client: remove unused rq_iter_size from struct smb_rqst Reviewed-by: David Howells Fixes: d08089f649a0 ("cifs: Change the I/O paths to use an iterator rather than a page list") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 1 - fs/smb/client/cifssmb.c | 1 - fs/smb/client/smb2ops.c | 2 -- fs/smb/client/smb2pdu.c | 2 -- 4 files changed, 6 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 7ebe80a25d0457..f379b9dc93bac1 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -254,7 +254,6 @@ struct cifs_open_info_data { struct smb_rqst { struct kvec *rq_iov; /* array of kvecs */ unsigned int rq_nvec; /* number of kvecs in array */ - size_t rq_iter_size; /* Amount of data in ->rq_iter */ struct iov_iter rq_iter; /* Data iterator */ struct xarray rq_buffer; /* Page buffer for encryption */ }; diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 595c4b673707e9..6dce70f1720826 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1713,7 +1713,6 @@ cifs_async_writev(struct cifs_io_subrequest *wdata) rqst.rq_iov = iov; rqst.rq_nvec = 2; rqst.rq_iter = wdata->subreq.io_iter; - rqst.rq_iter_size = iov_iter_count(&wdata->subreq.io_iter); cifs_dbg(FYI, "async write at %llu %zu bytes\n", wdata->subreq.start, wdata->subreq.len); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 322cabc69c6f14..ea298456d841ff 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4446,7 +4446,6 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, } iov_iter_xarray(&new->rq_iter, ITER_SOURCE, buffer, 0, size); - new->rq_iter_size = size; } } @@ -4492,7 +4491,6 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, rqst.rq_nvec = 2; if (iter) { rqst.rq_iter = *iter; - rqst.rq_iter_size = iov_iter_count(iter); iter_size = iov_iter_count(iter); } diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 8901de199a6b5c..63a2541d4a0524 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4523,7 +4523,6 @@ smb2_readv_callback(struct mid_q_entry *mid) if (rdata->got_bytes) { rqst.rq_iter = rdata->subreq.io_iter; - rqst.rq_iter_size = iov_iter_count(&rdata->subreq.io_iter); } WARN_ONCE(rdata->server != mid->server, @@ -4975,7 +4974,6 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) rqst.rq_iov = iov; rqst.rq_nvec = 1; rqst.rq_iter = wdata->subreq.io_iter; - rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter); if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags)) smb2_set_replay(server, &rqst); #ifdef CONFIG_CIFS_SMB_DIRECT From 017d1701743657fbfaea74397727a9d2b81846b7 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 21 Aug 2024 16:31:39 +0200 Subject: [PATCH 702/991] smb/client: fix rdma usage in smb2_async_writev() rqst.rq_iter needs to be truncated otherwise we'll also send the bytes into the stream socket... This is the logic behind rqst.rq_npages = 0, which was removed in "cifs: Change the I/O paths to use an iterator rather than a page list" (d08089f649a0cfb2099c8551ac47eef0cc23fdf2). Cc: stable@vger.kernel.org Fixes: d08089f649a0 ("cifs: Change the I/O paths to use an iterator rather than a page list") Reviewed-by: David Howells Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 63a2541d4a0524..2d7e6c42cf182b 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4913,6 +4913,13 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) if (rc) goto out; + rqst.rq_iov = iov; + rqst.rq_iter = wdata->subreq.io_iter; + + rqst.rq_iov[0].iov_len = total_len - 1; + rqst.rq_iov[0].iov_base = (char *)req; + rqst.rq_nvec += 1; + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -4924,6 +4931,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = SMB2_CHANNEL_NONE; + req->Length = cpu_to_le32(io_parms->length); req->Offset = cpu_to_le64(io_parms->offset); req->DataOffset = cpu_to_le16( offsetof(struct smb2_write_req, Buffer)); @@ -4943,7 +4951,6 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) */ if (smb3_use_rdma_offload(io_parms)) { struct smbd_buffer_descriptor_v1 *v1; - size_t data_size = iov_iter_count(&wdata->subreq.io_iter); bool need_invalidate = server->dialect == SMB30_PROT_ID; wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->subreq.io_iter, @@ -4952,9 +4959,10 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) rc = -EAGAIN; goto async_writev_out; } + /* For RDMA read, I/O size is in RemainingBytes not in Length */ + req->RemainingBytes = req->Length; req->Length = 0; req->DataOffset = 0; - req->RemainingBytes = cpu_to_le32(data_size); req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE; if (need_invalidate) req->Channel = SMB2_CHANNEL_RDMA_V1; @@ -4966,30 +4974,22 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) v1->offset = cpu_to_le64(wdata->mr->mr->iova); v1->token = cpu_to_le32(wdata->mr->mr->rkey); v1->length = cpu_to_le32(wdata->mr->mr->length); + + rqst.rq_iov[0].iov_len += sizeof(*v1); + + /* + * We keep wdata->subreq.io_iter, + * but we have to truncate rqst.rq_iter + */ + iov_iter_truncate(&rqst.rq_iter, 0); } #endif - iov[0].iov_len = total_len - 1; - iov[0].iov_base = (char *)req; - rqst.rq_iov = iov; - rqst.rq_nvec = 1; - rqst.rq_iter = wdata->subreq.io_iter; if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags)) smb2_set_replay(server, &rqst); -#ifdef CONFIG_CIFS_SMB_DIRECT - if (wdata->mr) - iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); -#endif - cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", - io_parms->offset, io_parms->length, iov_iter_count(&rqst.rq_iter)); -#ifdef CONFIG_CIFS_SMB_DIRECT - /* For RDMA read, I/O size is in RemainingBytes not in Length */ - if (!wdata->mr) - req->Length = cpu_to_le32(io_parms->length); -#else - req->Length = cpu_to_le32(io_parms->length); -#endif + cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", + io_parms->offset, io_parms->length, iov_iter_count(&wdata->subreq.io_iter)); if (wdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->subreq.len, From 416871f4fb84bc96822562e654941d5625a25bf8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Aug 2024 14:22:42 +0100 Subject: [PATCH 703/991] cifs: Fix FALLOC_FL_PUNCH_HOLE support The cifs filesystem doesn't quite emulate FALLOC_FL_PUNCH_HOLE correctly (note that due to lack of protocol support, it can't actually implement it directly). Whilst it will (partially) invalidate dirty folios in the pagecache, it doesn't write them back first, and so the EOF marker on the server may be lower than inode->i_size. This presents a problem, however, as if the punched hole invalidates the tail of the locally cached dirty data, writeback won't know it needs to move the EOF over to account for the hole punch (which isn't supposed to move the EOF). We could just write zeroes over the punched out region of the pagecache and write that back - but this is supposed to be a deallocatory operation. Fix this by manually moving the EOF over on the server after the operation if the hole punched would corrupt it. Note that the FSCTL_SET_ZERO_DATA RPC and the setting of the EOF should probably be compounded to stop a third party interfering (or, at least, massively reduce the chance). This was reproducible occasionally by using fsx with the following script: truncate 0x0 0x375e2 0x0 punch_hole 0x2f6d3 0x6ab5 0x375e2 truncate 0x0 0x3a71f 0x375e2 mapread 0xee05 0xcf12 0x3a71f write 0x2078e 0x5604 0x3a71f write 0x3ebdf 0x1421 0x3a71f * punch_hole 0x379d0 0x8630 0x40000 * mapread 0x2aaa2 0x85b 0x40000 fallocate 0x1b401 0x9ada 0x40000 read 0x15f2 0x7d32 0x40000 read 0x32f37 0x7a3b 0x40000 * The second "write" should extend the EOF to 0x40000, and the "punch_hole" should operate inside of that - but that depends on whether the VM gets in and writes back the data first. If it doesn't, the file ends up 0x3a71f in size, not 0x40000. Fixes: 31742c5a3317 ("enable fallocate punch hole ("fallocate -p") for SMB3") Signed-off-by: David Howells cc: Steve French cc: Paulo Alcantara cc: Shyam Prasad N cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index ea298456d841ff..0b9cb1a60d4afa 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -3305,6 +3305,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; struct file_zero_data_information fsctl_buf; + unsigned long long end = offset + len, i_size, remote_i_size; long rc; unsigned int xid; __u8 set_sparse = 1; @@ -3336,6 +3337,27 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, (char *)&fsctl_buf, sizeof(struct file_zero_data_information), CIFSMaxBufSize, NULL, NULL); + + if (rc) + goto unlock; + + /* If there's dirty data in the buffer that would extend the EOF if it + * were written, then we need to move the EOF marker over to the lower + * of the high end of the hole and the proposed EOF. The problem is + * that we locally hole-punch the tail of the dirty data, the proposed + * EOF update will end up in the wrong place. + */ + i_size = i_size_read(inode); + remote_i_size = netfs_inode(inode)->remote_i_size; + if (end > remote_i_size && i_size > remote_i_size) { + unsigned long long extend_to = umin(end, i_size); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, extend_to); + if (rc >= 0) + netfs_inode(inode)->remote_i_size = extend_to; + } + +unlock: filemap_invalidate_unlock(inode->i_mapping); out: inode_unlock(inode); From 58aec91efb93338d1cc7acc0a93242613a2a4e5f Mon Sep 17 00:00:00 2001 From: Miao Wang Date: Sun, 25 Aug 2024 22:17:39 +0800 Subject: [PATCH 704/991] LoongArch: Remove the unused dma-direct.h dma-direct.h is introduced in commit d4b6f1562a3c3284 ("LoongArch: Add Non-Uniform Memory Access (NUMA) support"). In commit c78c43fe7d42524c ("LoongArch: Use acpi_arch_dma_setup() and remove ARCH_HAS_PHYS_TO_DMA"), ARCH_HAS_PHYS_TO_DMA was deselected and the coresponding phys_to_dma()/ dma_to_phys() functions were removed. However, the unused dma-direct.h was left behind, which is removed by this patch. Cc: Fixes: c78c43fe7d42 ("LoongArch: Use acpi_arch_dma_setup() and remove ARCH_HAS_PHYS_TO_DMA") Signed-off-by: Miao Wang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/dma-direct.h | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 arch/loongarch/include/asm/dma-direct.h diff --git a/arch/loongarch/include/asm/dma-direct.h b/arch/loongarch/include/asm/dma-direct.h deleted file mode 100644 index 75ccd808a2af39..00000000000000 --- a/arch/loongarch/include/asm/dma-direct.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2020-2022 Loongson Technology Corporation Limited - */ -#ifndef _LOONGARCH_DMA_DIRECT_H -#define _LOONGARCH_DMA_DIRECT_H - -dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr); -phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr); - -#endif /* _LOONGARCH_DMA_DIRECT_H */ From 44ceabdec12f4e5938f5668c5a691aa3aac703d7 Mon Sep 17 00:00:00 2001 From: YOUNGJIN JOO Date: Sun, 25 Aug 2024 18:25:15 +0900 Subject: [PATCH 705/991] ALSA: hda/realtek: Fix the speaker output on Samsung Galaxy Book3 Ultra 144d:c1cc requires the same workaround to enable the speaker amp as other Samsung models with the ALC298 codec. Signed-off-by: YOUNGJIN JOO Cc: Link: https://patch.msgid.link/20240825092515.28728-1-neoelec@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index b5cc3417138c74..c04eac6a5064bc 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10540,6 +10540,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x144d, 0xca03, "Samsung Galaxy Book2 Pro 360 (NP930QED)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc868, "Samsung Galaxy Book2 Pro (NP930XED)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc1ca, "Samsung Galaxy Book3 Pro 360 (NP960QFG-KB1US)", ALC298_FIXUP_SAMSUNG_AMP2), + SND_PCI_QUIRK(0x144d, 0xc1cc, "Samsung Galaxy Book3 Ultra (NT960XFH-XD92G))", ALC298_FIXUP_SAMSUNG_AMP2), SND_PCI_QUIRK(0x1458, 0xfa53, "Gigabyte BXBT-2807", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb120, "MSI Cubi MS-B120", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb171, "Cubi N 8GL (MS-B171)", ALC283_FIXUP_HEADSET_MIC), From 25dfc9e357af8aed1ca79b318a73f2c59c1f0b2b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 19 Aug 2024 11:30:04 -0700 Subject: [PATCH 706/991] perf/x86/intel: Limit the period on Haswell Running the ltp test cve-2015-3290 concurrently reports the following warnings. perfevents: irq loop stuck! WARNING: CPU: 31 PID: 32438 at arch/x86/events/intel/core.c:3174 intel_pmu_handle_irq+0x285/0x370 Call Trace: ? __warn+0xa4/0x220 ? intel_pmu_handle_irq+0x285/0x370 ? __report_bug+0x123/0x130 ? intel_pmu_handle_irq+0x285/0x370 ? __report_bug+0x123/0x130 ? intel_pmu_handle_irq+0x285/0x370 ? report_bug+0x3e/0xa0 ? handle_bug+0x3c/0x70 ? exc_invalid_op+0x18/0x50 ? asm_exc_invalid_op+0x1a/0x20 ? irq_work_claim+0x1e/0x40 ? intel_pmu_handle_irq+0x285/0x370 perf_event_nmi_handler+0x3d/0x60 nmi_handle+0x104/0x330 Thanks to Thomas Gleixner's analysis, the issue is caused by the low initial period (1) of the frequency estimation algorithm, which triggers the defects of the HW, specifically erratum HSW11 and HSW143. (For the details, please refer https://lore.kernel.org/lkml/87plq9l5d2.ffs@tglx/) The HSW11 requires a period larger than 100 for the INST_RETIRED.ALL event, but the initial period in the freq mode is 1. The erratum is the same as the BDM11, which has been supported in the kernel. A minimum period of 128 is enforced as well on HSW. HSW143 is regarding that the fixed counter 1 may overcount 32 with the Hyper-Threading is enabled. However, based on the test, the hardware has more issues than it tells. Besides the fixed counter 1, the message 'interrupt took too long' can be observed on any counter which was armed with a period < 32 and two events expired in the same NMI. A minimum period of 32 is enforced for the rest of the events. The recommended workaround code of the HSW143 is not implemented. Because it only addresses the issue for the fixed counter. It brings extra overhead through extra MSR writing. No related overcounting issue has been reported so far. Fixes: 3a632cb229bf ("perf/x86/intel: Add simple Haswell PMU support") Reported-by: Li Huafei Suggested-by: Thomas Gleixner Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240819183004.3132920-1-kan.liang@linux.intel.com Closes: https://lore.kernel.org/lkml/20240729223328.327835-1-lihuafei1@huawei.com/ --- arch/x86/events/intel/core.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 0c9c2706d4ec80..9e519d8a810a68 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4589,6 +4589,25 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) return HYBRID_INTEL_CORE; } +static inline bool erratum_hsw11(struct perf_event *event) +{ + return (event->hw.config & INTEL_ARCH_EVENT_MASK) == + X86_CONFIG(.event=0xc0, .umask=0x01); +} + +/* + * The HSW11 requires a period larger than 100 which is the same as the BDM11. + * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL. + * + * The message 'interrupt took too long' can be observed on any counter which + * was armed with a period < 32 and two events expired in the same NMI. + * A minimum period of 32 is enforced for the rest of the events. + */ +static void hsw_limit_period(struct perf_event *event, s64 *left) +{ + *left = max(*left, erratum_hsw11(event) ? 128 : 32); +} + /* * Broadwell: * @@ -4606,8 +4625,7 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) */ static void bdw_limit_period(struct perf_event *event, s64 *left) { - if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == - X86_CONFIG(.event=0xc0, .umask=0x01)) { + if (erratum_hsw11(event)) { if (*left < 128) *left = 128; *left &= ~0x3fULL; @@ -6766,6 +6784,7 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.limit_period = hsw_limit_period; x86_pmu.lbr_double_abort = true; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; From 9efaebc0072b8e95505544bf385c20ee8a29d799 Mon Sep 17 00:00:00 2001 From: Ross Brown Date: Tue, 30 Jul 2024 08:21:42 +0200 Subject: [PATCH 707/991] hwmon: (asus-ec-sensors) remove VRM temp X570-E GAMING X570-E GAMING does not have VRM temperature sensor. Signed-off-by: Ross Brown Signed-off-by: Eugene Shalygin Link: https://lore.kernel.org/r/20240730062320.5188-2-eugene.shalygin@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/asus-ec-sensors.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index 6bb8d7b1d2194b..ee396f21fac5e9 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -420,7 +420,7 @@ static const struct ec_board_info board_info_strix_b550_i_gaming = { static const struct ec_board_info board_info_strix_x570_e_gaming = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | - SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | + SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CHIPSET | SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, From 0075df288dd8a7abfe03b3766176c393063591dd Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 29 Jul 2024 08:33:27 +0300 Subject: [PATCH 708/991] microblaze: don't treat zero reserved memory regions as error Before commit 721f4a6526da ("mm/memblock: remove empty dummy entry") the check for non-zero of memblock.reserved.cnt in mmu_init() would always be true either because memblock.reserved.cnt is initialized to 1 or because there were memory reservations earlier. The removal of dummy empty entry in memblock caused this check to fail because now memblock.reserved.cnt is initialized to 0. Remove the check for non-zero of memblock.reserved.cnt because it's perfectly fine to have an empty memblock.reserved array that early in boot. Reported-by: Guenter Roeck Signed-off-by: Mike Rapoport Reviewed-by: Wei Yang Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20240729053327.4091459-1-rppt@kernel.org Signed-off-by: Guenter Roeck --- arch/microblaze/mm/init.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 3827dc76edd823..4520c57415797f 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -193,11 +193,6 @@ asmlinkage void __init mmu_init(void) { unsigned int kstart, ksize; - if (!memblock.reserved.cnt) { - pr_emerg("Error memory count\n"); - machine_restart(NULL); - } - if ((u32) memblock.memory.regions[0].size < 0x400000) { pr_emerg("Memory must be greater than 4MB\n"); machine_restart(NULL); From 5a4c785905fd9361d067127b42564c08893f2a6f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 5 Aug 2024 18:13:41 -0700 Subject: [PATCH 709/991] Revert "MIPS: csrc-r4k: Apply verification clocksource flags" This reverts commit 7190401fc56fb5f02ee3d04476778ab000bbaf32. Verifying the clock source sometimes deems the MIPS clock to be unstable, at least in qemu. clocksource: timekeeping watchdog on CPU0: Marking clocksource 'MIPS' as unstable because the skew is too large: clocksource: 'jiffies' wd_nsec: 500000000 wd_now: ffff8bde wd_last: ffff8bac mask: ffffffff clocksource: 'MIPS' cs_nsec: 940634468 cs_now: 310181c4 cs_last: 28090a09 mask: ffffffff clocksource: Clocksource 'MIPS' skewed 440634468 ns (440 ms) over watchdog 'jiffies' interval of 500000000 ns (500 ms) clocksource: 'MIPS' is current clocksource. If this happens, network interfaces fail to come online. Signed-off-by: Guenter Roeck --- arch/mips/kernel/csrc-r4k.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c index bdb1fa8931f4a5..59eca397f2971c 100644 --- a/arch/mips/kernel/csrc-r4k.c +++ b/arch/mips/kernel/csrc-r4k.c @@ -21,9 +21,7 @@ static struct clocksource clocksource_mips = { .name = "MIPS", .read = c0_hpt_read, .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_IS_CONTINUOUS | - CLOCK_SOURCE_MUST_VERIFY | - CLOCK_SOURCE_VERIFY_PERCPU, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static u64 __maybe_unused notrace r4k_read_sched_clock(void) From 98c0cc48e27e9d269a3e4db2acd72b486c88ec77 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 8 Aug 2024 08:50:03 -0700 Subject: [PATCH 710/991] apparmor: fix policy_unpack_test on big endian systems policy_unpack_test fails on big endian systems because data byte order is expected to be little endian but is generated in host byte order. This results in test failures such as: # policy_unpack_test_unpack_array_with_null_name: EXPECTATION FAILED at security/apparmor/policy_unpack_test.c:150 Expected array_size == (u16)16, but array_size == 4096 (0x1000) (u16)16 == 16 (0x10) # policy_unpack_test_unpack_array_with_null_name: pass:0 fail:1 skip:0 total:1 not ok 3 policy_unpack_test_unpack_array_with_null_name # policy_unpack_test_unpack_array_with_name: EXPECTATION FAILED at security/apparmor/policy_unpack_test.c:164 Expected array_size == (u16)16, but array_size == 4096 (0x1000) (u16)16 == 16 (0x10) # policy_unpack_test_unpack_array_with_name: pass:0 fail:1 skip:0 total:1 Add the missing endianness conversions when generating test data. Fixes: 4d944bcd4e73 ("apparmor: add AppArmor KUnit tests for policy unpack") Cc: Brendan Higgins Cc: Kees Cook Signed-off-by: Guenter Roeck --- security/apparmor/policy_unpack_test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/security/apparmor/policy_unpack_test.c b/security/apparmor/policy_unpack_test.c index 874fcf97794eed..c64733d6c98fbb 100644 --- a/security/apparmor/policy_unpack_test.c +++ b/security/apparmor/policy_unpack_test.c @@ -80,14 +80,14 @@ static struct aa_ext *build_aa_ext_struct(struct policy_unpack_fixture *puf, *(buf + 1) = strlen(TEST_U32_NAME) + 1; strscpy(buf + 3, TEST_U32_NAME, e->end - (void *)(buf + 3)); *(buf + 3 + strlen(TEST_U32_NAME) + 1) = AA_U32; - *((u32 *)(buf + 3 + strlen(TEST_U32_NAME) + 2)) = TEST_U32_DATA; + *((__le32 *)(buf + 3 + strlen(TEST_U32_NAME) + 2)) = cpu_to_le32(TEST_U32_DATA); buf = e->start + TEST_NAMED_U64_BUF_OFFSET; *buf = AA_NAME; *(buf + 1) = strlen(TEST_U64_NAME) + 1; strscpy(buf + 3, TEST_U64_NAME, e->end - (void *)(buf + 3)); *(buf + 3 + strlen(TEST_U64_NAME) + 1) = AA_U64; - *((u64 *)(buf + 3 + strlen(TEST_U64_NAME) + 2)) = TEST_U64_DATA; + *((__le64 *)(buf + 3 + strlen(TEST_U64_NAME) + 2)) = cpu_to_le64(TEST_U64_DATA); buf = e->start + TEST_NAMED_BLOB_BUF_OFFSET; *buf = AA_NAME; @@ -103,7 +103,7 @@ static struct aa_ext *build_aa_ext_struct(struct policy_unpack_fixture *puf, *(buf + 1) = strlen(TEST_ARRAY_NAME) + 1; strscpy(buf + 3, TEST_ARRAY_NAME, e->end - (void *)(buf + 3)); *(buf + 3 + strlen(TEST_ARRAY_NAME) + 1) = AA_ARRAY; - *((u16 *)(buf + 3 + strlen(TEST_ARRAY_NAME) + 2)) = TEST_ARRAY_SIZE; + *((__le16 *)(buf + 3 + strlen(TEST_ARRAY_NAME) + 2)) = cpu_to_le16(TEST_ARRAY_SIZE); return e; } From aba07b9a0587f50e5d3346eaa19019cf3f86c0ea Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Fri, 16 Aug 2024 14:32:05 -0400 Subject: [PATCH 711/991] drm/vmwgfx: Prevent unmapping active read buffers The kms paths keep a persistent map active to read and compare the cursor buffer. These maps can race with each other in simple scenario where: a) buffer "a" mapped for update b) buffer "a" mapped for compare c) do the compare d) unmap "a" for compare e) update the cursor f) unmap "a" for update At step "e" the buffer has been unmapped and the read contents is bogus. Prevent unmapping of active read buffers by simply keeping a count of how many paths have currently active maps and unmap only when the count reaches 0. Fixes: 485d98d472d5 ("drm/vmwgfx: Add support for CursorMob and CursorBypass 4") Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Cc: # v5.19+ Signed-off-by: Zack Rusin Link: https://patchwork.freedesktop.org/patch/msgid/20240816183332.31961-2-zack.rusin@broadcom.com Reviewed-by: Martin Krastev Reviewed-by: Maaz Mombasawala --- drivers/gpu/drm/vmwgfx/vmwgfx_bo.c | 13 +++++++++++-- drivers/gpu/drm/vmwgfx/vmwgfx_bo.h | 3 +++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c index f42ebc4a7c2255..a0e433fbcba67c 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c @@ -360,6 +360,8 @@ void *vmw_bo_map_and_cache_size(struct vmw_bo *vbo, size_t size) void *virtual; int ret; + atomic_inc(&vbo->map_count); + virtual = ttm_kmap_obj_virtual(&vbo->map, ¬_used); if (virtual) return virtual; @@ -383,11 +385,17 @@ void *vmw_bo_map_and_cache_size(struct vmw_bo *vbo, size_t size) */ void vmw_bo_unmap(struct vmw_bo *vbo) { + int map_count; + if (vbo->map.bo == NULL) return; - ttm_bo_kunmap(&vbo->map); - vbo->map.bo = NULL; + map_count = atomic_dec_return(&vbo->map_count); + + if (!map_count) { + ttm_bo_kunmap(&vbo->map); + vbo->map.bo = NULL; + } } @@ -421,6 +429,7 @@ static int vmw_bo_init(struct vmw_private *dev_priv, vmw_bo->tbo.priority = 3; vmw_bo->res_tree = RB_ROOT; xa_init(&vmw_bo->detached_resources); + atomic_set(&vmw_bo->map_count, 0); params->size = ALIGN(params->size, PAGE_SIZE); drm_gem_private_object_init(vdev, &vmw_bo->tbo.base, params->size); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h index 62b4342d5f7c50..43b5439ec9f760 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h @@ -71,6 +71,8 @@ struct vmw_bo_params { * @map: Kmap object for semi-persistent mappings * @res_tree: RB tree of resources using this buffer object as a backing MOB * @res_prios: Eviction priority counts for attached resources + * @map_count: The number of currently active maps. Will differ from the + * cpu_writers because it includes kernel maps. * @cpu_writers: Number of synccpu write grabs. Protected by reservation when * increased. May be decreased without reservation. * @dx_query_ctx: DX context if this buffer object is used as a DX query MOB @@ -90,6 +92,7 @@ struct vmw_bo { u32 res_prios[TTM_MAX_BO_PRIORITY]; struct xarray detached_resources; + atomic_t map_count; atomic_t cpu_writers; /* Not ref-counted. Protected by binding_mutex */ struct vmw_resource *dx_query_ctx; From 50f1199250912568606b3778dc56646c10cb7b04 Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Fri, 16 Aug 2024 14:32:06 -0400 Subject: [PATCH 712/991] drm/vmwgfx: Fix prime with external buffers Make sure that for external buffers mapping goes through the dma_buf interface instead of trying to access pages directly. External buffers might not provide direct access to readable/writable pages so to make sure the bo's created from external dma_bufs can be read dma_buf interface has to be used. Fixes crashes in IGT's kms_prime with vgem. Regular desktop usage won't trigger this due to the fact that virtual machines will not have multiple GPUs but it enables better test coverage in IGT. Signed-off-by: Zack Rusin Fixes: b32233acceff ("drm/vmwgfx: Fix prime import/export") Cc: # v6.6+ Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Cc: # v6.9+ Link: https://patchwork.freedesktop.org/patch/msgid/20240816183332.31961-3-zack.rusin@broadcom.com Reviewed-by: Martin Krastev Reviewed-by: Maaz Mombasawala --- drivers/gpu/drm/vmwgfx/vmwgfx_blit.c | 114 ++++++++++++++++++++++++++- drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 4 +- drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c | 12 +-- 3 files changed, 118 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c index 717d624e9a0522..890a66a2361f4e 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c @@ -27,6 +27,8 @@ **************************************************************************/ #include "vmwgfx_drv.h" + +#include "vmwgfx_bo.h" #include /* @@ -420,13 +422,105 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, return 0; } +static void *map_external(struct vmw_bo *bo, struct iosys_map *map) +{ + struct vmw_private *vmw = + container_of(bo->tbo.bdev, struct vmw_private, bdev); + void *ptr = NULL; + int ret; + + if (bo->tbo.base.import_attach) { + ret = dma_buf_vmap(bo->tbo.base.dma_buf, map); + if (ret) { + drm_dbg_driver(&vmw->drm, + "Wasn't able to map external bo!\n"); + goto out; + } + ptr = map->vaddr; + } else { + ptr = vmw_bo_map_and_cache(bo); + } + +out: + return ptr; +} + +static void unmap_external(struct vmw_bo *bo, struct iosys_map *map) +{ + if (bo->tbo.base.import_attach) + dma_buf_vunmap(bo->tbo.base.dma_buf, map); + else + vmw_bo_unmap(bo); +} + +static int vmw_external_bo_copy(struct vmw_bo *dst, u32 dst_offset, + u32 dst_stride, struct vmw_bo *src, + u32 src_offset, u32 src_stride, + u32 width_in_bytes, u32 height, + struct vmw_diff_cpy *diff) +{ + struct vmw_private *vmw = + container_of(dst->tbo.bdev, struct vmw_private, bdev); + size_t dst_size = dst->tbo.resource->size; + size_t src_size = src->tbo.resource->size; + struct iosys_map dst_map = {0}; + struct iosys_map src_map = {0}; + int ret, i; + int x_in_bytes; + u8 *vsrc; + u8 *vdst; + + vsrc = map_external(src, &src_map); + if (!vsrc) { + drm_dbg_driver(&vmw->drm, "Wasn't able to map src\n"); + ret = -ENOMEM; + goto out; + } + + vdst = map_external(dst, &dst_map); + if (!vdst) { + drm_dbg_driver(&vmw->drm, "Wasn't able to map dst\n"); + ret = -ENOMEM; + goto out; + } + + vsrc += src_offset; + vdst += dst_offset; + if (src_stride == dst_stride) { + dst_size -= dst_offset; + src_size -= src_offset; + memcpy(vdst, vsrc, + min(dst_stride * height, min(dst_size, src_size))); + } else { + WARN_ON(dst_stride < width_in_bytes); + for (i = 0; i < height; ++i) { + memcpy(vdst, vsrc, width_in_bytes); + vsrc += src_stride; + vdst += dst_stride; + } + } + + x_in_bytes = (dst_offset % dst_stride); + diff->rect.x1 = x_in_bytes / diff->cpp; + diff->rect.y1 = ((dst_offset - x_in_bytes) / dst_stride); + diff->rect.x2 = diff->rect.x1 + width_in_bytes / diff->cpp; + diff->rect.y2 = diff->rect.y1 + height; + + ret = 0; +out: + unmap_external(src, &src_map); + unmap_external(dst, &dst_map); + + return ret; +} + /** * vmw_bo_cpu_blit - in-kernel cpu blit. * - * @dst: Destination buffer object. + * @vmw_dst: Destination buffer object. * @dst_offset: Destination offset of blit start in bytes. * @dst_stride: Destination stride in bytes. - * @src: Source buffer object. + * @vmw_src: Source buffer object. * @src_offset: Source offset of blit start in bytes. * @src_stride: Source stride in bytes. * @w: Width of blit. @@ -444,13 +538,15 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, * Neither of the buffer objects may be placed in PCI memory * (Fixed memory in TTM terminology) when using this function. */ -int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, +int vmw_bo_cpu_blit(struct vmw_bo *vmw_dst, u32 dst_offset, u32 dst_stride, - struct ttm_buffer_object *src, + struct vmw_bo *vmw_src, u32 src_offset, u32 src_stride, u32 w, u32 h, struct vmw_diff_cpy *diff) { + struct ttm_buffer_object *src = &vmw_src->tbo; + struct ttm_buffer_object *dst = &vmw_dst->tbo; struct ttm_operation_ctx ctx = { .interruptible = false, .no_wait_gpu = false @@ -460,6 +556,11 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, int ret = 0; struct page **dst_pages = NULL; struct page **src_pages = NULL; + bool src_external = (src->ttm->page_flags & TTM_TT_FLAG_EXTERNAL) != 0; + bool dst_external = (dst->ttm->page_flags & TTM_TT_FLAG_EXTERNAL) != 0; + + if (WARN_ON(dst == src)) + return -EINVAL; /* Buffer objects need to be either pinned or reserved: */ if (!(dst->pin_count)) @@ -479,6 +580,11 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, return ret; } + if (src_external || dst_external) + return vmw_external_bo_copy(vmw_dst, dst_offset, dst_stride, + vmw_src, src_offset, src_stride, + w, h, diff); + if (!src->ttm->pages && src->ttm->sg) { src_pages = kvmalloc_array(src->ttm->num_pages, sizeof(struct page *), GFP_KERNEL); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h index 32f50e59580972..3f4719b3c26818 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h @@ -1353,9 +1353,9 @@ void vmw_diff_memcpy(struct vmw_diff_cpy *diff, u8 *dest, const u8 *src, void vmw_memcpy(struct vmw_diff_cpy *diff, u8 *dest, const u8 *src, size_t n); -int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, +int vmw_bo_cpu_blit(struct vmw_bo *dst, u32 dst_offset, u32 dst_stride, - struct ttm_buffer_object *src, + struct vmw_bo *src, u32 src_offset, u32 src_stride, u32 w, u32 h, struct vmw_diff_cpy *diff); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c index 5453f7cf0e2d7b..fab155a68054a7 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c @@ -502,7 +502,7 @@ static void vmw_stdu_bo_cpu_commit(struct vmw_kms_dirty *dirty) container_of(dirty->unit, typeof(*stdu), base); s32 width, height; s32 src_pitch, dst_pitch; - struct ttm_buffer_object *src_bo, *dst_bo; + struct vmw_bo *src_bo, *dst_bo; u32 src_offset, dst_offset; struct vmw_diff_cpy diff = VMW_CPU_BLIT_DIFF_INITIALIZER(stdu->cpp); @@ -517,11 +517,11 @@ static void vmw_stdu_bo_cpu_commit(struct vmw_kms_dirty *dirty) /* Assume we are blitting from Guest (bo) to Host (display_srf) */ src_pitch = stdu->display_srf->metadata.base_size.width * stdu->cpp; - src_bo = &stdu->display_srf->res.guest_memory_bo->tbo; + src_bo = stdu->display_srf->res.guest_memory_bo; src_offset = ddirty->top * src_pitch + ddirty->left * stdu->cpp; dst_pitch = ddirty->pitch; - dst_bo = &ddirty->buf->tbo; + dst_bo = ddirty->buf; dst_offset = ddirty->fb_top * dst_pitch + ddirty->fb_left * stdu->cpp; (void) vmw_bo_cpu_blit(dst_bo, dst_offset, dst_pitch, @@ -1170,7 +1170,7 @@ vmw_stdu_bo_populate_update_cpu(struct vmw_du_update_plane *update, void *cmd, struct vmw_diff_cpy diff = VMW_CPU_BLIT_DIFF_INITIALIZER(0); struct vmw_stdu_update_gb_image *cmd_img = cmd; struct vmw_stdu_update *cmd_update; - struct ttm_buffer_object *src_bo, *dst_bo; + struct vmw_bo *src_bo, *dst_bo; u32 src_offset, dst_offset; s32 src_pitch, dst_pitch; s32 width, height; @@ -1184,11 +1184,11 @@ vmw_stdu_bo_populate_update_cpu(struct vmw_du_update_plane *update, void *cmd, diff.cpp = stdu->cpp; - dst_bo = &stdu->display_srf->res.guest_memory_bo->tbo; + dst_bo = stdu->display_srf->res.guest_memory_bo; dst_pitch = stdu->display_srf->metadata.base_size.width * stdu->cpp; dst_offset = bb->y1 * dst_pitch + bb->x1 * stdu->cpp; - src_bo = &vfbbo->buffer->tbo; + src_bo = vfbbo->buffer; src_pitch = update->vfb->base.pitches[0]; src_offset = bo_update->fb_top * src_pitch + bo_update->fb_left * stdu->cpp; From e9fd436bb8fb9b9d31fdf07bbcdba6d30290c5e4 Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Fri, 16 Aug 2024 14:32:07 -0400 Subject: [PATCH 713/991] drm/vmwgfx: Disable coherent dumb buffers without 3d Coherent surfaces make only sense if the host renders to them using accelerated apis. Without 3d the entire content of dumb buffers stays in the guest making all of the extra work they're doing to synchronize between guest and host useless. Configurations without 3d also tend to run with very low graphics memory limits. The pinned console fb, mob cursors and graphical login manager tend to run out of 16MB graphics memory that those guests use. Fix it by making sure the coherent dumb buffers are only used on configs with 3d enabled. Signed-off-by: Zack Rusin Fixes: d6667f0ddf46 ("drm/vmwgfx: Fix handling of dumb buffers") Reported-by: Christian Heusel Closes: https://lore.kernel.org/all/0d0330f3-2ac0-4cd5-8075-7f1cbaf72a8e@heusel.eu Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Cc: # v6.9+ Link: https://patchwork.freedesktop.org/patch/msgid/20240816183332.31961-4-zack.rusin@broadcom.com Reviewed-by: Martin Krastev Reviewed-by: Maaz Mombasawala Tested-by: Benjamin Coddington --- drivers/gpu/drm/vmwgfx/vmwgfx_surface.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c index 8ae6a761c90034..1625b30d997004 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c @@ -2283,9 +2283,11 @@ int vmw_dumb_create(struct drm_file *file_priv, /* * Without mob support we're just going to use raw memory buffer * because we wouldn't be able to support full surface coherency - * without mobs + * without mobs. There also no reason to support surface coherency + * without 3d (i.e. gpu usage on the host) because then all the + * contents is going to be rendered guest side. */ - if (!dev_priv->has_mob) { + if (!dev_priv->has_mob || !vmw_supports_3d(dev_priv)) { int cpp = DIV_ROUND_UP(args->bpp, 8); switch (cpp) { From e21fea4ac3cf12eba1921fbbf7764bf69c6d4b2c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Aug 2024 16:59:01 -0700 Subject: [PATCH 714/991] xfs: fix di_onlink checking for V1/V2 inodes "KjellR" complained on IRC that an old V4 filesystem suddenly stopped mounting after upgrading from 6.9.11 to 6.10.3, with the following splat when trying to read the rt bitmap inode: 00000000: 49 4e 80 00 01 02 00 01 00 00 00 00 00 00 00 00 IN.............. 00000010: 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000020: 00 00 00 00 00 00 00 00 43 d2 a9 da 21 0f d6 30 ........C...!..0 00000030: 43 d2 a9 da 21 0f d6 30 00 00 00 00 00 00 00 00 C...!..0........ 00000040: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000050: 00 00 00 02 00 00 00 00 00 00 00 04 00 00 00 00 ................ 00000060: ff ff ff ff 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000070: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ As Dave Chinner points out, this is a V1 inode with both di_onlink and di_nlink set to 1 and di_flushiter == 0. In other words, this inode was formatted this way by mkfs and hasn't been touched since then. Back in the old days of xfsprogs 3.2.3, I observed that libxfs_ialloc would set di_nlink, but if the filesystem didn't have NLINK, it would then set di_version = 1. libxfs_iflush_int later sees the V1 inode and copies the value of di_nlink to di_onlink without zeroing di_onlink. Eventually this filesystem must have been upgraded to support NLINK because 6.10 doesn't support !NLINK filesystems, which is how we tripped over this old behavior. The filesystem doesn't have a realtime section, so that's why the rtbitmap inode has never been touched. Fix this by removing the di_onlink/di_nlink checking for all V1/V2 inodes because this is a muddy mess. The V3 inode handling code has always supported NLINK and written di_onlink==0 so keep that check. The removal of the V1 inode handling code when we dropped support for !NLINK obscured this old behavior. Reported-by: kjell.m.randa@gmail.com Fixes: 40cb8613d612 ("xfs: check unused nlink fields in the ondisk inode") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_inode_buf.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 513b50da6215f7..79babeac9d7546 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -514,12 +514,18 @@ xfs_dinode_verify( return __this_address; } - if (dip->di_version > 1) { + /* + * Historical note: xfsprogs in the 3.2 era set up its incore inodes to + * have di_nlink track the link count, even if the actual filesystem + * only supported V1 inodes (i.e. di_onlink). When writing out the + * ondisk inode, it would set both the ondisk di_nlink and di_onlink to + * the the incore di_nlink value, which is why we cannot check for + * di_nlink==0 on a V1 inode. V2/3 inodes would get written out with + * di_onlink==0, so we can check that. + */ + if (dip->di_version >= 2) { if (dip->di_onlink) return __this_address; - } else { - if (dip->di_nlink) - return __this_address; } /* don't allow invalid i_size */ From 5335affcff91b53cfc45694171f911cb23257c8b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Aug 2024 16:59:17 -0700 Subject: [PATCH 715/991] xfs: fix folio dirtying for XFILE_ALLOC callers willy pointed out that folio_mark_dirty is the correct function to use to mark an xfile folio dirty because it calls out to the mapping's aops to mark it dirty. For tmpfs this likely doesn't matter much since it currently uses nop_dirty_folio, but let's use the abstractions properly. Reported-by: willy@infradead.org Fixes: 6907e3c00a40 ("xfs: add file_{get,put}_folio") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index d848222f802baa..9b5d98fe1f8ab3 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -293,7 +293,7 @@ xfile_get_folio( * (potentially last) reference in xfile_put_folio. */ if (flags & XFILE_ALLOC) - folio_set_dirty(folio); + folio_mark_dirty(folio); return folio; } From 95179935beadccaf0f0bb461adb778731e293da4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 22 Aug 2024 16:59:33 -0700 Subject: [PATCH 716/991] xfs: xfs_finobt_count_blocks() walks the wrong btree As a result of the factoring in commit 14dd46cf31f4 ("xfs: split xfs_inobt_init_cursor"), mount started taking a long time on a user's filesystem. For Anders, this made mount times regress from under a second to over 15 minutes for a filesystem with only 30 million inodes in it. Anders bisected it down to the above commit, but even then the bug was not obvious. In this commit, over 20 calls to xfs_inobt_init_cursor() were modified, and some we modified to call a new function named xfs_finobt_init_cursor(). If that takes you a moment to reread those function names to see what the rename was, then you have realised why this bug wasn't spotted during review. And it wasn't spotted on inspection even after the bisect pointed at this commit - a single missing "f" isn't the easiest thing for a human eye to notice.... The result is that xfs_finobt_count_blocks() now incorrectly calls xfs_inobt_init_cursor() so it is now walking the inobt instead of the finobt. Hence when there are lots of allocated inodes in a filesystem, mount takes a -long- time run because it now walks a massive allocated inode btrees instead of the small, nearly empty free inode btrees. It also means all the finobt space reservations are wrong, so mount could potentially given ENOSPC on kernel upgrade. In hindsight, commit 14dd46cf31f4 should have been two commits - the first to convert the finobt callers to the new API, the second to modify the xfs_inobt_init_cursor() API for the inobt callers. That would have made the bug very obvious during review. Fixes: 14dd46cf31f4 ("xfs: split xfs_inobt_init_cursor") Reported-by: Anders Blomdell Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_ialloc_btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 496e2f72a85b98..797d5b5f7b7255 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -749,7 +749,7 @@ xfs_finobt_count_blocks( if (error) return error; - cur = xfs_inobt_init_cursor(pag, tp, agbp); + cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_btree_count_blocks(cur, tree_blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); From 410e8a18f8e9311c6bf29ae47f32ad46f0219569 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Aug 2024 16:59:48 -0700 Subject: [PATCH 717/991] xfs: don't bother reporting blocks trimmed via FITRIM Don't bother reporting the number of bytes that we "trimmed" because the underlying storage isn't required to do anything(!) and failed discard IOs aren't reported to the caller anyway. It's not like userspace can use the reported value for anything useful like adjusting the offset parameter of the next call, and it's not like anyone ever wrote a manpage about FITRIM's out parameters. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Tested-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_discard.c | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 6f0fc7fe1f2ba9..25f5dffeab2aeb 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -158,8 +158,7 @@ static int xfs_trim_gather_extents( struct xfs_perag *pag, struct xfs_trim_cur *tcur, - struct xfs_busy_extents *extents, - uint64_t *blocks_trimmed) + struct xfs_busy_extents *extents) { struct xfs_mount *mp = pag->pag_mount; struct xfs_trans *tp; @@ -280,7 +279,6 @@ xfs_trim_gather_extents( xfs_extent_busy_insert_discard(pag, fbno, flen, &extents->extent_list); - *blocks_trimmed += flen; next_extent: if (tcur->by_bno) error = xfs_btree_increment(cur, 0, &i); @@ -327,8 +325,7 @@ xfs_trim_perag_extents( struct xfs_perag *pag, xfs_agblock_t start, xfs_agblock_t end, - xfs_extlen_t minlen, - uint64_t *blocks_trimmed) + xfs_extlen_t minlen) { struct xfs_trim_cur tcur = { .start = start, @@ -354,8 +351,7 @@ xfs_trim_perag_extents( extents->owner = extents; INIT_LIST_HEAD(&extents->extent_list); - error = xfs_trim_gather_extents(pag, &tcur, extents, - blocks_trimmed); + error = xfs_trim_gather_extents(pag, &tcur, extents); if (error) { kfree(extents); break; @@ -389,8 +385,7 @@ xfs_trim_datadev_extents( struct xfs_mount *mp, xfs_daddr_t start, xfs_daddr_t end, - xfs_extlen_t minlen, - uint64_t *blocks_trimmed) + xfs_extlen_t minlen) { xfs_agnumber_t start_agno, end_agno; xfs_agblock_t start_agbno, end_agbno; @@ -411,8 +406,7 @@ xfs_trim_datadev_extents( if (start_agno == end_agno) agend = end_agbno; - error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen, - blocks_trimmed); + error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen); if (error) last_error = error; @@ -431,9 +425,6 @@ struct xfs_trim_rtdev { /* list of rt extents to free */ struct list_head extent_list; - /* pointer to count of blocks trimmed */ - uint64_t *blocks_trimmed; - /* minimum length that caller allows us to trim */ xfs_rtblock_t minlen_fsb; @@ -551,7 +542,6 @@ xfs_trim_gather_rtextent( busyp->length = rlen; INIT_LIST_HEAD(&busyp->list); list_add_tail(&busyp->list, &tr->extent_list); - *tr->blocks_trimmed += rlen; tr->restart_rtx = rec->ar_startext + rec->ar_extcount; return 0; @@ -562,13 +552,11 @@ xfs_trim_rtdev_extents( struct xfs_mount *mp, xfs_daddr_t start, xfs_daddr_t end, - xfs_daddr_t minlen, - uint64_t *blocks_trimmed) + xfs_daddr_t minlen) { struct xfs_rtalloc_rec low = { }; struct xfs_rtalloc_rec high = { }; struct xfs_trim_rtdev tr = { - .blocks_trimmed = blocks_trimmed, .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), }; struct xfs_trans *tp; @@ -634,7 +622,7 @@ xfs_trim_rtdev_extents( return error; } #else -# define xfs_trim_rtdev_extents(m,s,e,n,b) (-EOPNOTSUPP) +# define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP) #endif /* CONFIG_XFS_RT */ /* @@ -661,7 +649,6 @@ xfs_ioc_trim( xfs_daddr_t start, end; xfs_extlen_t minlen; xfs_rfsblock_t max_blocks; - uint64_t blocks_trimmed = 0; int error, last_error = 0; if (!capable(CAP_SYS_ADMIN)) @@ -706,15 +693,13 @@ xfs_ioc_trim( end = start + BTOBBT(range.len) - 1; if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) { - error = xfs_trim_datadev_extents(mp, start, end, minlen, - &blocks_trimmed); + error = xfs_trim_datadev_extents(mp, start, end, minlen); if (error) last_error = error; } if (rt_bdev && !xfs_trim_should_stop()) { - error = xfs_trim_rtdev_extents(mp, start, end, minlen, - &blocks_trimmed); + error = xfs_trim_rtdev_extents(mp, start, end, minlen); if (error) last_error = error; } @@ -722,7 +707,8 @@ xfs_ioc_trim( if (last_error) return last_error; - range.len = XFS_FSB_TO_B(mp, blocks_trimmed); + range.len = min_t(unsigned long long, range.len, + XFS_FSB_TO_B(mp, max_blocks)); if (copy_to_user(urange, &range, sizeof(range))) return -EFAULT; return 0; From 68415b349f3f16904f006275757f4fcb34b8ee43 Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Thu, 22 Aug 2024 17:00:04 -0700 Subject: [PATCH 718/991] xfs: Fix the owner setting issue for rmap query in xfs fsmap I notice a rmap query bug in xfs_io fsmap: [root@fedora ~]# xfs_io -c 'fsmap -vvvv' /mnt EXT: DEV BLOCK-RANGE OWNER FILE-OFFSET AG AG-OFFSET TOTAL 0: 253:16 [0..7]: static fs metadata 0 (0..7) 8 1: 253:16 [8..23]: per-AG metadata 0 (8..23) 16 2: 253:16 [24..39]: inode btree 0 (24..39) 16 3: 253:16 [40..47]: per-AG metadata 0 (40..47) 8 4: 253:16 [48..55]: refcount btree 0 (48..55) 8 5: 253:16 [56..103]: per-AG metadata 0 (56..103) 48 6: 253:16 [104..127]: free space 0 (104..127) 24 ...... Bug: [root@fedora ~]# xfs_io -c 'fsmap -vvvv -d 0 3' /mnt [root@fedora ~]# Normally, we should be able to get one record, but we got nothing. The root cause of this problem lies in the incorrect setting of rm_owner in the rmap query. In the case of the initial query where the owner is not set, __xfs_getfsmap_datadev() first sets info->high.rm_owner to ULLONG_MAX. This is done to prevent any omissions when comparing rmap items. However, if the current ag is detected to be the last one, the function sets info's high_irec based on the provided key. If high->rm_owner is not specified, it should continue to be set to ULLONG_MAX; otherwise, there will be issues with interval omissions. For example, consider "start" and "end" within the same block. If high->rm_owner == 0, it will be smaller than the founded record in rmapbt, resulting in a query with no records. The main call stack is as follows: xfs_ioc_getfsmap xfs_getfsmap xfs_getfsmap_datadev_rmapbt __xfs_getfsmap_datadev info->high.rm_owner = ULLONG_MAX if (pag->pag_agno == end_ag) xfs_fsmap_owner_to_rmap // set info->high.rm_owner = 0 because fmr_owner == -1ULL dest->rm_owner = 0 // get nothing xfs_getfsmap_datadev_rmapbt_query The problem can be resolved by simply modify the xfs_fsmap_owner_to_rmap function internal logic to achieve. After applying this patch, the above problem have been solved: [root@fedora ~]# xfs_io -c 'fsmap -vvvv -d 0 3' /mnt EXT: DEV BLOCK-RANGE OWNER FILE-OFFSET AG AG-OFFSET TOTAL 0: 253:16 [0..7]: static fs metadata 0 (0..7) 8 Fixes: e89c041338ed ("xfs: implement the GETFSMAP ioctl") Signed-off-by: Zizhi Wo Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_fsmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 85dbb46452ca0b..3a30b36779db56 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -71,7 +71,7 @@ xfs_fsmap_owner_to_rmap( switch (src->fmr_owner) { case 0: /* "lowest owner id possible" */ case -1ULL: /* "highest owner id possible" */ - dest->rm_owner = 0; + dest->rm_owner = src->fmr_owner; break; case XFS_FMR_OWN_FREE: dest->rm_owner = XFS_RMAP_OWN_NULL; From 7af6c720417f21f015f46baa33e182f349ddc93b Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 15 Aug 2024 20:48:57 +0800 Subject: [PATCH 719/991] iommu/vt-d: Fix incorrect domain ID in context flush helper The helper intel_context_flush_present() is designed to flush all related caches when a context entry with the present bit set is modified. It currently retrieves the domain ID from the context entry and uses it to flush the IOTLB and context caches. This is incorrect when the context entry transitions from present to non-present, as the domain ID field is cleared before calling the helper. Fix it by passing the domain ID programmed in the context entry before the change to intel_context_flush_present(). This ensures that the correct domain ID is used for cache invalidation. Fixes: f90584f4beb8 ("iommu/vt-d: Add helper to flush caches for context change") Reported-by: Alex Williamson Closes: https://lore.kernel.org/linux-iommu/20240814162726.5efe1a6e.alex.williamson@redhat.com/ Signed-off-by: Lu Baolu Tested-by: Alex Williamson Reviewed-by: Alex Williamson Reviewed-by: Jerry Snitselaar Reviewed-by: Jacob Pan Link: https://lore.kernel.org/r/20240815124857.70038-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 8 ++++++-- drivers/iommu/intel/iommu.h | 2 +- drivers/iommu/intel/pasid.c | 7 ++++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9ff8b83c19a3e2..4aa070cf56e703 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1944,6 +1944,7 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 { struct intel_iommu *iommu = info->iommu; struct context_entry *context; + u16 did; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, 0); @@ -1952,10 +1953,11 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 return; } + did = context_domain_id(context); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - intel_context_flush_present(info, context, true); + intel_context_flush_present(info, context, did, true); } static int domain_setup_first_level(struct intel_iommu *iommu, @@ -4249,6 +4251,7 @@ static int context_flip_pri(struct device_domain_info *info, bool enable) struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; struct context_entry *context; + u16 did; spin_lock(&iommu->lock); if (context_copied(iommu, bus, devfn)) { @@ -4261,6 +4264,7 @@ static int context_flip_pri(struct device_domain_info *info, bool enable) spin_unlock(&iommu->lock); return -ENODEV; } + did = context_domain_id(context); if (enable) context_set_sm_pre(context); @@ -4269,7 +4273,7 @@ static int context_flip_pri(struct device_domain_info *info, bool enable) if (!ecap_coherent(iommu->ecap)) clflush_cache_range(context, sizeof(*context)); - intel_context_flush_present(info, context, true); + intel_context_flush_present(info, context, did, true); spin_unlock(&iommu->lock); return 0; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index b67c14da12408b..a969be2258b1c2 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1154,7 +1154,7 @@ void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, void intel_context_flush_present(struct device_domain_info *info, struct context_entry *context, - bool affect_domains); + u16 did, bool affect_domains); #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 5792c817cefa56..b51fc268dc845a 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -683,6 +683,7 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct context_entry *context; + u16 did; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, false); @@ -691,10 +692,11 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) return; } + did = context_domain_id(context); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - intel_context_flush_present(info, context, false); + intel_context_flush_present(info, context, did, false); } static int pci_pasid_table_teardown(struct pci_dev *pdev, u16 alias, void *data) @@ -885,10 +887,9 @@ static void __context_flush_dev_iotlb(struct device_domain_info *info) */ void intel_context_flush_present(struct device_domain_info *info, struct context_entry *context, - bool flush_domains) + u16 did, bool flush_domains) { struct intel_iommu *iommu = info->iommu; - u16 did = context_domain_id(context); struct pasid_entry *pte; int i; From 996dc53ac289b81957aa70d62ccadc6986d26a87 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Aug 2024 11:45:54 -0300 Subject: [PATCH 720/991] iommufd: Do not allow creating areas without READ or WRITE This results in passing 0 or just IOMMU_CACHE to iommu_map(). Most of the page table formats don't like this: amdv1 - -EINVAL armv7s - returns 0, doesn't update mapped arm-lpae - returns 0 doesn't update mapped dart - returns 0, doesn't update mapped VT-D - returns -EINVAL Unfortunately the three formats that return 0 cause serious problems: - Returning ret = but not uppdating mapped from domain->map_pages() causes an infinite loop in __iommu_map() - Not writing ioptes means that VFIO/iommufd have no way to recover them and we will have memory leaks and worse during unmap Since almost nothing can support this, and it is a useless thing to do, block it early in iommufd. Cc: stable@kernel.org Fixes: aad37e71d5c4 ("iommufd: IOCTLs for the io_pagetable") Signed-off-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/1-v1-1211e1294c27+4b1-iommu_no_prot_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommufd/ioas.c | 8 ++++++++ tools/testing/selftests/iommu/iommufd.c | 6 +++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 74224827654815..157a89b993e438 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -213,6 +213,10 @@ int iommufd_ioas_map(struct iommufd_ucmd *ucmd) if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) return -EOVERFLOW; + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); if (IS_ERR(ioas)) return PTR_ERR(ioas); @@ -253,6 +257,10 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) cmd->dst_iova >= ULONG_MAX) return -EOVERFLOW; + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + src_ioas = iommufd_get_ioas(ucmd->ictx, cmd->src_ioas_id); if (IS_ERR(src_ioas)) return PTR_ERR(src_ioas); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 6343f4053bd46e..4927b9add5add9 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -825,7 +825,7 @@ TEST_F(iommufd_ioas, copy_area) { struct iommu_ioas_copy copy_cmd = { .size = sizeof(copy_cmd), - .flags = IOMMU_IOAS_MAP_FIXED_IOVA, + .flags = IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE, .dst_ioas_id = self->ioas_id, .src_ioas_id = self->ioas_id, .length = PAGE_SIZE, @@ -1318,7 +1318,7 @@ TEST_F(iommufd_ioas, copy_sweep) { struct iommu_ioas_copy copy_cmd = { .size = sizeof(copy_cmd), - .flags = IOMMU_IOAS_MAP_FIXED_IOVA, + .flags = IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE, .src_ioas_id = self->ioas_id, .dst_iova = MOCK_APERTURE_START, .length = MOCK_PAGE_SIZE, @@ -1608,7 +1608,7 @@ TEST_F(iommufd_mock_domain, user_copy) }; struct iommu_ioas_copy copy_cmd = { .size = sizeof(copy_cmd), - .flags = IOMMU_IOAS_MAP_FIXED_IOVA, + .flags = IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE, .dst_ioas_id = self->ioas_id, .dst_iova = MOCK_APERTURE_START, .length = BUFFER_SIZE, From 6093cd582f8e027117a8d4ad5d129a1aacdc53d2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Aug 2024 11:45:55 -0300 Subject: [PATCH 721/991] iommu: Do not return 0 from map_pages if it doesn't do anything These three implementations of map_pages() all succeed if a mapping is requested with no read or write. Since they return back to __iommu_map() leaving the mapped output as 0 it triggers an infinite loop. Therefore nothing is using no-access protection bits. Further, VFIO and iommufd rely on iommu_iova_to_phys() to get back PFNs stored by map, if iommu_map() succeeds but iommu_iova_to_phys() fails that will create serious bugs. Thus remove this never used "nothing to do" concept and just fail map immediately. Fixes: e5fc9753b1a8 ("iommu/io-pgtable: Add ARMv7 short descriptor support") Fixes: e1d3c0fd701d ("iommu: add ARM LPAE page table allocator") Fixes: 745ef1092bcf ("iommu/io-pgtable: Move Apple DART support to its own file") Signed-off-by: Jason Gunthorpe Acked-by: Will Deacon Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/2-v1-1211e1294c27+4b1-iommu_no_prot_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm-v7s.c | 3 +-- drivers/iommu/io-pgtable-arm.c | 3 +-- drivers/iommu/io-pgtable-dart.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 75f244a3e12df6..06ffc683b28fee 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -552,9 +552,8 @@ static int arm_v7s_map_pages(struct io_pgtable_ops *ops, unsigned long iova, paddr >= (1ULL << data->iop.cfg.oas))) return -ERANGE; - /* If no access, then nothing to do */ if (!(prot & (IOMMU_READ | IOMMU_WRITE))) - return 0; + return -EINVAL; while (pgcount--) { ret = __arm_v7s_map(data, iova, paddr, pgsize, prot, 1, data->pgd, diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index f5d9fd1f45bf49..ff4149ae1751d4 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -515,9 +515,8 @@ static int arm_lpae_map_pages(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(iaext || paddr >> cfg->oas)) return -ERANGE; - /* If no access, then nothing to do */ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) - return 0; + return -EINVAL; prot = arm_lpae_prot_to_pte(data, iommu_prot); ret = __arm_lpae_map(data, iova, paddr, pgsize, pgcount, prot, lvl, diff --git a/drivers/iommu/io-pgtable-dart.c b/drivers/iommu/io-pgtable-dart.c index ad28031e1e93d6..c004640640ee50 100644 --- a/drivers/iommu/io-pgtable-dart.c +++ b/drivers/iommu/io-pgtable-dart.c @@ -245,9 +245,8 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(paddr >> cfg->oas)) return -ERANGE; - /* If no access, then nothing to do */ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) - return 0; + return -EINVAL; tbl = dart_get_table(data, iova); From 51eeef9a482bcb00f6f75eda4de9bd013092b76f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 23 Aug 2024 17:54:54 +0100 Subject: [PATCH 722/991] MAINTAINERS: Add Jean-Philippe as SMMUv3 SVA reviewer Add Jean-Philippe as a reviewer for the Arm SMMUv3 SVA support, since he's been a consistent contributor to that code over the years and understands the relevant parts of the architecture much better than me. Cc: Robin Murphy Cc: Jean-Philippe Brucker Cc: Mostafa Saleh Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20240823165454.1064-1-will@kernel.org Signed-off-by: Joerg Roedel --- MAINTAINERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f328373463b0d2..445cec2ae4ed67 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1880,6 +1880,10 @@ F: Documentation/devicetree/bindings/iommu/arm,smmu* F: drivers/iommu/arm/ F: drivers/iommu/io-pgtable-arm* +ARM SMMU SVA SUPPORT +R: Jean-Philippe Brucker +F: drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c + ARM SUB-ARCHITECTURES L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained From 28f5df210d06beb5920cf80446f1c27456c14b92 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Sun, 25 Aug 2024 16:47:50 +0200 Subject: [PATCH 723/991] random: vDSO: reject unknown getrandom() flags Like the getrandom() syscall, vDSO getrandom() must also reject unknown flags. [1] It would be possible to return -EINVAL from vDSO itself, but in the possible case that a new flag is added to getrandom() syscall in the future, it would be easier to get the behavior from the syscall, instead of erroring until the vDSO is extended to support the new flag or explicitly falling back. [1] Designing the API: Planning for Extension https://docs.kernel.org/process/adding-syscalls.html#designing-the-api-planning-for-extension Signed-off-by: Yann Droneaud [Jason: reworded commit message] Signed-off-by: Jason A. Donenfeld --- lib/vdso/getrandom.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/vdso/getrandom.c b/lib/vdso/getrandom.c index b230f0b10832fd..e1db228bc4f0d1 100644 --- a/lib/vdso/getrandom.c +++ b/lib/vdso/getrandom.c @@ -85,6 +85,10 @@ __cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_ if (unlikely(((unsigned long)opaque_state & ~PAGE_MASK) + sizeof(*state) > PAGE_SIZE)) return -EFAULT; + /* Handle unexpected flags by falling back to the kernel. */ + if (unlikely(flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE))) + goto fallback_syscall; + /* If the caller passes the wrong size, which might happen due to CRIU, fallback. */ if (unlikely(opaque_len != sizeof(*state))) goto fallback_syscall; From 2dc43c5e212036458ed7c5586fb82ee183fee504 Mon Sep 17 00:00:00 2001 From: Hendrik Borghorst Date: Sun, 25 Aug 2024 19:43:47 +0200 Subject: [PATCH 724/991] ALSA: hda/realtek: support HP Pavilion Aero 13-bg0xxx Mute LED This patch adds the HP Pavilion Aero 13 (13-bg0xxx) (year 2024) to list of quirks for keyboard LED mute indication. The laptop has two LEDs (one for speaker and one for mic mute). The pre-existing quirk ALC245_FIXUP_HP_X360_MUTE_LEDS chains both the quirk for mic and speaker mute. Tested on 6.11.0-rc4 with the aforementioned laptop. Signed-off-by: Hendrik Borghorst Cc: Link: https://patch.msgid.link/20240825174351.5687-1-hendrikborghorst@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c04eac6a5064bc..588738ce7380cc 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10380,6 +10380,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8ca2, "HP ZBook Power", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ca4, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ca7, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8cbd, "HP Pavilion Aero Laptop 13-bg0xxx", ALC245_FIXUP_HP_X360_MUTE_LEDS), SND_PCI_QUIRK(0x103c, 0x8cdd, "HP Spectre", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8cde, "HP Spectre", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8cdf, "HP SnowWhite", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), From 28b329f431cef840fddd9a9b493bc3eff1aa06c0 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 26 Aug 2024 10:49:40 +0100 Subject: [PATCH 725/991] ALSA: hda: hda_component: Fix mutex crash if nothing ever binds Move the initialization of parent->mutex into hda_component_manager_init() so that it is always valid. In hda_component_manager_bind() do not clear the parent information. Only zero-fill the per-component data ready for it to be filled in by the components as they bind. Previously parent->mutex was being initialized only in hda_component_manager_bind(). This meant that it was only initialized if all components appeared and there was a bind callback. If there wasn't a bind the mutex object was not valid when the Realtek driver called any of the other functions. Signed-off-by: Richard Fitzgerald Fixes: 047b9cbbaa8e ("ALSA: hda: hda_component: Protect shared data with a mutex") Link: https://patch.msgid.link/20240826094940.45563-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_component.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_component.c b/sound/pci/hda/hda_component.c index 7b19cb38b4e025..b7dfdb10d15672 100644 --- a/sound/pci/hda/hda_component.c +++ b/sound/pci/hda/hda_component.c @@ -141,8 +141,7 @@ int hda_component_manager_bind(struct hda_codec *cdc, int ret; /* Init shared and component specific data */ - memset(parent, 0, sizeof(*parent)); - mutex_init(&parent->mutex); + memset(parent->comps, 0, sizeof(parent->comps)); parent->codec = cdc; mutex_lock(&parent->mutex); @@ -164,6 +163,8 @@ int hda_component_manager_init(struct hda_codec *cdc, struct hda_scodec_match *sm; int ret, i; + mutex_init(&parent->mutex); + for (i = 0; i < count; i++) { sm = devm_kmalloc(dev, sizeof(*sm), GFP_KERNEL); if (!sm) From 5fd0628918977a0afdc2e6bc562d8751b5d3b8c5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 26 Aug 2024 12:45:22 +0200 Subject: [PATCH 726/991] netfilter: nf_tables: restore IP sanity checks for netdev/egress Subtract network offset to skb->len before performing IPv4 header sanity checks, then adjust transport offset from offset from mac header. Jorge Ortiz says: When small UDP packets (< 4 bytes payload) are sent from eth0, `meta l4proto udp` condition is not met because `NFT_PKTINFO_L4PROTO` is not set. This happens because there is a comparison that checks if the transport header offset exceeds the total length. This comparison does not take into account the fact that the skb network offset might be non-zero in egress mode (e.g., 14 bytes for Ethernet header). Fixes: 0ae8e4cca787 ("netfilter: nf_tables: set transport offset from mac header for netdev/egress") Reported-by: Jorge Ortiz Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_ipv4.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index 60a7d0ce308046..fcf967286e37cb 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -19,7 +19,7 @@ static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt) static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { struct iphdr *iph, _iph; - u32 len, thoff; + u32 len, thoff, skb_len; iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*iph), &_iph); @@ -30,8 +30,10 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) return -1; len = iph_totlen(pkt->skb, iph); - thoff = skb_network_offset(pkt->skb) + (iph->ihl * 4); - if (pkt->skb->len < len) + thoff = iph->ihl * 4; + skb_len = pkt->skb->len - skb_network_offset(pkt->skb); + + if (skb_len < len) return -1; else if (len < thoff) return -1; @@ -40,7 +42,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; - pkt->thoff = thoff; + pkt->thoff = skb_network_offset(pkt->skb) + thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; From 9286dfd5735b9cceb6a14bdf15e13400ccb60fe7 Mon Sep 17 00:00:00 2001 From: Mathieu Fenniak Date: Fri, 23 Aug 2024 15:56:28 +0200 Subject: [PATCH 727/991] platform/x86: asus-wmi: Fix spurious rfkill on UX8406MA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Asus Zenbook Duo (UX8406MA) has a keyboard which can be placed on the laptop to connect it via USB, or can be removed from the laptop to reveal a hidden secondary display in which case the keyboard operates via Bluetooth. When it is placed on the secondary display to connect via USB, it emits a keypress for a wireless disable. This causes the rfkill system to be activated disconnecting the current wifi connection, which doesn't reflect the user's true intention. Detect this hardware and suppress any wireless switches from the keyboard; this keyboard does not have a wireless toggle capability so these presses are always spurious. Signed-off-by: Mathieu Fenniak Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240823135630.128447-1-mathieu@fenniak.net Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-nb-wmi.c | 20 +++++++++++++++++++- drivers/platform/x86/asus-wmi.h | 1 + 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index fceffe2082ec58..ed3633c5955d9f 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -145,6 +145,10 @@ static struct quirk_entry quirk_asus_ignore_fan = { .wmi_ignore_fan = true, }; +static struct quirk_entry quirk_asus_zenbook_duo_kbd = { + .ignore_key_wlan = true, +}; + static int dmi_matched(const struct dmi_system_id *dmi) { pr_info("Identified laptop model '%s'\n", dmi->ident); @@ -516,6 +520,15 @@ static const struct dmi_system_id asus_quirks[] = { }, .driver_data = &quirk_asus_ignore_fan, }, + { + .callback = dmi_matched, + .ident = "ASUS Zenbook Duo UX8406MA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "UX8406MA"), + }, + .driver_data = &quirk_asus_zenbook_duo_kbd, + }, {}, }; @@ -630,7 +643,12 @@ static void asus_nb_wmi_key_filter(struct asus_wmi_driver *asus_wmi, int *code, case 0x32: /* Volume Mute */ if (atkbd_reports_vol_keys) *code = ASUS_WMI_KEY_IGNORE; - + break; + case 0x5D: /* Wireless console Toggle */ + case 0x5E: /* Wireless console Enable */ + case 0x5F: /* Wireless console Disable */ + if (quirks->ignore_key_wlan) + *code = ASUS_WMI_KEY_IGNORE; break; } } diff --git a/drivers/platform/x86/asus-wmi.h b/drivers/platform/x86/asus-wmi.h index cc30f185384723..d02f15fd3482fa 100644 --- a/drivers/platform/x86/asus-wmi.h +++ b/drivers/platform/x86/asus-wmi.h @@ -40,6 +40,7 @@ struct quirk_entry { bool wmi_force_als_set; bool wmi_ignore_fan; bool filter_i8042_e1_extended_codes; + bool ignore_key_wlan; enum asus_wmi_tablet_switch_mode tablet_switch_mode; int wapf; /* From a3379eca24a7da5118a7d090da6f8eb8611acac8 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 25 Aug 2024 15:24:15 +0200 Subject: [PATCH 728/991] platform/x86: x86-android-tablets: Make Lenovo Yoga Tab 3 X90F DMI match less strict MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are 2G and 4G RAM versions of the Lenovo Yoga Tab 3 X90F and it turns out that the 2G version has a DMI product name of "CHERRYVIEW D1 PLATFORM" where as the 4G version has "CHERRYVIEW C0 PLATFORM". The sys-vendor + product-version check are unique enough that the product-name check is not necessary. Drop the product-name check so that the existing DMI match for the 4G RAM version also matches the 2G RAM version. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240825132415.8307-1-hdegoede@redhat.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/x86-android-tablets/dmi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/platform/x86/x86-android-tablets/dmi.c b/drivers/platform/x86/x86-android-tablets/dmi.c index 141a2d25e83be6..387dd092c4dd01 100644 --- a/drivers/platform/x86/x86-android-tablets/dmi.c +++ b/drivers/platform/x86/x86-android-tablets/dmi.c @@ -140,7 +140,6 @@ const struct dmi_system_id x86_android_tablet_ids[] __initconst = { /* Lenovo Yoga Tab 3 Pro YT3-X90F */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Intel Corporation"), - DMI_MATCH(DMI_PRODUCT_NAME, "CHERRYVIEW D1 PLATFORM"), DMI_MATCH(DMI_PRODUCT_VERSION, "Blade3-10A-001"), }, .driver_data = (void *)&lenovo_yt3_info, From 052f3951640fd96d2e777b3272a925ec6c0c8100 Mon Sep 17 00:00:00 2001 From: Ryan Sullivan Date: Thu, 22 Aug 2024 13:31:22 -0400 Subject: [PATCH 729/991] selftests/livepatch: wait for atomic replace to occur On some machines with a large number of CPUs there is a sizable delay between an atomic replace occurring and when sysfs updates accordingly. This fix uses 'loop_until' to wait for the atomic replace to unload all previous livepatches. Reported-by: CKI Project Closes: https://datawarehouse.cki-project.org/kcidb/tests/redhat:1413102084-x86_64-kernel_upt_28 Signed-off-by: Ryan Sullivan Reviewed-by: Petr Mladek Acked-by: Joe Lawrence Link: https://lore.kernel.org/r/20240822173122.14760-1-rysulliv@redhat.com Signed-off-by: Petr Mladek --- tools/testing/selftests/livepatch/test-livepatch.sh | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/livepatch/test-livepatch.sh b/tools/testing/selftests/livepatch/test-livepatch.sh index 65c9c058458de8..bd13257bfdfe51 100755 --- a/tools/testing/selftests/livepatch/test-livepatch.sh +++ b/tools/testing/selftests/livepatch/test-livepatch.sh @@ -139,11 +139,8 @@ load_lp $MOD_REPLACE replace=1 grep 'live patched' /proc/cmdline > /dev/kmsg grep 'live patched' /proc/meminfo > /dev/kmsg -mods=(/sys/kernel/livepatch/*) -nmods=${#mods[@]} -if [ "$nmods" -ne 1 ]; then - die "Expecting only one moduled listed, found $nmods" -fi +loop_until 'mods=(/sys/kernel/livepatch/*); nmods=${#mods[@]}; [[ "$nmods" -eq 1 ]]' || + die "Expecting only one moduled listed, found $nmods" # These modules were disabled by the atomic replace for mod in $MOD_LIVEPATCH3 $MOD_LIVEPATCH2 $MOD_LIVEPATCH1; do From 4186c8d9e6af57bab0687b299df10ebd47534a0a Mon Sep 17 00:00:00 2001 From: Jacky Chou Date: Thu, 22 Aug 2024 15:30:06 +0800 Subject: [PATCH 730/991] net: ftgmac100: Ensure tx descriptor updates are visible The driver must ensure TX descriptor updates are visible before updating TX pointer and TX clear pointer. This resolves TX hangs observed on AST2600 when running iperf3. Signed-off-by: Jacky Chou Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftgmac100.c | 26 ++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index fddfd1dd50709f..4c546c3aef0fec 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -572,7 +572,7 @@ static bool ftgmac100_rx_packet(struct ftgmac100 *priv, int *processed) (*processed)++; return true; - drop: +drop: /* Clean rxdes0 (which resets own bit) */ rxdes->rxdes0 = cpu_to_le32(status & priv->rxdes0_edorr_mask); priv->rx_pointer = ftgmac100_next_rx_pointer(priv, pointer); @@ -656,6 +656,11 @@ static bool ftgmac100_tx_complete_packet(struct ftgmac100 *priv) ftgmac100_free_tx_packet(priv, pointer, skb, txdes, ctl_stat); txdes->txdes0 = cpu_to_le32(ctl_stat & priv->txdes0_edotr_mask); + /* Ensure the descriptor config is visible before setting the tx + * pointer. + */ + smp_wmb(); + priv->tx_clean_pointer = ftgmac100_next_tx_pointer(priv, pointer); return true; @@ -809,6 +814,11 @@ static netdev_tx_t ftgmac100_hard_start_xmit(struct sk_buff *skb, dma_wmb(); first->txdes0 = cpu_to_le32(f_ctl_stat); + /* Ensure the descriptor config is visible before setting the tx + * pointer. + */ + smp_wmb(); + /* Update next TX pointer */ priv->tx_pointer = pointer; @@ -829,7 +839,7 @@ static netdev_tx_t ftgmac100_hard_start_xmit(struct sk_buff *skb, return NETDEV_TX_OK; - dma_err: +dma_err: if (net_ratelimit()) netdev_err(netdev, "map tx fragment failed\n"); @@ -851,7 +861,7 @@ static netdev_tx_t ftgmac100_hard_start_xmit(struct sk_buff *skb, * last fragment, so we know ftgmac100_free_tx_packet() * hasn't freed the skb yet. */ - drop: +drop: /* Drop the packet */ dev_kfree_skb_any(skb); netdev->stats.tx_dropped++; @@ -1344,7 +1354,7 @@ static void ftgmac100_reset(struct ftgmac100 *priv) ftgmac100_init_all(priv, true); netdev_dbg(netdev, "Reset done !\n"); - bail: +bail: if (priv->mii_bus) mutex_unlock(&priv->mii_bus->mdio_lock); if (netdev->phydev) @@ -1543,15 +1553,15 @@ static int ftgmac100_open(struct net_device *netdev) return 0; - err_ncsi: +err_ncsi: napi_disable(&priv->napi); netif_stop_queue(netdev); - err_alloc: +err_alloc: ftgmac100_free_buffers(priv); free_irq(netdev->irq, netdev); - err_irq: +err_irq: netif_napi_del(&priv->napi); - err_hw: +err_hw: iowrite32(0, priv->base + FTGMAC100_OFFSET_IER); ftgmac100_free_rings(priv); return err; From 274ea3563e5ab9f468c15bfb9d2492803a66d9be Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Mon, 26 Aug 2024 23:11:32 +0800 Subject: [PATCH 731/991] LoongArch: Define ARCH_IRQ_INIT_FLAGS as IRQ_NOPROBE Currently we call irq_set_noprobe() in a loop for all IRQs, but indeed it only works for IRQs below NR_IRQS_LEGACY because at init_IRQ() only legacy interrupts have been allocated. Instead, we can define ARCH_IRQ_INIT_FLAGS as IRQ_NOPROBE in asm/hwirq.h and the core will automatically set the flag for all interrupts. Reviewed-by: Thomas Gleixner Signed-off-by: Huacai Chen Signed-off-by: Tianyang Zhang --- arch/loongarch/include/asm/hw_irq.h | 2 ++ arch/loongarch/kernel/irq.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/include/asm/hw_irq.h b/arch/loongarch/include/asm/hw_irq.h index af4f4e8fbd858f..8156ffb6741591 100644 --- a/arch/loongarch/include/asm/hw_irq.h +++ b/arch/loongarch/include/asm/hw_irq.h @@ -9,6 +9,8 @@ extern atomic_t irq_err_count; +#define ARCH_IRQ_INIT_FLAGS IRQ_NOPROBE + /* * interrupt-retrigger: NOP for now. This may not be appropriate for all * machines, we'll see ... diff --git a/arch/loongarch/kernel/irq.c b/arch/loongarch/kernel/irq.c index f4991c03514f48..adac8fcbb2aca4 100644 --- a/arch/loongarch/kernel/irq.c +++ b/arch/loongarch/kernel/irq.c @@ -102,9 +102,6 @@ void __init init_IRQ(void) mp_ops.init_ipi(); #endif - for (i = 0; i < NR_IRQS; i++) - irq_set_noprobe(i); - for_each_possible_cpu(i) { page = alloc_pages_node(cpu_to_node(i), GFP_KERNEL, order); From 80376323e2b6a4559f86b2b4d864848ac25cb054 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 26 Aug 2024 23:11:32 +0800 Subject: [PATCH 732/991] LoongArch: Add ifdefs to fix LSX and LASX related warnings There exist some warnings when building kernel if CONFIG_CPU_HAS_LBT is set but CONFIG_CPU_HAS_LSX and CONFIG_CPU_HAS_LASX are not set. In this case, there are no definitions of _restore_lsx & _restore_lasx and there are also no definitions of kvm_restore_lsx & kvm_restore_lasx in fpu.S and switch.S respectively, just add some ifdefs to fix these warnings. AS arch/loongarch/kernel/fpu.o arch/loongarch/kernel/fpu.o: warning: objtool: unexpected relocation symbol type in .rela.discard.func_stack_frame_non_standard: 0 arch/loongarch/kernel/fpu.o: warning: objtool: unexpected relocation symbol type in .rela.discard.func_stack_frame_non_standard: 0 AS [M] arch/loongarch/kvm/switch.o arch/loongarch/kvm/switch.o: warning: objtool: unexpected relocation symbol type in .rela.discard.func_stack_frame_non_standard: 0 arch/loongarch/kvm/switch.o: warning: objtool: unexpected relocation symbol type in .rela.discard.func_stack_frame_non_standard: 0 MODPOST Module.symvers ERROR: modpost: "kvm_restore_lsx" [arch/loongarch/kvm/kvm.ko] undefined! ERROR: modpost: "kvm_restore_lasx" [arch/loongarch/kvm/kvm.ko] undefined! Cc: stable@vger.kernel.org # 6.9+ Fixes: cb8a2ef0848c ("LoongArch: Add ORC stack unwinder support") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408120955.qls5oNQY-lkp@intel.com/ Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/fpu.S | 4 ++++ arch/loongarch/kvm/switch.S | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S index 69a85f2479fba1..6ab640101457cc 100644 --- a/arch/loongarch/kernel/fpu.S +++ b/arch/loongarch/kernel/fpu.S @@ -530,6 +530,10 @@ SYM_FUNC_END(_restore_lasx_context) #ifdef CONFIG_CPU_HAS_LBT STACK_FRAME_NON_STANDARD _restore_fp +#ifdef CONFIG_CPU_HAS_LSX STACK_FRAME_NON_STANDARD _restore_lsx +#endif +#ifdef CONFIG_CPU_HAS_LASX STACK_FRAME_NON_STANDARD _restore_lasx #endif +#endif diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S index 80e988985a6adf..0c292f81849277 100644 --- a/arch/loongarch/kvm/switch.S +++ b/arch/loongarch/kvm/switch.S @@ -277,6 +277,10 @@ SYM_DATA(kvm_enter_guest_size, .quad kvm_enter_guest_end - kvm_enter_guest) #ifdef CONFIG_CPU_HAS_LBT STACK_FRAME_NON_STANDARD kvm_restore_fpu +#ifdef CONFIG_CPU_HAS_LSX STACK_FRAME_NON_STANDARD kvm_restore_lsx +#endif +#ifdef CONFIG_CPU_HAS_LASX STACK_FRAME_NON_STANDARD kvm_restore_lasx #endif +#endif From 4956e07f05e239b274d042618a250c9fa3e92629 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 26 Aug 2024 23:11:32 +0800 Subject: [PATCH 733/991] LoongArch: KVM: Invalidate guest steal time address on vCPU reset If ParaVirt steal time feature is enabled, there is a percpu gpa address passed from guest vCPU and host modifies guest memory space with this gpa address. When vCPU is reset normally, it will notify host and invalidate gpa address. However if VM is crashed and VMM reboots VM forcely, the vCPU reboot notification callback will not be called in VM. Host needs invalidate the gpa address, else host will modify guest memory during VM reboots. Here it is invalidated from the vCPU KVM_REG_LOONGARCH_VCPU_RESET ioctl interface. Also funciton kvm_reset_timer() is removed at vCPU reset stage, since SW emulated timer is only used in vCPU block state. When a vCPU is removed from the block waiting queue, kvm_restore_timer() is called and SW timer is cancelled. And the timer register is also cleared at VMM when a vCPU is reset. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_vcpu.h | 1 - arch/loongarch/kvm/timer.c | 7 ------- arch/loongarch/kvm/vcpu.c | 2 +- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h index c416cb7125c0e5..86570084e05aa8 100644 --- a/arch/loongarch/include/asm/kvm_vcpu.h +++ b/arch/loongarch/include/asm/kvm_vcpu.h @@ -76,7 +76,6 @@ static inline void kvm_restore_lasx(struct loongarch_fpu *fpu) { } #endif void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long hz); -void kvm_reset_timer(struct kvm_vcpu *vcpu); void kvm_save_timer(struct kvm_vcpu *vcpu); void kvm_restore_timer(struct kvm_vcpu *vcpu); diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c index bcc6b6d063d914..74a4b5c272d60e 100644 --- a/arch/loongarch/kvm/timer.c +++ b/arch/loongarch/kvm/timer.c @@ -188,10 +188,3 @@ void kvm_save_timer(struct kvm_vcpu *vcpu) kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ESTAT); preempt_enable(); } - -void kvm_reset_timer(struct kvm_vcpu *vcpu) -{ - write_gcsr_timercfg(0); - kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_TCFG, 0); - hrtimer_cancel(&vcpu->arch.swtimer); -} diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 16756ffb55e860..6905283f535b96 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -647,7 +647,7 @@ static int kvm_set_one_reg(struct kvm_vcpu *vcpu, vcpu->kvm->arch.time_offset = (signed long)(v - drdtime()); break; case KVM_REG_LOONGARCH_VCPU_RESET: - kvm_reset_timer(vcpu); + vcpu->arch.st.guest_addr = 0; memset(&vcpu->arch.irq_pending, 0, sizeof(vcpu->arch.irq_pending)); memset(&vcpu->arch.irq_clear, 0, sizeof(vcpu->arch.irq_clear)); break; From 3e9b4021fedf92c11233ae1a8615327d0cbbecd5 Mon Sep 17 00:00:00 2001 From: Daniel Gabay Date: Fri, 23 Aug 2024 10:55:46 +0200 Subject: [PATCH 734/991] wifi: mac80211: fix beacon SSID mismatch handling Return false when memcmp with zero_ssid returns 0 to correctly handle hidden SSIDs case. Fixes: 9cc88678db5b ("wifi: mac80211: check SSID in beacon") Reviewed-by: Andrei Otcheretianski Reviewed-by: Miriam Rachel Korenblit Signed-off-by: Daniel Gabay Link: https://patch.msgid.link/20240823105546.7ab29ae287a6.I7f98e57e1ab6597614703fdd138cc88ad253d986@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 4779a18ab75d8c..f9526bbc363371 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -6664,7 +6664,7 @@ static bool ieee80211_mgd_ssid_mismatch(struct ieee80211_sub_if_data *sdata, return true; /* hidden SSID: zeroed out */ - if (memcmp(elems->ssid, zero_ssid, elems->ssid_len)) + if (!memcmp(elems->ssid, zero_ssid, elems->ssid_len)) return false; return memcmp(elems->ssid, cfg->ssid, cfg->ssid_len); From cb347bd29d0d106213a0cf4f86b72dffd08d3454 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 25 Aug 2024 19:17:02 +0300 Subject: [PATCH 735/991] wifi: iwlwifi: mvm: fix hibernation Fast resume is a feature that was recently introduced to speed up the resume time. It basically keeps the firmware alive while the system is suspended and that avoids starting again the whole device. This flow can't work for hibernation, since when the system boots, before the frozen image is loaded, the kernel may touch the device. As a result, we can't assume the device is in the exact same state as before the hibernation. Detect that we are resuming from hibernation through the PCI device and forbid the fast resume flow. We also need to shut down the device cleanly when that happens. In addition, in case the device is power gated during S3, we won't be able to keep the device alive. Detect this situation with BE200 at least with the help of the CSR_FUNC_SCRATCH register and reset the device upon resume if it was power gated during S3. Fixes: e8bb19c1d590 ("wifi: iwlwifi: support fast resume") Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20240825191257.24eb3b19e74f.I3837810318dbef0a0a773cf4c4fcf89cdc6fdbd3@changeid Signed-off-by: Johannes Berg --- .../net/wireless/intel/iwlwifi/iwl-op-mode.h | 12 ++++++ drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 10 +++++ drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 17 +++++++- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 41 +++++++++++++++++-- 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-op-mode.h b/drivers/net/wireless/intel/iwlwifi/iwl-op-mode.h index 595fa6ddf0843b..8ef5ed2db05177 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-op-mode.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-op-mode.h @@ -85,6 +85,10 @@ struct iwl_cfg; * May sleep * @wimax_active: invoked when WiMax becomes active. May sleep * @time_point: called when transport layer wants to collect debug data + * @device_powered_off: called upon resume from hibernation but not only. + * Op_mode needs to reset its internal state because the device did not + * survive the system state transition. The firmware is no longer running, + * etc... */ struct iwl_op_mode_ops { struct iwl_op_mode *(*start)(struct iwl_trans *trans, @@ -107,6 +111,7 @@ struct iwl_op_mode_ops { void (*time_point)(struct iwl_op_mode *op_mode, enum iwl_fw_ini_time_point tp_id, union iwl_dbg_tlv_tp_data *tp_data); + void (*device_powered_off)(struct iwl_op_mode *op_mode); }; int iwl_opmode_register(const char *name, const struct iwl_op_mode_ops *ops); @@ -204,4 +209,11 @@ static inline void iwl_op_mode_time_point(struct iwl_op_mode *op_mode, op_mode->ops->time_point(op_mode, tp_id, tp_data); } +static inline void iwl_op_mode_device_powered_off(struct iwl_op_mode *op_mode) +{ + if (!op_mode || !op_mode->ops || !op_mode->ops->device_powered_off) + return; + op_mode->ops->device_powered_off(op_mode); +} + #endif /* __iwl_op_mode_h__ */ diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index b4d650583ac27e..99a541d442bb18 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -3439,6 +3439,16 @@ static int __iwl_mvm_resume(struct iwl_mvm *mvm, bool test) mutex_lock(&mvm->mutex); + /* Apparently, the device went away and device_powered_off() was called, + * don't even try to read the rt_status, the device is currently + * inaccessible. + */ + if (!test_bit(IWL_MVM_STATUS_IN_D3, &mvm->status)) { + IWL_INFO(mvm, + "Can't resume, device_powered_off() was called during wowlan\n"); + goto err; + } + mvm->last_reset_or_resume_time_jiffies = jiffies; /* get the BSS vif pointer again */ diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index b7dcae76a05dfc..75fc60a4808cb5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -2090,6 +2090,20 @@ static void iwl_op_mode_mvm_time_point(struct iwl_op_mode *op_mode, iwl_dbg_tlv_time_point(&mvm->fwrt, tp_id, tp_data); } +static void iwl_op_mode_mvm_device_powered_off(struct iwl_op_mode *op_mode) +{ + struct iwl_mvm *mvm = IWL_OP_MODE_GET_MVM(op_mode); + + mutex_lock(&mvm->mutex); + clear_bit(IWL_MVM_STATUS_IN_D3, &mvm->status); + mvm->trans->system_pm_mode = IWL_PLAT_PM_MODE_DISABLED; + iwl_mvm_stop_device(mvm); +#ifdef CONFIG_PM + mvm->fast_resume = false; +#endif + mutex_unlock(&mvm->mutex); +} + #define IWL_MVM_COMMON_OPS \ /* these could be differentiated */ \ .queue_full = iwl_mvm_stop_sw_queue, \ @@ -2102,7 +2116,8 @@ static void iwl_op_mode_mvm_time_point(struct iwl_op_mode *op_mode, /* as we only register one, these MUST be common! */ \ .start = iwl_op_mode_mvm_start, \ .stop = iwl_op_mode_mvm_stop, \ - .time_point = iwl_op_mode_mvm_time_point + .time_point = iwl_op_mode_mvm_time_point, \ + .device_powered_off = iwl_op_mode_mvm_device_powered_off static const struct iwl_op_mode_ops iwl_mvm_ops = { IWL_MVM_COMMON_OPS, diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 9ad43464b702be..84fd93278450b0 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1577,11 +1577,12 @@ static int iwl_pci_suspend(struct device *device) return 0; } -static int iwl_pci_resume(struct device *device) +static int _iwl_pci_resume(struct device *device, bool restore) { struct pci_dev *pdev = to_pci_dev(device); struct iwl_trans *trans = pci_get_drvdata(pdev); struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); + bool device_was_powered_off = false; /* Before you put code here, think about WoWLAN. You cannot check here * whether WoWLAN is enabled or not, and your code will run even if @@ -1597,6 +1598,26 @@ static int iwl_pci_resume(struct device *device) if (!trans->op_mode) return 0; + /* + * Scratch value was altered, this means the device was powered off, we + * need to reset it completely. + * Note: MAC (bits 0:7) will be cleared upon suspend even with wowlan, + * so assume that any bits there mean that the device is usable. + */ + if (trans->trans_cfg->device_family >= IWL_DEVICE_FAMILY_BZ && + !iwl_read32(trans, CSR_FUNC_SCRATCH)) + device_was_powered_off = true; + + if (restore || device_was_powered_off) { + trans->state = IWL_TRANS_NO_FW; + /* Hope for the best here ... If one of those steps fails we + * won't really know how to recover. + */ + iwl_pcie_prepare_card_hw(trans); + iwl_finish_nic_init(trans); + iwl_op_mode_device_powered_off(trans->op_mode); + } + /* In WOWLAN, let iwl_trans_pcie_d3_resume do the rest of the work */ if (test_bit(STATUS_DEVICE_ENABLED, &trans->status)) return 0; @@ -1617,9 +1638,23 @@ static int iwl_pci_resume(struct device *device) return 0; } +static int iwl_pci_restore(struct device *device) +{ + return _iwl_pci_resume(device, true); +} + +static int iwl_pci_resume(struct device *device) +{ + return _iwl_pci_resume(device, false); +} + static const struct dev_pm_ops iwl_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(iwl_pci_suspend, - iwl_pci_resume) + .suspend = pm_sleep_ptr(iwl_pci_suspend), + .resume = pm_sleep_ptr(iwl_pci_resume), + .freeze = pm_sleep_ptr(iwl_pci_suspend), + .thaw = pm_sleep_ptr(iwl_pci_resume), + .poweroff = pm_sleep_ptr(iwl_pci_suspend), + .restore = pm_sleep_ptr(iwl_pci_restore), }; #define IWL_PM_OPS (&iwl_dev_pm_ops) From f8a129c1e10256c785164ed5efa5d17d45fbd81b Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sun, 25 Aug 2024 19:17:13 +0300 Subject: [PATCH 736/991] wifi: iwlwifi: lower message level for FW buffer destination An invalid buffer destination is not a problem for the driver and it does not make sense to report it with the KERN_ERR message level. As such, change the message to use IWL_DEBUG_FW. Reported-by: Len Brown Closes: https://lore.kernel.org/r/CAJvTdKkcxJss=DM2sxgv_MR5BeZ4_OC-3ad6tA40TYH2yqHCWw@mail.gmail.com Signed-off-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20240825191257.20abf78f05bc.Ifbcecc2ae9fb40b9698302507dcba8b922c8d856@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c index e63efbf809f02d..ae93a72542b281 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c @@ -89,7 +89,8 @@ iwl_pcie_ctxt_info_dbg_enable(struct iwl_trans *trans, } break; default: - IWL_ERR(trans, "WRT: Invalid buffer destination\n"); + IWL_DEBUG_FW(trans, "WRT: Invalid buffer destination (%d)\n", + le32_to_cpu(fw_mon_cfg->buf_location)); } out: if (dbg_flags) From d44162280899c3fc2c6700e21e491e71c3c96e3d Mon Sep 17 00:00:00 2001 From: Daniel Gabay Date: Sun, 25 Aug 2024 19:17:05 +0300 Subject: [PATCH 737/991] wifi: iwlwifi: mvm: fix iwl_mvm_scan_fits() calculation The calculation should consider also the 6GHz IE's len, fix that. In addition, in iwl_mvm_sched_scan_start() the scan_fits helper is called only in case non_psc_incldued is true, but it should be called regardless, fix that as well. Signed-off-by: Daniel Gabay Reviewed-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20240825191257.7db825442fd2.I99f4d6587709de02072fd57957ec7472331c6b1d@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index 8e0df31f1b3e29..ecd9d301e88b2c 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -837,8 +837,8 @@ static inline bool iwl_mvm_scan_fits(struct iwl_mvm *mvm, int n_ssids, return ((n_ssids <= PROBE_OPTION_MAX) && (n_channels <= mvm->fw->ucode_capa.n_scan_channels) & (ies->common_ie_len + - ies->len[NL80211_BAND_2GHZ] + - ies->len[NL80211_BAND_5GHZ] <= + ies->len[NL80211_BAND_2GHZ] + ies->len[NL80211_BAND_5GHZ] + + ies->len[NL80211_BAND_6GHZ] <= iwl_mvm_max_scan_ie_fw_cmd_room(mvm))); } @@ -3168,18 +3168,16 @@ int iwl_mvm_sched_scan_start(struct iwl_mvm *mvm, params.n_channels = j; } - if (non_psc_included && - !iwl_mvm_scan_fits(mvm, req->n_ssids, ies, params.n_channels)) { - kfree(params.channels); - return -ENOBUFS; + if (!iwl_mvm_scan_fits(mvm, req->n_ssids, ies, params.n_channels)) { + ret = -ENOBUFS; + goto out; } uid = iwl_mvm_build_scan_cmd(mvm, vif, &hcmd, ¶ms, type); - - if (non_psc_included) - kfree(params.channels); - if (uid < 0) - return uid; + if (uid < 0) { + ret = uid; + goto out; + } ret = iwl_mvm_send_cmd(mvm, &hcmd); if (!ret) { @@ -3197,6 +3195,9 @@ int iwl_mvm_sched_scan_start(struct iwl_mvm *mvm, mvm->sched_scan_pass_all = SCHED_SCAN_PASS_ALL_DISABLED; } +out: + if (non_psc_included) + kfree(params.channels); return ret; } From 916a5d9c5354c426220a0a6533a5e8ea1287d6ea Mon Sep 17 00:00:00 2001 From: Daniel Gabay Date: Sun, 25 Aug 2024 19:17:06 +0300 Subject: [PATCH 738/991] wifi: iwlwifi: mvm: fix iwl_mvm_max_scan_ie_fw_cmd_room() Driver creates also the WFA TPC element, consider that in the calculation. Signed-off-by: Daniel Gabay Reviewed-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20240825191257.e710ce446b7f.I2715c6742e9c3d160e2ba41bc4b35de370d2ce34@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index ecd9d301e88b2c..bae6aec8295c37 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -48,6 +48,8 @@ /* Number of iterations on the channel for mei filtered scan */ #define IWL_MEI_SCAN_NUM_ITER 5U +#define WFA_TPC_IE_LEN 9 + struct iwl_mvm_scan_timing_params { u32 suspend_time; u32 max_out_time; @@ -303,8 +305,8 @@ static int iwl_mvm_max_scan_ie_fw_cmd_room(struct iwl_mvm *mvm) max_probe_len = SCAN_OFFLOAD_PROBE_REQ_SIZE; - /* we create the 802.11 header and SSID element */ - max_probe_len -= 24 + 2; + /* we create the 802.11 header SSID element and WFA TPC element */ + max_probe_len -= 24 + 2 + WFA_TPC_IE_LEN; /* DS parameter set element is added on 2.4GHZ band if required */ if (iwl_mvm_rrm_scan_needed(mvm)) @@ -731,8 +733,6 @@ static u8 *iwl_mvm_copy_and_insert_ds_elem(struct iwl_mvm *mvm, const u8 *ies, return newpos; } -#define WFA_TPC_IE_LEN 9 - static void iwl_mvm_add_tpc_report_ie(u8 *pos) { pos[0] = WLAN_EID_VENDOR_SPECIFIC; From cd6f46c2fdb82e80ca248549c1f3ebe08b4a63ab Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 25 Aug 2024 19:17:07 +0300 Subject: [PATCH 739/991] wifi: iwlwifi: mvm: take the mutex before running link selection iwl_mvm_select_links is called by the link selection worker and it requires the mutex. Take it in the link selection worker. This logic used to run from iwl_mvm_rx_umac_scan_complete_notif which had the mvm->mutex held. This was changed to run in a worker holding the wiphy mutex, but we also need the mvm->mutex. Fixes: 2e194efa3809 ("wifi: iwlwifi: mvm: Fix race in scan completion") Signed-off-by: Emmanuel Grumbach Reviewed-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20240825191257.0cacecd5db1e.Iaca38a078592b69bdd06549daf63408ccf1810e4@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index 75fc60a4808cb5..f7ff8b02def4d4 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -1198,10 +1198,12 @@ static void iwl_mvm_trig_link_selection(struct wiphy *wiphy, struct iwl_mvm *mvm = container_of(wk, struct iwl_mvm, trig_link_selection_wk); + mutex_lock(&mvm->mutex); ieee80211_iterate_active_interfaces(mvm->hw, IEEE80211_IFACE_ITER_NORMAL, iwl_mvm_find_link_selection_vif, NULL); + mutex_unlock(&mvm->mutex); } static struct iwl_op_mode * From 3ee22f07a35b76939c5b8d17d6af292f5fafb509 Mon Sep 17 00:00:00 2001 From: Anjaneyulu Date: Sun, 25 Aug 2024 19:17:08 +0300 Subject: [PATCH 740/991] wifi: iwlwifi: fw: fix wgds rev 3 exact size Check size of WGDS revision 3 is equal to 8 entries size with some header, but doesn't depend on the number of used entries. Check that used entries are between min and max but allow more to be present than are used to fix operation with some BIOSes that have such data. Fixes: 97f8a3d1610b ("iwlwifi: ACPI: support revision 3 WGDS tables") Signed-off-by: Anjaneyulu Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20240825191257.cc71dfc67ec3.Ic27ee15ac6128b275c210b6de88f2145bd83ca7b@changeid [edit commit message] Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/fw/acpi.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/acpi.c b/drivers/net/wireless/intel/iwlwifi/fw/acpi.c index 79774c8c7ff452..8c8880b4482701 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/acpi.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/acpi.c @@ -725,22 +725,25 @@ int iwl_acpi_get_wgds_table(struct iwl_fw_runtime *fwrt) entry = &wifi_pkg->package.elements[entry_idx]; entry_idx++; if (entry->type != ACPI_TYPE_INTEGER || - entry->integer.value > num_profiles) { + entry->integer.value > num_profiles || + entry->integer.value < + rev_data[idx].min_profiles) { ret = -EINVAL; goto out_free; } - num_profiles = entry->integer.value; /* - * this also validates >= min_profiles since we - * otherwise wouldn't have gotten the data when - * looking up in ACPI + * Check to see if we received package count + * same as max # of profiles */ if (wifi_pkg->package.count != hdr_size + profile_size * num_profiles) { ret = -EINVAL; goto out_free; } + + /* Number of valid profiles */ + num_profiles = entry->integer.value; } goto read_table; } From 0668ebc8c2282ca1e7eb96092a347baefffb5fe7 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 25 Aug 2024 19:17:10 +0300 Subject: [PATCH 741/991] wifi: iwlwifi: mvm: pause TCM when the firmware is stopped Not doing so will make us send a host command to the transport while the firmware is not alive, which will trigger a WARNING. bad state = 0 WARNING: CPU: 2 PID: 17434 at drivers/net/wireless/intel/iwlwifi/iwl-trans.c:115 iwl_trans_send_cmd+0x1cb/0x1e0 [iwlwifi] RIP: 0010:iwl_trans_send_cmd+0x1cb/0x1e0 [iwlwifi] Call Trace: iwl_mvm_send_cmd+0x40/0xc0 [iwlmvm] iwl_mvm_config_scan+0x198/0x260 [iwlmvm] iwl_mvm_recalc_tcm+0x730/0x11d0 [iwlmvm] iwl_mvm_tcm_work+0x1d/0x30 [iwlmvm] process_one_work+0x29e/0x640 worker_thread+0x2df/0x690 ? rescuer_thread+0x540/0x540 kthread+0x192/0x1e0 ? set_kthread_struct+0x90/0x90 ret_from_fork+0x22/0x30 Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20240825191257.5abe71ca1b6b.I97a968cb8be1f24f94652d9b110ecbf6af73f89e@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index f7ff8b02def4d4..b9daaffd9c7f53 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -1513,6 +1513,8 @@ void iwl_mvm_stop_device(struct iwl_mvm *mvm) clear_bit(IWL_MVM_STATUS_FIRMWARE_RUNNING, &mvm->status); + iwl_mvm_pause_tcm(mvm, false); + iwl_fw_dbg_stop_sync(&mvm->fwrt); iwl_trans_stop_device(mvm->trans); iwl_free_fw_paging(&mvm->fwrt); From 454f6306a31248cf972f5f16d4c145ad5b33bfdc Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Sun, 25 Aug 2024 19:17:12 +0300 Subject: [PATCH 742/991] wifi: iwlwifi: mvm: allow 6 GHz channels in MLO scan MLO internal scan may include 6 GHz channels. Since the 6 GHz scan indication is not set, the channel flags are set incorrectly, which leads to a firmware assert. Since the MLO scan may include 6 GHz and non 6 GHz channels in one request, add support for non-PSC 6 GHz channels (PSC channels are already supported) when the 6 GHz indication is not set. Fixes: 38b3998dfba3 ("wifi: iwlwifi: mvm: Introduce internal MLO passive scan") Signed-off-by: Avraham Stern Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20240825191257.04807f8213b2.Idd09d4366df92a74853649c1a520b7f0f752d1ac@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index bae6aec8295c37..1cc9c426bb1599 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -1659,6 +1659,17 @@ iwl_mvm_umac_scan_cfg_channels_v7(struct iwl_mvm *mvm, cfg->v2.channel_num = channels[i]->hw_value; if (cfg80211_channel_is_psc(channels[i])) cfg->flags = 0; + + if (band == NL80211_BAND_6GHZ) { + /* 6 GHz channels should only appear in a scan request + * that has scan_6ghz set. The only exception is MLO + * scan, which has to be passive. + */ + WARN_ON_ONCE(cfg->flags != 0); + cfg->flags = + cpu_to_le32(IWL_UHB_CHAN_CFG_FLAG_FORCE_PASSIVE); + } + cfg->v2.iter_count = 1; cfg->v2.iter_interval = 0; if (version < 17) From 3a84454f5204718ca5b4ad2c1f0bf2031e2403d1 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 25 Aug 2024 19:17:04 +0300 Subject: [PATCH 743/991] wifi: iwlwifi: mvm: don't wait for tx queues if firmware is dead There is a WARNING in iwl_trans_wait_tx_queues_empty() (that was recently converted from just a message), that can be hit if we wait for TX queues to become empty after firmware died. Clearly, we can't expect anything from the firmware after it's declared dead. Don't call iwl_trans_wait_tx_queues_empty() in this case. While it could be a good idea to stop the flow earlier, the flush functions do some maintenance work that is not related to the firmware, so keep that part of the code running even when the firmware is not running. Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20240825191257.a7cbd794cee9.I44a739fbd4ffcc46b83844dd1c7b2eb0c7b270f6@changeid [edit commit message] Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 835a05b9183331..625ccf566e1c2a 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -5818,6 +5818,10 @@ static void iwl_mvm_flush_no_vif(struct iwl_mvm *mvm, u32 queues, bool drop) int i; if (!iwl_mvm_has_new_tx_api(mvm)) { + /* we can't ask the firmware anything if it is dead */ + if (test_bit(IWL_MVM_STATUS_HW_RESTART_REQUESTED, + &mvm->status)) + return; if (drop) { guard(mvm)(mvm); iwl_mvm_flush_tx_path(mvm, @@ -5911,8 +5915,11 @@ void iwl_mvm_mac_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, /* this can take a while, and we may need/want other operations * to succeed while doing this, so do it without the mutex held + * If the firmware is dead, this can't work... */ - if (!drop && !iwl_mvm_has_new_tx_api(mvm)) + if (!drop && !iwl_mvm_has_new_tx_api(mvm) && + !test_bit(IWL_MVM_STATUS_HW_RESTART_REQUESTED, + &mvm->status)) iwl_trans_wait_tx_queues_empty(mvm->trans, msk); } From 786c5be9ac29a39b6f37f1fdd2ea59d0fe35d525 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 5 Aug 2024 17:20:35 +0300 Subject: [PATCH 744/991] wifi: mac80211: free skb on error path in ieee80211_beacon_get_ap() In 'ieee80211_beacon_get_ap()', free allocated skb in case of error returned by 'ieee80211_beacon_protect()'. Compile tested only. Signed-off-by: Dmitry Antipov Link: https://patch.msgid.link/20240805142035.227847-1-dmantipov@yandex.ru Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index edba4a31844fb6..bca7b341dd772d 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5348,8 +5348,10 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw, if (beacon->tail) skb_put_data(skb, beacon->tail, beacon->tail_len); - if (ieee80211_beacon_protect(skb, local, sdata, link) < 0) + if (ieee80211_beacon_protect(skb, local, sdata, link) < 0) { + dev_kfree_skb(skb); return NULL; + } ieee80211_beacon_get_finish(hw, vif, link, offs, beacon, skb, chanctx_conf, csa_off_base); From f25d1b5f1be13a6de341b1d26e0cf4275e5908d2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Aug 2024 11:33:27 -0400 Subject: [PATCH 745/991] MAINTAINERS: Update Olga Kornievskaia's email address Signed-off-by: Chuck Lever --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 93238316b6c00f..174d65623a8b52 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11974,7 +11974,7 @@ KERNEL NFSD, SUNRPC, AND LOCKD SERVERS M: Chuck Lever M: Jeff Layton R: Neil Brown -R: Olga Kornievskaia +R: Olga Kornievskaia R: Dai Ngo R: Tom Talpey L: linux-nfs@vger.kernel.org From da05ba23d4c8d3e8a45846b952e53dd76c4b5e36 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 23 Aug 2024 18:27:38 -0400 Subject: [PATCH 746/991] nfsd: hold reference to delegation when updating it for cb_getattr Once we've dropped the flc_lock, there is nothing that ensures that the delegation that was found will still be around later. Take a reference to it while holding the lock and then drop it when we've finished with the delegation. Fixes: c5967721e106 ("NFSD: handle GETATTR conflict with write delegation") Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index dafff707e23a46..19d39872be325f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8837,7 +8837,6 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file_lock_context *ctx; struct file_lease *fl; - struct nfs4_delegation *dp; struct iattr attrs; struct nfs4_cb_fattr *ncf; @@ -8862,7 +8861,8 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, goto break_lease; } if (type == F_WRLCK) { - dp = fl->c.flc_owner; + struct nfs4_delegation *dp = fl->c.flc_owner; + if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { spin_unlock(&ctx->flc_lock); return 0; @@ -8870,6 +8870,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, break_lease: nfsd_stats_wdeleg_getattr_inc(nn); dp = fl->c.flc_owner; + refcount_inc(&dp->dl_stid.sc_count); ncf = &dp->dl_cb_fattr; nfs4_cb_getattr(&dp->dl_cb_fattr); spin_unlock(&ctx->flc_lock); @@ -8879,8 +8880,10 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, /* Recall delegation only if client didn't respond */ status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (status != nfserr_jukebox || - !nfsd_wait_for_delegreturn(rqstp, inode)) + !nfsd_wait_for_delegreturn(rqstp, inode)) { + nfs4_put_stid(&dp->dl_stid); return status; + } } if (!ncf->ncf_file_modified && (ncf->ncf_initial_cinfo != ncf->ncf_cb_change || @@ -8900,6 +8903,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, *size = ncf->ncf_cur_fsize; *modified = true; } + nfs4_put_stid(&dp->dl_stid); return 0; } break; From 1116e0e372eb16dd907ec571ce5d4af325c55c10 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 23 Aug 2024 18:27:39 -0400 Subject: [PATCH 747/991] nfsd: fix potential UAF in nfsd4_cb_getattr_release Once we drop the delegation reference, the fields embedded in it are no longer safe to access. Do that last. Fixes: c5967721e106 ("NFSD: handle GETATTR conflict with write delegation") Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 19d39872be325f..02d43f95146eef 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3078,9 +3078,9 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb) struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); - nfs4_put_stid(&dp->dl_stid); clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags); wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY); + nfs4_put_stid(&dp->dl_stid); } static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = { From b6fb565a2d15277896583d471b21bc14a0c99661 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 26 Aug 2024 15:53:04 +0300 Subject: [PATCH 748/991] x86/tdx: Fix data leak in mmio_read() The mmio_read() function makes a TDVMCALL to retrieve MMIO data for an address from the VMM. Sean noticed that mmio_read() unintentionally exposes the value of an initialized variable (val) on the stack to the VMM. This variable is only needed as an output value. It did not need to be passed to the VMM in the first place. Do not send the original value of *val to the VMM. [ dhansen: clarify what 'val' is used for. ] Fixes: 31d58c4e557d ("x86/tdx: Handle in-kernel MMIO") Reported-by: Sean Christopherson Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20240826125304.1566719-1-kirill.shutemov%40linux.intel.com --- arch/x86/coco/tdx/tdx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 078e2bac25531b..da8b66dce0da5f 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -389,7 +389,6 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val) .r12 = size, .r13 = EPT_READ, .r14 = addr, - .r15 = *val, }; if (__tdx_hypercall(&args)) From c6a09e342f8e6d3cac7f7c5c14085236aca284b9 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sun, 25 Aug 2024 20:27:45 -0700 Subject: [PATCH 749/991] binfmt_elf_fdpic: fix AUXV size calculation when ELF_HWCAP2 is defined create_elf_fdpic_tables() does not correctly account the space for the AUX vector when an architecture has ELF_HWCAP2 defined. Prior to the commit 10e29251be0e ("binfmt_elf_fdpic: fix /proc//auxv") it resulted in the last entry of the AUX vector being set to zero, but with that change it results in a kernel BUG. Fix that by adding one to the number of AUXV entries (nitems) when ELF_HWCAP2 is defined. Fixes: 10e29251be0e ("binfmt_elf_fdpic: fix /proc//auxv") Cc: stable@vger.kernel.org Reported-by: Greg Ungerer Closes: https://lore.kernel.org/lkml/5b51975f-6d0b-413c-8b38-39a6a45e8821@westnet.com.au/ Signed-off-by: Max Filippov Tested-by: Greg Ungerer Link: https://lore.kernel.org/r/20240826032745.3423812-1-jcmvbkbc@gmail.com Signed-off-by: Kees Cook --- fs/binfmt_elf_fdpic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 28a3439f163abc..4fe5bb9f1b1f5e 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -589,6 +589,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, if (bprm->have_execfd) nitems++; +#ifdef ELF_HWCAP2 + nitems++; +#endif csp = sp; sp -= nitems * 2 * sizeof(unsigned long); From a699781c79ecf6cfe67fb00a0331b4088c7c8466 Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Fri, 23 Aug 2024 16:26:58 +1000 Subject: [PATCH 750/991] ethtool: check device is present when getting link settings A sysfs reader can race with a device reset or removal, attempting to read device state when the device is not actually present. eg: [exception RIP: qed_get_current_link+17] #8 [ffffb9e4f2907c48] qede_get_link_ksettings at ffffffffc07a994a [qede] #9 [ffffb9e4f2907cd8] __rh_call_get_link_ksettings at ffffffff992b01a3 #10 [ffffb9e4f2907d38] __ethtool_get_link_ksettings at ffffffff992b04e4 #11 [ffffb9e4f2907d90] duplex_show at ffffffff99260300 #12 [ffffb9e4f2907e38] dev_attr_show at ffffffff9905a01c #13 [ffffb9e4f2907e50] sysfs_kf_seq_show at ffffffff98e0145b #14 [ffffb9e4f2907e68] seq_read at ffffffff98d902e3 #15 [ffffb9e4f2907ec8] vfs_read at ffffffff98d657d1 #16 [ffffb9e4f2907f00] ksys_read at ffffffff98d65c3f #17 [ffffb9e4f2907f38] do_syscall_64 at ffffffff98a052fb crash> struct net_device.state ffff9a9d21336000 state = 5, state 5 is __LINK_STATE_START (0b1) and __LINK_STATE_NOCARRIER (0b100). The device is not present, note lack of __LINK_STATE_PRESENT (0b10). This is the same sort of panic as observed in commit 4224cfd7fb65 ("net-sysfs: add check for netdevice being present to speed_show"). There are many other callers of __ethtool_get_link_ksettings() which don't have a device presence check. Move this check into ethtool to protect all callers. Fixes: d519e17e2d01 ("net: export device speed and duplex via sysfs") Fixes: 4224cfd7fb65 ("net-sysfs: add check for netdevice being present to speed_show") Signed-off-by: Jamie Bainbridge Link: https://patch.msgid.link/8bae218864beaa44ed01628140475b9bf641c5b0.1724393671.git.jamie.bainbridge@gmail.com Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 2 +- net/ethtool/ioctl.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 0e2084ce7b7572..444f23e74f8e6b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -235,7 +235,7 @@ static ssize_t speed_show(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev) && netif_device_present(netdev)) { + if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index e18823bf23306d..ae041f51cd2da4 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -442,6 +442,9 @@ int __ethtool_get_link_ksettings(struct net_device *dev, if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + memset(link_ksettings, 0, sizeof(*link_ksettings)); return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); } From 284b75a3d83c7631586d98f6dede1d90f128f0db Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Thu, 22 Aug 2024 11:30:50 +0800 Subject: [PATCH 751/991] ata: libata: Fix memory leak for error path in ata_host_alloc() In ata_host_alloc(), if devres_alloc() fails to allocate the device host resource data pointer, the already allocated ata_host structure is not freed before returning from the function. This results in a potential memory leak. Call kfree(host) before jumping to the error handling path to ensure that the ata_host structure is properly freed if devres_alloc() fails. Fixes: 2623c7a5f279 ("libata: add refcounting to ata_host") Cc: stable@vger.kernel.org Signed-off-by: Zheng Qixing Reviewed-by: Yu Kuai Signed-off-by: Damien Le Moal --- drivers/ata/libata-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index c7752dc800280e..30932552437a7e 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5593,8 +5593,10 @@ struct ata_host *ata_host_alloc(struct device *dev, int n_ports) } dr = devres_alloc(ata_devres_release, 0, GFP_KERNEL); - if (!dr) + if (!dr) { + kfree(host); goto err_out; + } devres_add(dev, dr); dev_set_drvdata(dev, host); From e846be0fba85603d2ad6fc8db6810958d7b6bed1 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Fri, 23 Aug 2024 17:34:12 +0530 Subject: [PATCH 752/991] net: ti: icssg-prueth: Fix 10M Link issue on AM64x Crash is seen on AM64x 10M link when connecting / disconnecting multiple times. The fix for this is to enable quirk_10m_link_issue for AM64x. Fixes: b256e13378a9 ("net: ti: icssg-prueth: Add AM64x icssg support") Signed-off-by: MD Danish Anwar Reviewed-by: Roger Quadros Link: https://patch.msgid.link/20240823120412.1262536-1-danishanwar@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 3e51b3a9b0a57f..e3451beed32385 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -1452,6 +1452,7 @@ static const struct prueth_pdata am654_icssg_pdata = { static const struct prueth_pdata am64x_icssg_pdata = { .fdqring_mode = K3_RINGACC_RING_MODE_RING, + .quirk_10m_link_issue = 1, .switch_mode = 1, }; From 7e8ae8486e4471513e2111aba6ac29f2357bed2a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 26 Aug 2024 10:32:34 -0400 Subject: [PATCH 753/991] fs/nfsd: fix update of inode attrs in CB_GETATTR Currently, we copy the mtime and ctime to the in-core inode and then mark the inode dirty. This is fine for certain types of filesystems, but not all. Some require a real setattr to properly change these values (e.g. ceph or reexported NFS). Fix this code to call notify_change() instead, which is the proper way to effect a setattr. There is one problem though: In this case, the client is holding a write delegation and has sent us attributes to update our cache. We don't want to break the delegation for this since that would defeat the purpose. Add a new ATTR_DELEG flag that makes notify_change bypass the try_break_deleg call. Fixes: c5967721e106 ("NFSD: handle GETATTR conflict with write delegation") Reviewed-by: Christian Brauner Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/attr.c | 14 +++++++++++--- fs/nfsd/nfs4state.c | 18 +++++++++++++----- fs/nfsd/nfs4xdr.c | 2 +- fs/nfsd/state.h | 2 +- include/linux/fs.h | 1 + 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/fs/attr.c b/fs/attr.c index 960a310581ebb9..0dbf43b6555c83 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -489,9 +489,17 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, error = security_inode_setattr(idmap, dentry, attr); if (error) return error; - error = try_break_deleg(inode, delegated_inode); - if (error) - return error; + + /* + * If ATTR_DELEG is set, then these attributes are being set on + * behalf of the holder of a write delegation. We want to avoid + * breaking the delegation in this case. + */ + if (!(ia_valid & ATTR_DELEG)) { + error = try_break_deleg(inode, delegated_inode); + if (error) + return error; + } if (inode->i_op->setattr) error = inode->i_op->setattr(idmap, dentry, attr); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 02d43f95146eef..07f2496850c4c6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8815,7 +8815,7 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, /** * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict * @rqstp: RPC transaction context - * @inode: file to be checked for a conflict + * @dentry: dentry of inode to be checked for a conflict * @modified: return true if file was modified * @size: new size of file if modified is true * @@ -8830,7 +8830,7 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, * code is returned. */ __be32 -nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, bool *modified, u64 *size) { __be32 status; @@ -8839,6 +8839,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, struct file_lease *fl; struct iattr attrs; struct nfs4_cb_fattr *ncf; + struct inode *inode = d_inode(dentry); *modified = false; ctx = locks_inode_context(inode); @@ -8890,15 +8891,22 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) ncf->ncf_file_modified = true; if (ncf->ncf_file_modified) { + int err; + /* * Per section 10.4.3 of RFC 8881, the server would * not update the file's metadata with the client's * modified size */ attrs.ia_mtime = attrs.ia_ctime = current_time(inode); - attrs.ia_valid = ATTR_MTIME | ATTR_CTIME; - setattr_copy(&nop_mnt_idmap, inode, &attrs); - mark_inode_dirty(inode); + attrs.ia_valid = ATTR_MTIME | ATTR_CTIME | ATTR_DELEG; + inode_lock(inode); + err = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL); + inode_unlock(inode); + if (err) { + nfs4_put_stid(&dp->dl_stid); + return nfserrno(err); + } ncf->ncf_cur_fsize = ncf->ncf_cb_fsize; *size = ncf->ncf_cur_fsize; *modified = true; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 43ccf6119cf128..97f58377797261 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3565,7 +3565,7 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, } args.size = 0; if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { - status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry), + status = nfsd4_deleg_getattr_conflict(rqstp, dentry, &file_modified, &size); if (status) goto out; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index ffc217099d1913..ec4559ecd193b2 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -781,5 +781,5 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) } extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, - struct inode *inode, bool *file_modified, u64 *size); + struct dentry *dentry, bool *file_modified, u64 *size); #endif /* NFSD4_STATE_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 0283cf366c2a93..bafc1d134b9486 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -208,6 +208,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) #define ATTR_TOUCH (1 << 17) +#define ATTR_DELEG (1 << 18) /* Delegated attrs. Don't break write delegations */ /* * Whiteout is represented by a char device. The following constants define the From b49420d6a1aeb399e5b107fc6eb8584d0860fbd7 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 21 Aug 2024 15:11:35 -0400 Subject: [PATCH 754/991] video/aperture: optionally match the device in sysfb_disable() In aperture_remove_conflicting_pci_devices(), we currently only call sysfb_disable() on vga class devices. This leads to the following problem when the pimary device is not VGA compatible: 1. A PCI device with a non-VGA class is the boot display 2. That device is probed first and it is not a VGA device so sysfb_disable() is not called, but the device resources are freed by aperture_detach_platform_device() 3. Non-primary GPU has a VGA class and it ends up calling sysfb_disable() 4. NULL pointer dereference via sysfb_disable() since the resources have already been freed by aperture_detach_platform_device() when it was called by the other device. Fix this by passing a device pointer to sysfb_disable() and checking the device to determine if we should execute it or not. v2: Fix build when CONFIG_SCREEN_INFO is not set v3: Move device check into the mutex Drop primary variable in aperture_remove_conflicting_pci_devices() Drop __init on pci sysfb_pci_dev_is_enabled() Fixes: 5ae3716cfdcd ("video/aperture: Only remove sysfb on the default vga pci device") Cc: Javier Martinez Canillas Cc: Thomas Zimmermann Cc: Helge Deller Cc: Sam Ravnborg Cc: Daniel Vetter Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Javier Martinez Canillas Reviewed-by: Thomas Zimmermann Signed-off-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20240821191135.829765-1-alexander.deucher@amd.com --- drivers/firmware/sysfb.c | 19 +++++++++++++------ drivers/of/platform.c | 2 +- drivers/video/aperture.c | 11 +++-------- include/linux/sysfb.h | 4 ++-- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/firmware/sysfb.c b/drivers/firmware/sysfb.c index 921f61507ae831..02a07d3d0d40a9 100644 --- a/drivers/firmware/sysfb.c +++ b/drivers/firmware/sysfb.c @@ -39,6 +39,8 @@ static struct platform_device *pd; static DEFINE_MUTEX(disable_lock); static bool disabled; +static struct device *sysfb_parent_dev(const struct screen_info *si); + static bool sysfb_unregister(void) { if (IS_ERR_OR_NULL(pd)) @@ -52,6 +54,7 @@ static bool sysfb_unregister(void) /** * sysfb_disable() - disable the Generic System Framebuffers support + * @dev: the device to check if non-NULL * * This disables the registration of system framebuffer devices that match the * generic drivers that make use of the system framebuffer set up by firmware. @@ -61,17 +64,21 @@ static bool sysfb_unregister(void) * Context: The function can sleep. A @disable_lock mutex is acquired to serialize * against sysfb_init(), that registers a system framebuffer device. */ -void sysfb_disable(void) +void sysfb_disable(struct device *dev) { + struct screen_info *si = &screen_info; + mutex_lock(&disable_lock); - sysfb_unregister(); - disabled = true; + if (!dev || dev == sysfb_parent_dev(si)) { + sysfb_unregister(); + disabled = true; + } mutex_unlock(&disable_lock); } EXPORT_SYMBOL_GPL(sysfb_disable); #if defined(CONFIG_PCI) -static __init bool sysfb_pci_dev_is_enabled(struct pci_dev *pdev) +static bool sysfb_pci_dev_is_enabled(struct pci_dev *pdev) { /* * TODO: Try to integrate this code into the PCI subsystem @@ -87,13 +94,13 @@ static __init bool sysfb_pci_dev_is_enabled(struct pci_dev *pdev) return true; } #else -static __init bool sysfb_pci_dev_is_enabled(struct pci_dev *pdev) +static bool sysfb_pci_dev_is_enabled(struct pci_dev *pdev) { return false; } #endif -static __init struct device *sysfb_parent_dev(const struct screen_info *si) +static struct device *sysfb_parent_dev(const struct screen_info *si) { struct pci_dev *pdev; diff --git a/drivers/of/platform.c b/drivers/of/platform.c index 389d4ea6bfc159..ef622d41eb5b2e 100644 --- a/drivers/of/platform.c +++ b/drivers/of/platform.c @@ -592,7 +592,7 @@ static int __init of_platform_default_populate_init(void) * This can happen for example on DT systems that do EFI * booting and may provide a GOP handle to the EFI stub. */ - sysfb_disable(); + sysfb_disable(NULL); of_platform_device_create(node, NULL, NULL); of_node_put(node); } diff --git a/drivers/video/aperture.c b/drivers/video/aperture.c index 561be8feca96c0..2b5a1e666e9b25 100644 --- a/drivers/video/aperture.c +++ b/drivers/video/aperture.c @@ -293,7 +293,7 @@ int aperture_remove_conflicting_devices(resource_size_t base, resource_size_t si * ask for this, so let's assume that a real driver for the display * was already probed and prevent sysfb to register devices later. */ - sysfb_disable(); + sysfb_disable(NULL); aperture_detach_devices(base, size); @@ -346,15 +346,10 @@ EXPORT_SYMBOL(__aperture_remove_legacy_vga_devices); */ int aperture_remove_conflicting_pci_devices(struct pci_dev *pdev, const char *name) { - bool primary = false; resource_size_t base, size; int bar, ret = 0; - if (pdev == vga_default_device()) - primary = true; - - if (primary) - sysfb_disable(); + sysfb_disable(&pdev->dev); for (bar = 0; bar < PCI_STD_NUM_BARS; ++bar) { if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) @@ -370,7 +365,7 @@ int aperture_remove_conflicting_pci_devices(struct pci_dev *pdev, const char *na * that consumes the VGA framebuffer I/O range. Remove this * device as well. */ - if (primary) + if (pdev == vga_default_device()) ret = __aperture_remove_legacy_vga_devices(pdev); return ret; diff --git a/include/linux/sysfb.h b/include/linux/sysfb.h index c9cb657dad08ad..bef5f06a91de65 100644 --- a/include/linux/sysfb.h +++ b/include/linux/sysfb.h @@ -58,11 +58,11 @@ struct efifb_dmi_info { #ifdef CONFIG_SYSFB -void sysfb_disable(void); +void sysfb_disable(struct device *dev); #else /* CONFIG_SYSFB */ -static inline void sysfb_disable(void) +static inline void sysfb_disable(struct device *dev) { } From 66927b89289974dab6d3b3cdd7706d0376034114 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 26 Aug 2024 15:11:38 -0400 Subject: [PATCH 755/991] bcachefs: Fix failure to return error in data_update_index_update() This fixes an assertion pop in io_write.c - if we don't return an error we're supposed to have completed all the btree updates. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 65176d51b502e6..004894ad414701 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -337,6 +337,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, printbuf_exit(&buf); bch2_fatal_error(c); + ret = -EIO; goto out; } From d26935690c03fe8159d42358bed1c56252700cd1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 26 Aug 2024 19:11:00 -0400 Subject: [PATCH 756/991] bcachefs: Fix bch2_extents_match() false positive This was caught as a very rare nonce inconsistency, on systems with encryption and replication (and tiering, or some form of rebalance operation running): [Wed Jul 17 13:30:03 2024] about to insert invalid key in data update path [Wed Jul 17 13:30:03 2024] old: u64s 10 type extent 671283510:6392:U32_MAX len 16 ver 106595503: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:104 gen 7 ptr: 4:513244:48 gen 6 rebalance: target hdd compression zstd [Wed Jul 17 13:30:03 2024] k: u64s 10 type extent 671283510:6400:U32_MAX len 16 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 ptr: 4:513244:56 gen 6 rebalance: target hdd compression zstd [Wed Jul 17 13:30:03 2024] new: u64s 14 type extent 671283510:6392:U32_MAX len 8 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 cached ptr: 4:513244:56 gen 6 cached rebalance: target hdd compression zstd crc: c_size 8 size 16 offset 8 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 1:10860085:32 gen 0 ptr: 0:17285918:408 gen 0 [Wed Jul 17 13:30:03 2024] bcachefs (cca5bc65-fe77-409d-a9fa-465a6e7f4eae): fatal error - emergency read only bch2_extents_match() was reporting true for extents that did not actually point to the same data. bch2_extent_match() iterates over pairs of pointers, looking for pointers that point to the same location on disk (with matching generation numbers). However one or both extents may have been trimmed (or merged) and they might not have the same disk offset: it corrects for this by subtracting the key offset and the checksum entry offset. However, this failed when an extent was immediately partially overwritten, and the new overwrite was allocated the next adjacent disk space. Normally, with compression off, this would never cause a bug, since the new extent would have to be immediately after the old extent for the pointer offsets to match, and the rebalance index update path is not looking for an extent outside the range of the extent it moved. However with compression enabled, extents take up less space on disk than they do in the btree index space - and spuriously matching after partial overwrite is possible. To fix this, add a secondary check, that strictly checks that the regions pointed to on disk overlap. https://github.com/koverstreet/bcachefs/issues/717 Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index e317df3644a119..eb31bda195443b 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -929,8 +929,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) if (p1.ptr.dev == p2.ptr.dev && p1.ptr.gen == p2.ptr.gen && + + /* + * This checks that the two pointers point + * to the same region on disk - adjusting + * for the difference in where the extents + * start, since one may have been trimmed: + */ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) && + + /* + * This additionally checks that the + * extents overlap on disk, since the + * previous check may trigger spuriously + * when one extent is immediately partially + * overwritten with another extent (so that + * on disk they are adjacent) and + * compression is in use: + */ + ((p1.ptr.offset >= p2.ptr.offset && + p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) || + (p2.ptr.offset >= p1.ptr.offset && + p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) return true; return false; From 7bbc079531fc38d401e1c4088d4981435a8828e3 Mon Sep 17 00:00:00 2001 From: Cosmo Chou Date: Mon, 19 Aug 2024 18:46:30 +0800 Subject: [PATCH 757/991] hwmon: (pt5161l) Fix invalid temperature reading The temperature reading function was using a signed long for the ADC code, which could lead to mishandling of invalid codes on 32-bit platforms. This allowed out-of-range ADC codes to be incorrectly interpreted as valid values and used in temperature calculations. Change adc_code to u32 to ensure that invalid ADC codes are correctly identified on all platforms. Fixes: 1b2ca93cd059 ("hwmon: Add driver for Astera Labs PT5161L retimer") Signed-off-by: Cosmo Chou Message-ID: <20240819104630.2375441-1-chou.cosmo@gmail.com> Signed-off-by: Guenter Roeck --- drivers/hwmon/pt5161l.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/pt5161l.c b/drivers/hwmon/pt5161l.c index b0d58a26d499d0..a9f0b23f9e76e6 100644 --- a/drivers/hwmon/pt5161l.c +++ b/drivers/hwmon/pt5161l.c @@ -427,7 +427,7 @@ static int pt5161l_read(struct device *dev, enum hwmon_sensor_types type, struct pt5161l_data *data = dev_get_drvdata(dev); int ret; u8 buf[8]; - long adc_code; + u32 adc_code; switch (attr) { case hwmon_temp_input: @@ -449,7 +449,7 @@ static int pt5161l_read(struct device *dev, enum hwmon_sensor_types type, adc_code = buf[3] << 24 | buf[2] << 16 | buf[1] << 8 | buf[0]; if (adc_code == 0 || adc_code >= 0x3ff) { - dev_dbg(dev, "Invalid adc_code %lx\n", adc_code); + dev_dbg(dev, "Invalid adc_code %x\n", adc_code); return -EIO; } From 9a471de516c35219d1722c13367191ce1f120fe9 Mon Sep 17 00:00:00 2001 From: ZHANG Yuntian Date: Sat, 3 Aug 2024 15:46:07 +0800 Subject: [PATCH 758/991] USB: serial: option: add MeiG Smart SRM825L Add support for MeiG Smart SRM825L which is based on Qualcomm 315 chip. T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=2dee ProdID=4d22 Rev= 4.14 S: Manufacturer=MEIG S: Product=LTE-A Module S: SerialNumber=6f345e48 C:* #Ifs= 6 Cfg#= 1 Atr=80 MxPwr=896mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=88(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=89(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms Signed-off-by: ZHANG Yuntian Link: https://lore.kernel.org/0041DFA5200EFB1B+20240803074619.563116-1-yt@radxa.com/ Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 311040f9b93522..176f38750ad589 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -619,6 +619,8 @@ static void option_instat_callback(struct urb *urb); /* MeiG Smart Technology products */ #define MEIGSMART_VENDOR_ID 0x2dee +/* MeiG Smart SRM825L based on Qualcomm 315 */ +#define MEIGSMART_PRODUCT_SRM825L 0x4d22 /* MeiG Smart SLM320 based on UNISOC UIS8910 */ #define MEIGSMART_PRODUCT_SLM320 0x4d41 @@ -2366,6 +2368,9 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, TOZED_PRODUCT_LT70C, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM320, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, option_ids); From 6d30bb88f623526197c0e18a366e68a4254a2c83 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Fri, 23 Aug 2024 15:15:20 +0200 Subject: [PATCH 759/991] wifi: wfx: repair open network AP mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RSN IE missing in beacon is normal in open networks. Avoid returning -EINVAL in this case. Steps to reproduce: $ cat /etc/wpa_supplicant.conf network={ ssid="testNet" mode=2 key_mgmt=NONE } $ wpa_supplicant -iwlan0 -c /etc/wpa_supplicant.conf nl80211: Beacon set failed: -22 (Invalid argument) Failed to set beacon parameters Interface initialization failed wlan0: interface state UNINITIALIZED->DISABLED wlan0: AP-DISABLED wlan0: Unable to setup interface. Failed to initialize AP interface After the change: $ wpa_supplicant -iwlan0 -c /etc/wpa_supplicant.conf Successfully initialized wpa_supplicant wlan0: interface state UNINITIALIZED->ENABLED wlan0: AP-ENABLED Cc: stable@vger.kernel.org Fixes: fe0a7776d4d1 ("wifi: wfx: fix possible NULL pointer dereference in wfx_set_mfp_ap()") Signed-off-by: Alexander Sverdlin Reviewed-by: Jérôme Pouiller Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240823131521.3309073-1-alexander.sverdlin@siemens.com --- drivers/net/wireless/silabs/wfx/sta.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c index 216d43c8bd6e92..7c04810dbf3dca 100644 --- a/drivers/net/wireless/silabs/wfx/sta.c +++ b/drivers/net/wireless/silabs/wfx/sta.c @@ -352,8 +352,11 @@ static int wfx_set_mfp_ap(struct wfx_vif *wvif) ptr = (u16 *)cfg80211_find_ie(WLAN_EID_RSN, skb->data + ieoffset, skb->len - ieoffset); - if (unlikely(!ptr)) + if (!ptr) { + /* No RSN IE is fine in open networks */ + ret = 0; goto free_skb; + } ptr += pairwise_cipher_suite_count_offset; if (WARN_ON(ptr > (u16 *)skb_tail_pointer(skb))) From 094513f8a2fbddee51b055d8035f995551f98fce Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 25 Aug 2024 19:17:01 +0300 Subject: [PATCH 760/991] wifi: iwlwifi: clear trans->state earlier upon error When the firmware crashes, we first told the op_mode and only then, changed the transport's state. This is a problem if the op_mode's nic_error() handler needs to send a host command: it'll see that the transport's state still reflects that the firmware is alive. Today, this has no consequences since we set the STATUS_FW_ERROR bit and that will prevent sending host commands. iwl_fw_dbg_stop_restart_recording looks at this bit to know not to send a host command for example. To fix the hibernation, we needed to reset the firmware without having an error and checking STATUS_FW_ERROR to see whether the firmware is alive will no longer hold, so this change is necessary as well. Change the flow a bit. Change trans->state before calling the op_mode's nic_error() method and check trans->state instead of STATUS_FW_ERROR. This will keep the current behavior of iwl_fw_dbg_stop_restart_recording upon firmware error, and it'll allow us to call iwl_fw_dbg_stop_restart_recording safely even if STATUS_FW_ERROR is clear, but yet, the firmware is not alive. Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20240825191257.9d7427fbdfd7.Ia056ca57029a382c921d6f7b6a6b28fc480f2f22@changeid [I missed this was a dependency for the hibernation fix, changed the commit message a bit accordingly] Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/fw/dbg.c | 2 +- drivers/net/wireless/intel/iwlwifi/iwl-trans.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c index fa57df336785b6..fb2ea38e89acab 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c @@ -3348,7 +3348,7 @@ void iwl_fw_dbg_stop_restart_recording(struct iwl_fw_runtime *fwrt, { int ret __maybe_unused = 0; - if (test_bit(STATUS_FW_ERROR, &fwrt->trans->status)) + if (!iwl_trans_fw_running(fwrt->trans)) return; if (fw_has_capa(&fwrt->fw->ucode_capa, diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h index 6148acbac6af9e..0ef48effeefb4e 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h @@ -1128,8 +1128,8 @@ static inline void iwl_trans_fw_error(struct iwl_trans *trans, bool sync) /* prevent double restarts due to the same erroneous FW */ if (!test_and_set_bit(STATUS_FW_ERROR, &trans->status)) { - iwl_op_mode_nic_error(trans->op_mode, sync); trans->state = IWL_TRANS_NO_FW; + iwl_op_mode_nic_error(trans->op_mode, sync); } } From 7d058e6bac9afab6a406e34344ebbfd3068bb2d5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 23 Aug 2024 09:50:55 +0200 Subject: [PATCH 761/991] drm/i915/dsi: Make Lenovo Yoga Tab 3 X90F DMI match less strict There are 2G and 4G RAM versions of the Lenovo Yoga Tab 3 X90F and it turns out that the 2G version has a DMI product name of "CHERRYVIEW D1 PLATFORM" where as the 4G version has "CHERRYVIEW C0 PLATFORM". The sys-vendor + product-version check are unique enough that the product-name check is not necessary. Drop the product-name check so that the existing DMI match for the 4G RAM version also matches the 2G RAM version. Fixes: f6f4a0862bde ("drm/i915/vlv_dsi: Add DMI quirk for backlight control issues on Lenovo Yoga Tab 3 (v2)") Cc: stable@vger.kernel.org Acked-by: Jani Nikula Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20240823075055.17198-1-hdegoede@redhat.com (cherry picked from commit a4dbe45c4c14edc316ae94b9af86a28f8c5d8123) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/vlv_dsi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/vlv_dsi.c b/drivers/gpu/drm/i915/display/vlv_dsi.c index eae5b5e09aa874..931d2cf74ed859 100644 --- a/drivers/gpu/drm/i915/display/vlv_dsi.c +++ b/drivers/gpu/drm/i915/display/vlv_dsi.c @@ -1870,7 +1870,6 @@ static const struct dmi_system_id vlv_dsi_dmi_quirk_table[] = { /* Lenovo Yoga Tab 3 Pro YT3-X90F */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Intel Corporation"), - DMI_MATCH(DMI_PRODUCT_NAME, "CHERRYVIEW D1 PLATFORM"), DMI_MATCH(DMI_PRODUCT_VERSION, "Blade3-10A-001"), }, .driver_data = (void *)vlv_dsi_lenovo_yoga_tab3_backlight_fixup, From 2955ae8186c8a6f029e429f7890e0c7e5f6e215e Mon Sep 17 00:00:00 2001 From: John Harrison Date: Thu, 1 Aug 2024 20:10:51 -0700 Subject: [PATCH 762/991] drm/i915: ARL requires a newer GSC firmware ARL and MTL share a single GSC firmware blob. However, ARL requires a newer version of it. So add differentiate of the PCI ids for ARL from MTL and create ARL as a sub-platform of MTL. That way, all the existing workarounds and such still treat ARL as MTL exactly as before. However, now the GSC code can check for ARL and do an extra version check on the firmware before committing to it. Also, the version extraction code has various ways of failing but the return code was being ignore and so the firmware load would attempt to continue anyway. Fix that by propagating the return code to the next level out. Signed-off-by: John Harrison Fixes: 213c43676beb ("drm/i915/mtl: Remove the 'force_probe' requirement for Meteor Lake") Reviewed-by: Daniele Ceraolo Spurio Acked-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240802031051.3816392-1-John.C.Harrison@Intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 67733d7a71503fd3e32eeada371f8aa2516c5c95) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c | 31 +++++++++++++++++++++++ drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 10 ++++++-- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/intel_device_info.c | 7 +++++ drivers/gpu/drm/i915/intel_device_info.h | 3 +++ include/drm/intel/i915_pciids.h | 11 +++++--- 6 files changed, 58 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c index 3b69bc6616bd34..551b0d7974ff13 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c @@ -212,6 +212,37 @@ int intel_gsc_fw_get_binary_info(struct intel_uc_fw *gsc_fw, const void *data, s } } + if (IS_ARROWLAKE(gt->i915)) { + bool too_old = false; + + /* + * ARL requires a newer firmware than MTL did (102.0.10.1878) but the + * firmware is actually common. So, need to do an explicit version check + * here rather than using a separate table entry. And if the older + * MTL-only version is found, then just don't use GSC rather than aborting + * the driver load. + */ + if (gsc->release.major < 102) { + too_old = true; + } else if (gsc->release.major == 102) { + if (gsc->release.minor == 0) { + if (gsc->release.patch < 10) { + too_old = true; + } else if (gsc->release.patch == 10) { + if (gsc->release.build < 1878) + too_old = true; + } + } + } + + if (too_old) { + gt_info(gt, "GSC firmware too old for ARL, got %d.%d.%d.%d but need at least 102.0.10.1878", + gsc->release.major, gsc->release.minor, + gsc->release.patch, gsc->release.build); + return -EINVAL; + } + } + return 0; } diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c index d80278eb45d734..ec33ad942115ab 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c @@ -698,12 +698,18 @@ static int check_gsc_manifest(struct intel_gt *gt, const struct firmware *fw, struct intel_uc_fw *uc_fw) { + int ret; + switch (uc_fw->type) { case INTEL_UC_FW_TYPE_HUC: - intel_huc_fw_get_binary_info(uc_fw, fw->data, fw->size); + ret = intel_huc_fw_get_binary_info(uc_fw, fw->data, fw->size); + if (ret) + return ret; break; case INTEL_UC_FW_TYPE_GSC: - intel_gsc_fw_get_binary_info(uc_fw, fw->data, fw->size); + ret = intel_gsc_fw_get_binary_info(uc_fw, fw->data, fw->size); + if (ret) + return ret; break; default: MISSING_CASE(uc_fw->type); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index d7723dd11c8070..110340e02a0213 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -546,6 +546,8 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915, #define IS_LUNARLAKE(i915) (0 && i915) #define IS_BATTLEMAGE(i915) (0 && i915) +#define IS_ARROWLAKE(i915) \ + IS_SUBPLATFORM(i915, INTEL_METEORLAKE, INTEL_SUBPLATFORM_ARL) #define IS_DG2_G10(i915) \ IS_SUBPLATFORM(i915, INTEL_DG2, INTEL_SUBPLATFORM_G10) #define IS_DG2_G11(i915) \ diff --git a/drivers/gpu/drm/i915/intel_device_info.c b/drivers/gpu/drm/i915/intel_device_info.c index d26de37719a72b..eede5417cb3fed 100644 --- a/drivers/gpu/drm/i915/intel_device_info.c +++ b/drivers/gpu/drm/i915/intel_device_info.c @@ -203,6 +203,10 @@ static const u16 subplatform_g12_ids[] = { INTEL_DG2_G12_IDS(ID), }; +static const u16 subplatform_arl_ids[] = { + INTEL_ARL_IDS(ID), +}; + static bool find_devid(u16 id, const u16 *p, unsigned int num) { for (; num; num--, p++) { @@ -260,6 +264,9 @@ static void intel_device_info_subplatform_init(struct drm_i915_private *i915) } else if (find_devid(devid, subplatform_g12_ids, ARRAY_SIZE(subplatform_g12_ids))) { mask = BIT(INTEL_SUBPLATFORM_G12); + } else if (find_devid(devid, subplatform_arl_ids, + ARRAY_SIZE(subplatform_arl_ids))) { + mask = BIT(INTEL_SUBPLATFORM_ARL); } GEM_BUG_ON(mask & ~INTEL_SUBPLATFORM_MASK); diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h index d1a2abc7e51393..df73ef94615dd8 100644 --- a/drivers/gpu/drm/i915/intel_device_info.h +++ b/drivers/gpu/drm/i915/intel_device_info.h @@ -127,6 +127,9 @@ enum intel_platform { #define INTEL_SUBPLATFORM_N 1 #define INTEL_SUBPLATFORM_RPLU 2 +/* MTL */ +#define INTEL_SUBPLATFORM_ARL 0 + enum intel_ppgtt_type { INTEL_PPGTT_NONE = I915_GEM_PPGTT_NONE, INTEL_PPGTT_ALIASING = I915_GEM_PPGTT_ALIASING, diff --git a/include/drm/intel/i915_pciids.h b/include/drm/intel/i915_pciids.h index b21374f76df238..2bf03ebfcf73da 100644 --- a/include/drm/intel/i915_pciids.h +++ b/include/drm/intel/i915_pciids.h @@ -772,15 +772,18 @@ INTEL_ATS_M75_IDS(MACRO__, ## __VA_ARGS__) /* MTL */ +#define INTEL_ARL_IDS(MACRO__, ...) \ + MACRO__(0x7D41, ## __VA_ARGS__), \ + MACRO__(0x7D51, ## __VA_ARGS__), \ + MACRO__(0x7D67, ## __VA_ARGS__), \ + MACRO__(0x7DD1, ## __VA_ARGS__) + #define INTEL_MTL_IDS(MACRO__, ...) \ + INTEL_ARL_IDS(MACRO__, ## __VA_ARGS__), \ MACRO__(0x7D40, ## __VA_ARGS__), \ - MACRO__(0x7D41, ## __VA_ARGS__), \ MACRO__(0x7D45, ## __VA_ARGS__), \ - MACRO__(0x7D51, ## __VA_ARGS__), \ MACRO__(0x7D55, ## __VA_ARGS__), \ MACRO__(0x7D60, ## __VA_ARGS__), \ - MACRO__(0x7D67, ## __VA_ARGS__), \ - MACRO__(0x7DD1, ## __VA_ARGS__), \ MACRO__(0x7DD5, ## __VA_ARGS__) /* LNL */ From 4786fe29f5a0dd74d9ccdce8c734bde1fb88cf37 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 22 Aug 2024 12:25:57 -0700 Subject: [PATCH 763/991] ionic: Prevent tx_timeout due to frequent doorbell ringing With recent work to the doorbell workaround code a small hole was introduced that could cause a tx_timeout. This happens if the rx dbell_deadline goes beyond the netdev watchdog timeout set by the driver (i.e. 2 seconds). Fix this by changing the netdev watchdog timeout to 5 seconds and reduce the max rx dbell_deadline to 4 seconds. The test that can reproduce the issue being fixed is a multi-queue send test via pktgen with the "burst" setting to 1. This causes the queue's doorbell to be rung on every packet sent to the driver, which may result in the device missing doorbells due to the high doorbell rate. Cc: stable@vger.kernel.org Fixes: 4ded136c78f8 ("ionic: add work item for missed-doorbell check") Signed-off-by: Brett Creeley Reviewed-by: Shannon Nelson Link: https://patch.msgid.link/20240822192557.9089-1-brett.creeley@amd.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/pensando/ionic/ionic_dev.h | 2 +- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.h b/drivers/net/ethernet/pensando/ionic/ionic_dev.h index c647033f3ad296..f2f07bf8854560 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.h @@ -32,7 +32,7 @@ #define IONIC_ADMIN_DOORBELL_DEADLINE (HZ / 2) /* 500ms */ #define IONIC_TX_DOORBELL_DEADLINE (HZ / 100) /* 10ms */ #define IONIC_RX_MIN_DOORBELL_DEADLINE (HZ / 100) /* 10ms */ -#define IONIC_RX_MAX_DOORBELL_DEADLINE (HZ * 5) /* 5s */ +#define IONIC_RX_MAX_DOORBELL_DEADLINE (HZ * 4) /* 4s */ struct ionic_dev_bar { void __iomem *vaddr; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index aa0cc31dfe6e6f..86774d9922d84d 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -3220,7 +3220,7 @@ int ionic_lif_alloc(struct ionic *ionic) netdev->netdev_ops = &ionic_netdev_ops; ionic_ethtool_set_ops(netdev); - netdev->watchdog_timeo = 2 * HZ; + netdev->watchdog_timeo = 5 * HZ; netif_carrier_off(netdev); lif->identity = lid; From 6b35cc8d9239569700cc7cc737c8ed40b8b9cfdb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Aug 2024 17:00:20 -0700 Subject: [PATCH 764/991] xfs: use XFS_BUF_DADDR_NULL for daddrs in getfsmap code Use XFS_BUF_DADDR_NULL (instead of a magic sentinel value) to mean "this field is null" like the rest of xfs. Cc: wozizhi@huawei.com Fixes: e89c041338ed6 ("xfs: implement the GETFSMAP ioctl") Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_fsmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 3a30b36779db56..613a0ec2041209 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -252,7 +252,7 @@ xfs_getfsmap_rec_before_start( const struct xfs_rmap_irec *rec, xfs_daddr_t rec_daddr) { - if (info->low_daddr != -1ULL) + if (info->low_daddr != XFS_BUF_DADDR_NULL) return rec_daddr < info->low_daddr; if (info->low.rm_blockcount) return xfs_rmap_compare(rec, &info->low) < 0; @@ -983,7 +983,7 @@ xfs_getfsmap( info.dev = handlers[i].dev; info.last = false; info.pag = NULL; - info.low_daddr = -1ULL; + info.low_daddr = XFS_BUF_DADDR_NULL; info.low.rm_blockcount = 0; error = handlers[i].fn(tp, dkeys, &info); if (error) From ca6448aed4f10ad88eba79055f181eb9a589a7b3 Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Thu, 22 Aug 2024 17:00:35 -0700 Subject: [PATCH 765/991] xfs: Fix missing interval for missing_owner in xfs fsmap In the fsmap query of xfs, there is an interval missing problem: [root@fedora ~]# xfs_io -c 'fsmap -vvvv' /mnt EXT: DEV BLOCK-RANGE OWNER FILE-OFFSET AG AG-OFFSET TOTAL 0: 253:16 [0..7]: static fs metadata 0 (0..7) 8 1: 253:16 [8..23]: per-AG metadata 0 (8..23) 16 2: 253:16 [24..39]: inode btree 0 (24..39) 16 3: 253:16 [40..47]: per-AG metadata 0 (40..47) 8 4: 253:16 [48..55]: refcount btree 0 (48..55) 8 5: 253:16 [56..103]: per-AG metadata 0 (56..103) 48 6: 253:16 [104..127]: free space 0 (104..127) 24 ...... BUG: [root@fedora ~]# xfs_io -c 'fsmap -vvvv -d 104 107' /mnt [root@fedora ~]# Normally, we should be able to get [104, 107), but we got nothing. The problem is caused by shifting. The query for the problem-triggered scenario is for the missing_owner interval (e.g. freespace in rmapbt/ unknown space in bnobt), which is obtained by subtraction (gap). For this scenario, the interval is obtained by info->last. However, rec_daddr is calculated based on the start_block recorded in key[1], which is converted by calling XFS_BB_TO_FSBT. Then if rec_daddr does not exceed info->next_daddr, which means keys[1].fmr_physical >> (mp)->m_blkbb_log <= info->next_daddr, no records will be displayed. In the above example, 104 >> (mp)->m_blkbb_log = 12 and 107 >> (mp)->m_blkbb_log = 12, so the two are reduced to 0 and the gap is ignored: before calculate ----------------> after shifting 104(st) 107(ed) 12(st/ed) |---------| | sector size block size Resolve this issue by introducing the "end_daddr" field in xfs_getfsmap_info. This records |key[1].fmr_physical + key[1].length| at the granularity of sector. If the current query is the last, the rec_daddr is end_daddr to prevent missing interval problems caused by shifting. We only need to focus on the last query, because xfs disks are internally aligned with disk blocksize that are powers of two and minimum 512, so there is no problem with shifting in previous queries. After applying this patch, the above problem have been solved: [root@fedora ~]# xfs_io -c 'fsmap -vvvv -d 104 107' /mnt EXT: DEV BLOCK-RANGE OWNER FILE-OFFSET AG AG-OFFSET TOTAL 0: 253:16 [104..106]: free space 0 (104..106) 3 Fixes: e89c041338ed ("xfs: implement the GETFSMAP ioctl") Signed-off-by: Zizhi Wo Reviewed-by: Darrick J. Wong [djwong: limit the range of end_addr correctly] Signed-off-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_fsmap.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 613a0ec2041209..71f32354944e45 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -162,6 +162,7 @@ struct xfs_getfsmap_info { xfs_daddr_t next_daddr; /* next daddr we expect */ /* daddr of low fsmap key when we're using the rtbitmap */ xfs_daddr_t low_daddr; + xfs_daddr_t end_daddr; /* daddr of high fsmap key */ u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ /* @@ -182,6 +183,7 @@ struct xfs_getfsmap_dev { int (*fn)(struct xfs_trans *tp, const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info); + sector_t nr_sectors; }; /* Compare two getfsmap device handlers. */ @@ -294,6 +296,18 @@ xfs_getfsmap_helper( return 0; } + /* + * For an info->last query, we're looking for a gap between the last + * mapping emitted and the high key specified by userspace. If the + * user's query spans less than 1 fsblock, then info->high and + * info->low will have the same rm_startblock, which causes rec_daddr + * and next_daddr to be the same. Therefore, use the end_daddr that + * we calculated from userspace's high key to synthesize the record. + * Note that if the btree query found a mapping, there won't be a gap. + */ + if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL) + rec_daddr = info->end_daddr; + /* Are we just counting mappings? */ if (info->head->fmh_count == 0) { if (info->head->fmh_entries == UINT_MAX) @@ -904,17 +918,21 @@ xfs_getfsmap( /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); + handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else handlers[0].fn = xfs_getfsmap_datadev_bnobt; if (mp->m_logdev_targp != mp->m_ddev_targp) { + handlers[1].nr_sectors = XFS_FSB_TO_BB(mp, + mp->m_sb.sb_logblocks); handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); handlers[1].fn = xfs_getfsmap_logdev; } #ifdef CONFIG_XFS_RT if (mp->m_rtdev_targp) { + handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; } @@ -946,6 +964,7 @@ xfs_getfsmap( info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; + info.end_daddr = XFS_BUF_DADDR_NULL; info.fsmap_recs = fsmap_recs; info.head = head; @@ -966,8 +985,11 @@ xfs_getfsmap( * low key, zero out the low key so that we get * everything from the beginning. */ - if (handlers[i].dev == head->fmh_keys[1].fmr_device) + if (handlers[i].dev == head->fmh_keys[1].fmr_device) { dkeys[1] = head->fmh_keys[1]; + info.end_daddr = min(handlers[i].nr_sectors - 1, + dkeys[1].fmr_physical); + } if (handlers[i].dev > head->fmh_keys[0].fmr_device) memset(&dkeys[0], 0, sizeof(struct xfs_fsmap)); From 16e1fbdce9c8d084863fd63cdaff8fb2a54e2f88 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Aug 2024 17:00:51 -0700 Subject: [PATCH 766/991] xfs: take m_growlock when running growfsrt Take the grow lock when we're expanding the realtime volume, like we do for the other growfs calls. Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 0c3e96c621a672..776d6c401f62fb 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -821,34 +821,39 @@ xfs_growfs_rt( /* Needs to have been mounted with an rt device. */ if (!XFS_IS_REALTIME_MOUNT(mp)) return -EINVAL; + + if (!mutex_trylock(&mp->m_growlock)) + return -EWOULDBLOCK; /* * Mount should fail if the rt bitmap/summary files don't load, but * we'll check anyway. */ + error = -EINVAL; if (!mp->m_rbmip || !mp->m_rsumip) - return -EINVAL; + goto out_unlock; /* Shrink not supported. */ if (in->newblocks <= sbp->sb_rblocks) - return -EINVAL; + goto out_unlock; /* Can only change rt extent size when adding rt volume. */ if (sbp->sb_rblocks > 0 && in->extsize != sbp->sb_rextsize) - return -EINVAL; + goto out_unlock; /* Range check the extent size. */ if (XFS_FSB_TO_B(mp, in->extsize) > XFS_MAX_RTEXTSIZE || XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE) - return -EINVAL; + goto out_unlock; /* Unsupported realtime features. */ + error = -EOPNOTSUPP; if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp)) - return -EOPNOTSUPP; + goto out_unlock; nrblocks = in->newblocks; error = xfs_sb_validate_fsb_count(sbp, nrblocks); if (error) - return error; + goto out_unlock; /* * Read in the last block of the device, make sure it exists. */ @@ -856,7 +861,7 @@ xfs_growfs_rt( XFS_FSB_TO_BB(mp, nrblocks - 1), XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); if (error) - return error; + goto out_unlock; xfs_buf_relse(bp); /* @@ -864,8 +869,10 @@ xfs_growfs_rt( */ nrextents = nrblocks; do_div(nrextents, in->extsize); - if (!xfs_validate_rtextents(nrextents)) - return -EINVAL; + if (!xfs_validate_rtextents(nrextents)) { + error = -EINVAL; + goto out_unlock; + } nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); nrextslog = xfs_compute_rextslog(nrextents); nrsumlevels = nrextslog + 1; @@ -876,8 +883,11 @@ xfs_growfs_rt( * the log. This prevents us from getting a log overflow, * since we'll log basically the whole summary file at once. */ - if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) - return -EINVAL; + if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) { + error = -EINVAL; + goto out_unlock; + } + /* * Get the old block counts for bitmap and summary inodes. * These can't change since other growfs callers are locked out. @@ -889,10 +899,10 @@ xfs_growfs_rt( */ error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip); if (error) - return error; + goto out_unlock; error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip); if (error) - return error; + goto out_unlock; rsum_cache = mp->m_rsum_cache; if (nrbmblocks != sbp->sb_rbmblocks) @@ -1059,6 +1069,8 @@ xfs_growfs_rt( } } +out_unlock: + mutex_unlock(&mp->m_growlock); return error; } From a24cae8fc1f13f6f6929351309f248fd2e9351ce Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Aug 2024 17:01:07 -0700 Subject: [PATCH 767/991] xfs: reset rootdir extent size hint after growfsrt If growfsrt is run on a filesystem that doesn't have a rt volume, it's possible to change the rt extent size. If the root directory was previously set up with an inherited extent size hint and rtinherit, it's possible that the hint is no longer a multiple of the rt extent size. Although the verifiers don't complain about this, xfs_repair will, so if we detect this situation, log the root directory to clean it up. This is still racy, but it's better than nothing. Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 776d6c401f62fb..ebeab8e4dab101 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -784,6 +784,39 @@ xfs_alloc_rsum_cache( xfs_warn(mp, "could not allocate realtime summary cache"); } +/* + * If we changed the rt extent size (meaning there was no rt volume previously) + * and the root directory had EXTSZINHERIT and RTINHERIT set, it's possible + * that the extent size hint on the root directory is no longer congruent with + * the new rt extent size. Log the rootdir inode to fix this. + */ +static int +xfs_growfs_rt_fixup_extsize( + struct xfs_mount *mp) +{ + struct xfs_inode *ip = mp->m_rootip; + struct xfs_trans *tp; + int error = 0; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (!(ip->i_diflags & XFS_DIFLAG_RTINHERIT) || + !(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)) + goto out_iolock; + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_ichange, 0, 0, false, + &tp); + if (error) + goto out_iolock; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + +out_iolock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} + /* * Visible (exported) functions. */ @@ -812,6 +845,7 @@ xfs_growfs_rt( xfs_extlen_t rsumblocks; /* current number of rt summary blks */ xfs_sb_t *sbp; /* old superblock */ uint8_t *rsum_cache; /* old summary cache */ + xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize; sbp = &mp->m_sb; @@ -1046,6 +1080,12 @@ xfs_growfs_rt( if (error) goto out_free; + if (old_rextsize != in->extsize) { + error = xfs_growfs_rt_fixup_extsize(mp); + if (error) + goto out_free; + } + /* Update secondary superblocks now the physical grow has completed */ error = xfs_update_secondary_sbs(mp); From 1eb52589a299f8b29df0f214206da6616e33a8b6 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 20 Aug 2024 09:01:29 -0700 Subject: [PATCH 768/991] drm/xe: Invalidate media_gt TLBs Testing on LNL has shown media TLBs need to be invalidated via the GuC, update xe_vm_invalidate_vma appropriately. v2: Fix 2 tile case v3: Include missing local change Fixes: 3330361543fc ("drm/xe/lnl: Add LNL platform definition") Signed-off-by: Matthew Brost Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20240820160129.986889-1-matthew.brost@intel.com (cherry picked from commit 77cc3f6c58b1b28cee73904946c46a1415187d04) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_vm.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index c7561a56abaf20..50e8fc49ba6c15 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -3341,9 +3341,10 @@ int xe_vm_invalidate_vma(struct xe_vma *vma) { struct xe_device *xe = xe_vma_vm(vma)->xe; struct xe_tile *tile; - struct xe_gt_tlb_invalidation_fence fence[XE_MAX_TILES_PER_DEVICE]; - u32 tile_needs_invalidate = 0; + struct xe_gt_tlb_invalidation_fence + fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE]; u8 id; + u32 fence_id = 0; int ret = 0; xe_assert(xe, !xe_vma_is_null(vma)); @@ -3371,27 +3372,37 @@ int xe_vm_invalidate_vma(struct xe_vma *vma) if (xe_pt_zap_ptes(tile, vma)) { xe_device_wmb(xe); xe_gt_tlb_invalidation_fence_init(tile->primary_gt, - &fence[id], true); + &fence[fence_id], + true); - /* - * FIXME: We potentially need to invalidate multiple - * GTs within the tile - */ ret = xe_gt_tlb_invalidation_vma(tile->primary_gt, - &fence[id], vma); + &fence[fence_id], vma); if (ret < 0) { - xe_gt_tlb_invalidation_fence_fini(&fence[id]); + xe_gt_tlb_invalidation_fence_fini(&fence[fence_id]); goto wait; } + ++fence_id; - tile_needs_invalidate |= BIT(id); + if (!tile->media_gt) + continue; + + xe_gt_tlb_invalidation_fence_init(tile->media_gt, + &fence[fence_id], + true); + + ret = xe_gt_tlb_invalidation_vma(tile->media_gt, + &fence[fence_id], vma); + if (ret < 0) { + xe_gt_tlb_invalidation_fence_fini(&fence[fence_id]); + goto wait; + } + ++fence_id; } } wait: - for_each_tile(tile, xe, id) - if (tile_needs_invalidate & BIT(id)) - xe_gt_tlb_invalidation_fence_wait(&fence[id]); + for (id = 0; id < fence_id; ++id) + xe_gt_tlb_invalidation_fence_wait(&fence[id]); vma->tile_invalidated = vma->tile_mask; From bc21000e99f92a6b5540d7267c6b22806c5c33d3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 24 Aug 2024 18:19:01 +0000 Subject: [PATCH 769/991] net_sched: sch_fq: fix incorrect behavior for small weights fq_dequeue() has a complex logic to find packets in one of the 3 bands. As Neal found out, it is possible that one band has a deficit smaller than its weight. fq_dequeue() can return NULL while some packets are elligible for immediate transmit. In this case, more than one iteration is needed to refill pband->credit. With default parameters (weights 589824 196608 65536) bug can trigger if large BIG TCP packets are sent to the lowest priority band. Bisected-by: John Sperbeck Diagnosed-by: Neal Cardwell Fixes: 29f834aa326e ("net_sched: sch_fq: add 3 bands and WRR scheduling") Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20240824181901.953776-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_fq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 23897472567932..19a49af5a9e527 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -663,7 +663,9 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) pband = &q->band_flows[q->band_nr]; pband->credit = min(pband->credit + pband->quantum, pband->quantum); - goto begin; + if (pband->credit > 0) + goto begin; + retry = 0; } if (q->time_next_delayed_flow != ~0ULL) qdisc_watchdog_schedule_range_ns(&q->watchdog, From 70c261d500951cf3ea0fcf32651aab9a65a91471 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 26 Aug 2024 15:03:23 +0200 Subject: [PATCH 770/991] netfilter: nf_tables_ipv6: consider network offset in netdev/egress validation From netdev/egress, skb->len can include the ethernet header, therefore, subtract network offset from skb->len when validating IPv6 packet length. Fixes: 42df6e1d221d ("netfilter: Introduce egress hook") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_ipv6.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index 467d59b9e5334c..a0633eeaec977b 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -31,8 +31,8 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; + u32 pkt_len, skb_len; int protohdr; - u32 pkt_len; ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*ip6h), &_ip6h); @@ -43,7 +43,8 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) return -1; pkt_len = ntohs(ip6h->payload_len); - if (pkt_len + sizeof(*ip6h) > pkt->skb->len) + skb_len = pkt->skb->len - skb_network_offset(pkt->skb); + if (pkt_len + sizeof(*ip6h) > skb_len) return -1; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); From 08d08e2e9f0ad1af0044e4747723f66677c35ee9 Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Mon, 29 Jul 2024 09:29:34 -0400 Subject: [PATCH 771/991] tpm: ibmvtpm: Call tpm2_sessions_init() to initialize session support Commit d2add27cf2b8 ("tpm: Add NULL primary creation") introduced CONFIG_TCG_TPM2_HMAC. When this option is enabled on ppc64 then the following message appears in the kernel log due to a missing call to tpm2_sessions_init(). [ 2.654549] tpm tpm0: auth session is not active Add the missing call to tpm2_session_init() to the ibmvtpm driver to resolve this issue. Cc: stable@vger.kernel.org # v6.10+ Fixes: d2add27cf2b8 ("tpm: Add NULL primary creation") Signed-off-by: Stefan Berger Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_ibmvtpm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c index d3989b257f4222..1e5b107d1f3bdb 100644 --- a/drivers/char/tpm/tpm_ibmvtpm.c +++ b/drivers/char/tpm/tpm_ibmvtpm.c @@ -698,6 +698,10 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev, rc = tpm2_get_cc_attrs_tbl(chip); if (rc) goto init_irq_cleanup; + + rc = tpm2_sessions_init(chip); + if (rc) + goto init_irq_cleanup; } return tpm_chip_register(chip); From e8497d6951ee8541d73784f9aac9942a7f239980 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 23 Aug 2024 18:25:37 +0200 Subject: [PATCH 772/991] selftests: forwarding: no_forwarding: Down ports on cleanup This test neglects to put ports down on cleanup. Fix it. Fixes: 476a4f05d9b8 ("selftests: forwarding: add a no_forwarding.sh test") Signed-off-by: Petr Machata Reviewed-by: Simon Horman Link: https://patch.msgid.link/0baf91dc24b95ae0cadfdf5db05b74888e6a228a.1724430120.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/no_forwarding.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/no_forwarding.sh b/tools/testing/selftests/net/forwarding/no_forwarding.sh index af3b398d13f01a..9e677aa64a06a6 100755 --- a/tools/testing/selftests/net/forwarding/no_forwarding.sh +++ b/tools/testing/selftests/net/forwarding/no_forwarding.sh @@ -233,6 +233,9 @@ cleanup() { pre_cleanup + ip link set dev $swp2 down + ip link set dev $swp1 down + h2_destroy h1_destroy From 65a3cce43d5b4c53cf16b0be1a03991f665a0806 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 26 Aug 2024 19:15:11 +0200 Subject: [PATCH 773/991] selftests: forwarding: local_termination: Down ports on cleanup This test neglects to put ports down on cleanup. Fix it. Fixes: 90b9566aa5cd ("selftests: forwarding: add a test for local_termination.sh") Signed-off-by: Petr Machata Link: https://patch.msgid.link/bf9b79f45de378f88344d44550f0a5052b386199.1724692132.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/local_termination.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 648868f7460442..c35548767756d0 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -571,6 +571,10 @@ vlan_over_vlan_aware_bridge() cleanup() { pre_cleanup + + ip link set $h2 down + ip link set $h1 down + vrf_cleanup } From ec13009472f4a756288eb4e18e20a7845da98d10 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 23 Aug 2024 06:10:54 +0300 Subject: [PATCH 774/991] bonding: implement xdo_dev_state_free and call it after deletion Add this implementation for bonding, so hardware resources can be freed from the active slave after xfrm state is deleted. The netdev used to invoke xdo_dev_state_free callback, is saved in the xfrm state (xs->xso.real_dev), which is also the bond's active slave. To prevent it from being freed, acquire netdev reference before leaving RCU read-side critical section, and release it after callback is done. And call it when deleting all SAs from old active real interface while switching current active slave. Fixes: 9a5605505d9c ("bonding: Add struct bond_ipesc to manage SA") Signed-off-by: Jianbo Liu Signed-off-by: Tariq Toukan Reviewed-by: Hangbin Liu Acked-by: Jay Vosburgh Link: https://patch.msgid.link/20240823031056.110999-2-jianbol@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 36 +++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index f74bacf071fca0..2b4b7ad9cd2de1 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -581,12 +581,47 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) __func__); } else { slave->dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs); + if (slave->dev->xfrmdev_ops->xdo_dev_state_free) + slave->dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); } } spin_unlock_bh(&bond->ipsec_lock); rcu_read_unlock(); } +static void bond_ipsec_free_sa(struct xfrm_state *xs) +{ + struct net_device *bond_dev = xs->xso.dev; + struct net_device *real_dev; + netdevice_tracker tracker; + struct bonding *bond; + struct slave *slave; + + if (!bond_dev) + return; + + rcu_read_lock(); + bond = netdev_priv(bond_dev); + slave = rcu_dereference(bond->curr_active_slave); + real_dev = slave ? slave->dev : NULL; + netdev_hold(real_dev, &tracker, GFP_ATOMIC); + rcu_read_unlock(); + + if (!slave) + goto out; + + if (!xs->xso.real_dev) + goto out; + + WARN_ON(xs->xso.real_dev != real_dev); + + if (real_dev && real_dev->xfrmdev_ops && + real_dev->xfrmdev_ops->xdo_dev_state_free) + real_dev->xfrmdev_ops->xdo_dev_state_free(xs); +out: + netdev_put(real_dev, &tracker); +} + /** * bond_ipsec_offload_ok - can this packet use the xfrm hw offload * @skb: current data packet @@ -627,6 +662,7 @@ static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs) static const struct xfrmdev_ops bond_xfrmdev_ops = { .xdo_dev_state_add = bond_ipsec_add_sa, .xdo_dev_state_delete = bond_ipsec_del_sa, + .xdo_dev_state_free = bond_ipsec_free_sa, .xdo_dev_offload_ok = bond_ipsec_offload_ok, }; #endif /* CONFIG_XFRM_OFFLOAD */ From 907ed83a7583e8ffede88c5ac088392701a7d458 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 23 Aug 2024 06:10:55 +0300 Subject: [PATCH 775/991] bonding: extract the use of real_device into local variable Add a local variable for slave->dev, to prepare for the lock change in the next patch. There is no functionality change. Fixes: 9a5605505d9c ("bonding: Add struct bond_ipesc to manage SA") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Reviewed-by: Hangbin Liu Acked-by: Jay Vosburgh Link: https://patch.msgid.link/20240823031056.110999-3-jianbol@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 58 +++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2b4b7ad9cd2de1..f9849174842091 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -427,6 +427,7 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs, struct netlink_ext_ack *extack) { struct net_device *bond_dev = xs->xso.dev; + struct net_device *real_dev; struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; @@ -443,9 +444,10 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs, return -ENODEV; } - if (!slave->dev->xfrmdev_ops || - !slave->dev->xfrmdev_ops->xdo_dev_state_add || - netif_is_bond_master(slave->dev)) { + real_dev = slave->dev; + if (!real_dev->xfrmdev_ops || + !real_dev->xfrmdev_ops->xdo_dev_state_add || + netif_is_bond_master(real_dev)) { NL_SET_ERR_MSG_MOD(extack, "Slave does not support ipsec offload"); rcu_read_unlock(); return -EINVAL; @@ -456,9 +458,9 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs, rcu_read_unlock(); return -ENOMEM; } - xs->xso.real_dev = slave->dev; - err = slave->dev->xfrmdev_ops->xdo_dev_state_add(xs, extack); + xs->xso.real_dev = real_dev; + err = real_dev->xfrmdev_ops->xdo_dev_state_add(xs, extack); if (!err) { ipsec->xs = xs; INIT_LIST_HEAD(&ipsec->list); @@ -475,6 +477,7 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs, static void bond_ipsec_add_sa_all(struct bonding *bond) { struct net_device *bond_dev = bond->dev; + struct net_device *real_dev; struct bond_ipsec *ipsec; struct slave *slave; @@ -483,12 +486,13 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) if (!slave) goto out; - if (!slave->dev->xfrmdev_ops || - !slave->dev->xfrmdev_ops->xdo_dev_state_add || - netif_is_bond_master(slave->dev)) { + real_dev = slave->dev; + if (!real_dev->xfrmdev_ops || + !real_dev->xfrmdev_ops->xdo_dev_state_add || + netif_is_bond_master(real_dev)) { spin_lock_bh(&bond->ipsec_lock); if (!list_empty(&bond->ipsec_list)) - slave_warn(bond_dev, slave->dev, + slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_add\n", __func__); spin_unlock_bh(&bond->ipsec_lock); @@ -497,9 +501,9 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) spin_lock_bh(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { - ipsec->xs->xso.real_dev = slave->dev; - if (slave->dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) { - slave_warn(bond_dev, slave->dev, "%s: failed to add SA\n", __func__); + ipsec->xs->xso.real_dev = real_dev; + if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) { + slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); ipsec->xs->xso.real_dev = NULL; } } @@ -515,6 +519,7 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) static void bond_ipsec_del_sa(struct xfrm_state *xs) { struct net_device *bond_dev = xs->xso.dev; + struct net_device *real_dev; struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; @@ -532,16 +537,17 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) if (!xs->xso.real_dev) goto out; - WARN_ON(xs->xso.real_dev != slave->dev); + real_dev = slave->dev; + WARN_ON(xs->xso.real_dev != real_dev); - if (!slave->dev->xfrmdev_ops || - !slave->dev->xfrmdev_ops->xdo_dev_state_delete || - netif_is_bond_master(slave->dev)) { - slave_warn(bond_dev, slave->dev, "%s: no slave xdo_dev_state_delete\n", __func__); + if (!real_dev->xfrmdev_ops || + !real_dev->xfrmdev_ops->xdo_dev_state_delete || + netif_is_bond_master(real_dev)) { + slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); goto out; } - slave->dev->xfrmdev_ops->xdo_dev_state_delete(xs); + real_dev->xfrmdev_ops->xdo_dev_state_delete(xs); out: spin_lock_bh(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { @@ -558,6 +564,7 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) static void bond_ipsec_del_sa_all(struct bonding *bond) { struct net_device *bond_dev = bond->dev; + struct net_device *real_dev; struct bond_ipsec *ipsec; struct slave *slave; @@ -568,21 +575,22 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) return; } + real_dev = slave->dev; spin_lock_bh(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (!ipsec->xs->xso.real_dev) continue; - if (!slave->dev->xfrmdev_ops || - !slave->dev->xfrmdev_ops->xdo_dev_state_delete || - netif_is_bond_master(slave->dev)) { - slave_warn(bond_dev, slave->dev, + if (!real_dev->xfrmdev_ops || + !real_dev->xfrmdev_ops->xdo_dev_state_delete || + netif_is_bond_master(real_dev)) { + slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); } else { - slave->dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs); - if (slave->dev->xfrmdev_ops->xdo_dev_state_free) - slave->dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); + real_dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs); + if (real_dev->xfrmdev_ops->xdo_dev_state_free) + real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); } } spin_unlock_bh(&bond->ipsec_lock); From 2aeeef906d5a526dc60cf4af92eda69836c39b1f Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 23 Aug 2024 06:10:56 +0300 Subject: [PATCH 776/991] bonding: change ipsec_lock from spin lock to mutex In the cited commit, bond->ipsec_lock is added to protect ipsec_list, hence xdo_dev_state_add and xdo_dev_state_delete are called inside this lock. As ipsec_lock is a spin lock and such xfrmdev ops may sleep, "scheduling while atomic" will be triggered when changing bond's active slave. [ 101.055189] BUG: scheduling while atomic: bash/902/0x00000200 [ 101.055726] Modules linked in: [ 101.058211] CPU: 3 PID: 902 Comm: bash Not tainted 6.9.0-rc4+ #1 [ 101.058760] Hardware name: [ 101.059434] Call Trace: [ 101.059436] [ 101.060873] dump_stack_lvl+0x51/0x60 [ 101.061275] __schedule_bug+0x4e/0x60 [ 101.061682] __schedule+0x612/0x7c0 [ 101.062078] ? __mod_timer+0x25c/0x370 [ 101.062486] schedule+0x25/0xd0 [ 101.062845] schedule_timeout+0x77/0xf0 [ 101.063265] ? asm_common_interrupt+0x22/0x40 [ 101.063724] ? __bpf_trace_itimer_state+0x10/0x10 [ 101.064215] __wait_for_common+0x87/0x190 [ 101.064648] ? usleep_range_state+0x90/0x90 [ 101.065091] cmd_exec+0x437/0xb20 [mlx5_core] [ 101.065569] mlx5_cmd_do+0x1e/0x40 [mlx5_core] [ 101.066051] mlx5_cmd_exec+0x18/0x30 [mlx5_core] [ 101.066552] mlx5_crypto_create_dek_key+0xea/0x120 [mlx5_core] [ 101.067163] ? bonding_sysfs_store_option+0x4d/0x80 [bonding] [ 101.067738] ? kmalloc_trace+0x4d/0x350 [ 101.068156] mlx5_ipsec_create_sa_ctx+0x33/0x100 [mlx5_core] [ 101.068747] mlx5e_xfrm_add_state+0x47b/0xaa0 [mlx5_core] [ 101.069312] bond_change_active_slave+0x392/0x900 [bonding] [ 101.069868] bond_option_active_slave_set+0x1c2/0x240 [bonding] [ 101.070454] __bond_opt_set+0xa6/0x430 [bonding] [ 101.070935] __bond_opt_set_notify+0x2f/0x90 [bonding] [ 101.071453] bond_opt_tryset_rtnl+0x72/0xb0 [bonding] [ 101.071965] bonding_sysfs_store_option+0x4d/0x80 [bonding] [ 101.072567] kernfs_fop_write_iter+0x10c/0x1a0 [ 101.073033] vfs_write+0x2d8/0x400 [ 101.073416] ? alloc_fd+0x48/0x180 [ 101.073798] ksys_write+0x5f/0xe0 [ 101.074175] do_syscall_64+0x52/0x110 [ 101.074576] entry_SYSCALL_64_after_hwframe+0x4b/0x53 As bond_ipsec_add_sa_all and bond_ipsec_del_sa_all are only called from bond_change_active_slave, which requires holding the RTNL lock. And bond_ipsec_add_sa and bond_ipsec_del_sa are xfrm state xdo_dev_state_add and xdo_dev_state_delete APIs, which are in user context. So ipsec_lock doesn't have to be spin lock, change it to mutex, and thus the above issue can be resolved. Fixes: 9a5605505d9c ("bonding: Add struct bond_ipesc to manage SA") Signed-off-by: Jianbo Liu Signed-off-by: Tariq Toukan Reviewed-by: Hangbin Liu Acked-by: Jay Vosburgh Link: https://patch.msgid.link/20240823031056.110999-4-jianbol@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 79 ++++++++++++++++++--------------- include/net/bonding.h | 2 +- 2 files changed, 44 insertions(+), 37 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index f9849174842091..bb9c3d6ef43592 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -428,6 +428,7 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs, { struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; + netdevice_tracker tracker; struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; @@ -439,24 +440,26 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs, rcu_read_lock(); bond = netdev_priv(bond_dev); slave = rcu_dereference(bond->curr_active_slave); - if (!slave) { - rcu_read_unlock(); - return -ENODEV; + real_dev = slave ? slave->dev : NULL; + netdev_hold(real_dev, &tracker, GFP_ATOMIC); + rcu_read_unlock(); + if (!real_dev) { + err = -ENODEV; + goto out; } - real_dev = slave->dev; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_add || netif_is_bond_master(real_dev)) { NL_SET_ERR_MSG_MOD(extack, "Slave does not support ipsec offload"); - rcu_read_unlock(); - return -EINVAL; + err = -EINVAL; + goto out; } - ipsec = kmalloc(sizeof(*ipsec), GFP_ATOMIC); + ipsec = kmalloc(sizeof(*ipsec), GFP_KERNEL); if (!ipsec) { - rcu_read_unlock(); - return -ENOMEM; + err = -ENOMEM; + goto out; } xs->xso.real_dev = real_dev; @@ -464,13 +467,14 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs, if (!err) { ipsec->xs = xs; INIT_LIST_HEAD(&ipsec->list); - spin_lock_bh(&bond->ipsec_lock); + mutex_lock(&bond->ipsec_lock); list_add(&ipsec->list, &bond->ipsec_list); - spin_unlock_bh(&bond->ipsec_lock); + mutex_unlock(&bond->ipsec_lock); } else { kfree(ipsec); } - rcu_read_unlock(); +out: + netdev_put(real_dev, &tracker); return err; } @@ -481,35 +485,35 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) struct bond_ipsec *ipsec; struct slave *slave; - rcu_read_lock(); - slave = rcu_dereference(bond->curr_active_slave); - if (!slave) - goto out; + slave = rtnl_dereference(bond->curr_active_slave); + real_dev = slave ? slave->dev : NULL; + if (!real_dev) + return; - real_dev = slave->dev; + mutex_lock(&bond->ipsec_lock); if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_add || netif_is_bond_master(real_dev)) { - spin_lock_bh(&bond->ipsec_lock); if (!list_empty(&bond->ipsec_list)) slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_add\n", __func__); - spin_unlock_bh(&bond->ipsec_lock); goto out; } - spin_lock_bh(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { + /* If new state is added before ipsec_lock acquired */ + if (ipsec->xs->xso.real_dev == real_dev) + continue; + ipsec->xs->xso.real_dev = real_dev; if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) { slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); ipsec->xs->xso.real_dev = NULL; } } - spin_unlock_bh(&bond->ipsec_lock); out: - rcu_read_unlock(); + mutex_unlock(&bond->ipsec_lock); } /** @@ -520,6 +524,7 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) { struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; + netdevice_tracker tracker; struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; @@ -530,6 +535,9 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) rcu_read_lock(); bond = netdev_priv(bond_dev); slave = rcu_dereference(bond->curr_active_slave); + real_dev = slave ? slave->dev : NULL; + netdev_hold(real_dev, &tracker, GFP_ATOMIC); + rcu_read_unlock(); if (!slave) goto out; @@ -537,7 +545,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) if (!xs->xso.real_dev) goto out; - real_dev = slave->dev; WARN_ON(xs->xso.real_dev != real_dev); if (!real_dev->xfrmdev_ops || @@ -549,7 +556,8 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) real_dev->xfrmdev_ops->xdo_dev_state_delete(xs); out: - spin_lock_bh(&bond->ipsec_lock); + netdev_put(real_dev, &tracker); + mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (ipsec->xs == xs) { list_del(&ipsec->list); @@ -557,8 +565,7 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) break; } } - spin_unlock_bh(&bond->ipsec_lock); - rcu_read_unlock(); + mutex_unlock(&bond->ipsec_lock); } static void bond_ipsec_del_sa_all(struct bonding *bond) @@ -568,15 +575,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) struct bond_ipsec *ipsec; struct slave *slave; - rcu_read_lock(); - slave = rcu_dereference(bond->curr_active_slave); - if (!slave) { - rcu_read_unlock(); + slave = rtnl_dereference(bond->curr_active_slave); + real_dev = slave ? slave->dev : NULL; + if (!real_dev) return; - } - real_dev = slave->dev; - spin_lock_bh(&bond->ipsec_lock); + mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (!ipsec->xs->xso.real_dev) continue; @@ -593,8 +597,7 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); } } - spin_unlock_bh(&bond->ipsec_lock); - rcu_read_unlock(); + mutex_unlock(&bond->ipsec_lock); } static void bond_ipsec_free_sa(struct xfrm_state *xs) @@ -5921,7 +5924,7 @@ void bond_setup(struct net_device *bond_dev) /* set up xfrm device ops (only supported in active-backup right now) */ bond_dev->xfrmdev_ops = &bond_xfrmdev_ops; INIT_LIST_HEAD(&bond->ipsec_list); - spin_lock_init(&bond->ipsec_lock); + mutex_init(&bond->ipsec_lock); #endif /* CONFIG_XFRM_OFFLOAD */ /* don't acquire bond device's netif_tx_lock when transmitting */ @@ -5970,6 +5973,10 @@ static void bond_uninit(struct net_device *bond_dev) __bond_release_one(bond_dev, slave->dev, true, true); netdev_info(bond_dev, "Released all slaves\n"); +#ifdef CONFIG_XFRM_OFFLOAD + mutex_destroy(&bond->ipsec_lock); +#endif /* CONFIG_XFRM_OFFLOAD */ + bond_set_slave_arr(bond, NULL, NULL); list_del_rcu(&bond->bond_list); diff --git a/include/net/bonding.h b/include/net/bonding.h index b61fb1aa3a56b5..8bb5f016969f10 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -260,7 +260,7 @@ struct bonding { #ifdef CONFIG_XFRM_OFFLOAD struct list_head ipsec_list; /* protecting ipsec_list */ - spinlock_t ipsec_lock; + struct mutex ipsec_lock; #endif /* CONFIG_XFRM_OFFLOAD */ struct bpf_prog *xdp_prog; }; From defd8b3c37b0f9cb3e0f60f47d3d78d459d57fda Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 25 Aug 2024 12:16:38 -0700 Subject: [PATCH 777/991] gtp: fix a potential NULL pointer dereference When sockfd_lookup() fails, gtp_encap_enable_socket() returns a NULL pointer, but its callers only check for error pointers thus miss the NULL pointer case. Fix it by returning an error pointer with the error code carried from sockfd_lookup(). (I found this bug during code inspection.) Fixes: 1e3a3abd8b28 ("gtp: make GTP sockets in gtp_newlink optional") Cc: Andreas Schultz Cc: Harald Welte Signed-off-by: Cong Wang Reviewed-by: Simon Horman Reviewed-by: Pablo Neira Ayuso Link: https://patch.msgid.link/20240825191638.146748-1-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/gtp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 0696faf60013e0..2e94d10348cceb 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1653,7 +1653,7 @@ static struct sock *gtp_encap_enable_socket(int fd, int type, sock = sockfd_lookup(fd, &err); if (!sock) { pr_debug("gtp socket fd=%d not found\n", fd); - return NULL; + return ERR_PTR(err); } sk = sock->sk; From bac76cf89816bff06c4ec2f3df97dc34e150a1c4 Mon Sep 17 00:00:00 2001 From: Xueming Feng Date: Mon, 26 Aug 2024 18:23:27 +0800 Subject: [PATCH 778/991] tcp: fix forever orphan socket caused by tcp_abort We have some problem closing zero-window fin-wait-1 tcp sockets in our environment. This patch come from the investigation. Previously tcp_abort only sends out reset and calls tcp_done when the socket is not SOCK_DEAD, aka orphan. For orphan socket, it will only purging the write queue, but not close the socket and left it to the timer. While purging the write queue, tp->packets_out and sk->sk_write_queue is cleared along the way. However tcp_retransmit_timer have early return based on !tp->packets_out and tcp_probe_timer have early return based on !sk->sk_write_queue. This caused ICSK_TIME_RETRANS and ICSK_TIME_PROBE0 not being resched and socket not being killed by the timers, converting a zero-windowed orphan into a forever orphan. This patch removes the SOCK_DEAD check in tcp_abort, making it send reset to peer and close the socket accordingly. Preventing the timer-less orphan from happening. According to Lorenzo's email in the v1 thread, the check was there to prevent force-closing the same socket twice. That situation is handled by testing for TCP_CLOSE inside lock, and returning -ENOENT if it is already closed. The -ENOENT code comes from the associate patch Lorenzo made for iproute2-ss; link attached below, which also conform to RFC 9293. At the end of the patch, tcp_write_queue_purge(sk) is removed because it was already called in tcp_done_with_error(). p.s. This is the same patch with v2. Resent due to mis-labeled "changes requested" on patchwork.kernel.org. Link: https://patchwork.ozlabs.org/project/netdev/patch/1450773094-7978-3-git-send-email-lorenzo@google.com/ Fixes: c1e64e298b8c ("net: diag: Support destroying TCP sockets.") Signed-off-by: Xueming Feng Tested-by: Lorenzo Colitti Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240826102327.1461482-1-kuro@kuroa.me Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e03a342c9162b6..831a18dc7aa6d6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4637,6 +4637,13 @@ int tcp_abort(struct sock *sk, int err) /* Don't race with userspace socket closes such as tcp_close. */ lock_sock(sk); + /* Avoid closing the same socket twice. */ + if (sk->sk_state == TCP_CLOSE) { + if (!has_current_bpf_ctx()) + release_sock(sk); + return -ENOENT; + } + if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); inet_csk_listen_stop(sk); @@ -4646,16 +4653,13 @@ int tcp_abort(struct sock *sk, int err) local_bh_disable(); bh_lock_sock(sk); - if (!sock_flag(sk, SOCK_DEAD)) { - if (tcp_need_reset(sk->sk_state)) - tcp_send_active_reset(sk, GFP_ATOMIC, - SK_RST_REASON_NOT_SPECIFIED); - tcp_done_with_error(sk, err); - } + if (tcp_need_reset(sk->sk_state)) + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); + tcp_done_with_error(sk, err); bh_unlock_sock(sk); local_bh_enable(); - tcp_write_queue_purge(sk); if (!has_current_bpf_ctx()) release_sock(sk); return 0; From f09b0ad55a1196f5891663f8888463c0541059cb Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 26 Aug 2024 19:11:18 +0200 Subject: [PATCH 779/991] mptcp: close subflow when receiving TCP+FIN When a peer decides to close one subflow in the middle of a connection having multiple subflows, the receiver of the first FIN should accept that, and close the subflow on its side as well. If not, the subflow will stay half closed, and would even continue to be used until the end of the MPTCP connection or a reset from the network. The issue has not been seen before, probably because the in-kernel path-manager always sends a RM_ADDR before closing the subflow. Upon the reception of this RM_ADDR, the other peer will initiate the closure on its side as well. On the other hand, if the RM_ADDR is lost, or if the path-manager of the other peer only closes the subflow without sending a RM_ADDR, the subflow would switch to TCP_CLOSE_WAIT, but that's it, leaving the subflow half-closed. So now, when the subflow switches to the TCP_CLOSE_WAIT state, and if the MPTCP connection has not been closed before with a DATA_FIN, the kernel owning the subflow schedules its worker to initiate the closure on its side as well. This issue can be easily reproduced with packetdrill, as visible in [1], by creating an additional subflow, injecting a FIN+ACK before sending the DATA_FIN, and expecting a FIN+ACK in return. Fixes: 40947e13997a ("mptcp: schedule worker when subflow is closed") Cc: stable@vger.kernel.org Link: https://github.com/multipath-tcp/packetdrill/pull/154 [1] Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-1-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 5 ++++- net/mptcp/subflow.c | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0d536b183a6c55..151e82e2ff2e86 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2533,8 +2533,11 @@ static void __mptcp_close_subflow(struct sock *sk) mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int ssk_state = inet_sk_state_load(ssk); - if (inet_sk_state_load(ssk) != TCP_CLOSE) + if (ssk_state != TCP_CLOSE && + (ssk_state != TCP_CLOSE_WAIT || + inet_sk_state_load(sk) != TCP_ESTABLISHED)) continue; /* 'subflow_data_ready' will re-sched once rx queue is empty */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a21c712350c36e..4834e7fc2fb66d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1255,12 +1255,16 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, /* sched mptcp worker to remove the subflow if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { - if (likely(ssk->sk_state != TCP_CLOSE)) + struct sock *sk = (struct sock *)msk; + + if (likely(ssk->sk_state != TCP_CLOSE && + (ssk->sk_state != TCP_CLOSE_WAIT || + inet_sk_state_load(sk) != TCP_ESTABLISHED))) return; if (skb_queue_empty(&ssk->sk_receive_queue) && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) - mptcp_schedule_work((struct sock *)msk); + mptcp_schedule_work(sk); } static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) From e93681afcb96864ec26c3b2ce94008ce93577373 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 26 Aug 2024 19:11:19 +0200 Subject: [PATCH 780/991] selftests: mptcp: join: cannot rm sf if closed Thanks to the previous commit, the MPTCP subflows are now closed on both directions even when only the MPTCP path-manager of one peer asks for their closure. In the two tests modified here -- "userspace pm add & remove address" and "userspace pm create destroy subflow" -- one peer is controlled by the userspace PM, and the other one by the in-kernel PM. When the userspace PM sends a RM_ADDR notification, the in-kernel PM will automatically react by closing all subflows using this address. Now, thanks to the previous commit, the subflows are properly closed on both directions, the userspace PM can then no longer closes the same subflows if they are already closed. Before, it was OK to do that, because the subflows were still half-opened, still OK to send a RM_ADDR. In other words, thanks to the previous commit closing the subflows, an error will be returned to the userspace if it tries to close a subflow that has already been closed. So no need to run this command, which mean that the linked counters will then not be incremented. These tests are then no longer sending both a RM_ADDR, then closing the linked subflow just after. The test with the userspace PM on the server side is now removing one subflow linked to one address, then sending a RM_ADDR for another address. The test with the userspace PM on the client side is now only removing the subflow that was previously created. Fixes: 4369c198e599 ("selftests: mptcp: test userspace pm out of transfer") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-2-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 89e553e0e0c2ef..264040a760c6f6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3429,14 +3429,12 @@ userspace_tests() "signal" userspace_pm_chk_get_addr "${ns1}" "10" "id 10 flags signal 10.0.2.1" userspace_pm_chk_get_addr "${ns1}" "20" "id 20 flags signal 10.0.3.1" - userspace_pm_rm_addr $ns1 10 userspace_pm_rm_sf $ns1 "::ffff:10.0.2.1" $MPTCP_LIB_EVENT_SUB_ESTABLISHED userspace_pm_chk_dump_addr "${ns1}" \ - "id 20 flags signal 10.0.3.1" "after rm_addr 10" + "id 20 flags signal 10.0.3.1" "after rm_sf 10" userspace_pm_rm_addr $ns1 20 - userspace_pm_rm_sf $ns1 10.0.3.1 $MPTCP_LIB_EVENT_SUB_ESTABLISHED userspace_pm_chk_dump_addr "${ns1}" "" "after rm_addr 20" - chk_rm_nr 2 2 invert + chk_rm_nr 1 1 invert chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids @@ -3460,12 +3458,11 @@ userspace_tests() "id 20 flags subflow 10.0.3.2" \ "subflow" userspace_pm_chk_get_addr "${ns2}" "20" "id 20 flags subflow 10.0.3.2" - userspace_pm_rm_addr $ns2 20 userspace_pm_rm_sf $ns2 10.0.3.2 $MPTCP_LIB_EVENT_SUB_ESTABLISHED userspace_pm_chk_dump_addr "${ns2}" \ "" \ - "after rm_addr 20" - chk_rm_nr 1 1 + "after rm_sf 20" + chk_rm_nr 0 1 chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids From 2a1f596ebb23eadc0f9b95a8012e18ef76295fc8 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 26 Aug 2024 19:11:20 +0200 Subject: [PATCH 781/991] mptcp: sched: check both backup in retrans The 'mptcp_subflow_context' structure has two items related to the backup flags: - 'backup': the subflow has been marked as backup by the other peer - 'request_bkup': the backup flag has been set by the host Looking only at the 'backup' flag can make sense in some cases, but it is not the behaviour of the default packet scheduler when selecting paths. As explained in the commit b6a66e521a20 ("mptcp: sched: check both directions for backup"), the packet scheduler should look at both flags, because that was the behaviour from the beginning: the 'backup' flag was set by accident instead of the 'request_bkup' one. Now that the latter has been fixed, get_retrans() needs to be adapted as well. Fixes: b6a66e521a20 ("mptcp: sched: check both directions for backup") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-3-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 151e82e2ff2e86..34fec753b9c178 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2326,7 +2326,7 @@ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) continue; } - if (subflow->backup) { + if (subflow->backup || subflow->request_bkup) { if (!backup) backup = ssk; continue; From cb41b195e634d3f1ecfcd845314e64fd4bb3c7aa Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 26 Aug 2024 19:11:21 +0200 Subject: [PATCH 782/991] mptcp: pr_debug: add missing \n at the end pr_debug() have been added in various places in MPTCP code to help developers to debug some situations. With the dynamic debug feature, it is easy to enable all or some of them, and asks users to reproduce issues with extra debug. Many of these pr_debug() don't end with a new line, while no 'pr_cont()' are used in MPTCP code. So the goal was not to display multiple debug messages on one line: they were then not missing the '\n' on purpose. Not having the new line at the end causes these messages to be printed with a delay, when something else needs to be printed. This issue is not visible when many messages need to be printed, but it is annoying and confusing when only specific messages are expected, e.g. # echo "func mptcp_pm_add_addr_echoed +fmp" \ > /sys/kernel/debug/dynamic_debug/control # ./mptcp_join.sh "signal address"; \ echo "$(awk '{print $1}' /proc/uptime) - end"; \ sleep 5s; \ echo "$(awk '{print $1}' /proc/uptime) - restart"; \ ./mptcp_join.sh "signal address" 013 signal address (...) 10.75 - end 15.76 - restart 013 signal address [ 10.367935] mptcp:mptcp_pm_add_addr_echoed: MPTCP: msk=(...) (...) => a delay of 5 seconds: printed with a 10.36 ts, but after 'restart' which was printed at the 15.76 ts. The 'Fixes' tag here below points to the first pr_debug() used without '\n' in net/mptcp. This patch could be split in many small ones, with different Fixes tag, but it doesn't seem worth it, because it is easy to re-generate this patch with this simple 'sed' command: git grep -l pr_debug -- net/mptcp | xargs sed -i "s/\(pr_debug(\".*[^n]\)\(\"[,)]\)/\1\\\n\2/g" So in case of conflicts, simply drop the modifications, and launch this command. Fixes: f870fa0b5768 ("mptcp: Add MPTCP socket stubs") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-4-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/fastopen.c | 4 ++-- net/mptcp/options.c | 50 ++++++++++++++++++++-------------------- net/mptcp/pm.c | 28 +++++++++++------------ net/mptcp/pm_netlink.c | 20 ++++++++-------- net/mptcp/protocol.c | 52 +++++++++++++++++++++--------------------- net/mptcp/protocol.h | 4 ++-- net/mptcp/sched.c | 4 ++-- net/mptcp/sockopt.c | 4 ++-- net/mptcp/subflow.c | 48 +++++++++++++++++++------------------- 9 files changed, 107 insertions(+), 107 deletions(-) diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index ad28da655f8bcc..a29ff901df7588 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -68,12 +68,12 @@ void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflo skb = skb_peek_tail(&sk->sk_receive_queue); if (skb) { WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq); - pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx", sk, + pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx\n", sk, MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq, MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq); MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq; MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq; } - pr_debug("msk=%p ack_seq=%llx", msk, msk->ack_seq); + pr_debug("msk=%p ack_seq=%llx\n", msk, msk->ack_seq); } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index ac2f1a54cc43ae..370c3836b7712f 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -117,7 +117,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; ptr += 2; } - pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u", + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u\n", version, flags, opsize, mp_opt->sndr_key, mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; @@ -131,7 +131,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 4; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; - pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u", + pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u\n", mp_opt->backup, mp_opt->join_id, mp_opt->token, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { @@ -142,19 +142,19 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 8; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; - pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u", + pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u\n", mp_opt->backup, mp_opt->join_id, mp_opt->thmac, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK; ptr += 2; memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); - pr_debug("MP_JOIN hmac"); + pr_debug("MP_JOIN hmac\n"); } break; case MPTCPOPT_DSS: - pr_debug("DSS"); + pr_debug("DSS\n"); ptr++; /* we must clear 'mpc_map' be able to detect MP_CAPABLE @@ -169,7 +169,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); - pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", + pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d\n", mp_opt->data_fin, mp_opt->dsn64, mp_opt->use_map, mp_opt->ack64, mp_opt->use_ack); @@ -207,7 +207,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 4; } - pr_debug("data_ack=%llu", mp_opt->data_ack); + pr_debug("data_ack=%llu\n", mp_opt->data_ack); } if (mp_opt->use_map) { @@ -231,7 +231,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 2; } - pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", mp_opt->data_seq, mp_opt->subflow_seq, mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD), mp_opt->csum); @@ -293,7 +293,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->ahmac = get_unaligned_be64(ptr); ptr += 8; } - pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d", + pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d\n", (mp_opt->addr.family == AF_INET6) ? "6" : "", mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port)); break; @@ -309,7 +309,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE; for (i = 0; i < mp_opt->rm_list.nr; i++) mp_opt->rm_list.ids[i] = *ptr++; - pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr); + pr_debug("RM_ADDR: rm_list_nr=%d\n", mp_opt->rm_list.nr); break; case MPTCPOPT_MP_PRIO: @@ -318,7 +318,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->suboptions |= OPTION_MPTCP_PRIO; mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP; - pr_debug("MP_PRIO: prio=%d", mp_opt->backup); + pr_debug("MP_PRIO: prio=%d\n", mp_opt->backup); break; case MPTCPOPT_MP_FASTCLOSE: @@ -329,7 +329,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE; - pr_debug("MP_FASTCLOSE: recv_key=%llu", mp_opt->rcvr_key); + pr_debug("MP_FASTCLOSE: recv_key=%llu\n", mp_opt->rcvr_key); break; case MPTCPOPT_RST: @@ -343,7 +343,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, flags = *ptr++; mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; mp_opt->reset_reason = *ptr; - pr_debug("MP_RST: transient=%u reason=%u", + pr_debug("MP_RST: transient=%u reason=%u\n", mp_opt->reset_transient, mp_opt->reset_reason); break; @@ -354,7 +354,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 2; mp_opt->suboptions |= OPTION_MPTCP_FAIL; mp_opt->fail_seq = get_unaligned_be64(ptr); - pr_debug("MP_FAIL: data_seq=%llu", mp_opt->fail_seq); + pr_debug("MP_FAIL: data_seq=%llu\n", mp_opt->fail_seq); break; default: @@ -417,7 +417,7 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { - pr_debug("remote_token=%u, nonce=%u", subflow->remote_token, + pr_debug("remote_token=%u, nonce=%u\n", subflow->remote_token, subflow->local_nonce); opts->suboptions = OPTION_MPTCP_MPJ_SYN; opts->join_id = subflow->local_id; @@ -500,7 +500,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, *size = TCPOLEN_MPTCP_MPC_ACK; } - pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", + pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d\n", subflow, subflow->local_key, subflow->remote_key, data_len); @@ -509,7 +509,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, opts->suboptions = OPTION_MPTCP_MPJ_ACK; memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); *size = TCPOLEN_MPTCP_MPJ_ACK; - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); /* we can use the full delegate action helper only from BH context * If we are in process context - sk is flushing the backlog at @@ -675,7 +675,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * *size = len; if (drop_other_suboptions) { - pr_debug("drop other suboptions"); + pr_debug("drop other suboptions\n"); opts->suboptions = 0; /* note that e.g. DSS could have written into the memory @@ -695,7 +695,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX); } - pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d", + pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d\n", opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); return true; @@ -726,7 +726,7 @@ static bool mptcp_established_options_rm_addr(struct sock *sk, opts->rm_list = rm_list; for (i = 0; i < opts->rm_list.nr; i++) - pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]); + pr_debug("rm_list_ids[%d]=%d\n", i, opts->rm_list.ids[i]); MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr); return true; } @@ -752,7 +752,7 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, opts->suboptions |= OPTION_MPTCP_PRIO; opts->backup = subflow->request_bkup; - pr_debug("prio=%d", opts->backup); + pr_debug("prio=%d\n", opts->backup); return true; } @@ -794,7 +794,7 @@ static bool mptcp_established_options_fastclose(struct sock *sk, opts->suboptions |= OPTION_MPTCP_FASTCLOSE; opts->rcvr_key = READ_ONCE(msk->remote_key); - pr_debug("FASTCLOSE key=%llu", opts->rcvr_key); + pr_debug("FASTCLOSE key=%llu\n", opts->rcvr_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); return true; } @@ -816,7 +816,7 @@ static bool mptcp_established_options_mp_fail(struct sock *sk, opts->suboptions |= OPTION_MPTCP_FAIL; opts->fail_seq = subflow->map_seq; - pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq); + pr_debug("MP_FAIL fail_seq=%llu\n", opts->fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); return true; @@ -904,7 +904,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, opts->csum_reqd = subflow_req->csum_reqd; opts->allow_join_id0 = subflow_req->allow_join_id0; *size = TCPOLEN_MPTCP_MPC_SYNACK; - pr_debug("subflow_req=%p, local_key=%llu", + pr_debug("subflow_req=%p, local_key=%llu\n", subflow_req, subflow_req->local_key); return true; } else if (subflow_req->mp_join) { @@ -913,7 +913,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, opts->join_id = subflow_req->local_id; opts->thmac = subflow_req->thmac; opts->nonce = subflow_req->local_nonce; - pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u", + pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u\n", subflow_req, opts->backup, opts->join_id, opts->thmac, opts->nonce); *size = TCPOLEN_MPTCP_MPJ_SYNACK; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 3e6e0f5510bb13..3f8dbde243f104 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -19,7 +19,7 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, { u8 add_addr = READ_ONCE(msk->pm.addr_signal); - pr_debug("msk=%p, local_id=%d, echo=%d", msk, addr->id, echo); + pr_debug("msk=%p, local_id=%d, echo=%d\n", msk, addr->id, echo); lockdep_assert_held(&msk->pm.lock); @@ -45,7 +45,7 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ { u8 rm_addr = READ_ONCE(msk->pm.addr_signal); - pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); + pr_debug("msk=%p, rm_list_nr=%d\n", msk, rm_list->nr); if (rm_addr) { MPTCP_ADD_STATS(sock_net((struct sock *)msk), @@ -66,7 +66,7 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int { struct mptcp_pm_data *pm = &msk->pm; - pr_debug("msk=%p, token=%u side=%d", msk, READ_ONCE(msk->token), server_side); + pr_debug("msk=%p, token=%u side=%d\n", msk, READ_ONCE(msk->token), server_side); WRITE_ONCE(pm->server_side, server_side); mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC); @@ -90,7 +90,7 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) subflows_max = mptcp_pm_get_subflows_max(msk); - pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows, + pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows, subflows_max, READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ @@ -114,7 +114,7 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, enum mptcp_pm_status new_status) { - pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status, + pr_debug("msk=%p status=%x new=%lx\n", msk, msk->pm.status, BIT(new_status)); if (msk->pm.status & BIT(new_status)) return false; @@ -129,7 +129,7 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) struct mptcp_pm_data *pm = &msk->pm; bool announce = false; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); spin_lock_bh(&pm->lock); @@ -153,14 +153,14 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) void mptcp_pm_connection_closed(struct mptcp_sock *msk) { - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); } void mptcp_pm_subflow_established(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); if (!READ_ONCE(pm->work_pending)) return; @@ -212,7 +212,7 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_pm_data *pm = &msk->pm; - pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id, + pr_debug("msk=%p remote_id=%d accept=%d\n", msk, addr->id, READ_ONCE(pm->accept_addr)); mptcp_event_addr_announced(ssk, addr); @@ -243,7 +243,7 @@ void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, { struct mptcp_pm_data *pm = &msk->pm; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); spin_lock_bh(&pm->lock); @@ -267,7 +267,7 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, struct mptcp_pm_data *pm = &msk->pm; u8 i; - pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr); + pr_debug("msk=%p remote_ids_nr=%d\n", msk, rm_list->nr); for (i = 0; i < rm_list->nr; i++) mptcp_event_addr_removed(msk, rm_list->ids[i]); @@ -299,19 +299,19 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - pr_debug("fail_seq=%llu", fail_seq); + pr_debug("fail_seq=%llu\n", fail_seq); if (!READ_ONCE(msk->allow_infinite_fallback)) return; if (!subflow->fail_tout) { - pr_debug("send MP_FAIL response and infinite map"); + pr_debug("send MP_FAIL response and infinite map\n"); subflow->send_mp_fail = 1; subflow->send_infinite_map = 1; tcp_send_ack(sk); } else { - pr_debug("MP_FAIL response received"); + pr_debug("MP_FAIL response received\n"); WRITE_ONCE(subflow->fail_tout, 0); } } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 3e4ad801786f2b..8d2f97854c6429 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -287,7 +287,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); if (!msk) return; @@ -306,7 +306,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) spin_lock_bh(&msk->pm.lock); if (!mptcp_pm_should_add_signal_addr(msk)) { - pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id); + pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); mptcp_pm_announce_addr(msk, &entry->addr, false); mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; @@ -387,7 +387,7 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) struct sock *sk = (struct sock *)msk; LIST_HEAD(free_list); - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); spin_lock_bh(&msk->pm.lock); list_splice_init(&msk->pm.anno_list, &free_list); @@ -473,7 +473,7 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow; - pr_debug("send ack for %s", + pr_debug("send ack for %s\n", prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); slow = lock_sock_fast(ssk); @@ -708,7 +708,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); - pr_debug("accepted %d:%d remote family %d", + pr_debug("accepted %d:%d remote family %d\n", msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); @@ -767,7 +767,7 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, { struct mptcp_subflow_context *subflow; - pr_debug("bkup=%d", bkup); + pr_debug("bkup=%d\n", bkup); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -803,7 +803,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, struct sock *sk = (struct sock *)msk; u8 i; - pr_debug("%s rm_list_nr %d", + pr_debug("%s rm_list_nr %d\n", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); msk_owned_by_me(msk); @@ -832,7 +832,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, if (rm_type == MPTCP_MIB_RMSUBFLOW && !mptcp_local_id_match(msk, id, rm_id)) continue; - pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u", + pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", i, rm_id, id, remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); @@ -889,7 +889,7 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk) spin_lock_bh(&msk->pm.lock); - pr_debug("msk=%p status=%x", msk, pm->status); + pr_debug("msk=%p status=%x\n", msk, pm->status); if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); mptcp_pm_nl_add_addr_received(msk); @@ -1476,7 +1476,7 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, long s_slot = 0, s_num = 0; struct mptcp_sock *msk; - pr_debug("remove_id=%d", addr->id); + pr_debug("remove_id=%d\n", addr->id); list.ids[list.nr++] = addr->id; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 34fec753b9c178..b571fba88a2f96 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -139,7 +139,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; - pr_debug("colesced seq %llx into %llx new len %d new end seq %llx", + pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n", MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; @@ -217,7 +217,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) end_seq = MPTCP_SKB_CB(skb)->end_seq; max_seq = atomic64_read(&msk->rcv_wnd_sent); - pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, + pr_debug("msk=%p seq=%llx limit=%llx empty=%d\n", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); if (after64(end_seq, max_seq)) { /* out of window */ @@ -643,7 +643,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } } - pr_debug("msk=%p ssk=%p", msk, ssk); + pr_debug("msk=%p ssk=%p\n", msk, ssk); tp = tcp_sk(ssk); do { u32 map_remaining, offset; @@ -724,7 +724,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) u64 end_seq; p = rb_first(&msk->out_of_order_queue); - pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); + pr_debug("msk=%p empty=%d\n", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); while (p) { skb = rb_to_skb(p); if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) @@ -746,7 +746,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; /* skip overlapping data, if any */ - pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d", + pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d\n", MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, delta); MPTCP_SKB_CB(skb)->offset += delta; @@ -1240,7 +1240,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, size_t copy; int i; - pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", + pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u\n", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); if (WARN_ON_ONCE(info->sent > info->limit || @@ -1341,7 +1341,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->use_map = 1; mpext->dsn64 = 1; - pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d\n", mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); @@ -1892,7 +1892,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!msk->first_pending) WRITE_ONCE(msk->first_pending, dfrag); } - pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk, + pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, !dfrag_collapsed); @@ -2248,7 +2248,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } - pr_debug("block timeout %ld", timeo); + pr_debug("block timeout %ld\n", timeo); sk_wait_data(sk, &timeo, NULL); } @@ -2264,7 +2264,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } - pr_debug("msk=%p rx queue empty=%d:%d copied=%d", + pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", msk, skb_queue_empty_lockless(&sk->sk_receive_queue), skb_queue_empty(&msk->receive_queue), copied); if (!(flags & MSG_PEEK)) @@ -2717,7 +2717,7 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) if (!ssk) return; - pr_debug("MP_FAIL doesn't respond, reset the subflow"); + pr_debug("MP_FAIL doesn't respond, reset the subflow\n"); slow = lock_sock_fast(ssk); mptcp_subflow_reset(ssk); @@ -2891,7 +2891,7 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) break; default: if (__mptcp_check_fallback(mptcp_sk(sk))) { - pr_debug("Fallback"); + pr_debug("Fallback\n"); ssk->sk_shutdown |= how; tcp_shutdown(ssk, how); @@ -2901,7 +2901,7 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt); mptcp_schedule_work(sk); } else { - pr_debug("Sending DATA_FIN on subflow %p", ssk); + pr_debug("Sending DATA_FIN on subflow %p\n", ssk); tcp_send_ack(ssk); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); @@ -2967,7 +2967,7 @@ static void mptcp_check_send_data_fin(struct sock *sk) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu", + pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu\n", msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), msk->snd_nxt, msk->write_seq); @@ -2991,7 +2991,7 @@ static void __mptcp_wr_shutdown(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d", + pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d\n", msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, !!mptcp_send_head(sk)); @@ -3006,7 +3006,7 @@ static void __mptcp_destroy_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); might_sleep(); @@ -3114,7 +3114,7 @@ bool __mptcp_close(struct sock *sk, long timeout) mptcp_set_state(sk, TCP_CLOSE); sock_hold(sk); - pr_debug("msk=%p state=%d", sk, sk->sk_state); + pr_debug("msk=%p state=%d\n", sk, sk->sk_state); if (msk->token) mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); @@ -3546,7 +3546,7 @@ static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); - pr_debug("msk=%p, ssk=%p", msk, msk->first); + pr_debug("msk=%p, ssk=%p\n", msk, msk->first); if (WARN_ON_ONCE(!msk->first)) return -EINVAL; @@ -3563,7 +3563,7 @@ void mptcp_finish_connect(struct sock *ssk) sk = subflow->conn; msk = mptcp_sk(sk); - pr_debug("msk=%p, token=%u", sk, subflow->token); + pr_debug("msk=%p, token=%u\n", sk, subflow->token); subflow->map_seq = subflow->iasn; subflow->map_subflow_seq = 1; @@ -3592,7 +3592,7 @@ bool mptcp_finish_join(struct sock *ssk) struct sock *parent = (void *)msk; bool ret = true; - pr_debug("msk=%p, subflow=%p", msk, subflow); + pr_debug("msk=%p, subflow=%p\n", msk, subflow); /* mptcp socket already closing? */ if (!mptcp_is_fully_established(parent)) { @@ -3638,7 +3638,7 @@ bool mptcp_finish_join(struct sock *ssk) static void mptcp_shutdown(struct sock *sk, int how) { - pr_debug("sk=%p, how=%d", sk, how); + pr_debug("sk=%p, how=%d\n", sk, how); if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) __mptcp_wr_shutdown(sk); @@ -3859,7 +3859,7 @@ static int mptcp_listen(struct socket *sock, int backlog) struct sock *ssk; int err; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); lock_sock(sk); @@ -3898,7 +3898,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *newsk; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); /* Buggy applications can call accept on socket states other then LISTEN * but no need to allocate the first subflow just to error out. @@ -3907,12 +3907,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk) return -EINVAL; - pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); + pr_debug("ssk=%p, listener=%p\n", ssk, mptcp_subflow_ctx(ssk)); newsk = inet_csk_accept(ssk, arg); if (!newsk) return arg->err; - pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); + pr_debug("newsk=%p, subflow is mptcp=%d\n", newsk, sk_is_mptcp(newsk)); if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; @@ -4005,7 +4005,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); - pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); + pr_debug("msk=%p state=%d flags=%lx\n", msk, state, msk->flags); if (state == TCP_LISTEN) { struct sock *ssk = READ_ONCE(msk->first); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a1c1b0ff1ce1c8..240d7c2ea55130 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1177,7 +1177,7 @@ static inline bool mptcp_check_fallback(const struct sock *sk) static inline void __mptcp_do_fallback(struct mptcp_sock *msk) { if (__mptcp_check_fallback(msk)) { - pr_debug("TCP fallback already done (msk=%p)", msk); + pr_debug("TCP fallback already done (msk=%p)\n", msk); return; } set_bit(MPTCP_FALLBACK_DONE, &msk->flags); @@ -1213,7 +1213,7 @@ static inline void mptcp_do_fallback(struct sock *ssk) } } -#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) +#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)\n", __func__, a) static inline bool mptcp_check_infinite_map(struct sk_buff *skb) { diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index 4a7fd0508ad284..78ed508ebc1b8d 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -86,7 +86,7 @@ int mptcp_register_scheduler(struct mptcp_sched_ops *sched) list_add_tail_rcu(&sched->list, &mptcp_sched_list); spin_unlock(&mptcp_sched_list_lock); - pr_debug("%s registered", sched->name); + pr_debug("%s registered\n", sched->name); return 0; } @@ -118,7 +118,7 @@ int mptcp_init_sched(struct mptcp_sock *msk, if (msk->sched->init) msk->sched->init(msk); - pr_debug("sched=%s", msk->sched->name); + pr_debug("sched=%s\n", msk->sched->name); return 0; } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2026a9a36f804d..505445a9598faf 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -873,7 +873,7 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname, struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); if (level == SOL_SOCKET) return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); @@ -1453,7 +1453,7 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname, struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 4834e7fc2fb66d..064ab32358934d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -39,7 +39,7 @@ static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - pr_debug("subflow_req=%p", subflow_req); + pr_debug("subflow_req=%p\n", subflow_req); if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); @@ -146,7 +146,7 @@ static int subflow_check_req(struct request_sock *req, struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; - pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); + pr_debug("subflow_req=%p, listener=%p\n", subflow_req, listener); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of @@ -221,7 +221,7 @@ static int subflow_check_req(struct request_sock *req, } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { - pr_debug("syn inet_sport=%d %d", + pr_debug("syn inet_sport=%d %d\n", ntohs(inet_sk(sk_listener)->inet_sport), ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { @@ -243,7 +243,7 @@ static int subflow_check_req(struct request_sock *req, subflow_init_req_cookie_join_save(subflow_req, skb); } - pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, + pr_debug("token=%u, remote_nonce=%u msk=%p\n", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } @@ -527,7 +527,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; - pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); + pr_debug("subflow=%p synack seq=%x\n", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { @@ -559,7 +559,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; WRITE_ONCE(subflow->remote_id, mp_opt.join_id); - pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", + pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d\n", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); @@ -585,7 +585,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX); if (subflow_use_different_dport(msk, sk)) { - pr_debug("synack inet_dport=%d %d", + pr_debug("synack inet_dport=%d %d\n", ntohs(inet_sk(sk)->inet_dport), ntohs(inet_sk(parent)->inet_dport)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); @@ -655,7 +655,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); /* Never answer to SYNs sent to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -686,7 +686,7 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); if (skb->protocol == htons(ETH_P_IP)) return subflow_v4_conn_request(sk, skb); @@ -807,7 +807,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_sock *owner; struct sock *child; - pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); + pr_debug("listener=%p, req=%p, conn=%p\n", listener, req, listener->conn); /* After child creation we must look for MPC even when options * are not parsed @@ -898,7 +898,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ctx->conn = (struct sock *)owner; if (subflow_use_different_sport(owner, sk)) { - pr_debug("ack inet_sport=%d %d", + pr_debug("ack inet_sport=%d %d\n", ntohs(inet_sk(sk)->inet_sport), ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { @@ -961,7 +961,7 @@ enum mapping_status { static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { - pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", + pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d\n", ssn, subflow->map_subflow_seq, subflow->map_data_len); } @@ -1121,7 +1121,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_len = mpext->data_len; if (data_len == 0) { - pr_debug("infinite mapping received"); + pr_debug("infinite mapping received\n"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); subflow->map_data_len = 0; return MAPPING_INVALID; @@ -1133,7 +1133,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (data_len == 1) { bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, mpext->dsn64); - pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); + pr_debug("DATA_FIN with no payload seq=%llu\n", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping @@ -1159,7 +1159,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_fin_seq &= GENMASK_ULL(31, 0); mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); - pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", + pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d\n", data_fin_seq, mpext->dsn64); /* Adjust for DATA_FIN using 1 byte of sequence space */ @@ -1205,7 +1205,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (unlikely(subflow->map_csum_reqd != csum_reqd)) return MAPPING_INVALID; - pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len, subflow->map_csum_reqd, subflow->map_data_csum); @@ -1240,7 +1240,7 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, avail_len = skb->len - offset; incr = limit >= avail_len ? avail_len + fin : limit; - pr_debug("discarding=%d len=%d offset=%d seq=%d", incr, skb->len, + pr_debug("discarding=%d len=%d offset=%d seq=%d\n", incr, skb->len, offset, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; @@ -1341,7 +1341,7 @@ static bool subflow_check_data_avail(struct sock *ssk) old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); - pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, + pr_debug("msk ack_seq=%llx subflow ack_seq=%llx\n", old_ack, ack_seq); if (unlikely(before64(ack_seq, old_ack))) { mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); @@ -1413,7 +1413,7 @@ bool mptcp_subflow_data_available(struct sock *sk) subflow->map_valid = 0; WRITE_ONCE(subflow->data_avail, false); - pr_debug("Done with mapping: seq=%u data_len=%u", + pr_debug("Done with mapping: seq=%u data_len=%u\n", subflow->map_subflow_seq, subflow->map_data_len); } @@ -1523,7 +1523,7 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped) target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); - pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", + pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d\n", subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); if (likely(icsk->icsk_af_ops == target)) @@ -1616,7 +1616,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, goto failed; mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); - pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, + pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d\n", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; WRITE_ONCE(subflow->remote_id, remote_id); @@ -1751,7 +1751,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; subflow = mptcp_subflow_ctx(sf->sk); - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); *new_sock = sf; sock_hold(sk); @@ -1780,7 +1780,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, INIT_LIST_HEAD(&ctx->node); INIT_LIST_HEAD(&ctx->delegated_node); - pr_debug("subflow=%p", ctx); + pr_debug("subflow=%p\n", ctx); ctx->tcp_sock = sk; WRITE_ONCE(ctx->local_id, -1); @@ -1931,7 +1931,7 @@ static int subflow_ulp_init(struct sock *sk) goto out; } - pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); + pr_debug("subflow=%p, family=%d\n", ctx, sk->sk_family); tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; From 3a0504d54b3b57f0d7bf3d9184a00c9f8887f6d7 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Mon, 26 Aug 2024 15:07:11 +0200 Subject: [PATCH 783/991] sctp: fix association labeling in the duplicate COOKIE-ECHO case sctp_sf_do_5_2_4_dupcook() currently calls security_sctp_assoc_request() on new_asoc, but as it turns out, this association is always discarded and the LSM labels never get into the final association (asoc). This can be reproduced by having two SCTP endpoints try to initiate an association with each other at approximately the same time and then peel off the association into a new socket, which exposes the unitialized labels and triggers SELinux denials. Fix it by calling security_sctp_assoc_request() on asoc instead of new_asoc. Xin Long also suggested limit calling the hook only to cases A, B, and D, since in cases C and E the COOKIE ECHO chunk is discarded and the association doesn't enter the ESTABLISHED state, so rectify that as well. One related caveat with SELinux and peer labeling: When an SCTP connection is set up simultaneously in this way, we will end up with an association that is initialized with security_sctp_assoc_request() on both sides, so the MLS component of the security context of the association will get swapped between the peers, instead of just one side setting it to the other's MLS component. However, at that point security_sctp_assoc_request() had already been called on both sides in sctp_sf_do_unexpected_init() (on a temporary association) and thus if the exchange didn't fail before due to MLS, it won't fail now either (most likely both endpoints have the same MLS range). Tested by: - reproducer from https://src.fedoraproject.org/tests/selinux/pull-request/530 - selinux-testsuite (https://github.com/SELinuxProject/selinux-testsuite/) - sctp-tests (https://github.com/sctp/sctp-tests) - no tests failed that wouldn't fail also without the patch applied Fixes: c081d53f97a1 ("security: pass asoc to sctp_assoc_request and sctp_sk_clone") Suggested-by: Xin Long Signed-off-by: Ondrej Mosnacek Acked-by: Xin Long Acked-by: Paul Moore (LSM/SELinux) Link: https://patch.msgid.link/20240826130711.141271-1-omosnace@redhat.com Signed-off-by: Jakub Kicinski --- net/sctp/sm_statefuns.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 5adf0c0a6c1acd..7d315a18612ba5 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2260,12 +2260,6 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( } } - /* Update socket peer label if first association. */ - if (security_sctp_assoc_request(new_asoc, chunk->head_skb ?: chunk->skb)) { - sctp_association_free(new_asoc); - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - } - /* Set temp so that it won't be added into hashtable */ new_asoc->temp = 1; @@ -2274,6 +2268,22 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( */ action = sctp_tietags_compare(new_asoc, asoc); + /* In cases C and E the association doesn't enter the ESTABLISHED + * state, so there is no need to call security_sctp_assoc_request(). + */ + switch (action) { + case 'A': /* Association restart. */ + case 'B': /* Collision case B. */ + case 'D': /* Collision case D. */ + /* Update socket peer label if first association. */ + if (security_sctp_assoc_request((struct sctp_association *)asoc, + chunk->head_skb ?: chunk->skb)) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + break; + } + switch (action) { case 'A': /* Association restart. */ retval = sctp_sf_do_dupcook_a(net, ep, asoc, chunk, commands, From a2ccc33b88e2953a6bf0b309e7e8849cc5320018 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 23 Aug 2024 19:29:18 +0300 Subject: [PATCH 784/991] drm/i915/dp_mst: Fix MST state after a sink reset In some cases the sink can reset itself after it was configured into MST mode, without the driver noticing the disconnected state. For instance the reset may happen in the middle of a modeset, or the (long) HPD pulse generated may be not long enough for the encoder detect handler to observe the HPD's deasserted state. In this case the sink's DPCD register programmed to enable MST will be reset, while the driver still assumes MST is still enabled. Detect this condition, which will tear down and recreate/re-enable the MST topology. v2: - Add a code comment about adjusting the expected DP_MSTM_CTRL register value for SST + SideBand. (Suraj, Jani) - Print a debug message about detecting the link reset. (Jani) - Verify the DPCD MST state only if it wasn't already determined that the sink is disconnected. Cc: stable@vger.kernel.org Cc: Jani Nikula Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11195 Reviewed-by: Suraj Kandpal (v1) Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20240823162918.1211875-1-imre.deak@intel.com (cherry picked from commit 594cf78dc36f31c0c7e0de4567e644f406d46bae) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_dp.c | 12 +++++++ drivers/gpu/drm/i915/display/intel_dp_mst.c | 40 +++++++++++++++++++++ drivers/gpu/drm/i915/display/intel_dp_mst.h | 1 + 3 files changed, 53 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 59f11af3b0a1dc..dc75a929d3ed68 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -5935,6 +5935,18 @@ intel_dp_detect(struct drm_connector *connector, else status = connector_status_disconnected; + if (status != connector_status_disconnected && + !intel_dp_mst_verify_dpcd_state(intel_dp)) + /* + * This requires retrying detection for instance to re-enable + * the MST mode that got reset via a long HPD pulse. The retry + * will happen either via the hotplug handler's retry logic, + * ensured by setting the connector here to SST/disconnected, + * or via a userspace connector probing in response to the + * hotplug uevent sent when removing the MST connectors. + */ + status = connector_status_disconnected; + if (status == connector_status_disconnected) { memset(&intel_dp->compliance, 0, sizeof(intel_dp->compliance)); memset(intel_connector->dp.dsc_dpcd, 0, sizeof(intel_connector->dp.dsc_dpcd)); diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index 27ce5c3f5951e5..17978a1f9ab0a0 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -1998,3 +1998,43 @@ bool intel_dp_mst_crtc_needs_modeset(struct intel_atomic_state *state, return false; } + +/* + * intel_dp_mst_verify_dpcd_state - verify the MST SW enabled state wrt. the DPCD + * @intel_dp: DP port object + * + * Verify if @intel_dp's MST enabled SW state matches the corresponding DPCD + * state. A long HPD pulse - not long enough to be detected as a disconnected + * state - could've reset the DPCD state, which requires tearing + * down/recreating the MST topology. + * + * Returns %true if the SW MST enabled and DPCD states match, %false + * otherwise. + */ +bool intel_dp_mst_verify_dpcd_state(struct intel_dp *intel_dp) +{ + struct intel_display *display = to_intel_display(intel_dp); + struct intel_connector *connector = intel_dp->attached_connector; + struct intel_digital_port *dig_port = dp_to_dig_port(intel_dp); + struct intel_encoder *encoder = &dig_port->base; + int ret; + u8 val; + + if (!intel_dp->is_mst) + return true; + + ret = drm_dp_dpcd_readb(intel_dp->mst_mgr.aux, DP_MSTM_CTRL, &val); + + /* Adjust the expected register value for SST + SideBand. */ + if (ret < 0 || val != (DP_MST_EN | DP_UP_REQ_EN | DP_UPSTREAM_IS_SRC)) { + drm_dbg_kms(display->drm, + "[CONNECTOR:%d:%s][ENCODER:%d:%s] MST mode got reset, removing topology (ret=%d, ctrl=0x%02x)\n", + connector->base.base.id, connector->base.name, + encoder->base.base.id, encoder->base.name, + ret, val); + + return false; + } + + return true; +} diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.h b/drivers/gpu/drm/i915/display/intel_dp_mst.h index 8ca1d599091c69..9e4c7679f1c3a3 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.h +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.h @@ -27,5 +27,6 @@ int intel_dp_mst_atomic_check_link(struct intel_atomic_state *state, struct intel_link_bw_limits *limits); bool intel_dp_mst_crtc_needs_modeset(struct intel_atomic_state *state, struct intel_crtc *crtc); +bool intel_dp_mst_verify_dpcd_state(struct intel_dp *intel_dp); #endif /* __INTEL_DP_MST_H__ */ From a547a5880cba6f287179135381f1b484b251be31 Mon Sep 17 00:00:00 2001 From: Peter Newman Date: Thu, 22 Aug 2024 12:02:11 -0700 Subject: [PATCH 785/991] x86/resctrl: Fix arch_mbm_* array overrun on SNC When using resctrl on systems with Sub-NUMA Clustering enabled, monitoring groups may be allocated RMID values which would overrun the arch_mbm_{local,total} arrays. This is due to inconsistencies in whether the SNC-adjusted num_rmid value or the unadjusted value in resctrl_arch_system_num_rmid_idx() is used. The num_rmid value for the L3 resource is currently: resctrl_arch_system_num_rmid_idx() / snc_nodes_per_l3_cache As a simple fix, make resctrl_arch_system_num_rmid_idx() return the SNC-adjusted, L3 num_rmid value on x86. Fixes: e13db55b5a0d ("x86/resctrl: Introduce snc_nodes_per_l3_cache") Signed-off-by: Peter Newman Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/r/20240822190212.1848788-1-peternewman@google.com --- arch/x86/include/asm/resctrl.h | 6 ------ arch/x86/kernel/cpu/resctrl/core.c | 8 ++++++++ include/linux/resctrl.h | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 12dbd2588ca7cc..8b1b6ce1e51b21 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -156,12 +156,6 @@ static inline void resctrl_sched_in(struct task_struct *tsk) __resctrl_sched_in(tsk); } -static inline u32 resctrl_arch_system_num_rmid_idx(void) -{ - /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return boot_cpu_data.x86_cache_max_rmid + 1; -} - static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) { *rmid = idx; diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 1930fce9dfe96d..8591d53c144bb1 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -119,6 +119,14 @@ struct rdt_hw_resource rdt_resources_all[] = { }, }; +u32 resctrl_arch_system_num_rmid_idx(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + + /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ + return r->num_rmid; +} + /* * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs * as they do not have CPUID enumeration support for Cache allocation. diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index b0875b99e8111e..d94abba1c716e9 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -248,6 +248,7 @@ struct resctrl_schema { /* The number of closid supported by this resource regardless of CDP */ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); +u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); /* From 6a5dcd487791e0c2d86622064602a5c7459941ed Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Aug 2024 23:06:48 +0100 Subject: [PATCH 786/991] cifs: Fix lack of credit renegotiation on read retry When netfslib asks cifs to issue a read operation, it prefaces this with a call to ->clamp_length() which cifs uses to negotiate credits, providing receive capacity on the server; however, in the event that a read op needs reissuing, netfslib doesn't call ->clamp_length() again as that could shorten the subrequest, leaving a gap. This causes the retried read to be done with zero credits which causes the server to reject it with STATUS_INVALID_PARAMETER. This is a problem for a DIO read that is requested that would go over the EOF. The short read will be retried, causing EINVAL to be returned to the user when it fails. Fix this by making cifs_req_issue_read() negotiate new credits if retrying (NETFS_SREQ_RETRYING now gets set in the read side as well as the write side in this instance). This isn't sufficient, however: the new credits might not be sufficient to complete the remainder of the read, so also add an additional field, rreq->actual_len, that holds the actual size of the op we want to perform without having to alter subreq->len. We then rely on repeated short reads being retried until we finish the read or reach the end of file and make a zero-length read. Also fix a couple of places where the subrequest start and length need to be altered by the amount so far transferred when being used. Fixes: 69c3c023af25 ("cifs: Implement netfslib hooks") Signed-off-by: David Howells cc: Steve French cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/netfs/io.c | 2 ++ fs/smb/client/cifsglob.h | 1 + fs/smb/client/file.c | 37 +++++++++++++++++++++++++++++++++---- fs/smb/client/smb2ops.c | 2 +- fs/smb/client/smb2pdu.c | 28 +++++++++++++++++----------- fs/smb/client/trace.h | 1 + 6 files changed, 55 insertions(+), 16 deletions(-) diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 4da0a494e860f1..3303b515b53635 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -306,6 +306,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) break; subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->error = 0; + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); netfs_stat(&netfs_n_rh_download_instead); trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); @@ -313,6 +314,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) netfs_reset_subreq_iter(rreq, subreq); netfs_read_from_server(rreq, subreq); } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); netfs_reset_subreq_iter(rreq, subreq); netfs_rreq_short_read(rreq, subreq); } diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index f379b9dc93bac1..9eae8649f90c38 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1485,6 +1485,7 @@ struct cifs_io_subrequest { struct cifs_io_request *req; }; ssize_t got_bytes; + size_t actual_len; unsigned int xid; int result; bool have_xid; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index f9b302cb8233cf..2d387485f05ba0 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -111,6 +111,7 @@ static void cifs_issue_write(struct netfs_io_subrequest *subreq) goto fail; } + wdata->actual_len = wdata->subreq.len; rc = adjust_credits(wdata->server, wdata, cifs_trace_rw_credits_issue_write_adjust); if (rc) goto fail; @@ -153,7 +154,7 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); struct TCP_Server_Info *server = req->server; struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); - size_t rsize = 0; + size_t rsize; int rc; rdata->xid = get_xid(); @@ -166,8 +167,8 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) cifs_sb->ctx); - rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize, - &rdata->credits); + rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, + &rsize, &rdata->credits); if (rc) { subreq->error = rc; return false; @@ -183,7 +184,8 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) server->credits, server->in_flight, 0, cifs_trace_rw_credits_read_submit); - subreq->len = min_t(size_t, subreq->len, rsize); + subreq->len = umin(subreq->len, rsize); + rdata->actual_len = subreq->len; #ifdef CONFIG_CIFS_SMB_DIRECT if (server->smbd_conn) @@ -203,12 +205,39 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); + struct TCP_Server_Info *server = req->server; + struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); int rc = 0; cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", __func__, rreq->debug_id, subreq->debug_index, rreq->mapping, subreq->transferred, subreq->len); + if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) { + /* + * As we're issuing a retry, we need to negotiate some new + * credits otherwise the server may reject the op with + * INVALID_PARAMETER. Note, however, we may get back less + * credit than we need to complete the op, in which case, we + * shorten the op and rely on additional rounds of retry. + */ + size_t rsize = umin(subreq->len - subreq->transferred, + cifs_sb->ctx->rsize); + + rc = server->ops->wait_mtu_credits(server, rsize, &rdata->actual_len, + &rdata->credits); + if (rc) + goto out; + + rdata->credits.in_flight_check = 1; + + trace_smb3_rw_credits(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->credits.value, + server->credits, server->in_flight, 0, + cifs_trace_rw_credits_read_resubmit); + } + if (req->cfile->invalidHandle) { do { rc = cifs_reopen_file(req->cfile, true); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 0b9cb1a60d4afa..a6f00b1572755f 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -301,7 +301,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server, unsigned int /*enum smb3_rw_credits_trace*/ trace) { struct cifs_credits *credits = &subreq->credits; - int new_val = DIV_ROUND_UP(subreq->subreq.len, SMB2_MAX_BUFFER_SIZE); + int new_val = DIV_ROUND_UP(subreq->actual_len, SMB2_MAX_BUFFER_SIZE); int scredits, in_flight; if (!credits->value || credits->value == new_val) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 2d7e6c42cf182b..be7a1a9c691da6 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4529,9 +4529,9 @@ smb2_readv_callback(struct mid_q_entry *mid) "rdata server %p != mid server %p", rdata->server, mid->server); - cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n", + cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu/%zu\n", __func__, mid->mid, mid->mid_state, rdata->result, - rdata->subreq.len); + rdata->actual_len, rdata->subreq.len - rdata->subreq.transferred); switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -4585,15 +4585,18 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->subreq.debug_index, rdata->xid, rdata->req->cfile->fid.persistent_fid, - tcon->tid, tcon->ses->Suid, rdata->subreq.start, - rdata->subreq.len, rdata->result); + tcon->tid, tcon->ses->Suid, + rdata->subreq.start + rdata->subreq.transferred, + rdata->actual_len, + rdata->result); } else trace_smb3_read_done(rdata->rreq->debug_id, rdata->subreq.debug_index, rdata->xid, rdata->req->cfile->fid.persistent_fid, tcon->tid, tcon->ses->Suid, - rdata->subreq.start, rdata->got_bytes); + rdata->subreq.start + rdata->subreq.transferred, + rdata->got_bytes); if (rdata->result == -ENODATA) { /* We may have got an EOF error because fallocate @@ -4621,6 +4624,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) { int rc, flags = 0; char *buf; + struct netfs_io_subrequest *subreq = &rdata->subreq; struct smb2_hdr *shdr; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = rdata->iov, @@ -4631,15 +4635,15 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) int credit_request; cifs_dbg(FYI, "%s: offset=%llu bytes=%zu\n", - __func__, rdata->subreq.start, rdata->subreq.len); + __func__, subreq->start, subreq->len); if (!rdata->server) rdata->server = cifs_pick_channel(tcon->ses); io_parms.tcon = tlink_tcon(rdata->req->cfile->tlink); io_parms.server = server = rdata->server; - io_parms.offset = rdata->subreq.start; - io_parms.length = rdata->subreq.len; + io_parms.offset = subreq->start + subreq->transferred; + io_parms.length = rdata->actual_len; io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid; io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid; io_parms.pid = rdata->req->pid; @@ -4654,11 +4658,13 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) rdata->iov[0].iov_base = buf; rdata->iov[0].iov_len = total_len; + rdata->got_bytes = 0; + rdata->result = 0; shdr = (struct smb2_hdr *)buf; if (rdata->credits.value > 0) { - shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->subreq.len, + shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->actual_len, SMB2_MAX_BUFFER_SIZE)); credit_request = le16_to_cpu(shdr->CreditCharge) + 8; if (server->credits >= server->max_credits) @@ -4682,11 +4688,11 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) if (rc) { cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); trace_smb3_read_err(rdata->rreq->debug_id, - rdata->subreq.debug_index, + subreq->debug_index, rdata->xid, io_parms.persistent_fid, io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length, rc); + io_parms.offset, rdata->actual_len, rc); } async_readv_out: diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 0f0c10c7ada73f..8e9964001e2aed 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -30,6 +30,7 @@ EM(cifs_trace_rw_credits_old_session, "old-session") \ EM(cifs_trace_rw_credits_read_response_add, "rd-resp-add") \ EM(cifs_trace_rw_credits_read_response_clear, "rd-resp-clr") \ + EM(cifs_trace_rw_credits_read_resubmit, "rd-resubmit") \ EM(cifs_trace_rw_credits_read_submit, "rd-submit ") \ EM(cifs_trace_rw_credits_write_prepare, "wr-prepare ") \ EM(cifs_trace_rw_credits_write_response_add, "wr-resp-add") \ From 1da29f2c39b67b846b74205c81bf0ccd96d34727 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Aug 2024 23:06:49 +0100 Subject: [PATCH 787/991] netfs, cifs: Fix handling of short DIO read Short DIO reads, particularly in relation to cifs, are not being handled correctly by cifs and netfslib. This can be tested by doing a DIO read of a file where the size of read is larger than the size of the file. When it crosses the EOF, it gets a short read and this gets retried, and in the case of cifs, the retry read fails, with the failure being translated to ENODATA. Fix this by the following means: (1) Add a flag, NETFS_SREQ_HIT_EOF, for the filesystem to set when it detects that the read did hit the EOF. (2) Make the netfslib read assessment stop processing subrequests when it encounters one with that flag set. (3) Return rreq->transferred, the accumulated contiguous amount read to that point, to userspace for a DIO read. (4) Make cifs set the flag and clear the error if the read RPC returned ENODATA. (5) Make cifs set the flag and clear the error if a short read occurred without error and the read-to file position is now at the remote inode size. Fixes: 69c3c023af25 ("cifs: Implement netfslib hooks") Signed-off-by: David Howells cc: Steve French cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/netfs/io.c | 17 +++++++++++------ fs/smb/client/smb2pdu.c | 13 +++++++++---- include/linux/netfs.h | 1 + 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 3303b515b53635..943128507af550 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -368,7 +368,8 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) if (subreq->error || subreq->transferred == 0) break; transferred += subreq->transferred; - if (subreq->transferred < subreq->len) + if (subreq->transferred < subreq->len || + test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) break; } @@ -503,7 +504,8 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, subreq->error = 0; subreq->transferred += transferred_or_error; - if (subreq->transferred < subreq->len) + if (subreq->transferred < subreq->len && + !test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) goto incomplete; complete: @@ -782,10 +784,13 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) TASK_UNINTERRUPTIBLE); ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len && - rreq->origin != NETFS_DIO_READ) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); - ret = -EIO; + if (ret == 0) { + if (rreq->origin == NETFS_DIO_READ) { + ret = rreq->transferred; + } else if (rreq->submitted < rreq->len) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); + ret = -EIO; + } } } else { /* If we decrement nr_outstanding to 0, the ref belongs to us. */ diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index be7a1a9c691da6..88dc49d670371b 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4507,6 +4507,7 @@ static void smb2_readv_callback(struct mid_q_entry *mid) { struct cifs_io_subrequest *rdata = mid->callback_data; + struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode); struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink); struct TCP_Server_Info *server = rdata->server; struct smb2_hdr *shdr = @@ -4599,11 +4600,15 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->got_bytes); if (rdata->result == -ENODATA) { - /* We may have got an EOF error because fallocate - * failed to enlarge the file. - */ - if (rdata->subreq.start < rdata->subreq.rreq->i_size) + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + rdata->result = 0; + } else { + if (rdata->got_bytes < rdata->actual_len && + rdata->subreq.start + rdata->subreq.transferred + rdata->got_bytes == + ictx->remote_i_size) { + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + } } trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, server->credits, server->in_flight, diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 983816608f15d7..c47443e7a97efd 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -198,6 +198,7 @@ struct netfs_io_subrequest { #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ #define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ #define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ +#define NETFS_SREQ_HIT_EOF 12 /* Set if we hit the EOF */ }; enum netfs_io_origin { From 8101d6e112e2524e967368f920c404ae445a9757 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 27 Aug 2024 15:47:27 +0100 Subject: [PATCH 788/991] cifs: Fix copy offload to flush destination region Fix cifs_file_copychunk_range() to flush the destination region before invalidating it to avoid potential loss of data should the copy fail, in whole or in part, in some way. Fixes: 7b2404a886f8 ("cifs: Fix flushing, invalidation and file size with copy_file_range()") Signed-off-by: David Howells cc: Steve French cc: Paulo Alcantara cc: Shyam Prasad N cc: Rohith Surabattula cc: Matthew Wilcox cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsfs.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index d89485235425ae..2a2523c93944de 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1341,7 +1341,6 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, struct cifsFileInfo *smb_file_target; struct cifs_tcon *src_tcon; struct cifs_tcon *target_tcon; - unsigned long long destend, fstart, fend; ssize_t rc; cifs_dbg(FYI, "copychunk range\n"); @@ -1391,25 +1390,13 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, goto unlock; } - destend = destoff + len - 1; - - /* Flush the folios at either end of the destination range to prevent - * accidental loss of dirty data outside of the range. + /* Flush and invalidate all the folios in the destination region. If + * the copy was successful, then some of the flush is extra overhead, + * but we need to allow for the copy failing in some way (eg. ENOSPC). */ - fstart = destoff; - fend = destend; - - rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true); + rc = filemap_invalidate_inode(target_inode, true, destoff, destoff + len - 1); if (rc) goto unlock; - rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false); - if (rc) - goto unlock; - if (fend > target_cifsi->netfs.zero_point) - target_cifsi->netfs.zero_point = fend + 1; - - /* Discard all the folios that overlap the destination region. */ - truncate_inode_pages_range(&target_inode->i_data, fstart, fend); fscache_invalidate(cifs_inode_cookie(target_inode), NULL, i_size_read(target_inode), 0); From 383baf5c8f062091af34c63f28d37642a8f188ae Mon Sep 17 00:00:00 2001 From: Mrinmay Sarkar Date: Mon, 26 Aug 2024 17:41:00 +0530 Subject: [PATCH 789/991] dmaengine: dw-edma: Fix unmasking STOP and ABORT interrupts for HDMA The current logic is enabling both STOP_INT_MASK and ABORT_INT_MASK bit. This is apparently masking those particular interrupts rather than unmasking the same. If the interrupts are masked, they would never get triggered. So fix the issue by unmasking the STOP and ABORT interrupts properly. Fixes: e74c39573d35 ("dmaengine: dw-edma: Add support for native HDMA") cc: stable@vger.kernel.org Signed-off-by: Mrinmay Sarkar Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/1724674261-3144-2-git-send-email-quic_msarkar@quicinc.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-hdma-v0-core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/dma/dw-edma/dw-hdma-v0-core.c b/drivers/dma/dw-edma/dw-hdma-v0-core.c index 10e8f0715114fb..2addaca3b694f1 100644 --- a/drivers/dma/dw-edma/dw-hdma-v0-core.c +++ b/drivers/dma/dw-edma/dw-hdma-v0-core.c @@ -247,10 +247,11 @@ static void dw_hdma_v0_core_start(struct dw_edma_chunk *chunk, bool first) if (first) { /* Enable engine */ SET_CH_32(dw, chan->dir, chan->id, ch_en, BIT(0)); - /* Interrupt enable&unmask - done, abort */ - tmp = GET_CH_32(dw, chan->dir, chan->id, int_setup) | - HDMA_V0_STOP_INT_MASK | HDMA_V0_ABORT_INT_MASK | - HDMA_V0_LOCAL_STOP_INT_EN | HDMA_V0_LOCAL_ABORT_INT_EN; + /* Interrupt unmask - stop, abort */ + tmp = GET_CH_32(dw, chan->dir, chan->id, int_setup); + tmp &= ~(HDMA_V0_STOP_INT_MASK | HDMA_V0_ABORT_INT_MASK); + /* Interrupt enable - stop, abort */ + tmp |= HDMA_V0_LOCAL_STOP_INT_EN | HDMA_V0_LOCAL_ABORT_INT_EN; if (!(dw->chip->flags & DW_EDMA_CHIP_LOCAL)) tmp |= HDMA_V0_REMOTE_STOP_INT_EN | HDMA_V0_REMOTE_ABORT_INT_EN; SET_CH_32(dw, chan->dir, chan->id, int_setup, tmp); From 9f646ff25c09c52cebe726601db27a60f876f15e Mon Sep 17 00:00:00 2001 From: Mrinmay Sarkar Date: Mon, 26 Aug 2024 17:41:01 +0530 Subject: [PATCH 790/991] dmaengine: dw-edma: Do not enable watermark interrupts for HDMA DW_HDMA_V0_LIE and DW_HDMA_V0_RIE are initialized as BIT(3) and BIT(4) respectively in dw_hdma_control enum. But as per HDMA register these bits are corresponds to LWIE and RWIE bit i.e local watermark interrupt enable and remote watermarek interrupt enable. In linked list mode LWIE and RWIE bits only enable the local and remote watermark interrupt. Since the watermark interrupts are not used but enabled, this leads to spurious interrupts getting generated. So remove the code that enables them to avoid generating spurious watermark interrupts. And also rename DW_HDMA_V0_LIE to DW_HDMA_V0_LWIE and DW_HDMA_V0_RIE to DW_HDMA_V0_RWIE as there is no LIE and RIE bits in HDMA and those bits are corresponds to LWIE and RWIE bits. Fixes: e74c39573d35 ("dmaengine: dw-edma: Add support for native HDMA") cc: stable@vger.kernel.org Signed-off-by: Mrinmay Sarkar Reviewed-by: Manivannan Sadhasivam Reviewed-by: Serge Semin Link: https://lore.kernel.org/r/1724674261-3144-3-git-send-email-quic_msarkar@quicinc.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-hdma-v0-core.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/drivers/dma/dw-edma/dw-hdma-v0-core.c b/drivers/dma/dw-edma/dw-hdma-v0-core.c index 2addaca3b694f1..e3f8db4fe909a1 100644 --- a/drivers/dma/dw-edma/dw-hdma-v0-core.c +++ b/drivers/dma/dw-edma/dw-hdma-v0-core.c @@ -17,8 +17,8 @@ enum dw_hdma_control { DW_HDMA_V0_CB = BIT(0), DW_HDMA_V0_TCB = BIT(1), DW_HDMA_V0_LLP = BIT(2), - DW_HDMA_V0_LIE = BIT(3), - DW_HDMA_V0_RIE = BIT(4), + DW_HDMA_V0_LWIE = BIT(3), + DW_HDMA_V0_RWIE = BIT(4), DW_HDMA_V0_CCS = BIT(8), DW_HDMA_V0_LLE = BIT(9), }; @@ -195,25 +195,14 @@ static void dw_hdma_v0_write_ll_link(struct dw_edma_chunk *chunk, static void dw_hdma_v0_core_write_chunk(struct dw_edma_chunk *chunk) { struct dw_edma_burst *child; - struct dw_edma_chan *chan = chunk->chan; u32 control = 0, i = 0; - int j; if (chunk->cb) control = DW_HDMA_V0_CB; - j = chunk->bursts_alloc; - list_for_each_entry(child, &chunk->burst->list, list) { - j--; - if (!j) { - control |= DW_HDMA_V0_LIE; - if (!(chan->dw->chip->flags & DW_EDMA_CHIP_LOCAL)) - control |= DW_HDMA_V0_RIE; - } - + list_for_each_entry(child, &chunk->burst->list, list) dw_hdma_v0_write_ll_data(chunk, i++, control, child->sz, child->sar, child->dar); - } control = DW_HDMA_V0_LLP | DW_HDMA_V0_TCB; if (!chunk->cb) From 8f614469de248a4bc55fb07e55d5f4c340c75b11 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 21 Aug 2024 14:32:02 -0400 Subject: [PATCH 791/991] drm/amdgpu: align pp_power_profile_mode with kernel docs The kernel doc says you need to select manual mode to adjust this, but the code only allows you to adjust it when manual mode is not selected. Remove the manual mode check. Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher (cherry picked from commit bbb05f8a9cd87f5046d05a0c596fddfb714ee457) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 9d7454b3c3143d..bc83cd89f8a00b 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -2265,8 +2265,7 @@ static int smu_adjust_power_state_dynamic(struct smu_context *smu, smu_dpm_ctx->dpm_level = level; } - if (smu_dpm_ctx->dpm_level != AMD_DPM_FORCED_LEVEL_MANUAL && - smu_dpm_ctx->dpm_level != AMD_DPM_FORCED_LEVEL_PERF_DETERMINISM) { + if (smu_dpm_ctx->dpm_level != AMD_DPM_FORCED_LEVEL_PERF_DETERMINISM) { index = fls(smu->workload_mask); index = index > 0 && index <= WORKLOAD_POLICY_MAX ? index - 1 : 0; workload[0] = smu->workload_setting[index]; @@ -2343,8 +2342,7 @@ static int smu_switch_power_profile(void *handle, workload[0] = smu->workload_setting[index]; } - if (smu_dpm_ctx->dpm_level != AMD_DPM_FORCED_LEVEL_MANUAL && - smu_dpm_ctx->dpm_level != AMD_DPM_FORCED_LEVEL_PERF_DETERMINISM) + if (smu_dpm_ctx->dpm_level != AMD_DPM_FORCED_LEVEL_PERF_DETERMINISM) smu_bump_power_profile_mode(smu, workload, 0); return 0; From 948f279dc48a6db17204f9b23f76b67abcd5d702 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 22 Aug 2024 16:20:10 -0400 Subject: [PATCH 792/991] drm/amdgpu/smu13.0.7: print index for profiles Print the index for the profiles. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3543 Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher (cherry picked from commit b86a6a57b8ad1699ba8b1c270a79678383baf632) --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index a7d0231727e8f4..7bc95c4043778d 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -2378,7 +2378,7 @@ static int smu_v13_0_7_get_power_profile_mode(struct smu_context *smu, char *buf size += sysfs_emit_at(buf, size, " "); for (i = 0; i <= PP_SMC_POWER_PROFILE_WINDOW3D; i++) - size += sysfs_emit_at(buf, size, "%-14s%s", amdgpu_pp_profile_name[i], + size += sysfs_emit_at(buf, size, "%d %-14s%s", i, amdgpu_pp_profile_name[i], (i == smu->power_profile_mode) ? "* " : " "); size += sysfs_emit_at(buf, size, "\n"); @@ -2408,7 +2408,7 @@ static int smu_v13_0_7_get_power_profile_mode(struct smu_context *smu, char *buf do { \ size += sysfs_emit_at(buf, size, "%-30s", #field); \ for (j = 0; j <= PP_SMC_POWER_PROFILE_WINDOW3D; j++) \ - size += sysfs_emit_at(buf, size, "%-16d", activity_monitor_external[j].DpmActivityMonitorCoeffInt.field); \ + size += sysfs_emit_at(buf, size, "%-18d", activity_monitor_external[j].DpmActivityMonitorCoeffInt.field); \ size += sysfs_emit_at(buf, size, "\n"); \ } while (0) From d420c857d85777663e8d16adfc24463f5d5c2dbc Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 22 Aug 2024 21:54:24 -0400 Subject: [PATCH 793/991] drm/amdgpu/swsmu: always force a state reprogram on init Always reprogram the hardware state on init. This ensures the PMFW state is explicitly programmed and we are not relying on the default PMFW state. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3131 Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher (cherry picked from commit c50fe289ed7207f71df3b5f1720512a9620e84fb) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index bc83cd89f8a00b..74e35f8ddefcf9 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -2224,8 +2224,9 @@ static int smu_bump_power_profile_mode(struct smu_context *smu, } static int smu_adjust_power_state_dynamic(struct smu_context *smu, - enum amd_dpm_forced_level level, - bool skip_display_settings) + enum amd_dpm_forced_level level, + bool skip_display_settings, + bool force_update) { int ret = 0; int index = 0; @@ -2254,7 +2255,7 @@ static int smu_adjust_power_state_dynamic(struct smu_context *smu, } } - if (smu_dpm_ctx->dpm_level != level) { + if (force_update || smu_dpm_ctx->dpm_level != level) { ret = smu_asic_set_performance_level(smu, level); if (ret) { dev_err(smu->adev->dev, "Failed to set performance level!"); @@ -2270,7 +2271,7 @@ static int smu_adjust_power_state_dynamic(struct smu_context *smu, index = index > 0 && index <= WORKLOAD_POLICY_MAX ? index - 1 : 0; workload[0] = smu->workload_setting[index]; - if (smu->power_profile_mode != workload[0]) + if (force_update || smu->power_profile_mode != workload[0]) smu_bump_power_profile_mode(smu, workload, 0); } @@ -2291,11 +2292,13 @@ static int smu_handle_task(struct smu_context *smu, ret = smu_pre_display_config_changed(smu); if (ret) return ret; - ret = smu_adjust_power_state_dynamic(smu, level, false); + ret = smu_adjust_power_state_dynamic(smu, level, false, false); break; case AMD_PP_TASK_COMPLETE_INIT: + ret = smu_adjust_power_state_dynamic(smu, level, true, true); + break; case AMD_PP_TASK_READJUST_POWER_STATE: - ret = smu_adjust_power_state_dynamic(smu, level, true); + ret = smu_adjust_power_state_dynamic(smu, level, true, false); break; default: break; From 37a45fb8db2619e03d26de59dbdb4ae2b0b02d7d Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Tue, 20 Aug 2024 08:57:15 +0800 Subject: [PATCH 794/991] drm/amd/pm: update message interface for smu v14.0.2/3 update message interface for smu v14.0.2/3 Signed-off-by: Kenneth Feng Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 01bfabc2d1d8aaffe5268f8df0843a6d916dcbaa) --- .../pm/swsmu/inc/pmfw_if/smu_v14_0_2_ppsmc.h | 18 ++++++++++++++---- .../drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 1 - 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_2_ppsmc.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_2_ppsmc.h index de2e442281ffee..87ca5ceb1ece10 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_2_ppsmc.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_2_ppsmc.h @@ -92,7 +92,6 @@ //Resets #define PPSMC_MSG_PrepareMp1ForUnload 0x2E -#define PPSMC_MSG_Mode1Reset 0x2F //Set SystemVirtual DramAddrHigh #define PPSMC_MSG_SetSystemVirtualDramAddrHigh 0x30 @@ -119,11 +118,12 @@ //STB to dram log #define PPSMC_MSG_DumpSTBtoDram 0x3D -#define PPSMC_MSG_STBtoDramLogSetDramAddrHigh 0x3E -#define PPSMC_MSG_STBtoDramLogSetDramAddrLow 0x3F +#define PPSMC_MSG_STBtoDramLogSetDramAddress 0x3E +#define PPSMC_MSG_DummyUndefined 0x3F #define PPSMC_MSG_STBtoDramLogSetDramSize 0x40 #define PPSMC_MSG_SetOBMTraceBufferLogging 0x41 +#define PPSMC_MSG_UseProfilingMode 0x42 #define PPSMC_MSG_AllowGfxDcs 0x43 #define PPSMC_MSG_DisallowGfxDcs 0x44 #define PPSMC_MSG_EnableAudioStutterWA 0x45 @@ -135,6 +135,16 @@ #define PPSMC_MSG_SetBadMemoryPagesRetiredFlagsPerChannel 0x4B #define PPSMC_MSG_SetPriorityDeltaGain 0x4C #define PPSMC_MSG_AllowIHHostInterrupt 0x4D +#define PPSMC_MSG_EnableShadowDpm 0x4E #define PPSMC_MSG_Mode3Reset 0x4F -#define PPSMC_Message_Count 0x50 +#define PPSMC_MSG_SetDriverDramAddr 0x50 +#define PPSMC_MSG_SetToolsDramAddr 0x51 +#define PPSMC_MSG_TransferTableSmu2DramWithAddr 0x52 +#define PPSMC_MSG_TransferTableDram2SmuWithAddr 0x53 +#define PPSMC_MSG_GetAllRunningSmuFeatures 0x54 +#define PPSMC_MSG_GetSvi3Voltage 0x55 +#define PPSMC_MSG_UpdatePolicy 0x56 +#define PPSMC_MSG_ExtPwrConnSupport 0x57 +#define PPSMC_MSG_PreloadSwPstateForUclkOverDrive 0x58 +#define PPSMC_Message_Count 0x59 #endif diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index e1a27903c80a14..e291137176bf81 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -115,7 +115,6 @@ static struct cmn2asic_msg_mapping smu_v14_0_2_message_map[SMU_MSG_MAX_COUNT] = MSG_MAP(SetMGpuFanBoostLimitRpm, PPSMC_MSG_SetMGpuFanBoostLimitRpm, 0), MSG_MAP(GetPptLimit, PPSMC_MSG_GetPptLimit, 0), MSG_MAP(NotifyPowerSource, PPSMC_MSG_NotifyPowerSource, 0), - MSG_MAP(Mode1Reset, PPSMC_MSG_Mode1Reset, 0), MSG_MAP(PrepareMp1ForUnload, PPSMC_MSG_PrepareMp1ForUnload, 0), MSG_MAP(DFCstateControl, PPSMC_MSG_SetExternalClientDfCstateAllow, 0), MSG_MAP(ArmD3, PPSMC_MSG_ArmD3, 0), From 959fc102ff4c39f5ab021da311c2cfd1d5602a0c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 20 Aug 2024 13:11:22 -0400 Subject: [PATCH 795/991] drm/amdgpu/gfx12: set UNORD_DISPATCH in compute MQDs This needs to be set to 1 to avoid a potential deadlock in the GC 10.x and newer. On GC 9.x and older, this needs to be set to 0. This can lead to hangs in some mixed graphics and compute workloads. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3575 Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 40318a2406bd426c6f4591269669c04e8eda571d) --- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 2c611b8577a7ec..e45d23e8287888 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -3005,7 +3005,7 @@ static int gfx_v12_0_compute_mqd_init(struct amdgpu_device *adev, void *m, (order_base_2(prop->queue_size / 4) - 1)); tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, RPTR_BLOCK_SIZE, (order_base_2(AMDGPU_GPU_PAGE_SIZE / 4) - 1)); - tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0); + tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 1); tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0); tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1); tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c index b7a08e7a44234b..d163d92a692f67 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c @@ -187,6 +187,7 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); From 3b9a33235c773c7a3768060cf1d2cf8a9153bc37 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Wed, 21 Aug 2024 12:27:24 +0800 Subject: [PATCH 796/991] drm/amd/display: avoid using null object of framebuffer Instead of using state->fb->obj[0] directly, get object from framebuffer by calling drm_gem_fb_get_obj() and return error code when object is null to avoid using null object of framebuffer. Fixes: 5d945cbcd4b1 ("drm/amd/display: Create a file dedicated to planes") Signed-off-by: Ma Ke Signed-off-by: Alex Deucher (cherry picked from commit 73dd0ad9e5dad53766ea3e631303430116f834b3) --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c index a83bd0331c3b74..5cb11cc2d06368 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "amdgpu.h" @@ -935,10 +936,14 @@ static int amdgpu_dm_plane_helper_prepare_fb(struct drm_plane *plane, } afb = to_amdgpu_framebuffer(new_state->fb); - obj = new_state->fb->obj[0]; + obj = drm_gem_fb_get_obj(new_state->fb, 0); + if (!obj) { + DRM_ERROR("Failed to get obj from framebuffer\n"); + return -EINVAL; + } + rbo = gem_to_amdgpu_bo(obj); adev = amdgpu_ttm_adev(rbo->tbo.bdev); - r = amdgpu_bo_reserve(rbo, true); if (r) { dev_err(adev->dev, "fail to reserve bo (%d)\n", r); From 6d5064c379557d92832b51d247b385bb8bd6aa5b Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Thu, 22 Aug 2024 11:44:12 +0800 Subject: [PATCH 797/991] drm/amdgpu: support for gc_info table v1.3 Add gc_info table v1.3 for IP discovery. Signed-off-by: Likun Gao Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 875ff9a7ee8824200885384effa7743892a34ed6) --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 11 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 6 +++ drivers/gpu/drm/amd/include/discovery.h | 42 +++++++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index 7b561e8e3cafc0..4bd61c169ca8d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -1500,6 +1500,7 @@ union gc_info { struct gc_info_v1_0 v1; struct gc_info_v1_1 v1_1; struct gc_info_v1_2 v1_2; + struct gc_info_v1_3 v1_3; struct gc_info_v2_0 v2; struct gc_info_v2_1 v2_1; }; @@ -1558,6 +1559,16 @@ static int amdgpu_discovery_get_gfx_info(struct amdgpu_device *adev) adev->gfx.config.gc_gl1c_size_per_instance = le32_to_cpu(gc_info->v1_2.gc_gl1c_size_per_instance); adev->gfx.config.gc_gl2c_per_gpu = le32_to_cpu(gc_info->v1_2.gc_gl2c_per_gpu); } + if (le16_to_cpu(gc_info->v1.header.version_minor) >= 3) { + adev->gfx.config.gc_tcp_size_per_cu = le32_to_cpu(gc_info->v1_3.gc_tcp_size_per_cu); + adev->gfx.config.gc_tcp_cache_line_size = le32_to_cpu(gc_info->v1_3.gc_tcp_cache_line_size); + adev->gfx.config.gc_instruction_cache_size_per_sqc = le32_to_cpu(gc_info->v1_3.gc_instruction_cache_size_per_sqc); + adev->gfx.config.gc_instruction_cache_line_size = le32_to_cpu(gc_info->v1_3.gc_instruction_cache_line_size); + adev->gfx.config.gc_scalar_data_cache_size_per_sqc = le32_to_cpu(gc_info->v1_3.gc_scalar_data_cache_size_per_sqc); + adev->gfx.config.gc_scalar_data_cache_line_size = le32_to_cpu(gc_info->v1_3.gc_scalar_data_cache_line_size); + adev->gfx.config.gc_tcc_size = le32_to_cpu(gc_info->v1_3.gc_tcc_size); + adev->gfx.config.gc_tcc_cache_line_size = le32_to_cpu(gc_info->v1_3.gc_tcc_cache_line_size); + } break; case 2: adev->gfx.config.max_shader_engines = le32_to_cpu(gc_info->v2.gc_num_se); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index ddda94e49db44a..56cc58edbb4e9a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -240,6 +240,12 @@ struct amdgpu_gfx_config { uint32_t gc_tcp_size_per_cu; uint32_t gc_num_cu_per_sqc; uint32_t gc_tcc_size; + uint32_t gc_tcp_cache_line_size; + uint32_t gc_instruction_cache_size_per_sqc; + uint32_t gc_instruction_cache_line_size; + uint32_t gc_scalar_data_cache_size_per_sqc; + uint32_t gc_scalar_data_cache_line_size; + uint32_t gc_tcc_cache_line_size; }; struct amdgpu_cu_info { diff --git a/drivers/gpu/drm/amd/include/discovery.h b/drivers/gpu/drm/amd/include/discovery.h index 46bf19c9c5c40a..710e328fad48f3 100644 --- a/drivers/gpu/drm/amd/include/discovery.h +++ b/drivers/gpu/drm/amd/include/discovery.h @@ -258,6 +258,48 @@ struct gc_info_v1_2 { uint32_t gc_gl2c_per_gpu; }; +struct gc_info_v1_3 { + struct gpu_info_header header; + uint32_t gc_num_se; + uint32_t gc_num_wgp0_per_sa; + uint32_t gc_num_wgp1_per_sa; + uint32_t gc_num_rb_per_se; + uint32_t gc_num_gl2c; + uint32_t gc_num_gprs; + uint32_t gc_num_max_gs_thds; + uint32_t gc_gs_table_depth; + uint32_t gc_gsprim_buff_depth; + uint32_t gc_parameter_cache_depth; + uint32_t gc_double_offchip_lds_buffer; + uint32_t gc_wave_size; + uint32_t gc_max_waves_per_simd; + uint32_t gc_max_scratch_slots_per_cu; + uint32_t gc_lds_size; + uint32_t gc_num_sc_per_se; + uint32_t gc_num_sa_per_se; + uint32_t gc_num_packer_per_sc; + uint32_t gc_num_gl2a; + uint32_t gc_num_tcp_per_sa; + uint32_t gc_num_sdp_interface; + uint32_t gc_num_tcps; + uint32_t gc_num_tcp_per_wpg; + uint32_t gc_tcp_l1_size; + uint32_t gc_num_sqc_per_wgp; + uint32_t gc_l1_instruction_cache_size_per_sqc; + uint32_t gc_l1_data_cache_size_per_sqc; + uint32_t gc_gl1c_per_sa; + uint32_t gc_gl1c_size_per_instance; + uint32_t gc_gl2c_per_gpu; + uint32_t gc_tcp_size_per_cu; + uint32_t gc_tcp_cache_line_size; + uint32_t gc_instruction_cache_size_per_sqc; + uint32_t gc_instruction_cache_line_size; + uint32_t gc_scalar_data_cache_size_per_sqc; + uint32_t gc_scalar_data_cache_line_size; + uint32_t gc_tcc_size; + uint32_t gc_tcc_cache_line_size; +}; + struct gc_info_v2_0 { struct gpu_info_header header; From badfdc6211f27803bc805fb56629f7d418670870 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 16 Aug 2024 14:34:17 +0530 Subject: [PATCH 798/991] drm/amd/pm: Add support for new P2S table revision Add p2s table support for a new revision of SMUv13.0.6. Signed-off-by: Lijo Lazar Reviewed-by: Hawking Zhang Reviewed-by: Asad Kamal Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 010cc730ace807c6d267481b5fb6ff99acc35c46) --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 78c3f94bb3ff60..9974c9f8135e99 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -121,6 +121,7 @@ struct mca_ras_info { #define P2S_TABLE_ID_A 0x50325341 #define P2S_TABLE_ID_X 0x50325358 +#define P2S_TABLE_ID_3 0x50325303 // clang-format off static const struct cmn2asic_msg_mapping smu_v13_0_6_message_map[SMU_MSG_MAX_COUNT] = { @@ -271,14 +272,18 @@ static int smu_v13_0_6_init_microcode(struct smu_context *smu) struct amdgpu_device *adev = smu->adev; uint32_t p2s_table_id = P2S_TABLE_ID_A; int ret = 0, i, p2stable_count; + int var = (adev->pdev->device & 0xF); char ucode_prefix[15]; /* No need to load P2S tables in IOV mode */ if (amdgpu_sriov_vf(adev)) return 0; - if (!(adev->flags & AMD_IS_APU)) + if (!(adev->flags & AMD_IS_APU)) { p2s_table_id = P2S_TABLE_ID_X; + if (var == 0x5) + p2s_table_id = P2S_TABLE_ID_3; + } amdgpu_ucode_ip_version_decode(adev, MP1_HWIP, ucode_prefix, sizeof(ucode_prefix)); From 849f0d5880b7494030c4ee1e4fbaf2ca5422bca9 Mon Sep 17 00:00:00 2001 From: Candice Li Date: Wed, 21 Aug 2024 13:10:58 +0800 Subject: [PATCH 799/991] drm/amd/pm: Drop unsupported features on smu v14_0_2 Drop unsupported features on smu v14_0_2. Signed-off-by: Candice Li Reviewed-by: Yang Wang Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 3376f922bfe070eff762164b3fc66981e3079417) --- .../drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 47 ------------------- 1 file changed, 47 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index e291137176bf81..0c09b8c4ff4931 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -1823,50 +1823,6 @@ static void smu_v14_0_2_set_smu_mailbox_registers(struct smu_context *smu) smu->debug_resp_reg = SOC15_REG_OFFSET(MP1, 0, regMP1_SMN_C2PMSG_54); } -static int smu_v14_0_2_smu_send_bad_mem_page_num(struct smu_context *smu, - uint32_t size) -{ - int ret = 0; - - /* message SMU to update the bad page number on SMUBUS */ - ret = smu_cmn_send_smc_msg_with_param(smu, - SMU_MSG_SetNumBadMemoryPagesRetired, - size, NULL); - if (ret) - dev_err(smu->adev->dev, - "[%s] failed to message SMU to update bad memory pages number\n", - __func__); - - return ret; -} - -static int smu_v14_0_2_send_bad_mem_channel_flag(struct smu_context *smu, - uint32_t size) -{ - int ret = 0; - - /* message SMU to update the bad channel info on SMUBUS */ - ret = smu_cmn_send_smc_msg_with_param(smu, - SMU_MSG_SetBadMemoryPagesRetiredFlagsPerChannel, - size, NULL); - if (ret) - dev_err(smu->adev->dev, - "[%s] failed to message SMU to update bad memory pages channel info\n", - __func__); - - return ret; -} - -static ssize_t smu_v14_0_2_get_ecc_info(struct smu_context *smu, - void *table) -{ - int ret = 0; - - // TODO - - return ret; -} - static ssize_t smu_v14_0_2_get_gpu_metrics(struct smu_context *smu, void **table) { @@ -2014,12 +1970,9 @@ static const struct pptable_funcs smu_v14_0_2_ppt_funcs = { .enable_gfx_features = smu_v14_0_2_enable_gfx_features, .set_mp1_state = smu_v14_0_2_set_mp1_state, .set_df_cstate = smu_v14_0_2_set_df_cstate, - .send_hbm_bad_pages_num = smu_v14_0_2_smu_send_bad_mem_page_num, - .send_hbm_bad_channel_flag = smu_v14_0_2_send_bad_mem_channel_flag, #if 0 .gpo_control = smu_v14_0_gpo_control, #endif - .get_ecc_info = smu_v14_0_2_get_ecc_info, }; void smu_v14_0_2_set_ppt_funcs(struct smu_context *smu) From 9d824c7fce58f59982228aa85b0376b113cdfa35 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 13 Aug 2024 11:25:04 +0100 Subject: [PATCH 800/991] drm/v3d: Disable preemption while updating GPU stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We forgot to disable preemption around the write_seqcount_begin/end() pair while updating GPU stats: [ ] WARNING: CPU: 2 PID: 12 at include/linux/seqlock.h:221 __seqprop_assert.isra.0+0x128/0x150 [v3d] [ ] Workqueue: v3d_bin drm_sched_run_job_work [gpu_sched] <...snip...> [ ] Call trace: [ ] __seqprop_assert.isra.0+0x128/0x150 [v3d] [ ] v3d_job_start_stats.isra.0+0x90/0x218 [v3d] [ ] v3d_bin_job_run+0x23c/0x388 [v3d] [ ] drm_sched_run_job_work+0x520/0x6d0 [gpu_sched] [ ] process_one_work+0x62c/0xb48 [ ] worker_thread+0x468/0x5b0 [ ] kthread+0x1c4/0x1e0 [ ] ret_from_fork+0x10/0x20 Fix it. Cc: Maíra Canal Cc: stable@vger.kernel.org # v6.10+ Fixes: 6abe93b621ab ("drm/v3d: Fix race-condition between sysfs/fdinfo and interrupt handler") Signed-off-by: Tvrtko Ursulin Acked-by: Maíra Canal Signed-off-by: Maíra Canal Link: https://patchwork.freedesktop.org/patch/msgid/20240813102505.80512-1-tursulin@igalia.com --- drivers/gpu/drm/v3d/v3d_sched.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index b8682818bafa6c..ad1e6236ff6ff1 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -134,6 +134,8 @@ v3d_job_start_stats(struct v3d_job *job, enum v3d_queue queue) struct v3d_stats *local_stats = &file->stats[queue]; u64 now = local_clock(); + preempt_disable(); + write_seqcount_begin(&local_stats->lock); local_stats->start_ns = now; write_seqcount_end(&local_stats->lock); @@ -141,6 +143,8 @@ v3d_job_start_stats(struct v3d_job *job, enum v3d_queue queue) write_seqcount_begin(&global_stats->lock); global_stats->start_ns = now; write_seqcount_end(&global_stats->lock); + + preempt_enable(); } static void @@ -162,8 +166,10 @@ v3d_job_update_stats(struct v3d_job *job, enum v3d_queue queue) struct v3d_stats *local_stats = &file->stats[queue]; u64 now = local_clock(); + preempt_disable(); v3d_stats_update(local_stats, now); v3d_stats_update(global_stats, now); + preempt_enable(); } static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job) From e33a97a830b230b79a98dbbb4121d4741a2be619 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 27 Aug 2024 10:53:40 -0700 Subject: [PATCH 801/991] block: fix detection of unsupported WRITE SAME in blkdev_issue_write_zeroes On error, blkdev_issue_write_zeroes used to recheck the block device's WRITE SAME queue limits after submitting WRITE SAME bios. As stated in the comment, the purpose of this was to collapse all IO errors to EOPNOTSUPP if the effect of issuing bios was that WRITE SAME got turned off in the queue limits. Therefore, it does not make sense to reuse the zeroes limit that was read earlier in the function because we only care about the queue limit *now*, not what it was at the start of the function. Found by running generic/351 from fstests. Fixes: 64b582ca88ca1 ("block: Read max write zeroes once for __blkdev_issue_write_zeroes()") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20240827175340.GB1977952@frogsfrogsfrogs Signed-off-by: Jens Axboe --- block/blk-lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 83eb7761c2bfbd..4c9f20a689f7bb 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -174,7 +174,7 @@ static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, * on an I/O error, in which case we'll turn any error into * "not supported" here. */ - if (ret && !limit) + if (ret && !bdev_write_zeroes_sectors(bdev)) return -EOPNOTSUPP; return ret; } From b18915248a15eae7d901262f108d6ff0ffb4ffc1 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 19 Aug 2024 19:52:30 +0200 Subject: [PATCH 802/991] fuse: use unsigned type for getxattr/listxattr size truncation The existing code uses min_t(ssize_t, outarg.size, XATTR_LIST_MAX) when parsing the FUSE daemon's response to a zero-length getxattr/listxattr request. On 32-bit kernels, where ssize_t and outarg.size are the same size, this is wrong: The min_t() will pass through any size values that are negative when interpreted as signed. fuse_listxattr() will then return this userspace-supplied negative value, which callers will treat as an error value. This kind of bug pattern can lead to fairly bad security bugs because of how error codes are used in the Linux kernel. If a caller were to convert the numeric error into an error pointer, like so: struct foo *func(...) { int len = fuse_getxattr(..., NULL, 0); if (len < 0) return ERR_PTR(len); ... } then it would end up returning this userspace-supplied negative value cast to a pointer - but the caller of this function wouldn't recognize it as an error pointer (IS_ERR_VALUE() only detects values in the narrow range in which legitimate errno values are), and so it would just be treated as a kernel pointer. I think there is at least one theoretical codepath where this could happen, but that path would involve virtio-fs with submounts plus some weird SELinux configuration, so I think it's probably not a concern in practice. Cc: stable@vger.kernel.org # v4.9 Fixes: 63401ccdb2ca ("fuse: limit xattr returned size") Signed-off-by: Jann Horn Signed-off-by: Miklos Szeredi --- fs/fuse/xattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 5b423fdbb13f8f..9f568d345c5123 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -81,7 +81,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, } ret = fuse_simple_request(fm, &args); if (!ret && !size) - ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); + ret = min_t(size_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { fm->fc->no_getxattr = 1; ret = -EOPNOTSUPP; @@ -143,7 +143,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) } ret = fuse_simple_request(fm, &args); if (!ret && !size) - ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); + ret = min_t(size_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { From 97f30876c94382d1b01d45c2c76be8911b196527 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Jul 2024 10:53:34 -0700 Subject: [PATCH 803/991] fuse: check aborted connection before adding requests to pending list for resending There is a race condition where inflight requests will not be aborted if they are in the middle of being re-sent when the connection is aborted. If fuse_resend has already moved all the requests in the fpq->processing lists to its private queue ("to_queue") and then the connection starts and finishes aborting, these requests will be added to the pending queue and remain on it indefinitely. Fixes: 760eac73f9f6 ("fuse: Introduce a new notification type for resend pending requests") Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Reviewed-by: Jingbo Xu Cc: # v6.9 Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9eb191b5c4de12..a11461ef6022c5 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -31,6 +31,8 @@ MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; +static void end_requests(struct list_head *head); + static struct fuse_dev *fuse_get_dev(struct file *file) { /* @@ -1820,6 +1822,13 @@ static void fuse_resend(struct fuse_conn *fc) } spin_lock(&fiq->lock); + if (!fiq->connected) { + spin_unlock(&fiq->lock); + list_for_each_entry(req, &to_queue, list) + clear_bit(FR_PENDING, &req->flags); + end_requests(&to_queue); + return; + } /* iq and pq requests are both oldest to newest */ list_splice(&to_queue, &fiq->pending); fiq->ops->wake_pending_and_unlock(fiq); From 3002240d16494d798add0575e8ba1f284258ab34 Mon Sep 17 00:00:00 2001 From: yangyun Date: Fri, 23 Aug 2024 16:51:46 +0800 Subject: [PATCH 804/991] fuse: fix memory leak in fuse_create_open The memory of struct fuse_file is allocated but not freed when get_create_ext return error. Fixes: 3e2b6fdbdc9a ("fuse: send security context of inode on file") Cc: stable@vger.kernel.org # v5.17 Signed-off-by: yangyun Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2b0d4781f39484..8e96df9fd76c99 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -670,7 +670,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, err = get_create_ext(&args, dir, entry, mode); if (err) - goto out_put_forget_req; + goto out_free_ff; err = fuse_simple_request(fm, &args); free_ext_value(&args); From 76a51ac00ca2a72fe3e168b7fb0e70f75ba6f512 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 28 Aug 2024 15:55:17 +0200 Subject: [PATCH 805/991] fuse: clear PG_uptodate when using a stolen page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Originally when a stolen page was inserted into fuse's page cache by fuse_try_move_page(), it would be marked uptodate. Then fuse_readpages_end() would call SetPageUptodate() again on the already uptodate page. Commit 413e8f014c8b ("fuse: Convert fuse_readpages_end() to use folio_end_read()") changed that by replacing the SetPageUptodate() + unlock_page() combination with folio_end_read(), which does mostly the same, except it sets the uptodate flag with an xor operation, which in the above scenario resulted in the uptodate flag being cleared, which in turn resulted in EIO being returned on the read. Fix by clearing PG_uptodate instead of setting it in fuse_try_move_page(), conforming to the expectation of folio_end_read(). Reported-by: Jürg Billeter Debugged-by: Matthew Wilcox Fixes: 413e8f014c8b ("fuse: Convert fuse_readpages_end() to use folio_end_read()") Cc: # v6.10 Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a11461ef6022c5..67443ef07285ef 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -775,7 +775,6 @@ static int fuse_check_folio(struct folio *folio) (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | - 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | 1 << PG_workingset | @@ -820,9 +819,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) newfolio = page_folio(buf->page); - if (!folio_test_uptodate(newfolio)) - folio_mark_uptodate(newfolio); - + folio_clear_uptodate(newfolio); folio_clear_mappedtodisk(newfolio); if (fuse_check_folio(newfolio) != 0) From f7790d67785302b3116bbbfda62a5a44524601a3 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 26 Aug 2024 14:19:04 -0700 Subject: [PATCH 806/991] fuse: update stats for pages in dropped aux writeback list In the case where the aux writeback list is dropped (e.g. the pages have been truncated or the connection is broken), the stats for its pages and backing device info need to be updated as well. Fixes: e2653bd53a98 ("fuse: fix leaked aux requests") Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Cc: # v5.1 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f39456c65ed7f7..ed76121f73f2e0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1832,10 +1832,16 @@ __acquires(fi->lock) fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); - /* After fuse_writepage_finish() aux request list is private */ + /* After rb_erase() aux request list is private */ for (aux = wpa->next; aux; aux = next) { + struct backing_dev_info *bdi = inode_to_bdi(aux->inode); + next = aux->next; aux->next = NULL; + + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + dec_node_page_state(aux->ia.ap.pages[0], NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); fuse_writepage_free(aux); } From 91d1dfae464987aaf6c79ff51d8674880fb3be77 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Aug 2024 21:08:25 +0100 Subject: [PATCH 807/991] cifs: Fix FALLOC_FL_ZERO_RANGE to preflush buffered part of target region Under certain conditions, the range to be cleared by FALLOC_FL_ZERO_RANGE may only be buffered locally and not yet have been flushed to the server. For example: xfs_io -f -t -c "pwrite -S 0x41 0 4k" \ -c "pwrite -S 0x42 4k 4k" \ -c "fzero 0 4k" \ -c "pread -v 0 8k" /xfstest.test/foo will write two 4KiB blocks of data, which get buffered in the pagecache, and then fallocate() is used to clear the first 4KiB block on the server - but we don't flush the data first, which means the EOF position on the server is wrong, and so the FSCTL_SET_ZERO_DATA RPC fails (and xfs_io ignores the error), but then when we try to read it, we see the old data. Fix this by preflushing any part of the target region that above the server's idea of the EOF position to force the server to update its EOF position. Note, however, that we don't want to simply expand the file by moving the EOF before doing the FSCTL_SET_ZERO_DATA[*] because someone else might see the zeroed region or if the RPC fails we then have to try to clean it up or risk getting corruption. [*] And we have to move the EOF first otherwise FSCTL_SET_ZERO_DATA won't do what we want. This fixes the generic/008 xfstest. [!] Note: A better way to do this might be to split the operation into two parts: we only do FSCTL_SET_ZERO_DATA for the part of the range below the server's EOF and then, if that worked, invalidate the buffered pages for the part above the range. Fixes: 6b69040247e1 ("cifs/smb3: Fix data inconsistent when zero file range") Signed-off-by: David Howells cc: Steve French cc: Zhang Xiaoxu cc: Pavel Shilovsky cc: Paulo Alcantara cc: Shyam Prasad N cc: Rohith Surabattula cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index a6f00b1572755f..4df84ebe8dbe53 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -3237,13 +3237,15 @@ static long smb3_zero_data(struct file *file, struct cifs_tcon *tcon, } static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, - loff_t offset, loff_t len, bool keep_size) + unsigned long long offset, unsigned long long len, + bool keep_size) { struct cifs_ses *ses = tcon->ses; struct inode *inode = file_inode(file); struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsFileInfo *cfile = file->private_data; - unsigned long long new_size; + struct netfs_inode *ictx = netfs_inode(inode); + unsigned long long i_size, new_size, remote_size; long rc; unsigned int xid; @@ -3255,6 +3257,16 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); + i_size = i_size_read(inode); + remote_size = ictx->remote_i_size; + if (offset + len >= remote_size && offset < i_size) { + unsigned long long top = umin(offset + len, i_size); + + rc = filemap_write_and_wait_range(inode->i_mapping, offset, top - 1); + if (rc < 0) + goto zero_range_exit; + } + /* * We zero the range through ioctl, so we need remove the page caches * first, otherwise the data may be inconsistent with the server. From 76a0e79bc84f466999fa501fce5bf7a07641b8a7 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Wed, 28 Aug 2024 15:51:29 -0400 Subject: [PATCH 808/991] selinux,smack: don't bypass permissions check in inode_setsecctx hook Marek Gresko reports that the root user on an NFS client is able to change the security labels on files on an NFS filesystem that is exported with root squashing enabled. The end of the kerneldoc comment for __vfs_setxattr_noperm() states: * This function requires the caller to lock the inode's i_mutex before it * is executed. It also assumes that the caller will make the appropriate * permission checks. nfsd_setattr() does do permissions checking via fh_verify() and nfsd_permission(), but those don't do all the same permissions checks that are done by security_inode_setxattr() and its related LSM hooks do. Since nfsd_setattr() is the only consumer of security_inode_setsecctx(), simplest solution appears to be to replace the call to __vfs_setxattr_noperm() with a call to __vfs_setxattr_locked(). This fixes the above issue and has the added benefit of causing nfsd to recall conflicting delegations on a file when a client tries to change its security label. Cc: stable@kernel.org Reported-by: Marek Gresko Link: https://bugzilla.kernel.org/show_bug.cgi?id=218809 Signed-off-by: Scott Mayhew Tested-by: Stephen Smalley Reviewed-by: Stephen Smalley Reviewed-by: Chuck Lever Reviewed-by: Jeff Layton Acked-by: Casey Schaufler Signed-off-by: Paul Moore --- security/selinux/hooks.c | 4 ++-- security/smack/smack_lsm.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 55c78c318ccd78..90afdfc48c0fe0 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6650,8 +6650,8 @@ static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen */ static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { - return __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_SELINUX, - ctx, ctxlen, 0); + return __vfs_setxattr_locked(&nop_mnt_idmap, dentry, XATTR_NAME_SELINUX, + ctx, ctxlen, 0, NULL); } static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 4164699cd4f62e..002a1b9ed83a56 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4880,8 +4880,8 @@ static int smack_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) static int smack_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { - return __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_SMACK, - ctx, ctxlen, 0); + return __vfs_setxattr_locked(&nop_mnt_idmap, dentry, XATTR_NAME_SMACK, + ctx, ctxlen, 0, NULL); } static int smack_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) From 0870b0d8b393dde53106678a1e2cec9dfa52f9b7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Aug 2024 11:49:16 +0000 Subject: [PATCH 809/991] net: busy-poll: use ktime_get_ns() instead of local_clock() Typically, busy-polling durations are below 100 usec. When/if the busy-poller thread migrates to another cpu, local_clock() can be off by +/-2msec or more for small values of HZ, depending on the platform. Use ktimer_get_ns() to ensure deterministic behavior, which is the whole point of busy-polling. Fixes: 060212928670 ("net: add low latency socket poll") Fixes: 9a3c71aa8024 ("net: convert low latency sockets to sched_clock()") Fixes: 37089834528b ("sched, net: Fixup busy_loop_us_clock()") Signed-off-by: Eric Dumazet Cc: Mina Almasry Cc: Willem de Bruijn Reviewed-by: Joe Damato Link: https://patch.msgid.link/20240827114916.223377-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 9b09acac538eed..522f1da8b747ac 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -68,7 +68,7 @@ static inline bool sk_can_busy_loop(struct sock *sk) static inline unsigned long busy_loop_current_time(void) { #ifdef CONFIG_NET_RX_BUSY_POLL - return (unsigned long)(local_clock() >> 10); + return (unsigned long)(ktime_get_ns() >> 10); #else return 0; #endif From 8b8ed1b429f8fa7ebd5632555e7b047bc0620075 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:24 +0200 Subject: [PATCH 810/991] mptcp: pm: reuse ID 0 after delete and re-add When the endpoint used by the initial subflow is removed and re-added later, the PM has to force the ID 0, it is a special case imposed by the MPTCP specs. Note that the endpoint should then need to be re-added reusing the same ID. Fixes: 3ad14f54bd74 ("mptcp: more accurate MPC endpoint tracking") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/pm_netlink.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 8d2f97854c6429..ec45ab4c66abb1 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -585,6 +585,11 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; + + /* Special case for ID0: set the correct ID */ + if (local.addr.id == msk->mpc_endpoint_id) + local.addr.id = 0; + mptcp_pm_announce_addr(msk, &local.addr, false); mptcp_pm_nl_addr_send_ack(msk); @@ -609,6 +614,11 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) msk->pm.local_addr_used++; __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); + + /* Special case for ID0: set the correct ID */ + if (local.addr.id == msk->mpc_endpoint_id) + local.addr.id = 0; + nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs); if (nr == 0) continue; From 87b5896f3f7848130095656739b05881904e2697 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:25 +0200 Subject: [PATCH 811/991] mptcp: pm: fix RM_ADDR ID for the initial subflow The initial subflow has a special local ID: 0. When an endpoint is being deleted, it is then important to check if its address is not linked to the initial subflow to send the right ID. If there was an endpoint linked to the initial subflow, msk's mpc_endpoint_id field will be set. We can then use this info when an endpoint is being removed to see if it is linked to the initial subflow. So now, the correct IDs are passed to mptcp_pm_nl_rm_addr_or_subflow(), it is no longer needed to use mptcp_local_id_match(). Fixes: 3ad14f54bd74 ("mptcp: more accurate MPC endpoint tracking") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/pm_netlink.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index ec45ab4c66abb1..42d4e7b5f65db7 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -800,11 +800,6 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, return -EINVAL; } -static bool mptcp_local_id_match(const struct mptcp_sock *msk, u8 local_id, u8 id) -{ - return local_id == id || (!local_id && msk->mpc_endpoint_id == id); -} - static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list, enum linux_mptcp_mib_field rm_type) @@ -839,7 +834,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) continue; - if (rm_type == MPTCP_MIB_RMSUBFLOW && !mptcp_local_id_match(msk, id, rm_id)) + if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) continue; pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", @@ -1448,6 +1443,12 @@ static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, return false; } +static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; +} + static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool force) @@ -1455,7 +1456,7 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, struct mptcp_rm_list list = { .nr = 0 }; bool ret; - list.ids[list.nr++] = addr->id; + list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); ret = remove_anno_list_by_saddr(msk, addr); if (ret || force) { @@ -1482,14 +1483,12 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, const struct mptcp_pm_addr_entry *entry) { const struct mptcp_addr_info *addr = &entry->addr; - struct mptcp_rm_list list = { .nr = 0 }; + struct mptcp_rm_list list = { .nr = 1 }; long s_slot = 0, s_num = 0; struct mptcp_sock *msk; pr_debug("remove_id=%d\n", addr->id); - list.ids[list.nr++] = addr->id; - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; bool remove_subflow; @@ -1507,6 +1506,7 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); + list.ids[0] = mptcp_endp_get_local_id(msk, addr); if (remove_subflow) { spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, &list); @@ -1613,6 +1613,7 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) return ret; } +/* Called from the userspace PM only */ void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }; @@ -1641,6 +1642,7 @@ void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) } } +/* Called from the in-kernel PM only */ static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, struct list_head *rm_list) { @@ -1650,11 +1652,11 @@ static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, list_for_each_entry(entry, rm_list, list) { if (slist.nr < MPTCP_RM_IDS_MAX && lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) - slist.ids[slist.nr++] = entry->addr.id; + slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); if (alist.nr < MPTCP_RM_IDS_MAX && remove_anno_list_by_saddr(msk, &entry->addr)) - alist.ids[alist.nr++] = entry->addr.id; + alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); } spin_lock_bh(&msk->pm.lock); @@ -1951,7 +1953,7 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, { struct mptcp_rm_list list = { .nr = 0 }; - list.ids[list.nr++] = addr->id; + list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, &list); From 5f94b08c001290acda94d9d8868075590931c198 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:26 +0200 Subject: [PATCH 812/991] selftests: mptcp: join: check removing ID 0 endpoint Removing the endpoint linked to the initial subflow should trigger a RM_ADDR for the right ID, and the removal of the subflow. That's what is now being verified in the "delete and re-add" test. Note that removing the initial subflow will not decrement the 'subflows' counters, which corresponds to the *additional* subflows. On the other hand, when the same endpoint is re-added, it will increment this counter, as it will be seen as an additional subflow this time. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 3ad14f54bd74 ("mptcp: more accurate MPC endpoint tracking") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- .../testing/selftests/net/mptcp/mptcp_join.sh | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 264040a760c6f6..8b4529ff15e54d 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3572,8 +3572,9 @@ endpoint_tests() if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then - pm_nl_set_limits $ns1 0 2 - pm_nl_set_limits $ns2 0 2 + pm_nl_set_limits $ns1 0 3 + pm_nl_set_limits $ns2 0 3 + pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow test_linkfail=4 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & @@ -3582,17 +3583,17 @@ endpoint_tests() wait_mpj $ns2 pm_nl_check_endpoint "creation" \ $ns2 10.0.2.2 id 2 flags subflow dev ns2eth2 - chk_subflow_nr "before delete" 2 + chk_subflow_nr "before delete id 2" 2 chk_mptcp_info subflows 1 subflows 1 pm_nl_del_endpoint $ns2 2 10.0.2.2 sleep 0.5 - chk_subflow_nr "after delete" 1 + chk_subflow_nr "after delete id 2" 1 chk_mptcp_info subflows 0 subflows 0 pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow wait_mpj $ns2 - chk_subflow_nr "after re-add" 2 + chk_subflow_nr "after re-add id 2" 2 chk_mptcp_info subflows 1 subflows 1 pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow @@ -3607,10 +3608,20 @@ endpoint_tests() chk_subflow_nr "after no reject" 3 chk_mptcp_info subflows 2 subflows 2 + pm_nl_del_endpoint $ns2 1 10.0.1.2 + sleep 0.5 + chk_subflow_nr "after delete id 0" 2 + chk_mptcp_info subflows 2 subflows 2 # only decr for additional sf + + pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow + wait_mpj $ns2 + chk_subflow_nr "after re-add id 0" 3 + chk_mptcp_info subflows 3 subflows 3 + mptcp_lib_kill_wait $tests_pid - chk_join_nr 3 3 3 - chk_rm_nr 1 1 + chk_join_nr 4 4 4 + chk_rm_nr 2 2 fi # remove and re-add From c07cc3ed895f9bfe0c53b5ed6be710c133b4271c Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:27 +0200 Subject: [PATCH 813/991] mptcp: pm: send ACK on an active subflow Taking the first one on the list doesn't work in some cases, e.g. if the initial subflow is being removed. Pick another one instead of not sending anything. Fixes: 84dfe3677a6f ("mptcp: send out dedicated ADD_ADDR packet") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/pm_netlink.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 42d4e7b5f65db7..ed2205ef72089a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -765,9 +765,12 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) !mptcp_pm_should_rm_signal(msk)) return; - subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); - if (subflow) - mptcp_pm_send_ack(msk, subflow, false, false); + mptcp_for_each_subflow(msk, subflow) { + if (__mptcp_subflow_active(subflow)) { + mptcp_pm_send_ack(msk, subflow, false, false); + break; + } + } } int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, From bc19ff57637ff563d2bdf2b385b48c41e6509e0d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:28 +0200 Subject: [PATCH 814/991] mptcp: pm: skip connecting to already established sf The lookup_subflow_by_daddr() helper checks if there is already a subflow connected to this address. But there could be a subflow that is closing, but taking time due to some reasons: latency, losses, data to process, etc. If an ADD_ADDR is received while the endpoint is being closed, it is better to try connecting to it, instead of rejecting it: the peer which has sent the ADD_ADDR will not be notified that the ADD_ADDR has been rejected for this reason, and the expected subflow will not be created at the end. This helper should then only look for subflows that are established, or going to be, but not the ones being closed. Fixes: d84ad04941c3 ("mptcp: skip connecting the connected address") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/pm_netlink.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index ed2205ef72089a..0134b6273c5454 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -130,12 +130,15 @@ static bool lookup_subflow_by_daddr(const struct list_head *list, { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; - struct sock_common *skc; list_for_each_entry(subflow, list, node) { - skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (!((1 << inet_sk_state_load(ssk)) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV))) + continue; - remote_address(skc, &cur); + remote_address((struct sock_common *)ssk, &cur); if (mptcp_addresses_equal(&cur, daddr, daddr->port)) return true; } From dce1c6d1e92535f165219695a826caedcca4e9b9 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:29 +0200 Subject: [PATCH 815/991] mptcp: pm: reset MPC endp ID when re-added The initial subflow has a special local ID: 0. It is specific per connection. When a global endpoint is deleted and re-added later, it can have a different ID -- most services managing the endpoints automatically don't force the ID to be the same as before. It is then important to track these modifications to be consistent with the ID being used for the address used by the initial subflow, not to confuse the other peer or to send the ID 0 for the wrong address. Now when removing an endpoint, msk->mpc_endpoint_id is reset if it corresponds to this endpoint. When adding a new endpoint, the same variable is updated if the address match the one of the initial subflow. Fixes: 3ad14f54bd74 ("mptcp: more accurate MPC endpoint tracking") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/pm_netlink.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 0134b6273c5454..5a84a55e37cc56 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1318,20 +1318,27 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) return pm_nl_get_pernet(genl_info_net(info)); } -static int mptcp_nl_add_subflow_or_signal_addr(struct net *net) +static int mptcp_nl_add_subflow_or_signal_addr(struct net *net, + struct mptcp_addr_info *addr) { struct mptcp_sock *msk; long s_slot = 0, s_num = 0; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info mpc_addr; if (!READ_ONCE(msk->fully_established) || mptcp_pm_is_userspace(msk)) goto next; + /* if the endp linked to the init sf is re-added with a != ID */ + mptcp_local_address((struct sock_common *)msk, &mpc_addr); + lock_sock(sk); spin_lock_bh(&msk->pm.lock); + if (mptcp_addresses_equal(addr, &mpc_addr, addr->port)) + msk->mpc_endpoint_id = addr->id; mptcp_pm_create_subflow_or_signal_addr(msk); spin_unlock_bh(&msk->pm.lock); release_sock(sk); @@ -1404,7 +1411,7 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) goto out_free; } - mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk)); + mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr); return 0; out_free: @@ -1525,6 +1532,8 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, spin_unlock_bh(&msk->pm.lock); } + if (msk->mpc_endpoint_id == entry->addr.id) + msk->mpc_endpoint_id = 0; release_sock(sk); next: From 1c2326fcae4f0c5de8ad0d734ced43a8e5f17dac Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:30 +0200 Subject: [PATCH 816/991] selftests: mptcp: join: check re-adding init endp with != id The initial subflow has a special local ID: 0. It is specific per connection. When a global endpoint is deleted and re-added later, it can have a different ID, but the kernel should still use the ID 0 if it corresponds to the initial address. This test validates this behaviour: the endpoint linked to the initial subflow is removed, and re-added with a different ID. Note that removing the initial subflow will not decrement the 'subflows' counters, which corresponds to the *additional* subflows. On the other hand, when the same endpoint is re-added, it will increment this counter, as it will be seen as an additional subflow this time. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 3ad14f54bd74 ("mptcp: more accurate MPC endpoint tracking") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- .../testing/selftests/net/mptcp/mptcp_join.sh | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 8b4529ff15e54d..75458ade32c78f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3627,11 +3627,12 @@ endpoint_tests() # remove and re-add if reset "delete re-add signal" && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then - pm_nl_set_limits $ns1 0 2 - pm_nl_set_limits $ns2 2 2 + pm_nl_set_limits $ns1 0 3 + pm_nl_set_limits $ns2 3 3 pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal + pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal test_linkfail=4 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & local tests_pid=$! @@ -3653,11 +3654,21 @@ endpoint_tests() wait_mpj $ns2 chk_subflow_nr "after re-add" 3 chk_mptcp_info subflows 2 subflows 2 + + pm_nl_del_endpoint $ns1 42 10.0.1.1 + sleep 0.5 + chk_subflow_nr "after delete ID 0" 2 + chk_mptcp_info subflows 2 subflows 2 + + pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal + wait_mpj $ns2 + chk_subflow_nr "after re-add" 3 + chk_mptcp_info subflows 3 subflows 3 mptcp_lib_kill_wait $tests_pid - chk_join_nr 3 3 3 - chk_add_nr 4 4 - chk_rm_nr 2 1 invert + chk_join_nr 4 4 4 + chk_add_nr 5 5 + chk_rm_nr 3 2 invert fi # flush and re-add From 76a2d8394cc183df872adf04bf636eaf42746449 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:31 +0200 Subject: [PATCH 817/991] selftests: mptcp: join: no extra msg if no counter The checksum and fail counters might not be available. Then no need to display an extra message with missing info. While at it, fix the indentation around, which is wrong since the same commit. Fixes: 47867f0a7e83 ("selftests: mptcp: join: skip check if MIB counter not supported") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 75458ade32c78f..a10714b6952fdf 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1112,26 +1112,26 @@ chk_csum_nr() print_check "sum" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtDataCsumErr") - if [ "$count" != "$csum_ns1" ]; then + if [ -n "$count" ] && [ "$count" != "$csum_ns1" ]; then extra_msg+=" ns1=$count" fi if [ -z "$count" ]; then print_skip elif { [ "$count" != $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 0 ]; } || - { [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then + { [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then fail_test "got $count data checksum error[s] expected $csum_ns1" else print_ok fi print_check "csum" count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtDataCsumErr") - if [ "$count" != "$csum_ns2" ]; then + if [ -n "$count" ] && [ "$count" != "$csum_ns2" ]; then extra_msg+=" ns2=$count" fi if [ -z "$count" ]; then print_skip elif { [ "$count" != $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 0 ]; } || - { [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then + { [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then fail_test "got $count data checksum error[s] expected $csum_ns2" else print_ok @@ -1169,13 +1169,13 @@ chk_fail_nr() print_check "ftx" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFailTx") - if [ "$count" != "$fail_tx" ]; then + if [ -n "$count" ] && [ "$count" != "$fail_tx" ]; then extra_msg+=",tx=$count" fi if [ -z "$count" ]; then print_skip elif { [ "$count" != "$fail_tx" ] && [ $allow_tx_lost -eq 0 ]; } || - { [ "$count" -gt "$fail_tx" ] && [ $allow_tx_lost -eq 1 ]; }; then + { [ "$count" -gt "$fail_tx" ] && [ $allow_tx_lost -eq 1 ]; }; then fail_test "got $count MP_FAIL[s] TX expected $fail_tx" else print_ok @@ -1183,13 +1183,13 @@ chk_fail_nr() print_check "failrx" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFailRx") - if [ "$count" != "$fail_rx" ]; then + if [ -n "$count" ] && [ "$count" != "$fail_rx" ]; then extra_msg+=",rx=$count" fi if [ -z "$count" ]; then print_skip elif { [ "$count" != "$fail_rx" ] && [ $allow_rx_lost -eq 0 ]; } || - { [ "$count" -gt "$fail_rx" ] && [ $allow_rx_lost -eq 1 ]; }; then + { [ "$count" -gt "$fail_rx" ] && [ $allow_rx_lost -eq 1 ]; }; then fail_test "got $count MP_FAIL[s] RX expected $fail_rx" else print_ok From 58e1b66b4e4b8a602d3f2843e8eba00a969ecce2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:32 +0200 Subject: [PATCH 818/991] mptcp: pm: do not remove already closed subflows It is possible to have in the list already closed subflows, e.g. the initial subflow has been already closed, but still in the list. No need to try to close it again, and increments the related counters again. Fixes: 0ee4261a3681 ("mptcp: implement mptcp_pm_remove_subflow") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/pm_netlink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 5a84a55e37cc56..3ff273e219f2ef 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -838,6 +838,8 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, int how = RCV_SHUTDOWN | SEND_SHUTDOWN; u8 id = subflow_get_local_id(subflow); + if (inet_sk_state_load(ssk) == TCP_CLOSE) + continue; if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) continue; if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) From 9366922adc6a71378ca01f898c41be295309f044 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:33 +0200 Subject: [PATCH 819/991] mptcp: pm: fix ID 0 endp usage after multiple re-creations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'local_addr_used' and 'add_addr_accepted' are decremented for addresses not related to the initial subflow (ID0), because the source and destination addresses of the initial subflows are known from the beginning: they don't count as "additional local address being used" or "ADD_ADDR being accepted". It is then required not to increment them when the entrypoint used by the initial subflow is removed and re-added during a connection. Without this modification, this entrypoint cannot be removed and re-added more than once. Reported-by: Arınç ÜNAL Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/512 Fixes: 3ad14f54bd74 ("mptcp: more accurate MPC endpoint tracking") Reported-by: syzbot+455d38ecd5f655fc45cf@syzkaller.appspotmail.com Closes: https://lore.kernel.org/00000000000049861306209237f4@google.com Cc: stable@vger.kernel.org Tested-by: Arınç ÜNAL Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/pm_netlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 3ff273e219f2ef..a93450ded50aee 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -615,12 +615,13 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH); - msk->pm.local_addr_used++; __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); /* Special case for ID0: set the correct ID */ if (local.addr.id == msk->mpc_endpoint_id) local.addr.id = 0; + else /* local_addr_used is not decr for ID 0 */ + msk->pm.local_addr_used++; nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs); if (nr == 0) @@ -750,7 +751,9 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) spin_lock_bh(&msk->pm.lock); if (sf_created) { - msk->pm.add_addr_accepted++; + /* add_addr_accepted is not decr for ID 0 */ + if (remote.id) + msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || msk->pm.subflows >= subflows_max) WRITE_ONCE(msk->pm.accept_addr, false); From d397d7246c11ca36c33c932bc36d38e3a79e9aa0 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:34 +0200 Subject: [PATCH 820/991] selftests: mptcp: join: check re-re-adding ID 0 endp This test extends "delete and re-add" to validate the previous commit: when the endpoint linked to the initial subflow (ID 0) is re-added multiple times, it was no longer being used, because the internal linked counters are not decremented for this special endpoint: it is not an additional endpoint. Here, the "del/add id 0" steps are done 3 times to unsure this case is validated. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 3ad14f54bd74 ("mptcp: more accurate MPC endpoint tracking") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- .../testing/selftests/net/mptcp/mptcp_join.sh | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index a10714b6952fdf..965b614e4b161c 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3576,7 +3576,7 @@ endpoint_tests() pm_nl_set_limits $ns2 0 3 pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - test_linkfail=4 speed=20 \ + test_linkfail=4 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & local tests_pid=$! @@ -3608,20 +3608,23 @@ endpoint_tests() chk_subflow_nr "after no reject" 3 chk_mptcp_info subflows 2 subflows 2 - pm_nl_del_endpoint $ns2 1 10.0.1.2 - sleep 0.5 - chk_subflow_nr "after delete id 0" 2 - chk_mptcp_info subflows 2 subflows 2 # only decr for additional sf - - pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow - wait_mpj $ns2 - chk_subflow_nr "after re-add id 0" 3 - chk_mptcp_info subflows 3 subflows 3 + local i + for i in $(seq 3); do + pm_nl_del_endpoint $ns2 1 10.0.1.2 + sleep 0.5 + chk_subflow_nr "after delete id 0 ($i)" 2 + chk_mptcp_info subflows 2 subflows 2 # only decr for additional sf + + pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow + wait_mpj $ns2 + chk_subflow_nr "after re-add id 0 ($i)" 3 + chk_mptcp_info subflows 3 subflows 3 + done mptcp_lib_kill_wait $tests_pid - chk_join_nr 4 4 4 - chk_rm_nr 2 2 + chk_join_nr 6 6 6 + chk_rm_nr 4 4 fi # remove and re-add From d82809b6c5f2676b382f77a5cbeb1a5d91ed2235 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:35 +0200 Subject: [PATCH 821/991] mptcp: avoid duplicated SUB_CLOSED events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The initial subflow might have already been closed, but still in the connection list. When the worker is instructed to close the subflows that have been marked as closed, it might then try to close the initial subflow again. A consequence of that is that the SUB_CLOSED event can be seen twice: # ip mptcp endpoint 1.1.1.1 id 1 subflow dev eth0 2.2.2.2 id 2 subflow dev eth1 # ip mptcp monitor & [ CREATED] remid=0 locid=0 saddr4=1.1.1.1 daddr4=9.9.9.9 [ ESTABLISHED] remid=0 locid=0 saddr4=1.1.1.1 daddr4=9.9.9.9 [ SF_ESTABLISHED] remid=0 locid=2 saddr4=2.2.2.2 daddr4=9.9.9.9 # ip mptcp endpoint delete id 1 [ SF_CLOSED] remid=0 locid=0 saddr4=1.1.1.1 daddr4=9.9.9.9 [ SF_CLOSED] remid=0 locid=0 saddr4=1.1.1.1 daddr4=9.9.9.9 The first one is coming from mptcp_pm_nl_rm_subflow_received(), and the second one from __mptcp_close_subflow(). To avoid doing the post-closed processing twice, the subflow is now marked as closed the first time. Note that it is not enough to check if we are dealing with the first subflow and check its sk_state: the subflow might have been reset or closed before calling mptcp_close_ssk(). Fixes: b911c97c7dc7 ("mptcp: add netlink event support") Cc: stable@vger.kernel.org Tested-by: Arınç ÜNAL Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/protocol.c | 6 ++++++ net/mptcp/protocol.h | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b571fba88a2f96..37ebcb7640ebb3 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2508,6 +2508,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { + /* The first subflow can already be closed and still in the list */ + if (subflow->close_event_done) + return; + + subflow->close_event_done = true; + if (sk->sk_state == TCP_ESTABLISHED) mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 240d7c2ea55130..26eb898a202b34 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -524,7 +524,8 @@ struct mptcp_subflow_context { stale : 1, /* unable to snd/rcv data, do not use for xmit */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ - __unused : 10; + close_event_done : 1, /* has done the post-closed part */ + __unused : 9; bool data_avail; bool scheduled; u32 remote_nonce; From 20ccc7c5f7a3aa48092441a4b182f9f40418392e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:36 +0200 Subject: [PATCH 822/991] selftests: mptcp: join: validate event numbers This test extends "delete and re-add" and "delete re-add signal" to validate the previous commit: the number of MPTCP events are checked to make sure there are no duplicated or unexpected ones. A new helper has been introduced to easily check these events. The missing events have been added to the lib. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: b911c97c7dc7 ("mptcp: add netlink event support") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- .../testing/selftests/net/mptcp/mptcp_join.sh | 74 ++++++++++++++++++- .../testing/selftests/net/mptcp/mptcp_lib.sh | 4 + 2 files changed, 75 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 965b614e4b161c..a8ea0fe200fb15 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -420,12 +420,17 @@ reset_with_fail() fi } +start_events() +{ + mptcp_lib_events "${ns1}" "${evts_ns1}" evts_ns1_pid + mptcp_lib_events "${ns2}" "${evts_ns2}" evts_ns2_pid +} + reset_with_events() { reset "${1}" || return 1 - mptcp_lib_events "${ns1}" "${evts_ns1}" evts_ns1_pid - mptcp_lib_events "${ns2}" "${evts_ns2}" evts_ns2_pid + start_events } reset_with_tcp_filter() @@ -3333,6 +3338,36 @@ userspace_pm_chk_get_addr() fi } +# $1: ns ; $2: event type ; $3: count +chk_evt_nr() +{ + local ns=${1} + local evt_name="${2}" + local exp="${3}" + + local evts="${evts_ns1}" + local evt="${!evt_name}" + local count + + evt_name="${evt_name:16}" # without MPTCP_LIB_EVENT_ + [ "${ns}" == "ns2" ] && evts="${evts_ns2}" + + print_check "event ${ns} ${evt_name} (${exp})" + + if [[ "${evt_name}" = "LISTENER_"* ]] && + ! mptcp_lib_kallsyms_has "mptcp_event_pm_listener$"; then + print_skip "event not supported" + return + fi + + count=$(grep -cw "type:${evt}" "${evts}") + if [ "${count}" != "${exp}" ]; then + fail_test "got ${count} events, expected ${exp}" + else + print_ok + fi +} + userspace_tests() { # userspace pm type prevents add_addr @@ -3572,6 +3607,7 @@ endpoint_tests() if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + start_events pm_nl_set_limits $ns1 0 3 pm_nl_set_limits $ns2 0 3 pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow @@ -3623,12 +3659,28 @@ endpoint_tests() mptcp_lib_kill_wait $tests_pid + kill_events_pids + chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_CREATED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_ESTABLISHED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_ANNOUNCED 0 + chk_evt_nr ns1 MPTCP_LIB_EVENT_REMOVED 4 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_ESTABLISHED 6 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_CLOSED 4 + + chk_evt_nr ns2 MPTCP_LIB_EVENT_CREATED 1 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ANNOUNCED 0 + chk_evt_nr ns2 MPTCP_LIB_EVENT_REMOVED 0 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 6 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 5 # one has been closed before estab + chk_join_nr 6 6 6 chk_rm_nr 4 4 fi # remove and re-add - if reset "delete re-add signal" && + if reset_with_events "delete re-add signal" && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then pm_nl_set_limits $ns1 0 3 pm_nl_set_limits $ns2 3 3 @@ -3669,6 +3721,22 @@ endpoint_tests() chk_mptcp_info subflows 3 subflows 3 mptcp_lib_kill_wait $tests_pid + kill_events_pids + chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_CREATED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_ESTABLISHED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_ANNOUNCED 0 + chk_evt_nr ns1 MPTCP_LIB_EVENT_REMOVED 0 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_ESTABLISHED 4 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_CLOSED 2 + + chk_evt_nr ns2 MPTCP_LIB_EVENT_CREATED 1 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ANNOUNCED 5 + chk_evt_nr ns2 MPTCP_LIB_EVENT_REMOVED 3 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 4 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 + chk_join_nr 4 4 4 chk_add_nr 5 5 chk_rm_nr 3 2 invert diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 438280e684346c..4578a331041ed4 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -12,10 +12,14 @@ readonly KSFT_SKIP=4 readonly KSFT_TEST="${MPTCP_LIB_KSFT_TEST:-$(basename "${0}" .sh)}" # These variables are used in some selftests, read-only +declare -rx MPTCP_LIB_EVENT_CREATED=1 # MPTCP_EVENT_CREATED +declare -rx MPTCP_LIB_EVENT_ESTABLISHED=2 # MPTCP_EVENT_ESTABLISHED +declare -rx MPTCP_LIB_EVENT_CLOSED=3 # MPTCP_EVENT_CLOSED declare -rx MPTCP_LIB_EVENT_ANNOUNCED=6 # MPTCP_EVENT_ANNOUNCED declare -rx MPTCP_LIB_EVENT_REMOVED=7 # MPTCP_EVENT_REMOVED declare -rx MPTCP_LIB_EVENT_SUB_ESTABLISHED=10 # MPTCP_EVENT_SUB_ESTABLISHED declare -rx MPTCP_LIB_EVENT_SUB_CLOSED=11 # MPTCP_EVENT_SUB_CLOSED +declare -rx MPTCP_LIB_EVENT_SUB_PRIORITY=13 # MPTCP_EVENT_SUB_PRIORITY declare -rx MPTCP_LIB_EVENT_LISTENER_CREATED=15 # MPTCP_EVENT_LISTENER_CREATED declare -rx MPTCP_LIB_EVENT_LISTENER_CLOSED=16 # MPTCP_EVENT_LISTENER_CLOSED From 57f86203b41c98b322119dfdbb1ec54ce5e3369b Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:37 +0200 Subject: [PATCH 823/991] mptcp: pm: ADD_ADDR 0 is not a new address The ADD_ADDR 0 with the address from the initial subflow should not be considered as a new address: this is not something new. If the host receives it, it simply means that the address is available again. When receiving an ADD_ADDR for the ID 0, the PM already doesn't consider it as new by not incrementing the 'add_addr_accepted' counter. But the 'accept_addr' might not be set if the limit has already been reached: this can be bypassed in this case. But before, it is important to check that this ADD_ADDR for the ID 0 is for the same address as the initial subflow. If not, it is not something that should happen, and the ADD_ADDR can be ignored. Note that if an ADD_ADDR is received while there is already a subflow opened using the same address, this ADD_ADDR is ignored as well. It means that if multiple ADD_ADDR for ID 0 are received, there will not be any duplicated subflows created by the client. Fixes: d0876b2284cf ("mptcp: add the incoming RM_ADDR support") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/pm.c | 4 +++- net/mptcp/pm_netlink.c | 9 +++++++++ net/mptcp/protocol.h | 2 ++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 3f8dbde243f104..37f6dbcd8434d1 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -226,7 +226,9 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, } else { __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } - } else if (!READ_ONCE(pm->accept_addr)) { + /* id0 should not have a different address */ + } else if ((addr->id == 0 && !mptcp_pm_nl_is_init_remote_addr(msk, addr)) || + (addr->id > 0 && !READ_ONCE(pm->accept_addr))) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a93450ded50aee..f891bc714668c5 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -760,6 +760,15 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) } } +bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *remote) +{ + struct mptcp_addr_info mpc_remote; + + remote_address((struct sock_common *)msk, &mpc_remote); + return mptcp_addresses_equal(&mpc_remote, remote, remote->port); +} + void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 26eb898a202b34..3b22313d1b86f6 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -993,6 +993,8 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); +bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *remote); void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); From f18fa2abf81099d822d842a107f8c9889c86043c Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:38 +0200 Subject: [PATCH 824/991] selftests: mptcp: join: check re-re-adding ID 0 signal This test extends "delete re-add signal" to validate the previous commit: when the 'signal' endpoint linked to the initial subflow (ID 0) is re-added multiple times, it will re-send the ADD_ADDR with id 0. The client should still be able to re-create this subflow, even if the add_addr_accepted limit has been reached as this special address is not considered as a new address. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: d0876b2284cf ("mptcp: add the incoming RM_ADDR support") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- .../testing/selftests/net/mptcp/mptcp_join.sh | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index a8ea0fe200fb15..a4762c49a87861 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3688,7 +3688,7 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal - test_linkfail=4 speed=20 \ + test_linkfail=4 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & local tests_pid=$! @@ -3717,7 +3717,17 @@ endpoint_tests() pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal wait_mpj $ns2 - chk_subflow_nr "after re-add" 3 + chk_subflow_nr "after re-add ID 0" 3 + chk_mptcp_info subflows 3 subflows 3 + + pm_nl_del_endpoint $ns1 99 10.0.1.1 + sleep 0.5 + chk_subflow_nr "after re-delete ID 0" 2 + chk_mptcp_info subflows 2 subflows 2 + + pm_nl_add_endpoint $ns1 10.0.1.1 id 88 flags signal + wait_mpj $ns2 + chk_subflow_nr "after re-re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 mptcp_lib_kill_wait $tests_pid @@ -3727,19 +3737,19 @@ endpoint_tests() chk_evt_nr ns1 MPTCP_LIB_EVENT_ESTABLISHED 1 chk_evt_nr ns1 MPTCP_LIB_EVENT_ANNOUNCED 0 chk_evt_nr ns1 MPTCP_LIB_EVENT_REMOVED 0 - chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_ESTABLISHED 4 - chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_CLOSED 2 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_ESTABLISHED 5 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_CLOSED 3 chk_evt_nr ns2 MPTCP_LIB_EVENT_CREATED 1 chk_evt_nr ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 - chk_evt_nr ns2 MPTCP_LIB_EVENT_ANNOUNCED 5 - chk_evt_nr ns2 MPTCP_LIB_EVENT_REMOVED 3 - chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 4 - chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ANNOUNCED 6 + chk_evt_nr ns2 MPTCP_LIB_EVENT_REMOVED 4 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 5 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 3 - chk_join_nr 4 4 4 - chk_add_nr 5 5 - chk_rm_nr 3 2 invert + chk_join_nr 5 5 5 + chk_add_nr 6 6 + chk_rm_nr 4 3 invert fi # flush and re-add From 6213dcc752f5d605cc50e08597f47fcbe658a40e Mon Sep 17 00:00:00 2001 From: Sriram Yagnaraman Date: Wed, 28 Aug 2024 09:24:17 +0200 Subject: [PATCH 825/991] mailmap: update entry for Sriram Yagnaraman Link my old est.tech address to my active mail address Signed-off-by: Sriram Yagnaraman Reviewed-by: Kurt Kanzenbach Link: https://patch.msgid.link/20240828072417.4111996-1-sriram.yagnaraman@ericsson.com Signed-off-by: Paolo Abeni --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 8ee01d9d70464b..53ebff0e268a4a 100644 --- a/.mailmap +++ b/.mailmap @@ -614,6 +614,7 @@ Simon Kelley Sricharan Ramabadhran Srinivas Ramana Sriram R +Sriram Yagnaraman Stanislav Fomichev Stefan Wahren Stéphane Witzmann From 3ab394b363c5fd14b231e335fb6746ddfb93aaaa Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 3 Jul 2024 19:30:20 +0200 Subject: [PATCH 826/991] fuse: disable the combination of passthrough and writeback cache Current design and handling of passthrough is without fuse caching and with that FUSE_WRITEBACK_CACHE is conflicting. Fixes: 7dc4e97a4f9a ("fuse: introduce FUSE_PASSTHROUGH capability") Cc: stable@kernel.org # v6.9 Signed-off-by: Bernd Schubert Acked-by: Amir Goldstein Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d8ab4e93916fa3..bebd89002328e4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1332,11 +1332,16 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, * on a stacked fs (e.g. overlayfs) themselves and with * max_stack_depth == 1, FUSE fs can be stacked as the * underlying fs of a stacked fs (e.g. overlayfs). + * + * Also don't allow the combination of FUSE_PASSTHROUGH + * and FUSE_WRITEBACK_CACHE, current design doesn't handle + * them together. */ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) && (flags & FUSE_PASSTHROUGH) && arg->max_stack_depth > 0 && - arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH) { + arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH && + !(flags & FUSE_WRITEBACK_CACHE)) { fc->passthrough = 1; fc->max_stack_depth = arg->max_stack_depth; fm->sb->s_stack_depth = arg->max_stack_depth; From febccb39255f9df35527b88c953b2e0deae50e53 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Tue, 27 Aug 2024 11:48:22 +0300 Subject: [PATCH 827/991] nfc: pn533: Add poll mod list filling check In case of im_protocols value is 1 and tm_protocols value is 0 this combination successfully passes the check 'if (!im_protocols && !tm_protocols)' in the nfc_start_poll(). But then after pn533_poll_create_mod_list() call in pn533_start_poll() poll mod list will remain empty and dev->poll_mod_count will remain 0 which lead to division by zero. Normally no im protocol has value 1 in the mask, so this combination is not expected by driver. But these protocol values actually come from userspace via Netlink interface (NFC_CMD_START_POLL operation). So a broken or malicious program may pass a message containing a "bad" combination of protocol parameter values so that dev->poll_mod_count is not incremented inside pn533_poll_create_mod_list(), thus leading to division by zero. Call trace looks like: nfc_genl_start_poll() nfc_start_poll() ->start_poll() pn533_start_poll() Add poll mod list filling check. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: dfccd0f58044 ("NFC: pn533: Add some polling entropy") Signed-off-by: Aleksandr Mishin Acked-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240827084822.18785-1-amishin@t-argos.ru Signed-off-by: Paolo Abeni --- drivers/nfc/pn533/pn533.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/nfc/pn533/pn533.c b/drivers/nfc/pn533/pn533.c index b19c39dcfbd932..e2bc67300a915a 100644 --- a/drivers/nfc/pn533/pn533.c +++ b/drivers/nfc/pn533/pn533.c @@ -1723,6 +1723,11 @@ static int pn533_start_poll(struct nfc_dev *nfc_dev, } pn533_poll_create_mod_list(dev, im_protocols, tm_protocols); + if (!dev->poll_mod_count) { + nfc_err(dev->dev, + "Poll mod list is empty\n"); + return -EINVAL; + } /* Do not always start polling from the same modulation */ get_random_bytes(&rand_mod, sizeof(rand_mod)); From 59d237c8a241168c7ae34c48244059b7bafaff38 Mon Sep 17 00:00:00 2001 From: Karthik Poosa Date: Tue, 27 Aug 2024 21:23:01 +0530 Subject: [PATCH 828/991] drm/xe/hwmon: Fix WRITE_I1 param from u32 to u16 WRITE_I1 sub-command of the POWER_SETUP pcode command accepts a u16 parameter instead of u32. This change prevents potential illegal sub-command errors. v2: Mask uval instead of changing the prototype. (Badal) v3: Rephrase commit message. (Badal) Signed-off-by: Karthik Poosa Fixes: 92d44a422d0d ("drm/xe/hwmon: Expose card reactive critical power") Reviewed-by: Badal Nilawar Link: https://patchwork.freedesktop.org/patch/msgid/20240827155301.183383-1-karthik.poosa@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit a7f657097e96d8fa745c74bb1a239ebd5a8c971c) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_hwmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index 832ea81faeee50..1faeca70900ed4 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -450,7 +450,7 @@ static int xe_hwmon_pcode_write_i1(struct xe_gt *gt, u32 uval) { return xe_pcode_write(gt, PCODE_MBOX(PCODE_POWER_SETUP, POWER_SETUP_SUBCOMMAND_WRITE_I1, 0), - uval); + (uval & POWER_SETUP_I1_DATA_MASK)); } static int xe_hwmon_power_curr_crit_read(struct xe_hwmon *hwmon, int channel, From c472d33bcbf7a1ed3710efe93822b5e94eabe18c Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 29 Aug 2024 08:38:54 -0700 Subject: [PATCH 829/991] Input: cypress_ps2 - fix waiting for command response Commit 8bccf667f62a ("Input: cypress_ps2 - report timeouts when reading command status") uncovered an existing problem with cypress_ps2 driver: it tries waiting on a PS/2 device waitqueue without using the rest of libps2. Unfortunately without it nobody signals wakeup for the waiting process, and each "extended" command was timing out. But the rest of the code simply did not notice it. Fix this by switching from homegrown way of sending request to get command response and reading it to standard ps2_command() which does the right thing. Reported-by: Woody Suwalski Tested-by: Woody Suwalski Fixes: 8bccf667f62a ("Input: cypress_ps2 - report timeouts when reading command status") Link: https://lore.kernel.org/r/a8252e0f-dab4-ef5e-2aa1-407a6f4c7204@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/cypress_ps2.c | 58 +++++++------------------------ 1 file changed, 13 insertions(+), 45 deletions(-) diff --git a/drivers/input/mouse/cypress_ps2.c b/drivers/input/mouse/cypress_ps2.c index b3c34ebcc4efcb..9446657a5f355d 100644 --- a/drivers/input/mouse/cypress_ps2.c +++ b/drivers/input/mouse/cypress_ps2.c @@ -91,48 +91,6 @@ static int cypress_ps2_ext_cmd(struct psmouse *psmouse, u8 prefix, u8 nibble) return rc; } -static int cypress_ps2_read_cmd_status(struct psmouse *psmouse, - u8 cmd, u8 *param) -{ - struct ps2dev *ps2dev = &psmouse->ps2dev; - enum psmouse_state old_state; - int pktsize; - int rc; - - ps2_begin_command(ps2dev); - - old_state = psmouse->state; - psmouse->state = PSMOUSE_CMD_MODE; - psmouse->pktcnt = 0; - - pktsize = (cmd == CYTP_CMD_READ_TP_METRICS) ? 8 : 3; - memset(param, 0, pktsize); - - rc = cypress_ps2_sendbyte(psmouse, PSMOUSE_CMD_GETINFO & 0xff); - if (rc) - goto out; - - if (!wait_event_timeout(ps2dev->wait, - psmouse->pktcnt >= pktsize, - msecs_to_jiffies(CYTP_CMD_TIMEOUT))) { - rc = -ETIMEDOUT; - goto out; - } - - memcpy(param, psmouse->packet, pktsize); - - psmouse_dbg(psmouse, "Command 0x%02x response data (0x): %*ph\n", - cmd, pktsize, param); - -out: - psmouse->state = old_state; - psmouse->pktcnt = 0; - - ps2_end_command(ps2dev); - - return rc; -} - static bool cypress_verify_cmd_state(struct psmouse *psmouse, u8 cmd, u8* param) { bool rate_match = false; @@ -166,6 +124,8 @@ static bool cypress_verify_cmd_state(struct psmouse *psmouse, u8 cmd, u8* param) static int cypress_send_ext_cmd(struct psmouse *psmouse, u8 cmd, u8 *param) { u8 cmd_prefix = PSMOUSE_CMD_SETRES & 0xff; + unsigned int resp_size = cmd == CYTP_CMD_READ_TP_METRICS ? 8 : 3; + unsigned int ps2_cmd = (PSMOUSE_CMD_GETINFO & 0xff) | (resp_size << 8); int tries = CYTP_PS2_CMD_TRIES; int error; @@ -179,10 +139,18 @@ static int cypress_send_ext_cmd(struct psmouse *psmouse, u8 cmd, u8 *param) cypress_ps2_ext_cmd(psmouse, cmd_prefix, DECODE_CMD_BB(cmd)); cypress_ps2_ext_cmd(psmouse, cmd_prefix, DECODE_CMD_AA(cmd)); - error = cypress_ps2_read_cmd_status(psmouse, cmd, param); - if (!error && cypress_verify_cmd_state(psmouse, cmd, param)) - return 0; + error = ps2_command(&psmouse->ps2dev, param, ps2_cmd); + if (error) { + psmouse_dbg(psmouse, "Command 0x%02x failed: %d\n", + cmd, error); + } else { + psmouse_dbg(psmouse, + "Command 0x%02x response data (0x): %*ph\n", + cmd, resp_size, param); + if (cypress_verify_cmd_state(psmouse, cmd, param)) + return 0; + } } while (--tries > 0); return -EIO; From 04c8abae1b7b2abeb638a3d5d5950fa2a031c244 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Thu, 29 Aug 2024 11:20:49 -0700 Subject: [PATCH 830/991] dcache: keep dentry_hashtable or d_hash_shift even when not used The runtime constant feature removes all the users of these variables, allowing the compiler to optimize them away. It's quite difficult to extract their values from the kernel text, and the memory saved by removing them is tiny, and it was never the point of this optimization. Since the dentry_hashtable is a core data structure, it's valuable for debugging tools to be able to read it easily. For instance, scripts built on drgn, like the dentrycache script[1], rely on it to be able to perform diagnostics on the contents of the dcache. Annotate it as used, so the compiler doesn't discard it. Link: https://github.com/oracle-samples/drgn-tools/blob/3afc56146f54d09dfd1f6d3c1b7436eda7e638be/drgn_tools/dentry.py#L325-L355 [1] Fixes: e3c92e81711d ("runtime constants: add x86 architecture support") Signed-off-by: Stephen Brennan Signed-off-by: Linus Torvalds --- fs/dcache.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 3d8daaecb6d1a2..6386b9b625ddbc 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -96,11 +96,16 @@ EXPORT_SYMBOL(dotdot_name); * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. + * + * Marking the variables "used" ensures that the compiler doesn't + * optimize them away completely on architectures with runtime + * constant infrastructure, this allows debuggers to see their + * values. But updating these values has no effect on those arches. */ -static unsigned int d_hash_shift __ro_after_init; +static unsigned int d_hash_shift __ro_after_init __used; -static struct hlist_bl_head *dentry_hashtable __ro_after_init; +static struct hlist_bl_head *dentry_hashtable __ro_after_init __used; static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { From 8d8d244726c8436c50f84092616c92bf551ea89a Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 23 Aug 2024 13:47:05 +0200 Subject: [PATCH 831/991] smb: Annotate struct xattr_smb_acl with __counted_by() Add the __counted_by compiler attribute to the flexible array member entries to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Signed-off-by: Thorsten Blum Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/xattr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/server/xattr.h b/fs/smb/server/xattr.h index 16499ca5c82d36..fa3e27d6971b82 100644 --- a/fs/smb/server/xattr.h +++ b/fs/smb/server/xattr.h @@ -76,7 +76,7 @@ struct xattr_acl_entry { struct xattr_smb_acl { int count; int next; - struct xattr_acl_entry entries[]; + struct xattr_acl_entry entries[] __counted_by(count); }; /* 64bytes hash in xattr_ntacl is computed with sha256 */ From 78c5a6f1f630172b19af4912e755e1da93ef0ab5 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 27 Aug 2024 21:44:41 +0900 Subject: [PATCH 832/991] ksmbd: unset the binding mark of a reused connection Steve French reported null pointer dereference error from sha256 lib. cifs.ko can send session setup requests on reused connection. If reused connection is used for binding session, conn->binding can still remain true and generate_preauth_hash() will not set sess->Preauth_HashValue and it will be NULL. It is used as a material to create an encryption key in ksmbd_gen_smb311_encryptionkey. ->Preauth_HashValue cause null pointer dereference error from crypto_shash_update(). BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 8 PID: 429254 Comm: kworker/8:39 Hardware name: LENOVO 20MAS08500/20MAS08500, BIOS N2CET69W (1.52 ) Workqueue: ksmbd-io handle_ksmbd_work [ksmbd] RIP: 0010:lib_sha256_base_do_update.isra.0+0x11e/0x1d0 [sha256_ssse3] ? show_regs+0x6d/0x80 ? __die+0x24/0x80 ? page_fault_oops+0x99/0x1b0 ? do_user_addr_fault+0x2ee/0x6b0 ? exc_page_fault+0x83/0x1b0 ? asm_exc_page_fault+0x27/0x30 ? __pfx_sha256_transform_rorx+0x10/0x10 [sha256_ssse3] ? lib_sha256_base_do_update.isra.0+0x11e/0x1d0 [sha256_ssse3] ? __pfx_sha256_transform_rorx+0x10/0x10 [sha256_ssse3] ? __pfx_sha256_transform_rorx+0x10/0x10 [sha256_ssse3] _sha256_update+0x77/0xa0 [sha256_ssse3] sha256_avx2_update+0x15/0x30 [sha256_ssse3] crypto_shash_update+0x1e/0x40 hmac_update+0x12/0x20 crypto_shash_update+0x1e/0x40 generate_key+0x234/0x380 [ksmbd] generate_smb3encryptionkey+0x40/0x1c0 [ksmbd] ksmbd_gen_smb311_encryptionkey+0x72/0xa0 [ksmbd] ntlm_authenticate.isra.0+0x423/0x5d0 [ksmbd] smb2_sess_setup+0x952/0xaa0 [ksmbd] __process_request+0xa3/0x1d0 [ksmbd] __handle_ksmbd_work+0x1c4/0x2f0 [ksmbd] handle_ksmbd_work+0x2d/0xa0 [ksmbd] process_one_work+0x16c/0x350 worker_thread+0x306/0x440 ? __pfx_worker_thread+0x10/0x10 kthread+0xef/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x44/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 Fixes: f5a544e3bab7 ("ksmbd: add support for SMB3 multichannel") Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 20846a4d3031fb..8bdc592514188a 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1690,6 +1690,8 @@ int smb2_sess_setup(struct ksmbd_work *work) rc = ksmbd_session_register(conn, sess); if (rc) goto out_err; + + conn->binding = false; } else if (conn->dialect >= SMB30_PROT_ID && (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) && req->Flags & SMB2_SESSION_REQ_FLAG_BINDING) { @@ -1768,6 +1770,8 @@ int smb2_sess_setup(struct ksmbd_work *work) sess = NULL; goto out_err; } + + conn->binding = false; } work->sess = sess; From 844436e045ac2ab7895d8b281cb784a24de1d14d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 29 Aug 2024 22:22:35 +0300 Subject: [PATCH 833/991] ksmbd: Unlock on in ksmbd_tcp_set_interfaces() Unlock before returning an error code if this allocation fails. Fixes: 0626e6641f6b ("cifsd: add server handler for central processing and tranport layers") Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Dan Carpenter Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index a84788396daaa4..aaed9e293b2e02 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -624,8 +624,10 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz) for_each_netdev(&init_net, netdev) { if (netif_is_bridge_port(netdev)) continue; - if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL))) + if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL))) { + rtnl_unlock(); return -ENOMEM; + } } rtnl_unlock(); bind_additional_ifaces = 1; From c26096ee0278c5e765009c5eee427bbafe6dc090 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Aug 2024 22:02:45 +0100 Subject: [PATCH 834/991] mm: Fix filemap_invalidate_inode() to use invalidate_inode_pages2_range() Fix filemap_invalidate_inode() to use invalidate_inode_pages2_range() rather than truncate_inode_pages_range(). The latter clears the invalidated bit of a partial pages rather than discarding it entirely. This causes copy_file_range() to fail on cifs because the partial pages at either end of the destination range aren't evicted and reread, but rather just partly cleared. This causes generic/075 and generic/112 xfstests to fail. Fixes: 74e797d79cf1 ("mm: Provide a means of invalidation without using launder_folio") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240828210249.1078637-5-dhowells@redhat.com cc: Matthew Wilcox cc: Miklos Szeredi cc: Trond Myklebust cc: Christoph Hellwig cc: Andrew Morton cc: Alexander Viro cc: Christian Brauner cc: Jeff Layton cc: linux-mm@kvack.org cc: linux-fsdevel@vger.kernel.org cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: linux-nfs@vger.kernel.org cc: devel@lists.orangefs.org Signed-off-by: Christian Brauner --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index d62150418b9104..0ca9c1377b686f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4231,7 +4231,7 @@ int filemap_invalidate_inode(struct inode *inode, bool flush, } /* Wait for writeback to complete on all folios and discard. */ - truncate_inode_pages_range(mapping, start, end); + invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); unlock: filemap_invalidate_unlock(mapping); From 1c47c0d6014c832ad8e2ba04fc2c5b7070d999f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 Aug 2024 09:42:33 -0600 Subject: [PATCH 835/991] io_uring/rsrc: ensure compat iovecs are copied correctly For buffer registration (or updates), a userspace iovec is copied in and updated. If the application is within a compat syscall, then the iovec type is compat_iovec rather than iovec. However, the type used in __io_sqe_buffers_update() and io_sqe_buffers_register() is always struct iovec, and hence the source is incremented by the size of a non-compat iovec in the loop. This misses every other iovec in the source, and will run into garbage half way through the copies and return -EFAULT to the application. Maintain the source address separately and assign to our user vec pointer, so that copies always happen from the right source address. While in there, correct a bad placement of __user which triggered the following sparse warning prior to this fix: io_uring/rsrc.c:981:33: warning: cast removes address space '__user' of expression io_uring/rsrc.c:981:30: warning: incorrect type in assignment (different address spaces) io_uring/rsrc.c:981:30: expected struct iovec const [noderef] __user *uvec io_uring/rsrc.c:981:30: got struct iovec *[noderef] __user Fixes: f4eaf8eda89e ("io_uring/rsrc: Drop io_copy_iov in favor of iovec API") Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index a860516bf44840..453867add7caa9 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -394,10 +394,11 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) { - struct iovec __user *uvec = u64_to_user_ptr(up->data); u64 __user *tags = u64_to_user_ptr(up->tags); struct iovec fast_iov, *iov; struct page *last_hpage = NULL; + struct iovec __user *uvec; + u64 user_data = up->data; __u32 done; int i, err; @@ -410,7 +411,8 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu; u64 tag = 0; - iov = iovec_from_user(&uvec[done], 1, 1, &fast_iov, ctx->compat); + uvec = u64_to_user_ptr(user_data); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); if (IS_ERR(iov)) { err = PTR_ERR(iov); break; @@ -443,6 +445,10 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, ctx->user_bufs[i] = imu; *io_get_tag_slot(ctx->buf_data, i) = tag; + if (ctx->compat) + user_data += sizeof(struct compat_iovec); + else + user_data += sizeof(struct iovec); } return done ? done : err; } @@ -949,7 +955,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, struct page *last_hpage = NULL; struct io_rsrc_data *data; struct iovec fast_iov, *iov = &fast_iov; - const struct iovec __user *uvec = (struct iovec * __user) arg; + const struct iovec __user *uvec; int i, ret; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); @@ -972,7 +978,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { if (arg) { - iov = iovec_from_user(&uvec[i], 1, 1, &fast_iov, ctx->compat); + uvec = (struct iovec __user *) arg; + iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); if (IS_ERR(iov)) { ret = PTR_ERR(iov); break; @@ -980,6 +987,10 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, ret = io_buffer_validate(iov); if (ret) break; + if (ctx->compat) + arg += sizeof(struct compat_iovec); + else + arg += sizeof(struct iovec); } if (!iov->iov_base && *io_get_tag_slot(data, i)) { From 40927f3d0972bf86357a32a5749be71a551241b6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 29 Aug 2024 09:06:28 +1000 Subject: [PATCH 836/991] nfsd: fix nfsd4_deleg_getattr_conflict in presence of third party lease It is not safe to dereference fl->c.flc_owner without first confirming fl->fl_lmops is the expected manager. nfsd4_deleg_getattr_conflict() tests fl_lmops but largely ignores the result and assumes that flc_owner is an nfs4_delegation anyway. This is wrong. With this patch we restore the "!= &nfsd_lease_mng_ops" case to behave as it did before the change mentioned below. This is the same as the current code, but without any reference to a possible delegation. Fixes: c5967721e106 ("NFSD: handle GETATTR conflict with write delegation") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 07f2496850c4c6..a366fb1c1b9b4f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8859,7 +8859,15 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, */ if (type == F_RDLCK) break; - goto break_lease; + + nfsd_stats_wdeleg_getattr_inc(nn); + spin_unlock(&ctx->flc_lock); + + status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + return status; + return 0; } if (type == F_WRLCK) { struct nfs4_delegation *dp = fl->c.flc_owner; @@ -8868,7 +8876,6 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, spin_unlock(&ctx->flc_lock); return 0; } -break_lease: nfsd_stats_wdeleg_getattr_inc(nn); dp = fl->c.flc_owner; refcount_inc(&dp->dl_stid.sc_count); From f274495aea7b15225b3d83837121b22ef96e560c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 Aug 2024 10:45:54 -0600 Subject: [PATCH 837/991] io_uring/kbuf: return correct iovec count from classic buffer peek io_provided_buffers_select() returns 0 to indicate success, but it should be returning 1 to indicate that 1 vec was mapped. This causes peeking to fail with classic provided buffers, and while that's not a use case that anyone should use, it should still work correctly. The end result is that no buffer will be selected, and hence a completion with '0' as the result will be posted, without a buffer attached. Fixes: 35c8711c8fc4 ("io_uring/kbuf: add helpers for getting/peeking multiple buffers") Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 1af2bd56af44ac..bdfa30b38321b7 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -129,7 +129,7 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, iov[0].iov_base = buf; iov[0].iov_len = *len; - return 0; + return 1; } static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br, From 150b572a7c1df30f5d32d87ad96675200cca7b80 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 26 Aug 2024 16:27:39 -0400 Subject: [PATCH 838/991] MAINTAINERS: PCI: Add NXP PCI controller mailing list imx@lists.linux.dev Add imx mailing list imx@lists.linux.dev for PCI controller of NXP chips (Layerscape and iMX). Link: https://lore.kernel.org/r/20240826202740.970015-1-Frank.Li@nxp.com Signed-off-by: Frank Li Signed-off-by: Bjorn Helgaas Acked-by: Richard Zhu --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3fb27f41515d51..1b7a6a8073bb2c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17412,6 +17412,7 @@ M: Roy Zang L: linuxppc-dev@lists.ozlabs.org L: linux-pci@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +L: imx@lists.linux.dev S: Maintained F: drivers/pci/controller/dwc/*layerscape* @@ -17438,6 +17439,7 @@ M: Richard Zhu M: Lucas Stach L: linux-pci@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +L: imx@lists.linux.dev S: Maintained F: Documentation/devicetree/bindings/pci/fsl,imx6q-pcie-common.yaml F: Documentation/devicetree/bindings/pci/fsl,imx6q-pcie-ep.yaml From d8b762070c3fde224f8b9ea3cf59bc41a5a3eb57 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 23 Aug 2024 13:55:00 +0200 Subject: [PATCH 839/991] power: sequencing: qcom-wcn: set the wlan-enable GPIO to output Commit a9aaf1ff88a8 ("power: sequencing: request the WLAN enable GPIO as-is") broke WLAN on boards on which the wlan-enable GPIO enabling the wifi module isn't in output mode by default. We need to set direction to output while retaining the value that was already set to keep the ath module on if it's already started. Fixes: a9aaf1ff88a8 ("power: sequencing: request the WLAN enable GPIO as-is") Link: https://lore.kernel.org/r/20240823115500.37280-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- drivers/power/sequencing/pwrseq-qcom-wcn.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/power/sequencing/pwrseq-qcom-wcn.c b/drivers/power/sequencing/pwrseq-qcom-wcn.c index d786cbf1b2cd64..700879474abf25 100644 --- a/drivers/power/sequencing/pwrseq-qcom-wcn.c +++ b/drivers/power/sequencing/pwrseq-qcom-wcn.c @@ -288,6 +288,13 @@ static int pwrseq_qcom_wcn_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(ctx->wlan_gpio), "Failed to get the WLAN enable GPIO\n"); + /* + * Set direction to output but keep the current value in order to not + * disable the WLAN module accidentally if it's already powered on. + */ + gpiod_direction_output(ctx->wlan_gpio, + gpiod_get_value_cansleep(ctx->wlan_gpio)); + ctx->clk = devm_clk_get_optional(dev, NULL); if (IS_ERR(ctx->clk)) return dev_err_probe(dev, PTR_ERR(ctx->clk), From e3e6940940910c2287fe962bdf72015efd4fee81 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 31 Aug 2024 17:44:51 -0400 Subject: [PATCH 840/991] bcachefs: Revert lockless buffered IO path We had a report of data corruption on nixos when building installer images. https://github.com/NixOS/nixpkgs/pull/321055#issuecomment-2184131334 It seems that writes are being dropped, but only when issued by QEMU, and possibly only in snapshot mode. It's undetermined if it's write calls are being dropped or dirty folios. Further testing, via minimizing the original patch to just the change that skips the inode lock on non appends/truncates, reveals that it really is just not taking the inode lock that causes the corruption: it has nothing to do with the other logic changes for preserving write atomicity in corner cases. It's also kernel config dependent: it doesn't reproduce with the minimal kernel config that ktest uses, but it does reproduce with nixos's distro config. Bisection the kernel config initially pointer the finger at page migration or compaction, but it appears that was erroneous; we haven't yet determined what kernel config option actually triggers it. Sadly it appears this will have to be reverted since we're getting too close to release and my plate is full, but we'd _really_ like to fully debug it. My suspicion is that this patch is exposing a preexisting bug - the inode lock actually covers very little in IO paths, and we have a different lock (the pagecache add lock) that guards against races with truncate here. Fixes: 7e64c86cdc6c ("bcachefs: Buffered write path now can avoid the inode lock") Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 1 - fs/bcachefs/fs-io-buffered.c | 149 ++++++++++------------------------- 2 files changed, 40 insertions(+), 110 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index ab5a7adece1042..742dcdd3e5d7d4 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -257,7 +257,6 @@ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ - x(0, need_inode_lock) \ x(0, invalid_snapshot_node) \ x(0, option_needs_open_fs) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 184d0385167689..ec8c427bf58893 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -802,8 +802,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi) static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, struct iov_iter *iter, - loff_t pos, unsigned len, - bool inode_locked) + loff_t pos, unsigned len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; @@ -827,15 +826,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, BUG_ON(!fs.nr); - /* - * If we're not using the inode lock, we need to lock all the folios for - * atomiticity of writes vs. other writes: - */ - if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { - ret = -BCH_ERR_need_inode_lock; - goto out; - } - f = darray_first(fs); if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); @@ -932,10 +922,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, end = pos + copied; spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) { - BUG_ON(!inode_locked); + if (end > inode->v.i_size) i_size_write(&inode->v, end); - } spin_unlock(&inode->v.i_lock); f_pos = pos; @@ -979,68 +967,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos; - bool inode_locked = false; - ssize_t written = 0, written2 = 0, ret = 0; - - /* - * We don't take the inode lock unless i_size will be changing. Folio - * locks provide exclusion with other writes, and the pagecache add lock - * provides exclusion with truncate and hole punching. - * - * There is one nasty corner case where atomicity would be broken - * without great care: when copying data from userspace to the page - * cache, we do that with faults disable - a page fault would recurse - * back into the filesystem, taking filesystem locks again, and - * deadlock; so it's done with faults disabled, and we fault in the user - * buffer when we aren't holding locks. - * - * If we do part of the write, but we then race and in the userspace - * buffer have been evicted and are no longer resident, then we have to - * drop our folio locks to re-fault them in, breaking write atomicity. - * - * To fix this, we restart the write from the start, if we weren't - * holding the inode lock. - * - * There is another wrinkle after that; if we restart the write from the - * start, and then get an unrecoverable error, we _cannot_ claim to - * userspace that we did not write data we actually did - so we must - * track (written2) the most we ever wrote. - */ - - if ((iocb->ki_flags & IOCB_APPEND) || - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { - inode_lock(&inode->v); - inode_locked = true; - } - - ret = generic_write_checks(iocb, iter); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); - if (ret) { - if (!inode_locked) { - inode_lock(&inode->v); - inode_locked = true; - ret = file_remove_privs_flags(file, 0); - } - if (ret) - goto unlock; - } - - ret = file_update_time(file); - if (ret) - goto unlock; - - pos = iocb->ki_pos; + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + int ret = 0; bch2_pagecache_add_get(inode); - if (!inode_locked && - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) - goto get_inode_lock; - do { unsigned offset = pos & (PAGE_SIZE - 1); unsigned bytes = iov_iter_count(iter); @@ -1065,17 +997,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) } } - if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) - goto get_inode_lock; - if (unlikely(fatal_signal_pending(current))) { ret = -EINTR; break; } - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); - if (ret == -BCH_ERR_need_inode_lock) - goto get_inode_lock; + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); if (unlikely(ret < 0)) break; @@ -1096,46 +1023,50 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) } pos += ret; written += ret; - written2 = max(written, written2); - - if (ret != bytes && !inode_locked) - goto get_inode_lock; ret = 0; balance_dirty_pages_ratelimited(mapping); - - if (0) { -get_inode_lock: - bch2_pagecache_add_put(inode); - inode_lock(&inode->v); - inode_locked = true; - bch2_pagecache_add_get(inode); - - iov_iter_revert(iter, written); - pos -= written; - written = 0; - ret = 0; - } } while (iov_iter_count(iter)); - bch2_pagecache_add_put(inode); -unlock: - if (inode_locked) - inode_unlock(&inode->v); - iocb->ki_pos += written; + bch2_pagecache_add_put(inode); - ret = max(written, written2) ?: ret; - if (ret > 0) - ret = generic_write_sync(iocb, ret); - return ret; + return written ? written : ret; } -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) { - ssize_t ret = iocb->ki_flags & IOCB_DIRECT - ? bch2_direct_write(iocb, iter) - : bch2_buffered_write(iocb, iter); + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = bch2_direct_write(iocb, from); + goto out; + } + + inode_lock(&inode->v); + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs(file); + if (ret) + goto unlock; + + ret = file_update_time(file); + if (ret) + goto unlock; + + ret = bch2_buffered_write(iocb, from); + if (likely(ret > 0)) + iocb->ki_pos += ret; +unlock: + inode_unlock(&inode->v); + if (ret > 0) + ret = generic_write_sync(iocb, ret); +out: return bch2_err_class(ret); } From 3d3020c461936009dc58702e267ff67b0076cbf2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 Aug 2024 11:47:32 -0400 Subject: [PATCH 841/991] bcachefs: Mark more errors as autofix errors that are known to always be safe to fix should be autofix: this should be most errors even at this point, but that will need some thorough review. note that errors are still logged in the superblock, so we'll still know that they happened. Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index d3a498617303e6..f0c14702f9e62c 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -23,7 +23,7 @@ enum bch_fsck_flags { x(jset_past_bucket_end, 9, 0) \ x(jset_seq_blacklisted, 10, 0) \ x(journal_entries_missing, 11, 0) \ - x(journal_entry_replicas_not_marked, 12, 0) \ + x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \ x(journal_entry_past_jset_end, 13, 0) \ x(journal_entry_replicas_data_mismatch, 14, 0) \ x(journal_entry_bkey_u64s_0, 15, 0) \ @@ -288,10 +288,10 @@ enum bch_fsck_flags { x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ - x(accounting_key_junk_at_end, 277, 0) \ - x(accounting_key_replicas_nr_devs_0, 278, 0) \ - x(accounting_key_replicas_nr_required_bad, 279, 0) \ - x(accounting_key_replicas_devs_unsorted, 280, 0) \ + x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ + x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ + x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ + x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, From 431c1646e1f86b949fa3685efc50b660a364c2b6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Sep 2024 19:46:02 +1200 Subject: [PATCH 842/991] Linux 6.11-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7b60eb103c5d3c..d57cfc6896b880 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Baby Opossum Posse # *DOCUMENTATION* From 72a6e22c604c95ddb3b10b5d3bb85b6ff4dbc34f Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 26 Aug 2024 19:20:56 +0800 Subject: [PATCH 843/991] fscache: delete fscache_cookie_lru_timer when fscache exits to avoid UAF The fscache_cookie_lru_timer is initialized when the fscache module is inserted, but is not deleted when the fscache module is removed. If timer_reduce() is called before removing the fscache module, the fscache_cookie_lru_timer will be added to the timer list of the current cpu. Afterwards, a use-after-free will be triggered in the softIRQ after removing the fscache module, as follows: ================================================================== BUG: unable to handle page fault for address: fffffbfff803c9e9 PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page PGD 21ffea067 P4D 21ffea067 PUD 21ffe6067 PMD 110a7c067 PTE 0 Oops: Oops: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 0 Comm: swapper/1 Tainted: G W 6.11.0-rc3 #855 Tainted: [W]=WARN RIP: 0010:__run_timer_base.part.0+0x254/0x8a0 Call Trace: tmigr_handle_remote_up+0x627/0x810 __walk_groups.isra.0+0x47/0x140 tmigr_handle_remote+0x1fa/0x2f0 handle_softirqs+0x180/0x590 irq_exit_rcu+0x84/0xb0 sysvec_apic_timer_interrupt+0x6e/0x90 asm_sysvec_apic_timer_interrupt+0x1a/0x20 RIP: 0010:default_idle+0xf/0x20 default_idle_call+0x38/0x60 do_idle+0x2b5/0x300 cpu_startup_entry+0x54/0x60 start_secondary+0x20d/0x280 common_startup_64+0x13e/0x148 Modules linked in: [last unloaded: netfs] ================================================================== Therefore delete fscache_cookie_lru_timer when removing the fscahe module. Fixes: 12bb21a29c19 ("fscache: Implement cookie user counting and resource pinning") Cc: stable@kernel.org Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240826112056.2458299-1-libaokun@huaweicloud.com Acked-by: David Howells Signed-off-by: Christian Brauner --- fs/netfs/fscache_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c index 42e98bb523e369..49849005eb7cbc 100644 --- a/fs/netfs/fscache_main.c +++ b/fs/netfs/fscache_main.c @@ -103,6 +103,7 @@ void __exit fscache_exit(void) kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); + timer_shutdown_sync(&fscache_cookie_lru_timer); destroy_workqueue(fscache_wq); pr_notice("FS-Cache unloaded\n"); } From 6a422a96bc84cf9b9f0ff741f293a1f9059e0883 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Fri, 30 Aug 2024 13:13:50 +0200 Subject: [PATCH 844/991] hwmon: ltc2991: fix register bits defines In the LTC2991, V5 and V6 channels use the low nibble of the "V5, V6, V7, and V8 Control Register" for configuration, but currently, the high nibble is defined. This patch changes the defines to use the low nibble. Fixes: 2b9ea4262ae9 ("hwmon: Add driver for ltc2991") Signed-off-by: Pawel Dembicki Message-ID: <20240830111349.30531-1-paweldembicki@gmail.com> Signed-off-by: Guenter Roeck --- drivers/hwmon/ltc2991.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hwmon/ltc2991.c b/drivers/hwmon/ltc2991.c index 573cd8f5721bd6..7ca139e4b6aff0 100644 --- a/drivers/hwmon/ltc2991.c +++ b/drivers/hwmon/ltc2991.c @@ -42,9 +42,9 @@ #define LTC2991_V7_V8_FILT_EN BIT(7) #define LTC2991_V7_V8_TEMP_EN BIT(5) #define LTC2991_V7_V8_DIFF_EN BIT(4) -#define LTC2991_V5_V6_FILT_EN BIT(7) -#define LTC2991_V5_V6_TEMP_EN BIT(5) -#define LTC2991_V5_V6_DIFF_EN BIT(4) +#define LTC2991_V5_V6_FILT_EN BIT(3) +#define LTC2991_V5_V6_TEMP_EN BIT(1) +#define LTC2991_V5_V6_DIFF_EN BIT(0) #define LTC2991_REPEAT_ACQ_EN BIT(4) #define LTC2991_T_INT_FILT_EN BIT(3) From 7f12a963b65872fda1219f065c1cc1b1b9a806e8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Sep 2024 15:53:03 -0400 Subject: [PATCH 845/991] bcachefs: fix rebalance accounting Fixes: 49aa7830396b ("bcachefs: Fix rebalance_work accounting") Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index a2274429e7f4ad..20219c1e6ddf9a 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -876,7 +876,7 @@ int bch2_trigger_extent(struct btree_trans *trans, need_rebalance_delta -= s != 0; need_rebalance_sectors_delta -= s; - s = bch2_bkey_sectors_need_rebalance(c, old); + s = bch2_bkey_sectors_need_rebalance(c, new.s_c); need_rebalance_delta += s != 0; need_rebalance_sectors_delta += s; From b808f629215685c1941b1cd567c7b7ccb3c90278 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 9 Aug 2024 13:25:11 +0500 Subject: [PATCH 846/991] selftests: mm: fix build errors on armhf The __NR_mmap isn't found on armhf. The mmap() is commonly available system call and its wrapper is present on all architectures. So it should be used directly. It solves problem for armhf and doesn't create problem for other architectures. Remove sys_mmap() functions as they aren't doing anything else other than calling mmap(). There is no need to set errno = 0 manually as glibc always resets it. For reference errors are as following: CC seal_elf seal_elf.c: In function 'sys_mmap': seal_elf.c:39:33: error: '__NR_mmap' undeclared (first use in this function) 39 | sret = (void *) syscall(__NR_mmap, addr, len, prot, | ^~~~~~~~~ mseal_test.c: In function 'sys_mmap': mseal_test.c:90:33: error: '__NR_mmap' undeclared (first use in this function) 90 | sret = (void *) syscall(__NR_mmap, addr, len, prot, | ^~~~~~~~~ Link: https://lkml.kernel.org/r/20240809082511.497266-1-usama.anjum@collabora.com Fixes: 4926c7a52de7 ("selftest mm/mseal memory sealing") Signed-off-by: Muhammad Usama Anjum Cc: Jeff Xu Cc: Kees Cook Cc: Liam R. Howlett Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mseal_test.c | 37 +++++++++---------------- tools/testing/selftests/mm/seal_elf.c | 13 +-------- 2 files changed, 14 insertions(+), 36 deletions(-) diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index a818f010de479a..bfcea5cf9a4842 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -81,17 +81,6 @@ static int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, return sret; } -static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long offset) -{ - void *sret; - - errno = 0; - sret = (void *) syscall(__NR_mmap, addr, len, prot, - flags, fd, offset); - return sret; -} - static int sys_munmap(void *ptr, size_t size) { int sret; @@ -172,7 +161,7 @@ static void setup_single_address(int size, void **ptrOut) { void *ptr; - ptr = sys_mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ptr = mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); *ptrOut = ptr; } @@ -181,7 +170,7 @@ static void setup_single_address_rw(int size, void **ptrOut) void *ptr; unsigned long mapflags = MAP_ANONYMOUS | MAP_PRIVATE; - ptr = sys_mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0); + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0); *ptrOut = ptr; } @@ -205,7 +194,7 @@ bool seal_support(void) void *ptr; unsigned long page_size = getpagesize(); - ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ptr = mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (ptr == (void *) -1) return false; @@ -481,8 +470,8 @@ static void test_seal_zero_address(void) int prot; /* use mmap to change protection. */ - ptr = sys_mmap(0, size, PROT_NONE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ptr = mmap(0, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); FAIL_TEST_IF_FALSE(ptr == 0); size = get_vma_size(ptr, &prot); @@ -1209,8 +1198,8 @@ static void test_seal_mmap_overwrite_prot(bool seal) } /* use mmap to change protection. */ - ret2 = sys_mmap(ptr, size, PROT_NONE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ret2 = mmap(ptr, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1240,8 +1229,8 @@ static void test_seal_mmap_expand(bool seal) } /* use mmap to expand. */ - ret2 = sys_mmap(ptr, size, PROT_READ, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ret2 = mmap(ptr, size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1268,8 +1257,8 @@ static void test_seal_mmap_shrink(bool seal) } /* use mmap to shrink. */ - ret2 = sys_mmap(ptr, 8 * page_size, PROT_READ, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ret2 = mmap(ptr, 8 * page_size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1650,7 +1639,7 @@ static void test_seal_discard_ro_anon_on_filebacked(bool seal) ret = fallocate(fd, 0, 0, size); FAIL_TEST_IF_FALSE(!ret); - ptr = sys_mmap(NULL, size, PROT_READ, mapflags, fd, 0); + ptr = mmap(NULL, size, PROT_READ, mapflags, fd, 0); FAIL_TEST_IF_FALSE(ptr != MAP_FAILED); if (seal) { @@ -1680,7 +1669,7 @@ static void test_seal_discard_ro_anon_on_shared(bool seal) int ret; unsigned long mapflags = MAP_ANONYMOUS | MAP_SHARED; - ptr = sys_mmap(NULL, size, PROT_READ, mapflags, -1, 0); + ptr = mmap(NULL, size, PROT_READ, mapflags, -1, 0); FAIL_TEST_IF_FALSE(ptr != (void *)-1); if (seal) { diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c index 7aa1366063e4e4..d9f8ba8d5050b7 100644 --- a/tools/testing/selftests/mm/seal_elf.c +++ b/tools/testing/selftests/mm/seal_elf.c @@ -30,17 +30,6 @@ static int sys_mseal(void *start, size_t len) return sret; } -static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long offset) -{ - void *sret; - - errno = 0; - sret = (void *) syscall(__NR_mmap, addr, len, prot, - flags, fd, offset); - return sret; -} - static inline int sys_mprotect(void *ptr, size_t size, unsigned long prot) { int sret; @@ -56,7 +45,7 @@ static bool seal_support(void) void *ptr; unsigned long page_size = getpagesize(); - ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ptr = mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (ptr == (void *) -1) return false; From 3e3de7947c751509027d26b679ecd243bc9db255 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 12 Aug 2024 18:16:06 +0100 Subject: [PATCH 847/991] mm: vmalloc: ensure vmap_block is initialised before adding to queue Commit 8c61291fd850 ("mm: fix incorrect vbq reference in purge_fragmented_block") extended the 'vmap_block' structure to contain a 'cpu' field which is set at allocation time to the id of the initialising CPU. When a new 'vmap_block' is being instantiated by new_vmap_block(), the partially initialised structure is added to the local 'vmap_block_queue' xarray before the 'cpu' field has been initialised. If another CPU is concurrently walking the xarray (e.g. via vm_unmap_aliases()), then it may perform an out-of-bounds access to the remote queue thanks to an uninitialised index. This has been observed as UBSAN errors in Android: | Internal error: UBSAN: array index out of bounds: 00000000f2005512 [#1] PREEMPT SMP | | Call trace: | purge_fragmented_block+0x204/0x21c | _vm_unmap_aliases+0x170/0x378 | vm_unmap_aliases+0x1c/0x28 | change_memory_common+0x1dc/0x26c | set_memory_ro+0x18/0x24 | module_enable_ro+0x98/0x238 | do_init_module+0x1b0/0x310 Move the initialisation of 'vb->cpu' in new_vmap_block() ahead of the addition to the xarray. Link: https://lkml.kernel.org/r/20240812171606.17486-1-will@kernel.org Fixes: 8c61291fd850 ("mm: fix incorrect vbq reference in purge_fragmented_block") Signed-off-by: Will Deacon Reviewed-by: Baoquan He Reviewed-by: Uladzislau Rezki (Sony) Cc: Zhaoyang Huang Cc: Hailong.Liu Cc: Christoph Hellwig Cc: Lorenzo Stoakes Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af2de36549d608..ac53d46ac8a5c3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2626,6 +2626,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) vb->dirty_max = 0; bitmap_set(vb->used_map, 0, (1UL << order)); INIT_LIST_HEAD(&vb->free_list); + vb->cpu = raw_smp_processor_id(); xa = addr_to_vb_xa(va->va_start); vb_idx = addr_to_vb_idx(va->va_start); @@ -2642,7 +2643,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) * integrity together with list_for_each_rcu from read * side. */ - vb->cpu = raw_smp_processor_id(); vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); From 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 13 Aug 2024 22:25:21 +0200 Subject: [PATCH 848/991] userfaultfd: fix checks for huge PMDs Patch series "userfaultfd: fix races around pmd_trans_huge() check", v2. The pmd_trans_huge() code in mfill_atomic() is wrong in three different ways depending on kernel version: 1. The pmd_trans_huge() check is racy and can lead to a BUG_ON() (if you hit the right two race windows) - I've tested this in a kernel build with some extra mdelay() calls. See the commit message for a description of the race scenario. On older kernels (before 6.5), I think the same bug can even theoretically lead to accessing transhuge page contents as a page table if you hit the right 5 narrow race windows (I haven't tested this case). 2. As pointed out by Qi Zheng, pmd_trans_huge() is not sufficient for detecting PMDs that don't point to page tables. On older kernels (before 6.5), you'd just have to win a single fairly wide race to hit this. I've tested this on 6.1 stable by racing migration (with a mdelay() patched into try_to_migrate()) against UFFDIO_ZEROPAGE - on my x86 VM, that causes a kernel oops in ptlock_ptr(). 3. On newer kernels (>=6.5), for shmem mappings, khugepaged is allowed to yank page tables out from under us (though I haven't tested that), so I think the BUG_ON() checks in mfill_atomic() are just wrong. I decided to write two separate fixes for these (one fix for bugs 1+2, one fix for bug 3), so that the first fix can be backported to kernels affected by bugs 1+2. This patch (of 2): This fixes two issues. I discovered that the following race can occur: mfill_atomic other thread ============ ============ pmdp_get_lockless() [reads none pmd] __pte_alloc [no-op] BUG_ON(pmd_none(*dst_pmd)) I have experimentally verified this in a kernel with extra mdelay() calls; the BUG_ON(pmd_none(*dst_pmd)) triggers. On kernels newer than commit 0d940a9b270b ("mm/pgtable: allow pte_offset_map[_lock]() to fail"), this can't lead to anything worse than a BUG_ON(), since the page table access helpers are actually designed to deal with page tables concurrently disappearing; but on older kernels (<=6.4), I think we could probably theoretically race past the two BUG_ON() checks and end up treating a hugepage as a page table. The second issue is that, as Qi Zheng pointed out, there are other types of huge PMDs that pmd_trans_huge() can't catch: devmap PMDs and swap PMDs (in particular, migration PMDs). On <=6.4, this is worse than the first issue: If mfill_atomic() runs on a PMD that contains a migration entry (which just requires winning a single, fairly wide race), it will pass the PMD to pte_offset_map_lock(), which assumes that the PMD points to a page table. Breakage follows: First, the kernel tries to take the PTE lock (which will crash or maybe worse if there is no "struct page" for the address bits in the migration entry PMD - I think at least on X86 there usually is no corresponding "struct page" thanks to the PTE inversion mitigation, amd64 looks different). If that didn't crash, the kernel would next try to write a PTE into what it wrongly thinks is a page table. As part of fixing these issues, get rid of the check for pmd_trans_huge() before __pte_alloc() - that's redundant, we're going to have to check for that after the __pte_alloc() anyway. Backport note: pmdp_get_lockless() is pmd_read_atomic() in older kernels. Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-0-5efa61078a41@google.com Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-1-5efa61078a41@google.com Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation") Signed-off-by: Jann Horn Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jann Horn Cc: Pavel Emelyanov Cc: Qi Zheng Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e54e5c8907fa22..290b2a0d84ac57 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -787,21 +787,23 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, } dst_pmdval = pmdp_get_lockless(dst_pmd); - /* - * If the dst_pmd is mapped as THP don't - * override it and just be strict. - */ - if (unlikely(pmd_trans_huge(dst_pmdval))) { - err = -EEXIST; - break; - } if (unlikely(pmd_none(dst_pmdval)) && unlikely(__pte_alloc(dst_mm, dst_pmd))) { err = -ENOMEM; break; } - /* If an huge pmd materialized from under us fail */ - if (unlikely(pmd_trans_huge(*dst_pmd))) { + dst_pmdval = pmdp_get_lockless(dst_pmd); + /* + * If the dst_pmd is THP don't override it and just be strict. + * (This includes the case where the PMD used to be THP and + * changed back to none after __pte_alloc().) + */ + if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) || + pmd_devmap(dst_pmdval))) { + err = -EEXIST; + break; + } + if (unlikely(pmd_bad(dst_pmdval))) { err = -EFAULT; break; } From 4828d207dc5161dc7ddf9a4f6dcfd80c7dd7d20a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 13 Aug 2024 22:25:22 +0200 Subject: [PATCH 849/991] userfaultfd: don't BUG_ON() if khugepaged yanks our page table Since khugepaged was changed to allow retracting page tables in file mappings without holding the mmap lock, these BUG_ON()s are wrong - get rid of them. We could also remove the preceding "if (unlikely(...))" block, but then we could reach pte_offset_map_lock() with transhuge pages not just for file mappings but also for anonymous mappings - which would probably be fine but I think is not necessarily expected. Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-2-5efa61078a41@google.com Fixes: 1d65b771bc08 ("mm/khugepaged: retract_page_tables() without mmap or vma lock") Signed-off-by: Jann Horn Reviewed-by: Qi Zheng Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Pavel Emelyanov Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 290b2a0d84ac57..acc56c75ba9947 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -807,9 +807,10 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, err = -EFAULT; break; } - - BUG_ON(pmd_none(*dst_pmd)); - BUG_ON(pmd_trans_huge(*dst_pmd)); + /* + * For shmem mappings, khugepaged is allowed to remove page + * tables under us; pte_offset_map_lock() will deal with that. + */ err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, flags, &folio); From 683408258917541bdb294cd717c210a04381931e Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 11 Aug 2024 19:03:20 +0900 Subject: [PATCH 850/991] nilfs2: protect references to superblock parameters exposed in sysfs The superblock buffers of nilfs2 can not only be overwritten at runtime for modifications/repairs, but they are also regularly swapped, replaced during resizing, and even abandoned when degrading to one side due to backing device issues. So, accessing them requires mutual exclusion using the reader/writer semaphore "nilfs->ns_sem". Some sysfs attribute show methods read this superblock buffer without the necessary mutual exclusion, which can cause problems with pointer dereferencing and memory access, so fix it. Link: https://lkml.kernel.org/r/20240811100320.9913-1-konishi.ryusuke@gmail.com Fixes: da7141fb78db ("nilfs2: add /sys/fs/nilfs2/ group") Signed-off-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/sysfs.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index a5569b7f47a39d..14868a3dd592ca 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -836,9 +836,15 @@ ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; - u32 major = le32_to_cpu(sbp[0]->s_rev_level); - u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level); + struct nilfs_super_block *raw_sb; + u32 major; + u16 minor; + + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + major = le32_to_cpu(raw_sb->s_rev_level); + minor = le16_to_cpu(raw_sb->s_minor_rev_level); + up_read(&nilfs->ns_sem); return sysfs_emit(buf, "%d.%d\n", major, minor); } @@ -856,8 +862,13 @@ ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; - u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size); + struct nilfs_super_block *raw_sb; + u64 dev_size; + + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + dev_size = le64_to_cpu(raw_sb->s_dev_size); + up_read(&nilfs->ns_sem); return sysfs_emit(buf, "%llu\n", dev_size); } @@ -879,9 +890,15 @@ ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct nilfs_super_block *raw_sb; + ssize_t len; - return sysfs_emit(buf, "%pUb\n", sbp[0]->s_uuid); + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + len = sysfs_emit(buf, "%pUb\n", raw_sb->s_uuid); + up_read(&nilfs->ns_sem); + + return len; } static @@ -889,10 +906,16 @@ ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct nilfs_super_block *raw_sb; + ssize_t len; + + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + len = scnprintf(buf, sizeof(raw_sb->s_volume_name), "%s\n", + raw_sb->s_volume_name); + up_read(&nilfs->ns_sem); - return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n", - sbp[0]->s_volume_name); + return len; } static const char dev_readme_str[] = From 5787fcaab9eb5930f5378d6a1dd03d916d146622 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sat, 10 Aug 2024 15:52:42 +0900 Subject: [PATCH 851/991] nilfs2: fix missing cleanup on rollforward recovery error In an error injection test of a routine for mount-time recovery, KASAN found a use-after-free bug. It turned out that if data recovery was performed using partial logs created by dsync writes, but an error occurred before starting the log writer to create a recovered checkpoint, the inodes whose data had been recovered were left in the ns_dirty_files list of the nilfs object and were not freed. Fix this issue by cleaning up inodes that have read the recovery data if the recovery routine fails midway before the log writer starts. Link: https://lkml.kernel.org/r/20240810065242.3701-1-konishi.ryusuke@gmail.com Fixes: 0f3e1c7f23f8 ("nilfs2: recovery functions") Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/recovery.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index b638dc06df2f72..61e25a980f73ef 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -715,6 +715,33 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, brelse(bh); } +/** + * nilfs_abort_roll_forward - cleaning up after a failed rollforward recovery + * @nilfs: nilfs object + */ +static void nilfs_abort_roll_forward(struct the_nilfs *nilfs) +{ + struct nilfs_inode_info *ii, *n; + LIST_HEAD(head); + + /* Abandon inodes that have read recovery data */ + spin_lock(&nilfs->ns_inode_lock); + list_splice_init(&nilfs->ns_dirty_files, &head); + spin_unlock(&nilfs->ns_inode_lock); + if (list_empty(&head)) + return; + + set_nilfs_purging(nilfs); + list_for_each_entry_safe(ii, n, &head, i_dirty) { + spin_lock(&nilfs->ns_inode_lock); + list_del_init(&ii->i_dirty); + spin_unlock(&nilfs->ns_inode_lock); + + iput(&ii->vfs_inode); + } + clear_nilfs_purging(nilfs); +} + /** * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint * @nilfs: nilfs object @@ -773,15 +800,19 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, if (unlikely(err)) { nilfs_err(sb, "error %d writing segment for recovery", err); - goto failed; + goto put_root; } nilfs_finish_roll_forward(nilfs, ri); } - failed: +put_root: nilfs_put_root(root); return err; + +failed: + nilfs_abort_roll_forward(nilfs); + goto put_root; } /** From 6576dd6695f2afca3f4954029ac4a64f82ba60ab Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 14 Aug 2024 19:11:19 +0900 Subject: [PATCH 852/991] nilfs2: fix state management in error path of log writing function After commit a694291a6211 ("nilfs2: separate wait function from nilfs_segctor_write") was applied, the log writing function nilfs_segctor_do_construct() was able to issue I/O requests continuously even if user data blocks were split into multiple logs across segments, but two potential flaws were introduced in its error handling. First, if nilfs_segctor_begin_construction() fails while creating the second or subsequent logs, the log writing function returns without calling nilfs_segctor_abort_construction(), so the writeback flag set on pages/folios will remain uncleared. This causes page cache operations to hang waiting for the writeback flag. For example, truncate_inode_pages_final(), which is called via nilfs_evict_inode() when an inode is evicted from memory, will hang. Second, the NILFS_I_COLLECTED flag set on normal inodes remain uncleared. As a result, if the next log write involves checkpoint creation, that's fine, but if a partial log write is performed that does not, inodes with NILFS_I_COLLECTED set are erroneously removed from the "sc_dirty_files" list, and their data and b-tree blocks may not be written to the device, corrupting the block mapping. Fix these issues by uniformly calling nilfs_segctor_abort_construction() on failure of each step in the loop in nilfs_segctor_do_construct(), having it clean up logs and segment usages according to progress, and correcting the conditions for calling nilfs_redirty_inodes() to ensure that the NILFS_I_COLLECTED flag is cleared. Link: https://lkml.kernel.org/r/20240814101119.4070-1-konishi.ryusuke@gmail.com Fixes: a694291a6211 ("nilfs2: separate wait function from nilfs_segctor_write") Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0ca3110d638689..871ec35ea8e8a1 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1812,6 +1812,9 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, nilfs_abort_logs(&logs, ret ? : err); list_splice_tail_init(&sci->sc_segbufs, &logs); + if (list_empty(&logs)) + return; /* if the first segment buffer preparation failed */ + nilfs_cancel_segusage(&logs, nilfs->ns_sufile); nilfs_free_incomplete_logs(&logs, nilfs); @@ -2056,7 +2059,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) err = nilfs_segctor_begin_construction(sci, nilfs); if (unlikely(err)) - goto out; + goto failed; /* Update time stamp */ sci->sc_seg_ctime = ktime_get_real_seconds(); @@ -2120,10 +2123,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) return err; failed_to_write: - if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) - nilfs_redirty_inodes(&sci->sc_dirty_files); - failed: + if (mode == SC_LSEG_SR && nilfs_sc_cstage_get(sci) >= NILFS_ST_IFILE) + nilfs_redirty_inodes(&sci->sc_dirty_files); if (nilfs_doing_gc()) nilfs_redirty_inodes(&sci->sc_gc_inodes); nilfs_segctor_abort_construction(sci, nilfs, err); From ab7ca09520e9c41c219a4427fe0dae24024bfe7f Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Fri, 16 Aug 2024 09:33:36 +0800 Subject: [PATCH 853/991] mm/slub: add check for s->flags in the alloc_tagging_slab_free_hook When enable CONFIG_MEMCG & CONFIG_KFENCE & CONFIG_KMEMLEAK, the following warning always occurs,This is because the following call stack occurred: mem_pool_alloc kmem_cache_alloc_noprof slab_alloc_node kfence_alloc Once the kfence allocation is successful,slab->obj_exts will not be empty, because it has already been assigned a value in kfence_init_pool. Since in the prepare_slab_obj_exts_hook function,we perform a check for s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE),the alloc_tag_add function will not be called as a result.Therefore,ref->ct remains NULL. However,when we call mem_pool_free,since obj_ext is not empty, it eventually leads to the alloc_tag_sub scenario being invoked. This is where the warning occurs. So we should add corresponding checks in the alloc_tagging_slab_free_hook. For __GFP_NO_OBJ_EXT case,I didn't see the specific case where it's using kfence,so I won't add the corresponding check in alloc_tagging_slab_free_hook for now. [ 3.734349] ------------[ cut here ]------------ [ 3.734807] alloc_tag was not set [ 3.735129] WARNING: CPU: 4 PID: 40 at ./include/linux/alloc_tag.h:130 kmem_cache_free+0x444/0x574 [ 3.735866] Modules linked in: autofs4 [ 3.736211] CPU: 4 UID: 0 PID: 40 Comm: ksoftirqd/4 Tainted: G W 6.11.0-rc3-dirty #1 [ 3.736969] Tainted: [W]=WARN [ 3.737258] Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022 [ 3.737875] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 3.738501] pc : kmem_cache_free+0x444/0x574 [ 3.738951] lr : kmem_cache_free+0x444/0x574 [ 3.739361] sp : ffff80008357bb60 [ 3.739693] x29: ffff80008357bb70 x28: 0000000000000000 x27: 0000000000000000 [ 3.740338] x26: ffff80008207f000 x25: ffff000b2eb2fd60 x24: ffff0000c0005700 [ 3.740982] x23: ffff8000804229e4 x22: ffff800082080000 x21: ffff800081756000 [ 3.741630] x20: fffffd7ff8253360 x19: 00000000000000a8 x18: ffffffffffffffff [ 3.742274] x17: ffff800ab327f000 x16: ffff800083398000 x15: ffff800081756df0 [ 3.742919] x14: 0000000000000000 x13: 205d344320202020 x12: 5b5d373038343337 [ 3.743560] x11: ffff80008357b650 x10: 000000000000005d x9 : 00000000ffffffd0 [ 3.744231] x8 : 7f7f7f7f7f7f7f7f x7 : ffff80008237bad0 x6 : c0000000ffff7fff [ 3.744907] x5 : ffff80008237ba78 x4 : ffff8000820bbad0 x3 : 0000000000000001 [ 3.745580] x2 : 68d66547c09f7800 x1 : 68d66547c09f7800 x0 : 0000000000000000 [ 3.746255] Call trace: [ 3.746530] kmem_cache_free+0x444/0x574 [ 3.746931] mem_pool_free+0x44/0xf4 [ 3.747306] free_object_rcu+0xc8/0xdc [ 3.747693] rcu_do_batch+0x234/0x8a4 [ 3.748075] rcu_core+0x230/0x3e4 [ 3.748424] rcu_core_si+0x14/0x1c [ 3.748780] handle_softirqs+0x134/0x378 [ 3.749189] run_ksoftirqd+0x70/0x9c [ 3.749560] smpboot_thread_fn+0x148/0x22c [ 3.749978] kthread+0x10c/0x118 [ 3.750323] ret_from_fork+0x10/0x20 [ 3.750696] ---[ end trace 0000000000000000 ]--- Link: https://lkml.kernel.org/r/20240816013336.17505-1-hao.ge@linux.dev Fixes: 4b8736964640 ("mm/slab: add allocation accounting into slab allocation and free paths") Signed-off-by: Hao Ge Cc: Christoph Lameter Cc: David Rientjes Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Kees Cook Cc: Kent Overstreet Cc: Pekka Enberg Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/slub.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index c9d8a2497fd65e..a77f354f83251e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2116,6 +2116,10 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, if (!mem_alloc_profiling_enabled()) return; + /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */ + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return; + obj_exts = slab_obj_exts(slab); if (!obj_exts) return; From 6dacd79d28842ff01f18b4900d897741aac5999e Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 5 Aug 2024 17:07:50 +0200 Subject: [PATCH 854/991] kexec_file: fix elfcorehdr digest exclusion when CONFIG_CRASH_HOTPLUG=y Fix the condition to exclude the elfcorehdr segment from the SHA digest calculation. The j iterator is an index into the output sha_regions[] array, not into the input image->segment[] array. Once it reaches image->elfcorehdr_index, all subsequent segments are excluded. Besides, if the purgatory segment precedes the elfcorehdr segment, the elfcorehdr may be wrongly included in the calculation. Link: https://lkml.kernel.org/r/20240805150750.170739-1-petr.tesarik@suse.com Fixes: f7cc804a9fd4 ("kexec: exclude elfcorehdr from the segment digest") Signed-off-by: Petr Tesarik Acked-by: Baoquan He Cc: Eric Biederman Cc: Hari Bathini Cc: Sourabh Jain Cc: Eric DeVolder Cc: Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 3d64290d24c9a2..3eedb8c226ad8f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -752,7 +752,7 @@ static int kexec_calculate_store_digests(struct kimage *image) #ifdef CONFIG_CRASH_HOTPLUG /* Exclude elfcorehdr segment to allow future changes via hotplug */ - if (j == image->elfcorehdr_index) + if (i == image->elfcorehdr_index) continue; #endif From f806de88d8f7f8191afd0fd9b94db4cd058e7d4f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 20 Aug 2024 13:54:17 -0400 Subject: [PATCH 855/991] maple_tree: remove rcu_read_lock() from mt_validate() The write lock should be held when validating the tree to avoid updates racing with checks. Holding the rcu read lock during a large tree validation may also cause a prolonged rcu read window and "rcu_preempt detected stalls" warnings. Link: https://lore.kernel.org/all/0000000000001d12d4062005aea1@google.com/ Link: https://lkml.kernel.org/r/20240820175417.2782532-1-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reported-by: syzbot+036af2f0c7338a33b0cd@syzkaller.appspotmail.com Cc: Hillf Danton Cc: Matthew Wilcox Cc: "Paul E. McKenney" Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index aa3a5df15b8ef9..6df3a8b95808ab 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7566,14 +7566,14 @@ static void mt_validate_nulls(struct maple_tree *mt) * 2. The gap is correctly set in the parents */ void mt_validate(struct maple_tree *mt) + __must_hold(mas->tree->ma_lock) { unsigned char end; MA_STATE(mas, mt, 0, 0); - rcu_read_lock(); mas_start(&mas); if (!mas_is_active(&mas)) - goto done; + return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); @@ -7594,9 +7594,6 @@ void mt_validate(struct maple_tree *mt) mas_dfs_postorder(&mas, ULONG_MAX); } mt_validate_nulls(mt); -done: - rcu_read_unlock(); - } EXPORT_SYMBOL_GPL(mt_validate); From bfe0857c20c663fcc1592fa4e3a61ca12b07dac9 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 21 Aug 2024 20:26:07 +0100 Subject: [PATCH 856/991] Revert "mm: skip CMA pages when they are not available" This reverts commit 5da226dbfce3 ("mm: skip CMA pages when they are not available") and b7108d66318a ("Multi-gen LRU: skip CMA pages when they are not eligible"). lruvec->lru_lock is highly contended and is held when calling isolate_lru_folios. If the lru has a large number of CMA folios consecutively, while the allocation type requested is not MIGRATE_MOVABLE, isolate_lru_folios can hold the lock for a very long time while it skips those. For FIO workload, ~150million order=0 folios were skipped to isolate a few ZONE_DMA folios [1]. This can cause lockups [1] and high memory pressure for extended periods of time [2]. Remove skipping CMA for MGLRU as well, as it was introduced in sort_folio for the same resaon as 5da226dbfce3a2f44978c2c7cf88166e69a6788b. [1] https://lore.kernel.org/all/CAOUHufbkhMZYz20aM_3rHZ3OcK4m2puji2FGpUpn_-DevGk3Kg@mail.gmail.com/ [2] https://lore.kernel.org/all/ZrssOrcJIDy8hacI@gmail.com/ [usamaarif642@gmail.com: also revert b7108d66318a, per Johannes] Link: https://lkml.kernel.org/r/9060a32d-b2d7-48c0-8626-1db535653c54@gmail.com Link: https://lkml.kernel.org/r/357ac325-4c61-497a-92a3-bdbd230d5ec9@gmail.com Link: https://lkml.kernel.org/r/9060a32d-b2d7-48c0-8626-1db535653c54@gmail.com Fixes: 5da226dbfce3 ("mm: skip CMA pages when they are not available") Signed-off-by: Usama Arif Acked-by: Johannes Weiner Cc: Bharata B Rao Cc: Breno Leitao Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Rik van Riel Cc: Vlastimil Babka Cc: Yu Zhao Cc: Zhaoyang Huang Cc: Zhaoyang Huang Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index cfa839284b923e..bd489c1af22893 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1604,25 +1604,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, } -#ifdef CONFIG_CMA -/* - * It is waste of effort to scan and reclaim CMA pages if it is not available - * for current allocation context. Kswapd can not be enrolled as it can not - * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL - */ -static bool skip_cma(struct folio *folio, struct scan_control *sc) -{ - return !current_is_kswapd() && - gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE && - folio_migratetype(folio) == MIGRATE_CMA; -} -#else -static bool skip_cma(struct folio *folio, struct scan_control *sc) -{ - return false; -} -#endif - /* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * @@ -1669,8 +1650,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, nr_pages = folio_nr_pages(folio); total_scan += nr_pages; - if (folio_zonenum(folio) > sc->reclaim_idx || - skip_cma(folio, sc)) { + if (folio_zonenum(folio) > sc->reclaim_idx) { nr_skipped[folio_zonenum(folio)] += nr_pages; move_to = &folios_skipped; goto move; @@ -4320,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* ineligible */ - if (zone > sc->reclaim_idx || skip_cma(folio, sc)) { + if (zone > sc->reclaim_idx) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; From a3f6a89c834a4cba0f881da21307b26de3796133 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 23 Aug 2024 17:38:50 +0100 Subject: [PATCH 857/991] scripts: fix gfp-translate after ___GFP_*_BITS conversion to an enum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Richard reports that since 772dd0342727c ("mm: enumerate all gfp flags"), gfp-translate is broken, as the bit numbers are implicit, leaving the shell script unable to extract them. Even more, some bits are now at a variable location, making it double extra hard to parse using a simple shell script. Use a brute-force approach to the problem by generating a small C stub that will use the enum to dump the interesting bits. As an added bonus, we are now able to identify invalid bits for a given configuration. As an added drawback, we cannot parse include files that predate this change anymore. Tough luck. Link: https://lkml.kernel.org/r/20240823163850.3791201-1-maz@kernel.org Fixes: 772dd0342727 ("mm: enumerate all gfp flags") Signed-off-by: Marc Zyngier Reported-by: Richard Weinberger Cc: Petr Tesařík Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- scripts/gfp-translate | 66 ++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/scripts/gfp-translate b/scripts/gfp-translate index 6c9aed17cf563d..8385ae0d5af937 100755 --- a/scripts/gfp-translate +++ b/scripts/gfp-translate @@ -62,25 +62,57 @@ if [ "$GFPMASK" = "none" ]; then fi # Extract GFP flags from the kernel source -TMPFILE=`mktemp -t gfptranslate-XXXXXX` || exit 1 -grep -q ___GFP $SOURCE/include/linux/gfp_types.h -if [ $? -eq 0 ]; then - grep "^#define ___GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE -else - grep "^#define __GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE -fi +TMPFILE=`mktemp -t gfptranslate-XXXXXX.c` || exit 1 -# Parse the flags -IFS=" -" echo Source: $SOURCE echo Parsing: $GFPMASK -for LINE in `cat $TMPFILE`; do - MASK=`echo $LINE | awk '{print $3}'` - if [ $(($GFPMASK&$MASK)) -ne 0 ]; then - echo $LINE - fi -done -rm -f $TMPFILE +( + cat < +#include + +// Try to fool compiler.h into not including extra stuff +#define __ASSEMBLY__ 1 + +#include +#include + +static const char *masks[] = { +EOF + + sed -nEe 's/^[[:space:]]+(___GFP_.*)_BIT,.*$/\1/p' $SOURCE/include/linux/gfp_types.h | + while read b; do + cat < 0) + [${b}_BIT] = "$b", +#endif +EOF + done + + cat < $TMPFILE + +${CC:-gcc} -Wall -o ${TMPFILE}.bin -I $SOURCE/include $TMPFILE && ${TMPFILE}.bin + +rm -f $TMPFILE ${TMPFILE}.bin + exit 0 From e399257349098bf7c84343f99efb2bc9c22eb9fd Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Fri, 23 Aug 2024 16:27:06 +0000 Subject: [PATCH 858/991] mm/memcontrol: respect zswap.writeback setting from parent cg too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the behavior of zswap.writeback wrt. the cgroup hierarchy seems a bit odd. Unlike zswap.max, it doesn't honor the value from parent cgroups. This surfaced when people tried to globally disable zswap writeback, i.e. reserve physical swap space only for hibernation [1] - disabling zswap.writeback only for the root cgroup results in subcgroups with zswap.writeback=1 still performing writeback. The inconsistency became more noticeable after I introduced the MemoryZSwapWriteback= systemd unit setting [2] for controlling the knob. The patch assumed that the kernel would enforce the value of parent cgroups. It could probably be workarounded from systemd's side, by going up the slice unit tree and inheriting the value. Yet I think it's more sensible to make it behave consistently with zswap.max and friends. [1] https://wiki.archlinux.org/title/Power_management/Suspend_and_hibernate#Disable_zswap_writeback_to_use_the_swap_space_only_for_hibernation [2] https://github.com/systemd/systemd/pull/31734 Link: https://lkml.kernel.org/r/20240823162506.12117-1-me@yhndnzj.com Fixes: 501a06fe8e4c ("zswap: memcontrol: implement zswap writeback disabling") Signed-off-by: Mike Yuan Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 7 ++++--- mm/memcontrol.c | 12 +++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 86311c2907cd3a..95c18bc1708345 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1717,9 +1717,10 @@ The following nested keys are defined. entries fault back in or are written out to disk. memory.zswap.writeback - A read-write single value file. The default value is "1". The - initial value of the root cgroup is 1, and when a new cgroup is - created, it inherits the current value of its parent. + A read-write single value file. The default value is "1". + Note that this setting is hierarchical, i.e. the writeback would be + implicitly disabled for child cgroups if the upper hierarchy + does so. When this is set to 0, all swapping attempts to swapping devices are disabled. This included both zswap writebacks, and swapping due diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f29157288b7dd6..d563fb515766bc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3613,8 +3613,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) memcg1_soft_limit_reset(memcg); #ifdef CONFIG_ZSWAP memcg->zswap_max = PAGE_COUNTER_MAX; - WRITE_ONCE(memcg->zswap_writeback, - !parent || READ_ONCE(parent->zswap_writeback)); + WRITE_ONCE(memcg->zswap_writeback, true); #endif page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { @@ -5320,7 +5319,14 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) { /* if zswap is disabled, do not block pages going to the swapping device */ - return !zswap_is_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback); + if (!zswap_is_enabled()) + return true; + + for (; memcg; memcg = parent_mem_cgroup(memcg)) + if (!READ_ONCE(memcg->zswap_writeback)) + return false; + + return true; } static u64 zswap_current_read(struct cgroup_subsys_state *css, From 5e9784e997620af7c1399029282f5d6964b41942 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Mon, 26 Aug 2024 00:36:49 +0800 Subject: [PATCH 859/991] codetag: debug: mark codetags for poisoned page as empty When PG_hwpoison pages are freed they are treated differently in free_pages_prepare() and instead of being released they are isolated. Page allocation tag counters are decremented at this point since the page is considered not in use. Later on when such pages are released by unpoison_memory(), the allocation tag counters will be decremented again and the following warning gets reported: [ 113.930443][ T3282] ------------[ cut here ]------------ [ 113.931105][ T3282] alloc_tag was not set [ 113.931576][ T3282] WARNING: CPU: 2 PID: 3282 at ./include/linux/alloc_tag.h:130 pgalloc_tag_sub.part.66+0x154/0x164 [ 113.932866][ T3282] Modules linked in: hwpoison_inject fuse ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute ip6table_nat ip6table_man4 [ 113.941638][ T3282] CPU: 2 UID: 0 PID: 3282 Comm: madvise11 Kdump: loaded Tainted: G W 6.11.0-rc4-dirty #18 [ 113.943003][ T3282] Tainted: [W]=WARN [ 113.943453][ T3282] Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022 [ 113.944378][ T3282] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 113.945319][ T3282] pc : pgalloc_tag_sub.part.66+0x154/0x164 [ 113.946016][ T3282] lr : pgalloc_tag_sub.part.66+0x154/0x164 [ 113.946706][ T3282] sp : ffff800087093a10 [ 113.947197][ T3282] x29: ffff800087093a10 x28: ffff0000d7a9d400 x27: ffff80008249f0a0 [ 113.948165][ T3282] x26: 0000000000000000 x25: ffff80008249f2b0 x24: 0000000000000000 [ 113.949134][ T3282] x23: 0000000000000001 x22: 0000000000000001 x21: 0000000000000000 [ 113.950597][ T3282] x20: ffff0000c08fcad8 x19: ffff80008251e000 x18: ffffffffffffffff [ 113.952207][ T3282] x17: 0000000000000000 x16: 0000000000000000 x15: ffff800081746210 [ 113.953161][ T3282] x14: 0000000000000000 x13: 205d323832335420 x12: 5b5d353031313339 [ 113.954120][ T3282] x11: ffff800087093500 x10: 000000000000005d x9 : 00000000ffffffd0 [ 113.955078][ T3282] x8 : 7f7f7f7f7f7f7f7f x7 : ffff80008236ba90 x6 : c0000000ffff7fff [ 113.956036][ T3282] x5 : ffff000b34bf4dc8 x4 : ffff8000820aba90 x3 : 0000000000000001 [ 113.956994][ T3282] x2 : ffff800ab320f000 x1 : 841d1e35ac932e00 x0 : 0000000000000000 [ 113.957962][ T3282] Call trace: [ 113.958350][ T3282] pgalloc_tag_sub.part.66+0x154/0x164 [ 113.959000][ T3282] pgalloc_tag_sub+0x14/0x1c [ 113.959539][ T3282] free_unref_page+0xf4/0x4b8 [ 113.960096][ T3282] __folio_put+0xd4/0x120 [ 113.960614][ T3282] folio_put+0x24/0x50 [ 113.961103][ T3282] unpoison_memory+0x4f0/0x5b0 [ 113.961678][ T3282] hwpoison_unpoison+0x30/0x48 [hwpoison_inject] [ 113.962436][ T3282] simple_attr_write_xsigned.isra.34+0xec/0x1cc [ 113.963183][ T3282] simple_attr_write+0x38/0x48 [ 113.963750][ T3282] debugfs_attr_write+0x54/0x80 [ 113.964330][ T3282] full_proxy_write+0x68/0x98 [ 113.964880][ T3282] vfs_write+0xdc/0x4d0 [ 113.965372][ T3282] ksys_write+0x78/0x100 [ 113.965875][ T3282] __arm64_sys_write+0x24/0x30 [ 113.966440][ T3282] invoke_syscall+0x7c/0x104 [ 113.966984][ T3282] el0_svc_common.constprop.1+0x88/0x104 [ 113.967652][ T3282] do_el0_svc+0x2c/0x38 [ 113.968893][ T3282] el0_svc+0x3c/0x1b8 [ 113.969379][ T3282] el0t_64_sync_handler+0x98/0xbc [ 113.969980][ T3282] el0t_64_sync+0x19c/0x1a0 [ 113.970511][ T3282] ---[ end trace 0000000000000000 ]--- To fix this, clear the page tag reference after the page got isolated and accounted for. Link: https://lkml.kernel.org/r/20240825163649.33294-1-hao.ge@linux.dev Fixes: d224eb0287fb ("codetag: debug: mark codetags for reserved pages as empty") Signed-off-by: Hao Ge Reviewed-by: Miaohe Lin Acked-by: Suren Baghdasaryan Cc: David Hildenbrand Cc: Hao Ge Cc: Kent Overstreet Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: [6.10+] Signed-off-by: Andrew Morton --- mm/page_alloc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c565de8f48e9d8..91ace8ca97e21f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1054,6 +1054,13 @@ __always_inline bool free_pages_prepare(struct page *page, reset_page_owner(page, order); page_table_check_free(page, order); pgalloc_tag_sub(page, 1 << order); + + /* + * The page is isolated and accounted for. + * Mark the codetag as empty to avoid accounting error + * when the page is freed by unpoison_memory(). + */ + clear_page_tag_ref(page); return false; } From 4f295229b279145bdc667c58f62e89f5968e12fb Mon Sep 17 00:00:00 2001 From: Jan Kuliga Date: Fri, 30 Aug 2024 11:56:58 +0200 Subject: [PATCH 860/991] mailmap: update entry for Jan Kuliga Soon I won't be able to use my current email address. Link: https://lkml.kernel.org/r/20240830095658.1203198-1-jankul@alatek.krakow.pl Signed-off-by: Jan Kuliga Cc: David S. Miller Cc: Matthieu Baerts (NGI0) Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index caf46a652f154c..a7969e2d590a53 100644 --- a/.mailmap +++ b/.mailmap @@ -269,6 +269,7 @@ James Ketrenos Jan Glauber Jan Glauber Jan Glauber +Jan Kuliga Jarkko Sakkinen Jarkko Sakkinen Jarkko Sakkinen From 409faf8c97d5abb0597ea43e99c8b3dd8dbe99e3 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Thu, 29 Aug 2024 21:06:33 +0800 Subject: [PATCH 861/991] mm: vmalloc: optimize vmap_lazy_nr arithmetic when purging each vmap_area When running the vmalloc stress on a 448-core system, observe the average latency of purge_vmap_node() is about 2 seconds by using the eBPF/bcc 'funclatency.py' tool [1]. # /your-git-repo/bcc/tools/funclatency.py -u purge_vmap_node & pid1=$! && sleep 8 && modprobe test_vmalloc nr_threads=$(nproc) run_test_mask=0x7; kill -SIGINT $pid1 usecs : count distribution 0 -> 1 : 0 | | 2 -> 3 : 29 | | 4 -> 7 : 19 | | 8 -> 15 : 56 | | 16 -> 31 : 483 |**** | 32 -> 63 : 1548 |************ | 64 -> 127 : 2634 |********************* | 128 -> 255 : 2535 |********************* | 256 -> 511 : 1776 |************** | 512 -> 1023 : 1015 |******** | 1024 -> 2047 : 573 |**** | 2048 -> 4095 : 488 |**** | 4096 -> 8191 : 1091 |********* | 8192 -> 16383 : 3078 |************************* | 16384 -> 32767 : 4821 |****************************************| 32768 -> 65535 : 3318 |*************************** | 65536 -> 131071 : 1718 |************** | 131072 -> 262143 : 2220 |****************** | 262144 -> 524287 : 1147 |********* | 524288 -> 1048575 : 1179 |********* | 1048576 -> 2097151 : 822 |****** | 2097152 -> 4194303 : 906 |******* | 4194304 -> 8388607 : 2148 |***************** | 8388608 -> 16777215 : 4497 |************************************* | 16777216 -> 33554431 : 289 |** | avg = 2041714 usecs, total: 78381401772 usecs, count: 38390 The worst case is over 16-33 seconds, so soft lockup is triggered [2]. [Root Cause] 1) Each purge_list has the long list. The following shows the number of vmap_area is purged. crash> p vmap_nodes vmap_nodes = $27 = (struct vmap_node *) 0xff2de5a900100000 crash> vmap_node 0xff2de5a900100000 128 | grep nr_purged nr_purged = 663070 ... nr_purged = 821670 nr_purged = 692214 nr_purged = 726808 ... 2) atomic_long_sub() employs the 'lock' prefix to ensure the atomic operation when purging each vmap_area. However, the iteration is over 600000 vmap_area (See 'nr_purged' above). Here is objdump output: $ objdump -D vmlinux ffffffff813e8c80 : ... ffffffff813e8d70: f0 48 29 2d 68 0c bb lock sub %rbp,0x2bb0c68(%rip) ... Quote from "Instruction tables" pdf file [3]: Instructions with a LOCK prefix have a long latency that depends on cache organization and possibly RAM speed. If there are multiple processors or cores or direct memory access (DMA) devices, then all locked instructions will lock a cache line for exclusive access, which may involve RAM access. A LOCK prefix typically costs more than a hundred clock cycles, even on single-processor systems. That's why the latency of purge_vmap_node() dramatically increases on a many-core system: One core is busy on purging each vmap_area of the *long* purge_list and executing atomic_long_sub() for each vmap_area, while other cores free vmalloc allocations and execute atomic_long_add_return() in free_vmap_area_noflush(). [Solution] Employ a local variable to record the total purged pages, and execute atomic_long_sub() after the traversal of the purge_list is done. The experiment result shows the latency improvement is 99%. [Experiment Result] 1) System Configuration: Three servers (with HT-enabled) are tested. * 72-core server: 3rd Gen Intel Xeon Scalable Processor*1 * 192-core server: 5th Gen Intel Xeon Scalable Processor*2 * 448-core server: AMD Zen 4 Processor*2 2) Kernel Config * CONFIG_KASAN is disabled 3) The data in column "w/o patch" and "w/ patch" * Unit: micro seconds (us) * Each data is the average of 3-time measurements System w/o patch (us) w/ patch (us) Improvement (%) --------------- -------------- ------------- ------------- 72-core server 2194 14 99.36% 192-core server 143799 1139 99.21% 448-core server 1992122 6883 99.65% [1] https://github.com/iovisor/bcc/blob/master/tools/funclatency.py [2] https://gist.github.com/AdrianHuang/37c15f67b45407b83c2d32f918656c12 [3] https://www.agner.org/optimize/instruction_tables.pdf Link: https://lkml.kernel.org/r/20240829130633.2184-1-ahuang12@lenovo.com Signed-off-by: Adrian Huang Reviewed-by: Uladzislau Rezki (Sony) Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ac53d46ac8a5c3..a0df1e2e155a87 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2191,6 +2191,7 @@ static void purge_vmap_node(struct work_struct *work) { struct vmap_node *vn = container_of(work, struct vmap_node, purge_work); + unsigned long nr_purged_pages = 0; struct vmap_area *va, *n_va; LIST_HEAD(local_list); @@ -2208,7 +2209,7 @@ static void purge_vmap_node(struct work_struct *work) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); - atomic_long_sub(nr, &vmap_lazy_nr); + nr_purged_pages += nr; vn->nr_purged++; if (is_vn_id_valid(vn_id) && !vn->skip_populate) @@ -2219,6 +2220,8 @@ static void purge_vmap_node(struct work_struct *work) list_add(&va->list, &local_list); } + atomic_long_sub(nr_purged_pages, &vmap_lazy_nr); + reclaim_list_global(&local_list); } From 052a45c1cb1b32f05dd63a295d65496d8b403283 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 28 Aug 2024 16:15:36 -0700 Subject: [PATCH 862/991] alloc_tag: fix allocation tag reporting when CONFIG_MODULES=n codetag_module_init() is used to initialize sections containing allocation tags. This function is used to initialize module sections as well as core kernel sections, in which case the module parameter is set to NULL. This function has to be called even when CONFIG_MODULES=n to initialize core kernel allocation tag sections. When CONFIG_MODULES=n, this function is a NOP, which is wrong. This leads to /proc/allocinfo reported as empty. Fix this by making it independent of CONFIG_MODULES. Link: https://lkml.kernel.org/r/20240828231536.1770519-1-surenb@google.com Fixes: 916cc5167cc6 ("lib: code tagging framework") Signed-off-by: Suren Baghdasaryan Cc: David Hildenbrand Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Cc: [6.10+] Signed-off-by: Andrew Morton --- lib/codetag.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/lib/codetag.c b/lib/codetag.c index 5ace625f2328f2..afa8a2d4f31733 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -125,7 +125,6 @@ static inline size_t range_size(const struct codetag_type *cttype, cttype->desc.tag_size; } -#ifdef CONFIG_MODULES static void *get_symbol(struct module *mod, const char *prefix, const char *name) { DECLARE_SEQ_BUF(sb, KSYM_NAME_LEN); @@ -155,6 +154,15 @@ static struct codetag_range get_section_range(struct module *mod, }; } +static const char *get_mod_name(__maybe_unused struct module *mod) +{ +#ifdef CONFIG_MODULES + if (mod) + return mod->name; +#endif + return "(built-in)"; +} + static int codetag_module_init(struct codetag_type *cttype, struct module *mod) { struct codetag_range range; @@ -164,8 +172,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) range = get_section_range(mod, cttype->desc.section); if (!range.start || !range.stop) { pr_warn("Failed to load code tags of type %s from the module %s\n", - cttype->desc.section, - mod ? mod->name : "(built-in)"); + cttype->desc.section, get_mod_name(mod)); return -EINVAL; } @@ -199,6 +206,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) return 0; } +#ifdef CONFIG_MODULES void codetag_load_module(struct module *mod) { struct codetag_type *cttype; @@ -248,9 +256,6 @@ bool codetag_unload_module(struct module *mod) return unload_ok; } - -#else /* CONFIG_MODULES */ -static int codetag_module_init(struct codetag_type *cttype, struct module *mod) { return 0; } #endif /* CONFIG_MODULES */ struct codetag_type * From a54da9df75cd1b4b5028f6c60f9a211532680585 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sun, 1 Sep 2024 05:10:51 +0200 Subject: [PATCH 863/991] hwmon: (hp-wmi-sensors) Check if WMI event data exists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BIOS can choose to return no event data in response to a WMI event, so the ACPI object passed to the WMI notify handler can be NULL. Check for such a situation and ignore the event in such a case. Fixes: 23902f98f8d4 ("hwmon: add HP WMI Sensors driver") Signed-off-by: Armin Wolf Reviewed-by: Ilpo Järvinen Message-ID: <20240901031055.3030-2-W_Armin@gmx.de> Signed-off-by: Guenter Roeck --- drivers/hwmon/hp-wmi-sensors.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hwmon/hp-wmi-sensors.c b/drivers/hwmon/hp-wmi-sensors.c index b5325d0e72b9cd..dfa1d6926deacc 100644 --- a/drivers/hwmon/hp-wmi-sensors.c +++ b/drivers/hwmon/hp-wmi-sensors.c @@ -1637,6 +1637,8 @@ static void hp_wmi_notify(u32 value, void *context) goto out_unlock; wobj = out.pointer; + if (!wobj) + goto out_unlock; err = populate_event_from_wobj(dev, &event, wobj); if (err) { From 1c7fb536e899a2f66f9b1719a0234570dda2e634 Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Thu, 8 Aug 2024 12:37:49 +0200 Subject: [PATCH 864/991] perf test pmu: Set uninitialized PMU alias to null Commit 3e0bf9fde2984469 ("perf pmu: Restore full PMU name wildcard support") adds a test case "PMU cmdline match" that covers PMU name wildcard support provided by function perf_pmu__match(). The test works with a wide range of supported combinations of PMU name matching but omits the case that if the perf_pmu__match() cannot match the PMU name to the wildcard, it tries to match its alias. However, this variable is not set up, causing the test case to fail when run with subprocesses or to segfault if run as a single process. ./perf test -vv 9 9: Sysfs PMU tests : 9.1: Parsing with PMU format directory : Ok 9.2: Parsing with PMU event : Ok 9.3: PMU event names : Ok 9.4: PMU name combining : Ok 9.5: PMU name comparison : Ok 9.6: PMU cmdline match : FAILED! ./perf test -F 9 9.1: Parsing with PMU format directory : Ok 9.2: Parsing with PMU event : Ok 9.3: PMU event names : Ok 9.4: PMU name combining : Ok 9.5: PMU name comparison : Ok Segmentation fault (core dumped) Initialize the PMU alias to null for all tests of perf_pmu__match() as this functionality is not being tested and the alias matching works exactly the same as the matching of the PMU name. ./perf test -F 9 9.1: Parsing with PMU format directory : Ok 9.2: Parsing with PMU event : Ok 9.3: PMU event names : Ok 9.4: PMU name combining : Ok 9.5: PMU name comparison : Ok 9.6: PMU cmdline match : Ok Fixes: 3e0bf9fde2984469 ("perf pmu: Restore full PMU name wildcard support") Signed-off-by: Veronika Molnarova Cc: james.clark@arm.com Cc: mpetlan@redhat.com Cc: rstoyano@redhat.com Link: https://lore.kernel.org/r/20240808103749.9356-1-vmolnaro@redhat.com Signed-off-by: Namhyung Kim --- tools/perf/tests/pmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index 40132655ccd171..c76f53a90a7b72 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -456,11 +456,13 @@ static int test__name_cmp(struct test_suite *test __maybe_unused, int subtest __ /** * Test perf_pmu__match() that's used to search for a PMU given a name passed * on the command line. The name that's passed may also be a filename type glob - * match. + * match. If the name does not match, perf_pmu__match() attempts to match the + * alias of the PMU, if provided. */ static int test__pmu_match(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { struct perf_pmu test_pmu; + test_pmu.alias_name = NULL; test_pmu.name = "pmuname"; TEST_ASSERT_EQUAL("Exact match", perf_pmu__match(&test_pmu, "pmuname"), true); From 287bd5cf06e0f2c02293ce942777ad1f18059ed3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 27 Aug 2024 22:29:53 -0700 Subject: [PATCH 865/991] perf lock contention: Fix spinlock and rwlock accounting The spinlock and rwlock use a single-element per-cpu array to track current locks due to performance reason. But this means the key is always available and it cannot simply account lock stats in the array because some of them are invalid. In fact, the contention_end() program in the BPF invalidates the entry by setting the 'lock' value to 0 instead of deleting the entry for the hashmap. So it should skip entries with the lock value of 0 in the account_end_timestamp(). Otherwise, it'd have spurious high contention on an idle machine: $ sudo perf lock con -ab -Y spinlock sleep 3 contended total wait max wait avg wait type caller 8 4.72 s 1.84 s 590.46 ms spinlock rcu_core+0xc7 8 1.87 s 1.87 s 233.48 ms spinlock process_one_work+0x1b5 2 1.87 s 1.87 s 933.92 ms spinlock worker_thread+0x1a2 3 1.81 s 1.81 s 603.93 ms spinlock tmigr_update_events+0x13c 2 1.72 s 1.72 s 861.98 ms spinlock tick_do_update_jiffies64+0x25 6 42.48 us 13.02 us 7.08 us spinlock futex_q_lock+0x2a 1 13.03 us 13.03 us 13.03 us spinlock futex_wake+0xce 1 11.61 us 11.61 us 11.61 us spinlock rcu_core+0xc7 I don't believe it has contention on a spinlock longer than 1 second. After this change, it only reports some small contentions. $ sudo perf lock con -ab -Y spinlock sleep 3 contended total wait max wait avg wait type caller 4 133.51 us 43.29 us 33.38 us spinlock tick_do_update_jiffies64+0x25 4 69.06 us 31.82 us 17.27 us spinlock process_one_work+0x1b5 2 50.66 us 25.77 us 25.33 us spinlock rcu_core+0xc7 1 28.45 us 28.45 us 28.45 us spinlock rcu_core+0xc7 1 24.77 us 24.77 us 24.77 us spinlock tmigr_update_events+0x13c 1 23.34 us 23.34 us 23.34 us spinlock raw_spin_rq_lock_nested+0x15 Fixes: b5711042a1c8 ("perf lock contention: Use per-cpu array map for spinlocks") Reported-by: Xi Wang Cc: Song Liu Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240828052953.1445862-1-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/perf/util/bpf_lock_contention.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c index b4cb3fe5cc2548..bc4e92c0c08b8b 100644 --- a/tools/perf/util/bpf_lock_contention.c +++ b/tools/perf/util/bpf_lock_contention.c @@ -286,6 +286,9 @@ static void account_end_timestamp(struct lock_contention *con) goto next; for (int i = 0; i < total_cpus; i++) { + if (cpu_data[i].lock == 0) + continue; + update_lock_stat(stat_fd, -1, end_ts, aggr_mode, &cpu_data[i]); } From aee1d55922977bf9282398283a72d38fc5514540 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 19 Aug 2024 10:34:03 +0800 Subject: [PATCH 866/991] perf python: include "util/sample.h" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 32-bit arm build system will complain: tools/perf/util/python.c:75:28: error: field ‘sample’ has incomplete type 75 | struct perf_sample sample; However, arm64 build system doesn't complain this. The root cause is arm64 define "HAVE_KVM_STAT_SUPPORT := 1" in tools/perf/arch/arm64/Makefile, but arm arch doesn't define this. This will lead to kvm-stat.h include other header files on arm64 build system, especially "util/sample.h" for util/python.c. This will try to directly include "util/sample.h" for "util/python.c" to avoid such build issue on arm platform. Signed-off-by: Xu Yang Cc: imx@lists.linux.dev Link: https://lore.kernel.org/r/20240819023403.201324-1-xu.yang_2@nxp.com Signed-off-by: Namhyung Kim --- tools/perf/util/python.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 3be882b2e84542..31a223eaf8e65f 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -20,6 +20,7 @@ #include "util/env.h" #include "util/kvm-stat.h" #include "util/kwork.h" +#include "util/sample.h" #include "util/lock-contention.h" #include #include "../builtin.h" From e162cb25c410afc42051a582c46a47dde597f51c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 19 Aug 2024 21:43:01 -0300 Subject: [PATCH 867/991] perf daemon: Fix the build on more 32-bit architectures FYI: I'm carrying this on perf-tools-next. The previous attempt fixed the build on debian:experimental-x-mipsel, but when building on a larger set of containers I noticed it broke the build on some other 32-bit architectures such as: 42 7.87 ubuntu:18.04-x-arm : FAIL gcc version 7.5.0 (Ubuntu/Linaro 7.5.0-3ubuntu1~18.04) builtin-daemon.c: In function 'cmd_session_list': builtin-daemon.c:692:16: error: format '%llu' expects argument of type 'long long unsigned int', but argument 4 has type 'long int' [-Werror=format=] fprintf(out, "%c%" PRIu64, ^~~~~ builtin-daemon.c:694:13: csv_sep, (curr - daemon->start) / 60); ~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from builtin-daemon.c:3:0: /usr/arm-linux-gnueabihf/include/inttypes.h:105:34: note: format string is defined here # define PRIu64 __PRI64_PREFIX "u" So lets cast that time_t (32-bit/64-bit) to uint64_t to make sure it builds everywhere. Fixes: 4bbe6002931954bb ("perf daemon: Fix the build on 32-bit architectures") Signed-off-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/ZsPmldtJ0D9Cua9_@x1 Signed-off-by: Namhyung Kim --- tools/perf/builtin-daemon.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index 5c9335fff2d396..9a95871afc9556 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -691,7 +691,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) fprintf(out, "%c%" PRIu64, /* session up time */ - csv_sep, (curr - daemon->start) / 60); + csv_sep, (uint64_t)((curr - daemon->start) / 60)); fprintf(out, "\n"); } else { @@ -702,7 +702,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) fprintf(out, " lock: %s/lock\n", daemon->base); fprintf(out, " up: %" PRIu64 " minutes\n", - (curr - daemon->start) / 60); + (uint64_t)((curr - daemon->start) / 60)); } } @@ -730,7 +730,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) fprintf(out, "%c%" PRIu64, /* session up time */ - csv_sep, (curr - session->start) / 60); + csv_sep, (uint64_t)((curr - session->start) / 60)); fprintf(out, "\n"); } else { @@ -747,7 +747,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) fprintf(out, " ack: %s/%s\n", session->base, SESSION_ACK); fprintf(out, " up: %" PRIu64 " minutes\n", - (curr - session->start) / 60); + (uint64_t)((curr - session->start) / 60)); } } From 213aa670153ed675a007c1f35c5db544b0fefc94 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 31 Aug 2024 14:02:06 +0200 Subject: [PATCH 868/991] parisc: Delay write-protection until mark_rodata_ro() call Do not write-protect the kernel read-only and __ro_after_init sections earlier than before mark_rodata_ro() is called. This fixes a boot issue on parisc which is triggered by commit 91a1d97ef482 ("jump_label,module: Don't alloc static_key_mod for __ro_after_init keys"). That commit may modify static key contents in the __ro_after_init section at bootup, so this section needs to be writable at least until mark_rodata_ro() is called. Signed-off-by: Helge Deller Reported-by: matoro Reported-by: Christoph Biedl Tested-by: Christoph Biedl Link: https://lore.kernel.org/linux-parisc/096cad5aada514255cd7b0b9dbafc768@matoro.tk/#r Fixes: 91a1d97ef482 ("jump_label,module: Don't alloc static_key_mod for __ro_after_init keys") Cc: stable@vger.kernel.org # v6.10+ --- arch/parisc/mm/init.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 34d91cb8b25905..96970fa75e4ac9 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -459,7 +459,6 @@ void free_initmem(void) unsigned long kernel_end = (unsigned long)&_end; /* Remap kernel text and data, but do not touch init section yet. */ - kernel_set_to_readonly = true; map_pages(init_end, __pa(init_end), kernel_end - init_end, PAGE_KERNEL, 0); @@ -493,11 +492,18 @@ void free_initmem(void) #ifdef CONFIG_STRICT_KERNEL_RWX void mark_rodata_ro(void) { - /* rodata memory was already mapped with KERNEL_RO access rights by - pagetable_init() and map_pages(). No need to do additional stuff here */ - unsigned long roai_size = __end_ro_after_init - __start_ro_after_init; + unsigned long start = (unsigned long) &__start_rodata; + unsigned long end = (unsigned long) &__end_rodata; + + pr_info("Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); + + kernel_set_to_readonly = true; + map_pages(start, __pa(start), end - start, PAGE_KERNEL, 0); - pr_info("Write protected read-only-after-init data: %luk\n", roai_size >> 10); + /* force the kernel to see the new page table entries */ + flush_cache_all(); + flush_tlb_all(); } #endif From 53f6619554fb1edf8d7599b560d44dbea085c730 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Sep 2024 18:09:18 -0400 Subject: [PATCH 869/991] bcachefs: BCH_SB_MEMBER_INVALID Create a sentinal value for "invalid device". This is needed for removing devices that have stripes on them (force removing, without evacuating); we need a sentinal value for the stripe pointers to the device being removed. Signed-off-by: Kent Overstreet --- fs/bcachefs/replicas.c | 3 ++- fs/bcachefs/sb-members.c | 3 ++- fs/bcachefs/sb-members_format.h | 5 +++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 12b1d28b7eb499..12d4de65ae17c6 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -82,7 +82,8 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, } for (unsigned i = 0; i < r->nr_devs; i++) - if (!bch2_member_exists(sb, r->devs[i])) { + if (r->devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_member_exists(sb, r->devs[i])) { prt_printf(err, "invalid device %u in entry ", r->devs[i]); goto bad; } diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 39196f2a41974f..4b765422dd771a 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -11,7 +11,8 @@ void bch2_dev_missing(struct bch_fs *c, unsigned dev) { - bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); + if (dev != BCH_SB_MEMBER_INVALID) + bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); } void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket) diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index e2630548c0f681..d727d2dfda08f2 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -8,6 +8,11 @@ */ #define BCH_SB_MEMBERS_MAX 64 +/* + * Sentinal value - indicates a device that does not exist + */ +#define BCH_SB_MEMBER_INVALID 255 + #define BCH_MIN_NR_NBUCKETS (1 << 6) #define BCH_IOPS_MEASUREMENTS() \ From 0d437918fb6473d25fb83188c2d6040f47acfbcd Mon Sep 17 00:00:00 2001 From: Yuntao Liu Date: Wed, 21 Aug 2024 07:34:41 +0100 Subject: [PATCH 870/991] ARM: 9414/1: Fix build issue with LD_DEAD_CODE_DATA_ELIMINATION There is a build issue with LD segmentation fault, while CONFIG_LD_DEAD_CODE_DATA_ELIMINATION is not enabled, as bellow. scripts/link-vmlinux.sh: line 49: 3796 Segmentation fault (core dumped) ${ld} ${ldflags} -o ${output} ${wl}--whole-archive ${objs} ${wl}--no-whole-archive ${wl}--start-group ${libs} ${wl}--end-group ${kallsymso} ${btf_vmlinux_bin_o} ${ldlibs} The error occurs in older versions of the GNU ld with version earlier than 2.36. It makes most sense to have a minimum LD version as a dependency for HAVE_LD_DEAD_CODE_DATA_ELIMINATION and eliminate the impact of ".reloc .text, R_ARM_NONE, ." when CONFIG_LD_DEAD_CODE_DATA_ELIMINATION is not enabled. Fixes: ed0f94102251 ("ARM: 9404/1: arm32: enable HAVE_LD_DEAD_CODE_DATA_ELIMINATION") Reported-by: Harith George Tested-by: Harith George Suggested-by: Arnd Bergmann Acked-by: Arnd Bergmann Signed-off-by: Yuntao Liu Link: https://lore.kernel.org/all/14e9aefb-88d1-4eee-8288-ef15d4a9b059@gmail.com/ Signed-off-by: Russell King (Oracle) --- arch/arm/Kconfig | 2 +- arch/arm/kernel/entry-armv.S | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 54b2bb817a7fc0..173159e93c99c4 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -117,7 +117,7 @@ config ARM select HAVE_KERNEL_XZ select HAVE_KPROBES if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !CPU_V7M select HAVE_KRETPROBES if HAVE_KPROBES - select HAVE_LD_DEAD_CODE_DATA_ELIMINATION + select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_IS_LLD) select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPTPROBES if !THUMB2_KERNEL diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index f01d23a220e65d..1dfae1af8e31b0 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -29,6 +29,12 @@ #include "entry-header.S" #include +#ifdef CONFIG_HAVE_LD_DEAD_CODE_DATA_ELIMINATION +#define RELOC_TEXT_NONE .reloc .text, R_ARM_NONE, . +#else +#define RELOC_TEXT_NONE +#endif + /* * Interrupt handling. */ @@ -1065,7 +1071,7 @@ vector_addrexcptn: .globl vector_fiq .section .vectors, "ax", %progbits - .reloc .text, R_ARM_NONE, . + RELOC_TEXT_NONE W(b) vector_rst W(b) vector_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_swi ) @@ -1079,7 +1085,7 @@ THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_swi ) #ifdef CONFIG_HARDEN_BRANCH_HISTORY .section .vectors.bhb.loop8, "ax", %progbits - .reloc .text, R_ARM_NONE, . + RELOC_TEXT_NONE W(b) vector_rst W(b) vector_bhb_loop8_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_loop8_swi ) @@ -1092,7 +1098,7 @@ THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_bhb_loop8_swi ) W(b) vector_bhb_loop8_fiq .section .vectors.bhb.bpiall, "ax", %progbits - .reloc .text, R_ARM_NONE, . + RELOC_TEXT_NONE W(b) vector_rst W(b) vector_bhb_bpiall_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_bpiall_swi ) From 7aefdf88388514cb9f0eb3e5711dbf801975e6b9 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 8 Jul 2024 13:24:08 +0200 Subject: [PATCH 871/991] btrfs: update stripe extents for existing logical addresses Update a stripe extent in case of an already existing logical address, but with different physical addresses and/or device id instead of bailing out with EEXIST. This can happen i.e. in case of a device replace operation, where data extents get rewritten to a new disk. Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index e6f7a234b8f680..0c7b928805e5e0 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -73,6 +73,36 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le return ret; } +static int update_raid_extent_item(struct btrfs_trans_handle *trans, + struct btrfs_key *key, + struct btrfs_stripe_extent *stripe_extent, + const size_t item_size) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + int slot; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path, + 0, 1); + if (ret) + return (ret == 1 ? ret : -EINVAL); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), + item_size); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_free_path(path); + + return ret; +} + static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_io_context *bioc) { @@ -112,6 +142,9 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, item_size); + if (ret == -EEXIST) + ret = update_raid_extent_item(trans, &stripe_key, stripe_extent, + item_size); if (ret) btrfs_abort_transaction(trans, ret); From 923bf2696aaab3abefb60959788fedba06cb83e8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 9 Jul 2024 09:40:34 +0200 Subject: [PATCH 872/991] btrfs: update stripe_extent delete loop assumptions btrfs_delete_raid_extent() was written under the assumption, that it's call-chain always passes a start, length tuple that matches a single extent. But btrfs_delete_raid_extent() is called by do_free_extent_accounting() which in turn is called by __btrfs_free_extent(). But this call-chain passes in a start address and a length that can possibly match multiple on-disk extents. To make this possible, we have to adjust the start and length of each btree node lookup, to not delete beyond the requested range. Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 0c7b928805e5e0..bd06ff7956910f 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -66,6 +66,11 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le if (ret) break; + start += key.offset; + length -= key.offset; + if (length == 0) + break; + btrfs_release_path(path); } From da4fcd8a33c29251174ed79395919dfa2e223912 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 23 Jul 2024 15:21:22 +0100 Subject: [PATCH 873/991] btrfs: reduce size and overhead of extent_map_block_end() At extent_map_block_end() we are calling the inline functions extent_map_block_start() and extent_map_block_len() multiple times, which results in expanding their code multiple times, increasing the compiled code size and repeating the computations those functions do. Improve this by caching their results in local variables. The size of the module before this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1755770 163800 16920 1936490 1d8c6a fs/btrfs/btrfs.ko And after this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1755656 163800 16920 1936376 1d8bf8 fs/btrfs/btrfs.ko Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 10ac5f657e3889..25d191f1ac10f4 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -192,10 +192,13 @@ static inline u64 extent_map_block_len(const struct extent_map *em) static inline u64 extent_map_block_end(const struct extent_map *em) { - if (extent_map_block_start(em) + extent_map_block_len(em) < - extent_map_block_start(em)) + const u64 block_start = extent_map_block_start(em); + const u64 block_end = block_start + extent_map_block_len(em); + + if (block_end < block_start) return (u64)-1; - return extent_map_block_start(em) + extent_map_block_len(em); + + return block_end; } static bool can_merge_extent_map(const struct extent_map *em) From d192b9d24896a717e47f0027f49899066e43159a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 24 Jul 2024 14:29:02 +0930 Subject: [PATCH 874/991] btrfs: move uuid tree related code to uuid-tree.[ch] Functions btrfs_uuid_scan_kthread() and btrfs_create_uuid_tree() are for UUID tree rescan and creation, it's not suitable for volumes.[ch]. Move them to uuid-tree.[ch] instead. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/uuid-tree.c | 179 +++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/uuid-tree.h | 2 + fs/btrfs/volumes.c | 177 ------------------------------------------ fs/btrfs/volumes.h | 2 - 4 files changed, 181 insertions(+), 179 deletions(-) diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index eae75bb572b99e..c6399513c66fff 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -3,6 +3,7 @@ * Copyright (C) STRATO AG 2013. All rights reserved. */ +#include #include #include #include "messages.h" @@ -12,6 +13,7 @@ #include "fs.h" #include "accessors.h" #include "uuid-tree.h" +#include "ioctl.h" static void btrfs_uuid_to_key(const u8 *uuid, u8 type, struct btrfs_key *key) { @@ -390,3 +392,180 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) btrfs_free_path(path); return ret; } + +int btrfs_uuid_scan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_key key; + struct btrfs_path *path = NULL; + int ret = 0; + struct extent_buffer *eb; + int slot; + struct btrfs_root_item root_item; + u32 item_size; + struct btrfs_trans_handle *trans = NULL; + bool closing = false; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + + while (1) { + if (btrfs_fs_closing(fs_info)) { + closing = true; + break; + } + ret = btrfs_search_forward(root, &key, path, + BTRFS_OLDEST_GENERATION); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + + if (key.type != BTRFS_ROOT_ITEM_KEY || + (key.objectid < BTRFS_FIRST_FREE_OBJECTID && + key.objectid != BTRFS_FS_TREE_OBJECTID) || + key.objectid > BTRFS_LAST_FREE_OBJECTID) + goto skip; + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(eb, slot); + if (item_size < sizeof(root_item)) + goto skip; + + read_extent_buffer(eb, &root_item, + btrfs_item_ptr_offset(eb, slot), + (int)sizeof(root_item)); + if (btrfs_root_refs(&root_item) == 0) + goto skip; + + if (!btrfs_is_empty_uuid(root_item.uuid) || + !btrfs_is_empty_uuid(root_item.received_uuid)) { + if (trans) + goto update_tree; + + btrfs_release_path(path); + /* + * 1 - subvol uuid item + * 1 - received_subvol uuid item + */ + trans = btrfs_start_transaction(fs_info->uuid_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + continue; + } else { + goto skip; + } +update_tree: + btrfs_release_path(path); + if (!btrfs_is_empty_uuid(root_item.uuid)) { + ret = btrfs_uuid_tree_add(trans, root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + + if (!btrfs_is_empty_uuid(root_item.received_uuid)) { + ret = btrfs_uuid_tree_add(trans, + root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + +skip: + btrfs_release_path(path); + if (trans) { + ret = btrfs_end_transaction(trans); + trans = NULL; + if (ret) + break; + } + + if (key.offset < (u64)-1) { + key.offset++; + } else if (key.type < BTRFS_ROOT_ITEM_KEY) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + } else if (key.objectid < (u64)-1) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.objectid++; + } else { + break; + } + cond_resched(); + } + +out: + btrfs_free_path(path); + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans); + if (ret) + btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); + else if (!closing) + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + up(&fs_info->uuid_tree_rescan_sem); + return 0; +} + +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *uuid_root; + struct task_struct *task; + int ret; + + /* + * 1 - root node + * 1 - root item + */ + trans = btrfs_start_transaction(tree_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); + if (IS_ERR(uuid_root)) { + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + fs_info->uuid_root = uuid_root; + + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_scan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h index a3f5757cc7cffc..c60ad20325cce0 100644 --- a/fs/btrfs/uuid-tree.h +++ b/fs/btrfs/uuid-tree.h @@ -13,5 +13,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_uuid_scan_kthread(void *data); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fcedc43ef291a2..ff7c7194f5f732 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4784,183 +4784,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_uuid_scan_kthread(void *data) -{ - struct btrfs_fs_info *fs_info = data; - struct btrfs_root *root = fs_info->tree_root; - struct btrfs_key key; - struct btrfs_path *path = NULL; - int ret = 0; - struct extent_buffer *eb; - int slot; - struct btrfs_root_item root_item; - u32 item_size; - struct btrfs_trans_handle *trans = NULL; - bool closing = false; - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - key.objectid = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = 0; - - while (1) { - if (btrfs_fs_closing(fs_info)) { - closing = true; - break; - } - ret = btrfs_search_forward(root, &key, path, - BTRFS_OLDEST_GENERATION); - if (ret) { - if (ret > 0) - ret = 0; - break; - } - - if (key.type != BTRFS_ROOT_ITEM_KEY || - (key.objectid < BTRFS_FIRST_FREE_OBJECTID && - key.objectid != BTRFS_FS_TREE_OBJECTID) || - key.objectid > BTRFS_LAST_FREE_OBJECTID) - goto skip; - - eb = path->nodes[0]; - slot = path->slots[0]; - item_size = btrfs_item_size(eb, slot); - if (item_size < sizeof(root_item)) - goto skip; - - read_extent_buffer(eb, &root_item, - btrfs_item_ptr_offset(eb, slot), - (int)sizeof(root_item)); - if (btrfs_root_refs(&root_item) == 0) - goto skip; - - if (!btrfs_is_empty_uuid(root_item.uuid) || - !btrfs_is_empty_uuid(root_item.received_uuid)) { - if (trans) - goto update_tree; - - btrfs_release_path(path); - /* - * 1 - subvol uuid item - * 1 - received_subvol uuid item - */ - trans = btrfs_start_transaction(fs_info->uuid_root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - continue; - } else { - goto skip; - } -update_tree: - btrfs_release_path(path); - if (!btrfs_is_empty_uuid(root_item.uuid)) { - ret = btrfs_uuid_tree_add(trans, root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - key.objectid); - if (ret < 0) { - btrfs_warn(fs_info, "uuid_tree_add failed %d", - ret); - break; - } - } - - if (!btrfs_is_empty_uuid(root_item.received_uuid)) { - ret = btrfs_uuid_tree_add(trans, - root_item.received_uuid, - BTRFS_UUID_KEY_RECEIVED_SUBVOL, - key.objectid); - if (ret < 0) { - btrfs_warn(fs_info, "uuid_tree_add failed %d", - ret); - break; - } - } - -skip: - btrfs_release_path(path); - if (trans) { - ret = btrfs_end_transaction(trans); - trans = NULL; - if (ret) - break; - } - - if (key.offset < (u64)-1) { - key.offset++; - } else if (key.type < BTRFS_ROOT_ITEM_KEY) { - key.offset = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - } else if (key.objectid < (u64)-1) { - key.offset = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - key.objectid++; - } else { - break; - } - cond_resched(); - } - -out: - btrfs_free_path(path); - if (trans && !IS_ERR(trans)) - btrfs_end_transaction(trans); - if (ret) - btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); - else if (!closing) - set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); - up(&fs_info->uuid_tree_rescan_sem); - return 0; -} - -int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root *uuid_root; - struct task_struct *task; - int ret; - - /* - * 1 - root node - * 1 - root item - */ - trans = btrfs_start_transaction(tree_root, 2); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); - if (IS_ERR(uuid_root)) { - ret = PTR_ERR(uuid_root); - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; - } - - fs_info->uuid_root = uuid_root; - - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - down(&fs_info->uuid_tree_rescan_sem); - task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); - if (IS_ERR(task)) { - /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - btrfs_warn(fs_info, "failed to start uuid_scan task"); - up(&fs_info->uuid_tree_rescan_sem); - return PTR_ERR(task); - } - - return 0; -} - /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 37a09ebb34dd88..c947187539ddbd 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -725,8 +725,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); -int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); -int btrfs_uuid_scan_kthread(void *data); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, From 49738bab6d69cdd89b1a74f05366aa706ad34b0f Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 18 Jul 2024 00:58:54 +0800 Subject: [PATCH 875/991] btrfs: print message on device opening error during mount [ENHANCEMENT] When mounting a btrfs filesystem, the filesystem opens the block device, and if this fails, there is no message about it. Print a message about it to help debugging. [TEST] I have a btrfs filesystem on three block devices, one of which is write-protected, so regular mounts fail, but there is no message in dmesg. /dev/vdb normal /dev/vdc write protected /dev/vdd normal Before patch: $ sudo mount /dev/vdb /mnt/ mount: mount(2) failed: no such file or directory $ sudo dmesg # Show only messages about missing block devices .... [ 352.947196] BTRFS error (device vdb): devid 2 uuid 4ee2c625-a3b2-4fe0-b411-756b23e08533 missing .... After patch: $ sudo mount /dev/vdb /mnt/ mount: mount(2) failed: no such file or directory $ sudo dmesg # Show bdev_file_open_by_path failed. .... [ 352.944328] BTRFS error: failed to open device for path /dev/vdc with flags 0x3: -13 [ 352.947196] BTRFS error (device vdb): missing devid 2 uuid 4ee2c625-a3b2-4fe0-b411-756b23e08533 .... Signed-off-by: Li Zhang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ff7c7194f5f732..e07452207426aa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -476,6 +476,8 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, if (IS_ERR(*bdev_file)) { ret = PTR_ERR(*bdev_file); + btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d", + device_path, flags, ret); goto error; } bdev = file_bdev(*bdev_file); From 41d9ce410168bd37b7e12e210c42b52a1d0a9613 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 15:55:33 -0400 Subject: [PATCH 876/991] btrfs: convert btrfs_readahead() to only use folio We're the only user of readahead_page_batch(). Convert btrfs_readahead() to use the folio based helpers to do readahead. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 36 ++++++++---------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c73cd4f89015f6..2798a3ca1db49e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1176,26 +1176,6 @@ int btrfs_read_folio(struct file *file, struct folio *folio) return ret; } -static inline void contiguous_readpages(struct page *pages[], int nr_pages, - u64 start, u64 end, - struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - u64 *prev_em_start) -{ - struct btrfs_inode *inode = page_to_inode(pages[0]); - int index; - - ASSERT(em_cached); - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - for (index = 0; index < nr_pages; index++) { - btrfs_do_readpage(pages[index], em_cached, bio_ctrl, - prev_em_start); - put_page(pages[index]); - } -} - /* * helper for __extent_writepage, doing all of the delayed allocation setup. * @@ -2379,18 +2359,18 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; - struct page *pagepool[16]; + struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); + struct folio *folio; + u64 start = readahead_pos(rac); + u64 end = start + readahead_length(rac) - 1; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - int nr; - while ((nr = readahead_page_batch(rac, pagepool))) { - u64 contig_start = readahead_pos(rac); - u64 contig_end = contig_start + readahead_batch_length(rac) - 1; + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - contiguous_readpages(pagepool, nr, contig_start, contig_end, - &em_cached, &bio_ctrl, &prev_em_start); - } + while ((folio = readahead_folio(rac)) != NULL) + btrfs_do_readpage(&folio->page, &em_cached, &bio_ctrl, + &prev_em_start); if (em_cached) free_extent_map(em_cached); From 3c369ae4a88ef9a498a92277501ce8c375fd27ad Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 15:59:34 -0400 Subject: [PATCH 877/991] btrfs: convert btrfs_read_folio() to only use a folio Currently we're using the page for everything here. Convert this to use the folio helpers instead. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2798a3ca1db49e..5e9e4671edef90 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1155,17 +1155,16 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct btrfs_inode *inode = page_to_inode(page); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; + struct btrfs_inode *inode = folio_to_inode(folio); + u64 start = folio_pos(folio); + u64 end = start + folio_size(folio) - 1; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(page, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(&folio->page, &em_cached, &bio_ctrl, NULL); free_extent_map(em_cached); /* From e7180057d89d55061ab88e63c868915abab9aa3d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 16:16:20 -0400 Subject: [PATCH 878/991] btrfs: convert end_page_read() to take a folio We have this helper function to set the page range uptodate once we're done reading it, as well as run fsverity against it. Half of these functions already take a folio, just rename this to end_folio_read and then rework it to take a folio instead, and update everything accordingly. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5e9e4671edef90..feec56a77d9bac 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -406,30 +406,31 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, start, end, page_ops); } -static bool btrfs_verify_page(struct page *page, u64 start) +static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) { - if (!fsverity_active(page->mapping->host) || - PageUptodate(page) || - start >= i_size_read(page->mapping->host)) + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + + if (!fsverity_active(folio->mapping->host) || + btrfs_folio_test_uptodate(fs_info, folio, start, len) || + start >= i_size_read(folio->mapping->host)) return true; - return fsverity_verify_page(page); + return fsverity_verify_folio(folio); } -static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) +static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); + ASSERT(folio_pos(folio) <= start && + start + len <= folio_pos(folio) + PAGE_SIZE); - if (uptodate && btrfs_verify_page(page, start)) + if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); else btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, page->mapping)) - unlock_page(page); + if (!btrfs_is_subpage(fs_info, folio->mapping)) + folio_unlock(folio); else btrfs_subpage_end_reader(fs_info, folio, start, len); } @@ -642,7 +643,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) } /* Update page status and unlock. */ - end_page_read(folio_page(folio, 0), uptodate, start, len); + end_folio_read(folio, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); } @@ -1048,13 +1049,13 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = PAGE_SIZE - pg_offset; memzero_page(page, pg_offset, iosize); unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(page_folio(page), true, cur, iosize); break; } em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { unlock_extent(tree, cur, end, NULL); - end_page_read(page, false, cur, end + 1 - cur); + end_folio_read(page_folio(page), false, cur, end + 1 - cur); return PTR_ERR(em); } extent_offset = cur - em->start; @@ -1123,7 +1124,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, memzero_page(page, pg_offset, iosize); unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(page_folio(page), true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -1131,7 +1132,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* the get_extent function already copied into the page */ if (block_start == EXTENT_MAP_INLINE) { unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(page_folio(page), true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -2551,7 +2552,7 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli return true; /* * Even there is no eb refs here, we may still have - * end_page_read() call relying on page::private. + * end_folio_read() call relying on page::private. */ if (atomic_read(&subpage->readers)) return true; From adfcee812ba265fa764b726596baf67dbe98f011 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 16:19:12 -0400 Subject: [PATCH 879/991] btrfs: convert begin_page_folio() to take a folio instead This already uses a folio internally, change it to take a folio as an argument instead. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index feec56a77d9bac..5a69fb566fa0bd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -551,16 +551,14 @@ static void endio_readpage_release_extent(struct processed_extent *processed, processed->uptodate = uptodate; } -static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) +static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { - struct folio *folio = page_folio(page); - ASSERT(folio_test_locked(folio)); if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio)); - btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE); + btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE); } /* @@ -1038,7 +1036,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } } bio_ctrl->end_io_func = end_bbio_data_read; - begin_page_read(fs_info, page); + begin_folio_read(fs_info, page_folio(page)); while (cur <= end) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; bool force_bio_submit = false; From 17f9d0315378a61d81d90ccfa9425f622c0132b0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 16:32:29 -0400 Subject: [PATCH 880/991] btrfs: convert submit_extent_page() to use a folio The callers of this helper are going to be converted to using a folio, so adjust submit_extent_page to become submit_extent_folio and update it to use all the relevant folio helpers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5a69fb566fa0bd..6829f44ea083c5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -736,12 +736,13 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) } static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, u64 disk_bytenr, + struct folio *folio, u64 disk_bytenr, unsigned int pg_offset) { struct bio *bio = &bio_ctrl->bbio->bio; struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; + struct folio *bv_folio = page_folio(bvec->bv_page); if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* @@ -754,7 +755,7 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, /* * The contig check requires the following conditions to be met: * - * 1) The pages are belonging to the same inode + * 1) The folios are belonging to the same inode * This is implied by the call chain. * * 2) The range has adjacent logical bytenr @@ -763,8 +764,8 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, * This is required for the usage of btrfs_bio->file_offset. */ return bio_end_sector(bio) == sector && - page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len == - page_offset(page) + pg_offset; + folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len == + folio_pos(folio) + pg_offset; } static void alloc_new_bio(struct btrfs_inode *inode, @@ -817,17 +818,17 @@ static void alloc_new_bio(struct btrfs_inode *inode, * The mirror number for this IO should already be initizlied in * @bio_ctrl->mirror_num. */ -static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, - u64 disk_bytenr, struct page *page, +static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, + u64 disk_bytenr, struct folio *folio, size_t size, unsigned long pg_offset) { - struct btrfs_inode *inode = page_to_inode(page); + struct btrfs_inode *inode = folio_to_inode(folio); ASSERT(pg_offset + size <= PAGE_SIZE); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && - !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset)) + !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset)) submit_one_bio(bio_ctrl); do { @@ -836,7 +837,7 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, /* Allocate new bio if needed */ if (!bio_ctrl->bbio) { alloc_new_bio(inode, bio_ctrl, disk_bytenr, - page_offset(page) + pg_offset); + folio_pos(folio) + pg_offset); } /* Cap to the current ordered extent boundary if there is one. */ @@ -846,21 +847,22 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, len = bio_ctrl->len_to_oe_boundary; } - if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) { + if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { /* bio full: move on to a new one */ submit_one_bio(bio_ctrl); continue; } if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, page, len); + wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page, + len); size -= len; pg_offset += len; disk_bytenr += len; /* - * len_to_oe_boundary defaults to U32_MAX, which isn't page or + * len_to_oe_boundary defaults to U32_MAX, which isn't folio or * sector aligned. alloc_new_bio() then sets it to the end of * our ordered extent for writes into zoned devices. * @@ -870,15 +872,15 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, * boundary is correct. * * When len_to_oe_boundary is U32_MAX, the cap above would - * result in a 4095 byte IO for the last page right before - * we hit the bio limit of UINT_MAX. bio_add_page() has all + * result in a 4095 byte IO for the last folio right before + * we hit the bio limit of UINT_MAX. bio_add_folio() has all * the checks required to make sure we don't overflow the bio, * and we should just ignore len_to_oe_boundary completely * unless we're using it to track an ordered extent. * * It's pretty hard to make a bio sized U32_MAX, but it can * happen when the page cache is able to feed us contiguous - * pages for large extents. + * folios for large extents. */ if (bio_ctrl->len_to_oe_boundary != U32_MAX) bio_ctrl->len_to_oe_boundary -= len; @@ -1143,8 +1145,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, - pg_offset); + submit_extent_folio(bio_ctrl, disk_bytenr, page_folio(page), + iosize, pg_offset); cur = cur + iosize; pg_offset += iosize; } @@ -1489,8 +1491,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } - submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, - cur - page_offset(page)); + submit_extent_folio(bio_ctrl, disk_bytenr, page_folio(page), + iosize, cur - page_offset(page)); cur += iosize; nr++; } @@ -2087,7 +2089,7 @@ int btree_write_cache_pages(struct address_space *mapping, * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. * - * We can get ret > 0 from submit_extent_page() indicating how many ebs + * We can get ret > 0 from submit_extent_folio() indicating how many ebs * were submitted. Reset it to 0 to avoid false alerts for the caller. */ if (ret > 0) From 06f4602d47bd00f1d190c252eacefd0320374b1b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 17:06:03 -0400 Subject: [PATCH 881/991] btrfs: convert btrfs_do_readpage() to only use a folio Now that the callers and helpers mostly use folio, convert btrfs_do_readpage to take a folio, and rename it to btrfs_do_read_folio. Update all of the page stuff to use the folio based helpers instead. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 58 ++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6829f44ea083c5..6cabeab5d21c48 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1004,12 +1004,12 @@ static struct extent_map *__get_extent_map(struct inode *inode, struct page *pag * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - u64 start = page_offset(page); + u64 start = folio_pos(folio); const u64 end = start + PAGE_SIZE - 1; u64 cur = start; u64 extent_offset; @@ -1022,23 +1022,23 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, size_t blocksize = fs_info->sectorsize; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { unlock_extent(tree, start, end, NULL); - unlock_page(page); + folio_unlock(folio); return ret; } - if (page->index == last_byte >> PAGE_SHIFT) { - size_t zero_offset = offset_in_page(last_byte); + if (folio->index == last_byte >> folio_shift(folio)) { + size_t zero_offset = offset_in_folio(folio, last_byte); if (zero_offset) { - iosize = PAGE_SIZE - zero_offset; - memzero_page(page, zero_offset, iosize); + iosize = folio_size(folio) - zero_offset; + folio_zero_range(folio, zero_offset, iosize); } } bio_ctrl->end_io_func = end_bbio_data_read; - begin_folio_read(fs_info, page_folio(page)); + begin_folio_read(fs_info, folio); while (cur <= end) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; bool force_bio_submit = false; @@ -1046,16 +1046,17 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = PAGE_SIZE - pg_offset; - memzero_page(page, pg_offset, iosize); + iosize = folio_size(folio) - pg_offset; + folio_zero_range(folio, pg_offset, iosize); unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_folio_read(page_folio(page), true, cur, iosize); + end_folio_read(folio, true, cur, iosize); break; } - em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached); + em = __get_extent_map(inode, folio_page(folio, 0), cur, + end - cur + 1, em_cached); if (IS_ERR(em)) { unlock_extent(tree, cur, end, NULL); - end_folio_read(page_folio(page), false, cur, end + 1 - cur); + end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); } extent_offset = cur - em->start; @@ -1080,8 +1081,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a - * single bio to populate the pages for the 2 ranges because - * this makes the compressed extent read zero out the pages + * single bio to populate the folios for the 2 ranges because + * this makes the compressed extent read zero out the folios * belonging to the 2nd range. Imagine the following scenario: * * File layout @@ -1094,13 +1095,13 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * [extent X, compressed length = 4K uncompressed length = 16K] * * If the bio to read the compressed extent covers both ranges, - * it will decompress extent X into the pages belonging to the + * it will decompress extent X into the folios belonging to the * first range and then it will stop, zeroing out the remaining - * pages that belong to the other range that points to extent X. + * folios that belong to the other range that points to extent X. * So here we make sure we submit 2 bios, one for the first * range and another one for the third range. Both will target * the same physical extent from disk, but we can't currently - * make the compressed bio endio callback populate the pages + * make the compressed bio endio callback populate the folios * for both ranges because each compressed bio is tightly * coupled with a single extent map, and each range can have * an extent map with a different offset value relative to the @@ -1121,18 +1122,18 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - memzero_page(page, pg_offset, iosize); + folio_zero_range(folio, pg_offset, iosize); unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_folio_read(page_folio(page), true, cur, iosize); + end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } - /* the get_extent function already copied into the page */ + /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_folio_read(page_folio(page), true, cur, iosize); + end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -1145,8 +1146,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_folio(bio_ctrl, disk_bytenr, page_folio(page), - iosize, pg_offset); + submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + pg_offset); cur = cur + iosize; pg_offset += iosize; } @@ -1165,7 +1166,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio) btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(&folio->page, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); free_extent_map(em_cached); /* @@ -2369,8 +2370,7 @@ void btrfs_readahead(struct readahead_control *rac) btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); while ((folio = readahead_folio(rac)) != NULL) - btrfs_do_readpage(&folio->page, &em_cached, &bio_ctrl, - &prev_em_start); + btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); if (em_cached) free_extent_map(em_cached); From 071f057b7ee716d7b0fffa08763643e2cb98d33c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 17:12:15 -0400 Subject: [PATCH 882/991] btrfs: update the writepage tracepoint to take a folio Willy is wanting to get rid of page->index, convert the writepage tracepoint to take a folio so we can do folio->index instead of page->index. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- include/trace/events/btrfs.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6cabeab5d21c48..51cadf9e61a17f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1531,7 +1531,7 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - trace___extent_writepage(page, inode, bio_ctrl->wbc); + trace___extent_writepage(folio, inode, bio_ctrl->wbc); WARN_ON(!PageLocked(page)); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 0a523023bdcc38..0eddbb8b67283c 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -674,10 +674,10 @@ TRACE_EVENT(btrfs_finish_ordered_extent, DECLARE_EVENT_CLASS(btrfs__writepage, - TP_PROTO(const struct page *page, const struct inode *inode, + TP_PROTO(const struct folio *folio, const struct inode *inode, const struct writeback_control *wbc), - TP_ARGS(page, inode, wbc), + TP_ARGS(folio, inode, wbc), TP_STRUCT__entry_btrfs( __field( u64, ino ) @@ -695,7 +695,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage, TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), __entry->ino = btrfs_ino(BTRFS_I(inode)); - __entry->index = page->index; + __entry->index = folio->index; __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; @@ -723,10 +723,10 @@ DECLARE_EVENT_CLASS(btrfs__writepage, DEFINE_EVENT(btrfs__writepage, __extent_writepage, - TP_PROTO(const struct page *page, const struct inode *inode, + TP_PROTO(const struct folio *folio, const struct inode *inode, const struct writeback_control *wbc), - TP_ARGS(page, inode, wbc) + TP_ARGS(folio, inode, wbc) ); TRACE_EVENT(btrfs_writepage_end_io_hook, From b658b08bf6168c65a2528a38dedd35c2c7a3d40b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 14:38:01 -0400 Subject: [PATCH 883/991] btrfs: convert __extent_writepage_io() to take a folio __extent_writepage_io uses page everywhere, but a lot of these functions take a folio. Convert it to use the folio based helpers, and then change it to take a folio as an argument and update its callers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 55 ++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 51cadf9e61a17f..ebd4867f3dd78a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1393,10 +1393,10 @@ static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info, * < 0 if there were errors (page still locked) */ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, - struct page *page, u64 start, u32 len, - struct btrfs_bio_ctrl *bio_ctrl, - loff_t i_size, - int *nr_ret) + struct folio *folio, + u64 start, u32 len, + struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 cur = start; @@ -1407,14 +1407,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, int ret = 0; int nr = 0; - ASSERT(start >= page_offset(page) && - start + len <= page_offset(page) + PAGE_SIZE); + ASSERT(start >= folio_pos(folio) && + start + len <= folio_pos(folio) + folio_size(folio)); - ret = btrfs_writepage_cow_fixup(page); + ret = btrfs_writepage_cow_fixup(&folio->page); if (ret) { /* Fixup worker will requeue */ - redirty_page_for_writepage(bio_ctrl->wbc, page); - unlock_page(page); + folio_redirty_for_writepage(bio_ctrl->wbc, folio); + folio_unlock(folio); return 1; } @@ -1428,21 +1428,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u32 iosize; if (cur >= i_size) { - btrfs_mark_ordered_io_finished(inode, page, cur, len, - true); + btrfs_mark_ordered_io_finished(inode, &folio->page, cur, + len, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. * But we still need to clear the dirty subpage bit, or - * the next time the page gets dirtied, we will try to + * the next time the folio gets dirtied, we will try to * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len); + btrfs_folio_clear_dirty(fs_info, folio, cur, len); break; } - find_next_dirty_byte(fs_info, page, &dirty_range_start, + find_next_dirty_byte(fs_info, &folio->page, &dirty_range_start, &dirty_range_end); if (cur < dirty_range_start) { cur = dirty_range_start; @@ -1478,33 +1478,32 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, em = NULL; /* - * Although the PageDirty bit might be cleared before entering - * this function, subpage dirty bit is not cleared. + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. * So clear subpage dirty bit here so next time we won't submit - * page for range already written to disk. + * folio for range already written to disk. */ - btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize); + btrfs_folio_clear_dirty(fs_info, folio, cur, iosize); btrfs_set_range_writeback(inode, cur, cur + iosize - 1); - if (!PageWriteback(page)) { + if (!folio_test_writeback(folio)) { btrfs_err(inode->root->fs_info, - "page %lu not writeback, cur %llu end %llu", - page->index, cur, end); + "folio %lu not writeback, cur %llu end %llu", + folio->index, cur, end); } - - submit_extent_folio(bio_ctrl, disk_bytenr, page_folio(page), - iosize, cur - page_offset(page)); + submit_extent_folio(bio_ctrl, disk_bytenr, folio, + iosize, cur - folio_pos(folio)); cur += iosize; nr++; } - btrfs_folio_assert_not_dirty(fs_info, page_folio(page), start, len); + btrfs_folio_assert_not_dirty(fs_info, folio, start, len); *nr_ret = nr; return 0; out_error: /* - * If we finish without problem, we should not only clear page dirty, + * If we finish without problem, we should not only clear folio dirty, * but also empty subpage dirty bits */ *nr_ret = nr; @@ -1556,7 +1555,7 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl if (ret) goto done; - ret = __extent_writepage_io(BTRFS_I(inode), page, page_offset(page), + ret = __extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), PAGE_SIZE, bio_ctrl, i_size, &nr); if (ret == 1) return 0; @@ -2308,7 +2307,7 @@ void extent_write_locked_range(struct inode *inode, const struct page *locked_pa if (pages_dirty && page != locked_page) ASSERT(PageDirty(page)); - ret = __extent_writepage_io(BTRFS_I(inode), page, cur, cur_len, + ret = __extent_writepage_io(BTRFS_I(inode), page_folio(page), cur, cur_len, &bio_ctrl, i_size, &nr); if (ret == 1) goto next_page; From 748cec1ef623e41c614f2d5e10082e2f37ad1853 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 14:50:51 -0400 Subject: [PATCH 884/991] btrfs: convert extent_write_locked_range() to use folios Instead of using pages for everything, find a folio and use that. This makes things a bit cleaner as a lot of the functions calls here all take folios. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ebd4867f3dd78a..06f75d80e37a4f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2299,37 +2299,47 @@ void extent_write_locked_range(struct inode *inode, const struct page *locked_pa while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); u32 cur_len = cur_end + 1 - cur; - struct page *page; + struct folio *folio; int nr = 0; - page = find_get_page(mapping, cur >> PAGE_SHIFT); - ASSERT(PageLocked(page)); - if (pages_dirty && page != locked_page) - ASSERT(PageDirty(page)); + folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0); - ret = __extent_writepage_io(BTRFS_I(inode), page_folio(page), cur, cur_len, + /* + * This shouldn't happen, the pages are pinned and locked, this + * code is just in case, but shouldn't actually be run. + */ + if (IS_ERR(folio)) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, + cur, cur_len, false); + mapping_set_error(mapping, PTR_ERR(folio)); + cur = cur_end + 1; + continue; + } + + ASSERT(folio_test_locked(folio)); + if (pages_dirty && &folio->page != locked_page) + ASSERT(folio_test_dirty(folio)); + + ret = __extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, &bio_ctrl, i_size, &nr); if (ret == 1) goto next_page; /* Make sure the mapping tag for page dirty gets cleared. */ if (nr == 0) { - struct folio *folio; - - folio = page_folio(page); btrfs_folio_set_writeback(fs_info, folio, cur, cur_len); btrfs_folio_clear_writeback(fs_info, folio, cur, cur_len); } if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, + btrfs_mark_ordered_io_finished(BTRFS_I(inode), &folio->page, cur, cur_len, !ret); - mapping_set_error(page->mapping, ret); + mapping_set_error(mapping, ret); } - btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len); + btrfs_folio_unlock_writer(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; next_page: - put_page(page); + folio_put(folio); cur = cur_end + 1; } From 7805b6108469cf32a57215921f0c5398b2e066f1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 14:58:22 -0400 Subject: [PATCH 885/991] btrfs: convert __extent_writepage() to be completely folio based Now that we've gotten most of the helpers updated to only take a folio, update __extent_writepage to only deal in folios. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 06f75d80e37a4f..85a28becd3f9eb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1519,11 +1519,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * Return 0 if everything goes well. * Return <0 for error. */ -static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) +static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; - const u64 page_start = page_offset(page); + struct inode *inode = folio->mapping->host; + const u64 page_start = folio_pos(folio); int ret; int nr = 0; size_t pg_offset; @@ -1532,24 +1531,24 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl trace___extent_writepage(folio, inode, bio_ctrl->wbc); - WARN_ON(!PageLocked(page)); + WARN_ON(!folio_test_locked(folio)); - pg_offset = offset_in_page(i_size); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { + pg_offset = offset_in_folio(folio, i_size); + if (folio->index > end_index || + (folio->index == end_index && !pg_offset)) { folio_invalidate(folio, 0, folio_size(folio)); folio_unlock(folio); return 0; } - if (page->index == end_index) - memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); + if (folio->index == end_index) + folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); + ret = writepage_delalloc(BTRFS_I(inode), &folio->page, bio_ctrl->wbc); if (ret == 1) return 0; if (ret) @@ -1565,13 +1564,13 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl done: if (nr == 0) { /* make sure the mapping tag for page dirty gets cleared */ - set_page_writeback(page); - end_page_writeback(page); + folio_start_writeback(folio); + folio_end_writeback(folio); } if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, - PAGE_SIZE, !ret); - mapping_set_error(page->mapping, ret); + btrfs_mark_ordered_io_finished(BTRFS_I(inode), &folio->page, + page_start, PAGE_SIZE, !ret); + mapping_set_error(folio->mapping, ret); } btrfs_folio_end_all_writers(inode_to_fs_info(inode), folio); @@ -2228,7 +2227,7 @@ static int extent_write_cache_pages(struct address_space *mapping, continue; } - ret = __extent_writepage(&folio->page, bio_ctrl); + ret = __extent_writepage(folio, bio_ctrl); if (ret < 0) { done = 1; break; From dedd47db9a9758e33c5e3edc93696a424f0db88a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 15:16:40 -0400 Subject: [PATCH 886/991] btrfs: convert add_ra_bio_pages() to use only folios Willy is going to get rid of page->index, and add_ra_bio_pages uses page->index. Make his life easier by converting add_ra_bio_pages to use folios so that we are no longer using page->index. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 62 ++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index a8e2c461aff714..832ab8984c41dd 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -420,7 +420,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; u64 isize = i_size_read(inode); int ret; - struct page *page; + struct folio *folio; struct extent_map *em; struct address_space *mapping = inode->i_mapping; struct extent_map_tree *em_tree; @@ -453,9 +453,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - page = xa_load(&mapping->i_pages, pg_index); - if (page && !xa_is_value(page)) { - sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> + folio = __filemap_get_folio(mapping, pg_index, 0, 0); + if (!IS_ERR(folio)) { + u64 folio_sz = folio_size(folio); + u64 offset = offset_in_folio(folio, cur); + + folio_put(folio); + sectors_missed += (folio_sz - offset) >> fs_info->sectorsize_bits; /* Beyond threshold, no need to continue */ @@ -466,35 +470,35 @@ static noinline int add_ra_bio_pages(struct inode *inode, * Jump to next page start as we already have page for * current offset. */ - cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + cur += (folio_sz - offset); continue; } - page = __page_cache_alloc(mapping_gfp_constraint(mapping, - ~__GFP_FS)); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, + ~__GFP_FS), 0); + if (!folio) break; - if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { - put_page(page); + if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) { /* There is already a page, skip to page end */ - cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + cur += folio_size(folio); + folio_put(folio); continue; } - if (!*memstall && PageWorkingset(page)) { + if (!*memstall && folio_test_workingset(folio)) { psi_memstall_enter(pflags); *memstall = 1; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } - page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; + page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1; lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); @@ -511,28 +515,28 @@ static noinline int add_ra_bio_pages(struct inode *inode, orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, cur, page_end, NULL); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } add_size = min(em->start + em->len, page_end + 1) - cur; free_extent_map(em); - if (page->index == end_index) { - size_t zero_offset = offset_in_page(isize); + if (folio->index == end_index) { + size_t zero_offset = offset_in_folio(folio, isize); if (zero_offset) { int zeros; - zeros = PAGE_SIZE - zero_offset; - memzero_page(page, zero_offset, zeros); + zeros = folio_size(folio) - zero_offset; + folio_zero_range(folio, zero_offset, zeros); } } - ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur)); - if (ret != add_size) { + if (!bio_add_folio(orig_bio, folio, add_size, + offset_in_folio(folio, cur))) { unlock_extent(tree, cur, page_end, NULL); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } /* @@ -541,9 +545,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, page_folio(page), - cur, add_size); - put_page(page); + btrfs_subpage_start_reader(fs_info, folio, cur, + add_size); + folio_put(folio); cur += add_size; } return 0; From 439855e40005ca23bda667589b0c2d3f74a5808d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 15:24:35 -0400 Subject: [PATCH 887/991] btrfs: utilize folio more in btrfs_page_mkwrite() We already have a folio that we're using in btrfs_page_mkwrite, update the rest of the function to use folio everywhere else. This will make it easier on Willy when he drops page->index. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2aeb8116549ca9..c7a7234998aa4c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1920,8 +1920,8 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; + page_start = folio_pos(folio); + page_end = page_start + folio_size(folio) - 1; end = page_end; /* @@ -1949,18 +1949,18 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; again: down_read(&BTRFS_I(inode)->i_mmap_lock); - lock_page(page); + folio_lock(folio); size = i_size_read(inode); - if ((page->mapping != inode->i_mapping) || + if ((folio->mapping != inode->i_mapping) || (page_start >= size)) { /* Page got truncated out from underneath us. */ goto out_unlock; } - wait_on_page_writeback(page); + folio_wait_writeback(folio); lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_page_extent_mapped(page); + ret2 = set_folio_extent_mapped(folio); if (ret2 < 0) { ret = vmf_error(ret2); unlock_extent(io_tree, page_start, page_end, &cached_state); @@ -1974,14 +1974,14 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); if (ordered) { unlock_extent(io_tree, page_start, page_end, &cached_state); - unlock_page(page); + folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } - if (page->index == ((size - 1) >> PAGE_SHIFT)) { + if (folio->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; @@ -2011,13 +2011,13 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) } /* Page is wholly or partially inside EOF. */ - if (page_start + PAGE_SIZE > size) - zero_start = offset_in_page(size); + if (page_start + folio_size(folio) > size) + zero_start = offset_in_folio(folio, size); else zero_start = PAGE_SIZE; if (zero_start != PAGE_SIZE) - memzero_page(page, zero_start, PAGE_SIZE - zero_start); + folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); @@ -2034,7 +2034,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_LOCKED; out_unlock: - unlock_page(page); + folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); From 0a55851802455670ae0dba09a95c7fd494cded18 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 15:49:47 -0400 Subject: [PATCH 888/991] btrfs: convert can_finish_ordered_extent() to use a folio Pass in a folio instead, and use a folio instead of a page. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 82a68394a89c06..760a37512c7e08 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -332,7 +332,7 @@ static void finish_ordered_fn(struct btrfs_work *work) } static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { struct btrfs_inode *inode = ordered->inode; @@ -340,10 +340,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, lockdep_assert_held(&inode->ordered_tree_lock); - if (page) { - ASSERT(page->mapping); - ASSERT(page_offset(page) <= file_offset); - ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE); + if (folio) { + ASSERT(folio->mapping); + ASSERT(folio_pos(folio) <= file_offset); + ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); /* * Ordered (Private2) bit indicates whether we still have @@ -351,10 +351,9 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * * If there's no such bit, we need to skip to next range. */ - if (!btrfs_folio_test_ordered(fs_info, page_folio(page), - file_offset, len)) + if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) return false; - btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len); + btrfs_folio_clear_ordered(fs_info, folio, file_offset, len); } /* Now we're fine to update the accounting. */ @@ -408,7 +407,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); spin_lock_irqsave(&inode->ordered_tree_lock, flags); - ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); + ret = can_finish_ordered_extent(ordered, page_folio(page), file_offset, + len, uptodate); spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); /* @@ -524,7 +524,8 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(end + 1 - cur < U32_MAX); len = end + 1 - cur; - if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { + if (can_finish_ordered_extent(entry, page_folio(page), cur, len, + uptodate)) { spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); btrfs_queue_ordered_fn(entry); spin_lock_irqsave(&inode->ordered_tree_lock, flags); From 0df2fc7762ca1b6cf79c2a32ab347cfd9895311f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 15:53:18 -0400 Subject: [PATCH 889/991] btrfs: convert btrfs_finish_ordered_extent() to take a folio The callers and callee's of this now all use folios, update it to take a folio as well. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/ordered-data.c | 6 +++--- fs/btrfs/ordered-data.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 85a28becd3f9eb..cf2022ea2d82e9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -472,8 +472,8 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) "incomplete page write with offset %zu and length %zu", fi.offset, fi.length); - btrfs_finish_ordered_extent(bbio->ordered, - folio_page(folio, 0), start, len, !error); + btrfs_finish_ordered_extent(bbio->ordered, folio, start, len, + !error); if (error) mapping_set_error(folio->mapping, error); btrfs_folio_clear_writeback(fs_info, folio, start, len); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 760a37512c7e08..e9774795604096 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -397,7 +397,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) } void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { struct btrfs_inode *inode = ordered->inode; @@ -407,8 +407,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); spin_lock_irqsave(&inode->ordered_tree_lock, flags); - ret = can_finish_ordered_extent(ordered, page_folio(page), file_offset, - len, uptodate); + ret = can_finish_ordered_extent(ordered, folio, file_offset, len, + uptodate); spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 51b9e81726e220..90c1c3c51ae572 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -163,7 +163,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, + struct folio *folio, u64 file_offset, u64 len, bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, From 16b52bb9b16a30eb117074c414fd3bc2c8d2161d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 15:57:10 -0400 Subject: [PATCH 890/991] btrfs: convert btrfs_mark_ordered_io_finished() to take a folio We only need a folio now, make it take a folio as an argument and update all of the callers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 ++++---- fs/btrfs/inode.c | 7 ++++--- fs/btrfs/ordered-data.c | 9 ++++----- fs/btrfs/ordered-data.h | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cf2022ea2d82e9..007b690b914c0a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1428,8 +1428,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u32 iosize; if (cur >= i_size) { - btrfs_mark_ordered_io_finished(inode, &folio->page, cur, - len, true); + btrfs_mark_ordered_io_finished(inode, folio, cur, len, + true); /* * This range is beyond i_size, thus we don't need to * bother writing back. @@ -1568,7 +1568,7 @@ static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ct folio_end_writeback(folio); } if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), &folio->page, + btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, page_start, PAGE_SIZE, !ret); mapping_set_error(folio->mapping, ret); } @@ -2330,7 +2330,7 @@ void extent_write_locked_range(struct inode *inode, const struct page *locked_pa btrfs_folio_clear_writeback(fs_info, folio, cur, cur_len); } if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), &folio->page, + btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, cur, cur_len, !ret); mapping_set_error(mapping, ret); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b1b6564ab68f0c..26a3d429d9a55d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1144,7 +1144,8 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, set_page_writeback(locked_page); end_page_writeback(locked_page); - btrfs_mark_ordered_io_finished(inode, locked_page, + btrfs_mark_ordered_io_finished(inode, + page_folio(locked_page), page_start, PAGE_SIZE, !ret); mapping_set_error(locked_page->mapping, ret); @@ -2802,8 +2803,8 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) * to reflect the errors and clean the page. */ mapping_set_error(page->mapping, ret); - btrfs_mark_ordered_io_finished(inode, page, page_start, - PAGE_SIZE, !ret); + btrfs_mark_ordered_io_finished(inode, page_folio(page), + page_start, PAGE_SIZE, !ret); clear_page_dirty_for_io(page); } btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e9774795604096..eb9b32ffbc0c04 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -449,8 +449,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, /* * Mark all ordered extents io inside the specified range finished. * - * @page: The involved page for the operation. - * For uncompressed buffered IO, the page status also needs to be + * @folio: The involved folio for the operation. + * For uncompressed buffered IO, the folio status also needs to be * updated to indicate whether the pending ordered io is finished. * Can be NULL for direct IO and compressed write. * For these cases, callers are ensured they won't execute the @@ -460,7 +460,7 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 num_bytes, bool uptodate) { struct rb_node *node; @@ -524,8 +524,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(end + 1 - cur < U32_MAX); len = end + 1 - cur; - if (can_finish_ordered_extent(entry, page_folio(page), cur, len, - uptodate)) { + if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); btrfs_queue_ordered_fn(entry); spin_lock_irqsave(&inode->ordered_tree_lock, flags); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 90c1c3c51ae572..4e152736d06cbd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -166,8 +166,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, struct folio *folio, u64 file_offset, u64 len, bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, - u64 num_bytes, bool uptodate); + struct folio *folio, u64 file_offset, + u64 num_bytes, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); From 1150652862455a59b561316722fd5ae671462794 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:03:04 -0400 Subject: [PATCH 891/991] btrfs: convert writepage_delalloc() to take a folio We already use a folio heavily in this function, pass the folio in directly and use it everywhere, only passing the page down to functions that do not take a folio yet. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 007b690b914c0a..1c784f7e6858d0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1188,13 +1188,13 @@ int btrfs_read_folio(struct file *file, struct folio *folio) * This returns < 0 if there were errors (page still locked) */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, - struct page *page, struct writeback_control *wbc) + struct folio *folio, + struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); - struct folio *folio = page_folio(page); - const bool is_subpage = btrfs_is_subpage(fs_info, page->mapping); - const u64 page_start = page_offset(page); - const u64 page_end = page_start + PAGE_SIZE - 1; + const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const u64 page_start = folio_pos(folio); + const u64 page_end = page_start + folio_size(folio) - 1; /* * Save the last found delalloc end. As the delalloc end can go beyond * page boundary, thus we cannot rely on subpage bitmap to locate the @@ -1206,10 +1206,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_to_write = 0; int ret = 0; - /* Lock all (subpage) delalloc ranges inside the page first. */ + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; - if (!find_lock_delalloc_range(&inode->vfs_inode, page, + if (!find_lock_delalloc_range(&inode->vfs_inode, &folio->page, &delalloc_start, &delalloc_end)) { delalloc_start = delalloc_end + 1; continue; @@ -1234,7 +1234,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, if (!is_subpage) { /* * For non-subpage case, the found delalloc range must - * cover this page and there must be only one locked + * cover this folio and there must be only one locked * delalloc range. */ found_start = page_start; @@ -1248,7 +1248,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, break; /* * The subpage range covers the last sector, the delalloc range may - * end beyond the page boundary, use the saved delalloc_end + * end beyond the folio boundary, use the saved delalloc_end * instead. */ if (found_start + found_len >= page_end) @@ -1256,7 +1256,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, if (ret >= 0) { /* No errors hit so far, run the current delalloc range. */ - ret = btrfs_run_delalloc_range(inode, page, found_start, + ret = btrfs_run_delalloc_range(inode, &folio->page, + found_start, found_start + found_len - 1, wbc); } else { @@ -1266,15 +1267,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, */ unlock_extent(&inode->io_tree, found_start, found_start + found_len - 1, NULL); - __unlock_for_delalloc(&inode->vfs_inode, page, found_start, + __unlock_for_delalloc(&inode->vfs_inode, &folio->page, + found_start, found_start + found_len - 1); } /* * We can hit btrfs_run_delalloc_range() with >0 return value. * - * This happens when either the IO is already done and page - * unlocked (inline) or the IO submission and page unlock would + * This happens when either the IO is already done and folio + * unlocked (inline) or the IO submission and folio unlock would * be handled as async (compression). * * Inline is only possible for regular sectorsize for now. @@ -1282,14 +1284,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, * Compression is possible for both subpage and regular cases, * but even for subpage compression only happens for page aligned * range, thus the found delalloc range must go beyond current - * page. + * folio. */ if (ret > 0) ASSERT(!is_subpage || found_start + found_len >= page_end); /* - * Above btrfs_run_delalloc_range() may have unlocked the page, - * thus for the last range, we cannot touch the page anymore. + * Above btrfs_run_delalloc_range() may have unlocked the folio, + * thus for the last range, we cannot touch the folio anymore. */ if (found_start + found_len >= last_delalloc_end + 1) break; @@ -1312,7 +1314,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, /* * If btrfs_run_dealloc_range() already started I/O and unlocked - * the pages, we just need to account for them here. + * the folios, we just need to account for them here. */ if (ret == 1) { wbc->nr_to_write -= delalloc_to_write; @@ -1548,7 +1550,7 @@ static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ct if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), &folio->page, bio_ctrl->wbc); + ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl->wbc); if (ret == 1) return 0; if (ret) From 8e9d7b70848cfcf939d1a94dcb7d18f1a0b5bc38 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:08:13 -0400 Subject: [PATCH 892/991] btrfs: convert find_lock_delalloc_range() to use a folio Instead of passing in a page for locked_page, pass in the folio instead. We only use the folio itself to validate some range assumptions, and then pass it into other functions. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 28 ++++++++++++++-------------- fs/btrfs/extent_io.h | 2 +- fs/btrfs/tests/extent-io-tests.c | 10 +++++----- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1c784f7e6858d0..b9921b8235e248 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -304,8 +304,8 @@ static noinline int lock_delalloc_pages(struct inode *inode, */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, - u64 *end) + struct folio *locked_folio, + u64 *start, u64 *end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -323,9 +323,9 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, /* Caller should pass a valid @end to indicate the search range end */ ASSERT(orig_end > orig_start); - /* The range should at least cover part of the page */ - ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || - orig_end <= page_offset(locked_page))); + /* The range should at least cover part of the folio */ + ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) || + orig_end <= folio_pos(locked_folio))); again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; @@ -342,25 +342,25 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, } /* - * start comes from the offset of locked_page. We have to lock - * pages in order, so we can't process delalloc bytes before - * locked_page + * start comes from the offset of locked_folio. We have to lock + * folios in order, so we can't process delalloc bytes before + * locked_folio */ if (delalloc_start < *start) delalloc_start = *start; /* - * make sure to limit the number of pages we try to lock down + * make sure to limit the number of folios we try to lock down */ if (delalloc_end + 1 - delalloc_start > max_bytes) delalloc_end = delalloc_start + max_bytes - 1; - /* step two, lock all the pages after the page that has start */ - ret = lock_delalloc_pages(inode, locked_page, + /* step two, lock all the folioss after the folios that has start */ + ret = lock_delalloc_pages(inode, &locked_folio->page, delalloc_start, delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { - /* some of the pages are gone, lets avoid looping by + /* some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); @@ -384,7 +384,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - __unlock_for_delalloc(inode, locked_page, + __unlock_for_delalloc(inode, &locked_folio->page, delalloc_start, delalloc_end); cond_resched(); goto again; @@ -1209,7 +1209,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; - if (!find_lock_delalloc_range(&inode->vfs_inode, &folio->page, + if (!find_lock_delalloc_range(&inode->vfs_inode, folio, &delalloc_start, &delalloc_end)) { delalloc_start = delalloc_end + 1; continue; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index dceebd76c7d17d..1dd295e1b5a5d7 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -368,7 +368,7 @@ int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, + struct folio *locked_folio, u64 *start, u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 865d4af4b30356..0a2dbfaaf49e29 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -180,7 +180,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("should have found at least one delalloc"); @@ -211,7 +211,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("couldn't find delalloc in our range"); @@ -245,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) } start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (found) { test_err("found range when we shouldn't have"); @@ -266,7 +266,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("didn't find our range"); @@ -307,7 +307,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("didn't find our range"); From 8812f7151abc772b0231ba84053d4ec8e7d7e962 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:15:13 -0400 Subject: [PATCH 893/991] btrfs: convert lock_delalloc_pages() to take a folio Also rename lock_delalloc_pages => lock_delalloc_folios in the process, now that it exclusively works on folios. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b9921b8235e248..845c1c3efe8e06 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -230,10 +230,9 @@ static noinline void __unlock_for_delalloc(const struct inode *inode, PAGE_UNLOCK); } -static noinline int lock_delalloc_pages(struct inode *inode, - const struct page *locked_page, - u64 start, - u64 end) +static noinline int lock_delalloc_folios(struct inode *inode, + const struct folio *locked_folio, + u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct address_space *mapping = inode->i_mapping; @@ -243,7 +242,7 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 processed_end = start; struct folio_batch fbatch; - if (index == locked_page->index && index == end_index) + if (index == locked_folio->index && index == end_index) return 0; folio_batch_init(&fbatch); @@ -257,23 +256,22 @@ static noinline int lock_delalloc_pages(struct inode *inode, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - struct page *page = folio_page(folio, 0); u32 len = end + 1 - start; - if (page == locked_page) + if (folio == locked_folio) continue; if (btrfs_folio_start_writer_lock(fs_info, folio, start, len)) goto out; - if (!PageDirty(page) || page->mapping != mapping) { + if (!folio_test_dirty(folio) || folio->mapping != mapping) { btrfs_folio_end_writer_lock(fs_info, folio, start, len); goto out; } - processed_end = page_offset(page) + PAGE_SIZE - 1; + processed_end = folio_pos(folio) + folio_size(folio) - 1; } folio_batch_release(&fbatch); cond_resched(); @@ -283,7 +281,8 @@ static noinline int lock_delalloc_pages(struct inode *inode, out: folio_batch_release(&fbatch); if (processed_end > start) - __unlock_for_delalloc(inode, locked_page, start, processed_end); + __unlock_for_delalloc(inode, &locked_folio->page, start, + processed_end); return -EAGAIN; } @@ -356,8 +355,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, delalloc_end = delalloc_start + max_bytes - 1; /* step two, lock all the folioss after the folios that has start */ - ret = lock_delalloc_pages(inode, &locked_folio->page, - delalloc_start, delalloc_end); + ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, + delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { /* some of the folios are gone, lets avoid looping by From b9a0dd6cfc938d2540d9729eb8def1fbfa0ec2f4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:17:43 -0400 Subject: [PATCH 894/991] btrfs: convert __unlock_for_delalloc() to take a folio All of the callers have a folio at this point, update __unlock_for_delalloc to take a folio so that it's consistent with its callers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 845c1c3efe8e06..79b6212775777c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -216,18 +216,18 @@ static void __process_pages_contig(struct address_space *mapping, } static noinline void __unlock_for_delalloc(const struct inode *inode, - const struct page *locked_page, + const struct folio *locked_folio, u64 start, u64 end) { unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - ASSERT(locked_page); - if (index == locked_page->index && end_index == index) + ASSERT(locked_folio); + if (index == locked_folio->index && end_index == index) return; - __process_pages_contig(inode->i_mapping, locked_page, start, end, - PAGE_UNLOCK); + __process_pages_contig(inode->i_mapping, &locked_folio->page, start, + end, PAGE_UNLOCK); } static noinline int lock_delalloc_folios(struct inode *inode, @@ -281,7 +281,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, out: folio_batch_release(&fbatch); if (processed_end > start) - __unlock_for_delalloc(inode, &locked_folio->page, start, + __unlock_for_delalloc(inode, locked_folio, start, processed_end); return -EAGAIN; } @@ -383,8 +383,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - __unlock_for_delalloc(inode, &locked_folio->page, - delalloc_start, delalloc_end); + __unlock_for_delalloc(inode, locked_folio, delalloc_start, + delalloc_end); cond_resched(); goto again; } @@ -1266,7 +1266,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, */ unlock_extent(&inode->io_tree, found_start, found_start + found_len - 1, NULL); - __unlock_for_delalloc(&inode->vfs_inode, &folio->page, + __unlock_for_delalloc(&inode->vfs_inode, folio, found_start, found_start + found_len - 1); } From 5a43c5d8c0c274575811c6642ea076963d2983ac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:20:02 -0400 Subject: [PATCH 895/991] btrfs: convert __process_pages_contig() to take a folio This operates mostly on folios, update it to take a folio for the locked folio instead of the page, rename from __process_pages_contig => __process_folios_contig. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 79b6212775777c..c91ec5661ee3d3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -187,9 +187,9 @@ static void process_one_page(struct btrfs_fs_info *fs_info, btrfs_folio_end_writer_lock(fs_info, folio, start, len); } -static void __process_pages_contig(struct address_space *mapping, - const struct page *locked_page, u64 start, u64 end, - unsigned long page_ops) +static void __process_folios_contig(struct address_space *mapping, + const struct folio *locked_folio, u64 start, + u64 end, unsigned long page_ops) { struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); pgoff_t start_index = start >> PAGE_SHIFT; @@ -207,8 +207,9 @@ static void __process_pages_contig(struct address_space *mapping, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - process_one_page(fs_info, &folio->page, locked_page, - page_ops, start, end); + process_one_page(fs_info, &folio->page, + &locked_folio->page, page_ops, start, + end); } folio_batch_release(&fbatch); cond_resched(); @@ -226,8 +227,8 @@ static noinline void __unlock_for_delalloc(const struct inode *inode, if (index == locked_folio->index && end_index == index) return; - __process_pages_contig(inode->i_mapping, &locked_folio->page, start, - end, PAGE_UNLOCK); + __process_folios_contig(inode->i_mapping, locked_folio, start, end, + PAGE_UNLOCK); } static noinline int lock_delalloc_folios(struct inode *inode, @@ -401,8 +402,8 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, { clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); - __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start, end, page_ops); + __process_folios_contig(inode->vfs_inode.i_mapping, + page_folio(locked_page), start, end, page_ops); } static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) From 5e1e7c96217bb44f6e0712bfa1cceb62f1ed717d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:22:15 -0400 Subject: [PATCH 896/991] btrfs: convert process_one_page() to operate only on folios Now that this mostly uses folios, update it to take folios, use the folios that are passed in, and rename from process_one_page => process_one_folio. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c91ec5661ee3d3..d73496ce7f52c5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -164,11 +164,10 @@ void __cold extent_buffer_free_cachep(void) kmem_cache_destroy(extent_buffer_cache); } -static void process_one_page(struct btrfs_fs_info *fs_info, - struct page *page, const struct page *locked_page, - unsigned long page_ops, u64 start, u64 end) +static void process_one_folio(struct btrfs_fs_info *fs_info, + struct folio *folio, const struct folio *locked_folio, + unsigned long page_ops, u64 start, u64 end) { - struct folio *folio = page_folio(page); u32 len; ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); @@ -183,7 +182,7 @@ static void process_one_page(struct btrfs_fs_info *fs_info, if (page_ops & PAGE_END_WRITEBACK) btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); - if (page != locked_page && (page_ops & PAGE_UNLOCK)) + if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) btrfs_folio_end_writer_lock(fs_info, folio, start, len); } @@ -207,9 +206,8 @@ static void __process_folios_contig(struct address_space *mapping, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - process_one_page(fs_info, &folio->page, - &locked_folio->page, page_ops, start, - end); + process_one_folio(fs_info, folio, locked_folio, + page_ops, start, end); } folio_batch_release(&fbatch); cond_resched(); From f07b9e2ce66768035f9a1dabd277980e913957e6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:29:18 -0400 Subject: [PATCH 897/991] btrfs: convert extent_clear_unlock_delalloc() to take a folio Instead of taking the locked page, take the locked folio so we can pass that into __process_folios_contig. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 25 ++++++++++++++----------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d73496ce7f52c5..a1ba45690635a6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -394,14 +394,14 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, } void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - const struct page *locked_page, + const struct folio *locked_folio, struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); - __process_folios_contig(inode->vfs_inode.i_mapping, - page_folio(locked_page), start, end, page_ops); + __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, + end, page_ops); } static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1dd295e1b5a5d7..5d36031578ff9c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -354,7 +354,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - const struct page *locked_page, + const struct folio *locked_folio, struct extent_state **cached, u32 bits_to_clear, unsigned long page_ops); int extent_invalidate_folio(struct extent_io_tree *tree, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 26a3d429d9a55d..3dff49ba445374 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -743,10 +743,10 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, if (ret == 0) locked_page = NULL; - extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached, - clear_flags, - PAGE_UNLOCK | PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); + extent_clear_unlock_delalloc(inode, offset, end, + page_folio(locked_page), &cached, + clear_flags, PAGE_UNLOCK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return ret; } @@ -1501,7 +1501,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - locked_page, &cached, + page_folio(locked_page), &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) @@ -1560,7 +1560,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (!locked_page) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_page, NULL, 0, page_ops); + page_folio(locked_page), NULL, 0, + page_ops); } /* @@ -1583,7 +1584,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - locked_page, &cached, + page_folio(locked_page), &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); @@ -1598,8 +1599,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, */ if (start < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_page, - &cached, clear_bits, page_ops); + extent_clear_unlock_delalloc(inode, start, end, + page_folio(locked_page), &cached, + clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } return ret; @@ -2207,7 +2209,8 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_page, &cached_state, + page_folio(locked_page), + &cached_state, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); @@ -2256,7 +2259,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, - locked_page, &cached, + page_folio(locked_page), &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | From 23dd7ebf85886a8cc619d6d1663654d4bc829159 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:31:44 -0400 Subject: [PATCH 898/991] btrfs: convert extent_write_locked_range() to take a folio This mostly uses folios, convert it to take a folio instead and update the callers to pass in the folio. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a1ba45690635a6..9ae17c9fd89b20 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2274,7 +2274,7 @@ static int extent_write_cache_pages(struct address_space *mapping, * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -void extent_write_locked_range(struct inode *inode, const struct page *locked_page, +void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { @@ -2316,7 +2316,7 @@ void extent_write_locked_range(struct inode *inode, const struct page *locked_pa } ASSERT(folio_test_locked(folio)); - if (pages_dirty && &folio->page != locked_page) + if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); ret = __extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5d36031578ff9c..b38460279b9932 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -240,7 +240,7 @@ bool try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -void extent_write_locked_range(struct inode *inode, const struct page *locked_page, +void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3dff49ba445374..5761ccc92a448f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1758,7 +1758,8 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, true, false); if (ret) return ret; - extent_write_locked_range(&inode->vfs_inode, locked_page, start, + extent_write_locked_range(&inode->vfs_inode, + page_folio(locked_page), start, done_offset, wbc, pages_dirty); start = done_offset + 1; } From ff5cca310fbb9ac6b44b4738ed6a573d07240009 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:34:15 -0400 Subject: [PATCH 899/991] btrfs: convert run_delalloc_cow() to take a folio We pass the folio into extent_write_locked_range, go ahead and take a folio to pass along, and update the callers to pass in a folio. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5761ccc92a448f..bcbaf2e65e5a99 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -116,7 +116,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); @@ -1135,7 +1135,8 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + ret = run_delalloc_cow(inode, page_folio(locked_page), start, end, + &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); @@ -1746,7 +1747,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * covered by the range. */ static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { @@ -1754,13 +1755,12 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, int ret; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, &done_offset, - true, false); + ret = cow_file_range(inode, &locked_folio->page, start, end, + &done_offset, true, false); if (ret) return ret; - extent_write_locked_range(&inode->vfs_inode, - page_folio(locked_page), start, - done_offset, wbc, pages_dirty); + extent_write_locked_range(&inode->vfs_inode, locked_folio, + start, done_offset, wbc, pages_dirty); start = done_offset + 1; } @@ -2311,8 +2311,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page return 1; if (zoned) - ret = run_delalloc_cow(inode, locked_page, start, end, wbc, - true); + ret = run_delalloc_cow(inode, page_folio(locked_page), start, + end, wbc, true); else ret = cow_file_range(inode, locked_page, start, end, NULL, false, false); From 518aeeb4a2ad23abd4e5d5ae1c3c39a83fab2a79 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jul 2024 15:25:45 -0400 Subject: [PATCH 900/991] btrfs: convert cow_file_range_inline() to take a folio Now that we want the folio in this function, convert it to take a folio directly and use that. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bcbaf2e65e5a99..fd3232472d32fd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -715,7 +715,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offse } static noinline int cow_file_range_inline(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, u64 offset, u64 end, size_t compressed_size, int compress_type, @@ -741,10 +741,9 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, } if (ret == 0) - locked_page = NULL; + locked_folio = NULL; - extent_clear_unlock_delalloc(inode, offset, end, - page_folio(locked_page), &cached, + extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached, clear_flags, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return ret; @@ -1365,8 +1364,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, locked_page, start, end, 0, - BTRFS_COMPRESS_NONE, NULL, false); + ret = cow_file_range_inline(inode, page_folio(locked_page), + start, end, 0, BTRFS_COMPRESS_NONE, + NULL, false); if (ret <= 0) { /* * We succeeded, return 1 so the caller knows we're done From a1b03de694ab499f5b3d038421e7f280c8487ab7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:37:29 -0400 Subject: [PATCH 901/991] btrfs: convert cow_file_range() to take a folio Convert this to take a folio and pass it into all of the various cleanup functions. Update the callers to pass in a folio instead. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fd3232472d32fd..6047fd35fe43f1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1307,21 +1307,21 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * - * locked_page is the page that writepage had locked already. We use + * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * When this function fails, it unlocks all pages except @locked_page. + * When this function fails, it unlocks all pages except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and - * unlocks all pages including locked_page and starts I/O on them. - * (In reality inline extents are limited to a single page, so locked_page is + * unlocks all pages including locked_folio and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_folio is * the only page handled anyway). * * When this function succeed and creates a normal extent, the page locking * status depends on the passed in flags: * * - If @keep_locked is set, all pages are kept locked. - * - Else all pages except for @locked_page are unlocked. + * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept @@ -1330,8 +1330,8 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, u64 start, u64 end, - u64 *done_offset, + struct folio *locked_folio, u64 start, + u64 end, u64 *done_offset, bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; @@ -1364,9 +1364,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, page_folio(locked_page), - start, end, 0, BTRFS_COMPRESS_NONE, - NULL, false); + ret = cow_file_range_inline(inode, locked_folio, start, end, 0, + BTRFS_COMPRESS_NONE, NULL, false); if (ret <= 0) { /* * We succeeded, return 1 so the caller knows we're done @@ -1502,7 +1501,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - page_folio(locked_page), &cached, + locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) @@ -1555,14 +1554,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * function. * * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_page) to ensure all the pages are unlocked. + * (except @locked_folio) to ensure all the pages are unlocked. */ if (keep_locked && orig_start < start) { - if (!locked_page) + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - page_folio(locked_page), NULL, 0, - page_ops); + locked_folio, NULL, 0, page_ops); } /* @@ -1585,8 +1583,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - page_folio(locked_page), &cached, - clear_bits, + locked_folio, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); start += cur_alloc_size; @@ -1600,9 +1597,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, */ if (start < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, - page_folio(locked_page), &cached, - clear_bits, page_ops); + extent_clear_unlock_delalloc(inode, start, end, locked_folio, + &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } return ret; @@ -1755,7 +1751,7 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, int ret; while (start <= end) { - ret = cow_file_range(inode, &locked_folio->page, start, end, + ret = cow_file_range(inode, locked_folio, start, end, &done_offset, true, false); if (ret) return ret; @@ -1837,7 +1833,8 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ - ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ret = cow_file_range(inode, page_folio(locked_page), start, end, NULL, + false, true); ASSERT(ret != 1); return ret; } @@ -2314,8 +2311,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page ret = run_delalloc_cow(inode, page_folio(locked_page), start, end, wbc, true); else - ret = cow_file_range(inode, locked_page, start, end, NULL, - false, false); + ret = cow_file_range(inode, page_folio(locked_page), start, end, + NULL, false, false); out: if (ret < 0) From 5198c89c6fde6d88821f59416217bfa510898035 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:40:34 -0400 Subject: [PATCH 902/991] btrfs: convert fallback_to_cow() to take a folio With this we can pass the folio directly into cow_file_range(). Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6047fd35fe43f1..534b1dec375234 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1763,8 +1763,9 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, return 1; } -static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end) +static int fallback_to_cow(struct btrfs_inode *inode, + struct folio *locked_folio, const u64 start, + const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); @@ -1833,8 +1834,8 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ - ret = cow_file_range(inode, page_folio(locked_page), start, end, NULL, - false, true); + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, + true); ASSERT(ret != 1); return ret; } @@ -2151,7 +2152,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(inode, locked_page, + ret = fallback_to_cow(inode, page_folio(locked_page), cow_start, found_key.offset - 1); cow_start = (u64)-1; if (ret) { @@ -2230,7 +2231,8 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end); + ret = fallback_to_cow(inode, page_folio(locked_page), cow_start, + end); cow_start = (u64)-1; if (ret) goto error; From 5bb565b684cc58ebc398e334acf33d89ce047afc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:42:23 -0400 Subject: [PATCH 903/991] btrfs: convert run_delalloc_nocow() to take a folio Now all of the functions that use locked_page in run_delalloc_nocow take a folio, update it to take a folio and update the caller. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 534b1dec375234..9fae22af047ee7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1989,7 +1989,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, * blocks on disk */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -2152,8 +2152,8 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(inode, page_folio(locked_page), - cow_start, found_key.offset - 1); + ret = fallback_to_cow(inode, locked_folio, cow_start, + found_key.offset - 1); cow_start = (u64)-1; if (ret) { btrfs_dec_nocow_writers(nocow_bg); @@ -2208,8 +2208,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - page_folio(locked_page), - &cached_state, + locked_folio, &cached_state, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); @@ -2231,8 +2230,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(inode, page_folio(locked_page), cow_start, - end); + ret = fallback_to_cow(inode, locked_folio, cow_start, end); cow_start = (u64)-1; if (ret) goto error; @@ -2259,7 +2257,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, - page_folio(locked_page), &cached, + locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -2300,7 +2298,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page start >= page_offset(locked_page) + PAGE_SIZE)); if (should_nocow(inode, start, end)) { - ret = run_delalloc_nocow(inode, locked_page, start, end); + ret = run_delalloc_nocow(inode, page_folio(locked_page), start, + end); goto out; } From 0ddf953bb170c2aef45f9ec92d695bc196a44a41 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:46:01 -0400 Subject: [PATCH 904/991] btrfs: convert btrfs_cleanup_ordered_extents() to use folios We walk through pages in this function and clear ordered, and the function for this uses folios. Update the function to use a folio for this whole operation. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9fae22af047ee7..d3345c323ba59d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -399,7 +399,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; u64 page_start = 0, page_end = 0; - struct page *page; + struct folio *folio; if (locked_page) { page_start = page_offset(locked_page); @@ -421,9 +421,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, index++; continue; } - page = find_get_page(inode->vfs_inode.i_mapping, index); + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); index++; - if (!page) + if (IS_ERR(folio)) continue; /* @@ -431,9 +431,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ - btrfs_folio_clamp_clear_ordered(inode->root->fs_info, - page_folio(page), offset, bytes); - put_page(page); + btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio, + offset, bytes); + folio_put(folio); } if (locked_page) { From 18a48c20ee3e2a11545d3993483edbf867853db7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:49:54 -0400 Subject: [PATCH 905/991] btrfs: convert btrfs_cleanup_ordered_extents() to take a folio Now that btrfs_cleanup_ordered_extents is operating mostly with folios, update it to use a folio instead of a page, and the update the function and the callers as appropriate. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3345c323ba59d..15b3e368ce7f5d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -393,7 +393,7 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; @@ -401,9 +401,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, u64 page_start = 0, page_end = 0; struct folio *folio; - if (locked_page) { - page_start = page_offset(locked_page); - page_end = page_start + PAGE_SIZE - 1; + if (locked_folio) { + page_start = folio_pos(locked_folio); + page_end = page_start + folio_size(locked_folio) - 1; } while (index <= end_index) { @@ -417,7 +417,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ - if (locked_page && index == (page_start >> PAGE_SHIFT)) { + if (locked_folio && index == (page_start >> PAGE_SHIFT)) { index++; continue; } @@ -436,9 +436,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, folio_put(folio); } - if (locked_page) { + if (locked_folio) { /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + PAGE_SIZE) + if (bytes + offset <= page_start + folio_size(locked_folio)) return; /* * In case this page belongs to the delalloc range being @@ -447,8 +447,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; + bytes = offset + bytes - folio_pos(locked_folio) - + folio_size(locked_folio); + offset = folio_pos(locked_folio) + folio_size(locked_folio); } } @@ -1138,7 +1139,8 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); + btrfs_cleanup_ordered_extents(inode, page_folio(locked_page), + start, end - start + 1); if (locked_page) { const u64 page_start = page_offset(locked_page); @@ -2317,8 +2319,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page out: if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_page, start, - end - start + 1); + btrfs_cleanup_ordered_extents(inode, page_folio(locked_page), + start, end - start + 1); return ret; } From 1adae8b7d161fd41f2f8ca622a3c976dbe9af232 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:52:57 -0400 Subject: [PATCH 906/991] btrfs: convert run_delalloc_compressed() to take a folio This just passes the page into the compressed machinery to keep track of the locked page. Update this to take a folio and convert it to a page where appropriate. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 15b3e368ce7f5d..3ee2c863adeed2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1653,7 +1653,7 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ } static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1693,15 +1693,16 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, INIT_LIST_HEAD(&async_chunk[i].extents); /* - * The locked_page comes all the way from writepage and its - * the original page we were actually given. As we spread + * The locked_folio comes all the way from writepage and its + * the original folio we were actually given. As we spread * this large delalloc region across multiple async_chunk - * structs, only the first struct needs a pointer to locked_page + * structs, only the first struct needs a pointer to + * locked_folio. * * This way we don't need racey decisions about who is supposed * to unlock it. */ - if (locked_page) { + if (locked_folio) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to @@ -1711,10 +1712,10 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, locked_page, + wbc_account_cgroup_owner(wbc, &locked_folio->page, cur_end - start); - async_chunk[i].locked_page = locked_page; - locked_page = NULL; + async_chunk[i].locked_page = &locked_folio->page; + locked_folio = NULL; } else { async_chunk[i].locked_page = NULL; } @@ -2307,7 +2308,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, locked_page, start, end, wbc)) + run_delalloc_compressed(inode, page_folio(locked_page), start, end, + wbc)) return 1; if (zoned) From db1099e5f361f31cf491c2301fe2063420034556 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:56:32 -0400 Subject: [PATCH 907/991] btrfs: convert btrfs_run_delalloc_range() to take a folio Now that every function that btrfs_run_delalloc_range calls takes a folio, update it to take a folio and update the callers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 26 ++++++++++++-------------- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3056c8aed8ef4f..5599b458a9a98e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -596,7 +596,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9ae17c9fd89b20..5ff38e3f28e6bf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1254,7 +1254,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, if (ret >= 0) { /* No errors hit so far, run the current delalloc range. */ - ret = btrfs_run_delalloc_range(inode, &folio->page, + ret = btrfs_run_delalloc_range(inode, folio, found_start, found_start + found_len - 1, wbc); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3ee2c863adeed2..e189dc9b6a3b06 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2287,42 +2287,40 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); int ret; /* - * The range must cover part of the @locked_page, or a return of 1 + * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= page_offset(locked_page) || - start >= page_offset(locked_page) + PAGE_SIZE)); + ASSERT(!(end <= folio_pos(locked_folio) || + start >= folio_pos(locked_folio) + folio_size(locked_folio))); if (should_nocow(inode, start, end)) { - ret = run_delalloc_nocow(inode, page_folio(locked_page), start, - end); + ret = run_delalloc_nocow(inode, locked_folio, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, page_folio(locked_page), start, end, - wbc)) + run_delalloc_compressed(inode, locked_folio, start, end, wbc)) return 1; if (zoned) - ret = run_delalloc_cow(inode, page_folio(locked_page), start, - end, wbc, true); + ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, + true); else - ret = cow_file_range(inode, page_folio(locked_page), start, end, - NULL, false, false); + ret = cow_file_range(inode, locked_folio, start, end, NULL, + false, false); out: if (ret < 0) - btrfs_cleanup_ordered_extents(inode, page_folio(locked_page), - start, end - start + 1); + btrfs_cleanup_ordered_extents(inode, locked_folio, start, + end - start + 1); return ret; } From fdfbc362e95d1f3b4f681f07965b19bade09b720 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 17:09:33 -0400 Subject: [PATCH 908/991] btrfs: convert struct async_chunk to hold a folio Instead of passing in the page for ->locked_page, make it hold a locked_folio and then update the users of async_chunk to act accordingly. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e189dc9b6a3b06..ceb7144ed0de6b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -762,7 +762,7 @@ struct async_extent { struct async_chunk { struct btrfs_inode *inode; - struct page *locked_page; + struct folio *locked_folio; u64 start; u64 end; blk_opf_t write_flags; @@ -1167,7 +1167,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; struct btrfs_key ins; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; @@ -1178,19 +1178,20 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, kthread_associate_blkcg(async_chunk->blkcg_css); /* - * If async_chunk->locked_page is in the async_extent range, we need to + * If async_chunk->locked_folio is in the async_extent range, we need to * handle it. */ - if (async_chunk->locked_page) { - u64 locked_page_start = page_offset(async_chunk->locked_page); - u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; + if (async_chunk->locked_folio) { + u64 locked_folio_start = folio_pos(async_chunk->locked_folio); + u64 locked_folio_end = locked_folio_start + + folio_size(async_chunk->locked_folio) - 1; - if (!(start >= locked_page_end || end <= locked_page_start)) - locked_page = async_chunk->locked_page; + if (!(start >= locked_folio_end || end <= locked_folio_start)) + locked_folio = async_chunk->locked_folio; } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, &locked_folio->page); goto done; } @@ -1205,7 +1206,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed. */ - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, + &locked_folio->page); goto done; } @@ -1714,10 +1716,10 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, */ wbc_account_cgroup_owner(wbc, &locked_folio->page, cur_end - start); - async_chunk[i].locked_page = &locked_folio->page; + async_chunk[i].locked_folio = locked_folio; locked_folio = NULL; } else { - async_chunk[i].locked_page = NULL; + async_chunk[i].locked_folio = NULL; } if (blkcg_css != blkcg_root_css) { From b00db1cbd4b981983311513d75dca1c55a71ecc1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 17:13:17 -0400 Subject: [PATCH 909/991] btrfs: convert submit_uncompressed_range() to take a folio This mostly uses folios already, update it to take a folio and update the rest of the function to use the folio instead of the page. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ceb7144ed0de6b..b6baa78e65732b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1122,7 +1122,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) static void submit_uncompressed_range(struct btrfs_inode *inode, struct async_extent *async_extent, - struct page *locked_page) + struct folio *locked_folio) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1135,23 +1135,22 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = run_delalloc_cow(inode, page_folio(locked_page), start, end, + ret = run_delalloc_cow(inode, locked_folio, start, end, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, page_folio(locked_page), + btrfs_cleanup_ordered_extents(inode, locked_folio, start, end - start + 1); - if (locked_page) { - const u64 page_start = page_offset(locked_page); + if (locked_folio) { + const u64 page_start = folio_pos(locked_folio); - set_page_writeback(locked_page); - end_page_writeback(locked_page); - btrfs_mark_ordered_io_finished(inode, - page_folio(locked_page), + folio_start_writeback(locked_folio); + folio_end_writeback(locked_folio); + btrfs_mark_ordered_io_finished(inode, locked_folio, page_start, PAGE_SIZE, !ret); - mapping_set_error(locked_page->mapping, ret); - unlock_page(locked_page); + mapping_set_error(locked_folio->mapping, ret); + folio_unlock(locked_folio); } } } @@ -1191,7 +1190,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { - submit_uncompressed_range(inode, async_extent, &locked_folio->page); + submit_uncompressed_range(inode, async_extent, locked_folio); goto done; } @@ -1206,8 +1205,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed. */ - submit_uncompressed_range(inode, async_extent, - &locked_folio->page); + submit_uncompressed_range(inode, async_extent, locked_folio); goto done; } From 7929ea39c2ffb9176ef1bdbfb925b42fcc22ac63 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 17:21:25 -0400 Subject: [PATCH 910/991] btrfs: convert btrfs_writepage_fixup_worker() to use a folio This function heavily messes with pages, instead update it to use a folio. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 54 +++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b6baa78e65732b..da9b31a6cc1451 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2708,49 +2708,51 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; struct page *page = fixup->page; + struct folio *folio = page_folio(page); struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 page_start = page_offset(page); - u64 page_end = page_offset(page) + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = folio_pos(folio) + folio_size(folio) - 1; int ret = 0; bool free_delalloc_space = true; /* * This is similar to page_mkwrite, we need to reserve the space before - * we take the page lock. + * we take the folio lock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); + folio_size(folio)); again: - lock_page(page); + folio_lock(folio); /* - * Before we queued this fixup, we took a reference on the page. - * page->mapping may go NULL, but it shouldn't be moved to a different + * Before we queued this fixup, we took a reference on the folio. + * folio->mapping may go NULL, but it shouldn't be moved to a different * address space. */ - if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + if (!folio->mapping || !folio_test_dirty(folio) || + !folio_test_checked(folio)) { /* * Unfortunately this is a little tricky, either * - * 1) We got here and our page had already been dealt with and + * 1) We got here and our folio had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. - * 2) Our page was already dealt with, but we happened to get an + * 2) Our folio was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but - * because the page was already dealt with we don't want to - * mark the page with an error, so make sure we're resetting + * because the folio was already dealt with we don't want to + * mark the folio with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC - * when the page was already properly dealt with. + * when the folio was already properly dealt with. */ if (!ret) { - btrfs_delalloc_release_extents(inode, PAGE_SIZE); + btrfs_delalloc_release_extents(inode, folio_size(folio)); btrfs_delalloc_release_space(inode, data_reserved, - page_start, PAGE_SIZE, + page_start, folio_size(folio), true); } ret = 0; @@ -2758,7 +2760,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) } /* - * We can't mess with the page state unless it is locked, so now that + * We can't mess with the folio state unless it is locked, so now that * it is locked bail if we failed to make our space reservation. */ if (ret) @@ -2767,14 +2769,14 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PageOrdered(page)) + if (folio_test_ordered(folio)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -2792,7 +2794,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) * * The page was dirty when we started, nothing should have cleaned it. */ - BUG_ON(!PageDirty(page)); + BUG_ON(!folio_test_dirty(folio)); free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(inode, PAGE_SIZE); @@ -2806,14 +2808,14 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page. */ - mapping_set_error(page->mapping, ret); - btrfs_mark_ordered_io_finished(inode, page_folio(page), - page_start, PAGE_SIZE, !ret); - clear_page_dirty_for_io(page); + mapping_set_error(folio->mapping, ret); + btrfs_mark_ordered_io_finished(inode, folio, page_start, + folio_size(folio), !ret); + folio_clear_dirty_for_io(folio); } - btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); - unlock_page(page); - put_page(page); + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + folio_unlock(folio); + folio_put(folio); kfree(fixup); extent_changeset_free(data_reserved); /* From d9d0a7bd47ffdff09cb6504a68e0d95486429f3c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 17:26:31 -0400 Subject: [PATCH 911/991] btrfs: convert btrfs_writepage_cow_fixup() to use folio Instead of a page, use a folio for btrfs_writepage_cow_fixup. We already have a folio at the only caller, and the fixup worker uses folios. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 31 ++++++++++++++++--------------- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5599b458a9a98e..fc60c0cde47964 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -598,7 +598,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, loff_t actual_len, u64 *alloc_hint); int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc); -int btrfs_writepage_cow_fixup(struct page *page); +int btrfs_writepage_cow_fixup(struct folio *folio); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5ff38e3f28e6bf..6ba8867d2a1fe4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1410,7 +1410,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, ASSERT(start >= folio_pos(folio) && start + len <= folio_pos(folio) + folio_size(folio)); - ret = btrfs_writepage_cow_fixup(&folio->page); + ret = btrfs_writepage_cow_fixup(folio); if (ret) { /* Fixup worker will requeue */ folio_redirty_for_writepage(bio_ctrl->wbc, folio); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index da9b31a6cc1451..e81b221d33a752 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2828,33 +2828,34 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) /* * There are a few paths in the higher layers of the kernel that directly - * set the page dirty bit without asking the filesystem if it is a + * set the folio dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set - * the delalloc bit and make it safe to write the page. + * the delalloc bit and make it safe to write the folio. */ -int btrfs_writepage_cow_fixup(struct page *page) +int btrfs_writepage_cow_fixup(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup; - /* This page has ordered extent covering it already */ - if (PageOrdered(page)) + /* This folio has ordered extent covering it already */ + if (folio_test_ordered(folio)) return 0; /* - * PageChecked is set below when we create a fixup worker for this page, - * don't try to create another one if we're already PageChecked() + * folio_checked is set below when we create a fixup worker for this + * folio, don't try to create another one if we're already + * folio_test_checked. * - * The extent_io writepage code will redirty the page if we send back + * The extent_io writepage code will redirty the foio if we send back * EAGAIN. */ - if (PageChecked(page)) + if (folio_test_checked(folio)) return -EAGAIN; fixup = kzalloc(sizeof(*fixup), GFP_NOFS); @@ -2864,14 +2865,14 @@ int btrfs_writepage_cow_fixup(struct page *page) /* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation - * takes place outside of the page lock, and we can't trust - * page->mapping outside of the page lock. + * takes place outside of the folio lock, and we can't trust + * page->mapping outside of the folio lock. */ ihold(inode); - btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); - get_page(page); + btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); + folio_get(folio); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); - fixup->page = page; + fixup->page = &folio->page; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); From 76e723084800b1ac7dfd75766c9e186e6f0c1f98 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 17:28:05 -0400 Subject: [PATCH 912/991] btrfs: convert struct btrfs_writepage_fixup to use a folio Now the fixup creator and consumer use folios, change this to use a folio as well. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e81b221d33a752..8ca07ca7dcd0ab 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2695,7 +2695,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { - struct page *page; + struct folio *folio; struct btrfs_inode *inode; struct btrfs_work work; }; @@ -2707,8 +2707,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page = fixup->page; - struct folio *folio = page_folio(page); + struct folio *folio = fixup->folio; struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 page_start = folio_pos(folio); @@ -2872,7 +2871,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio) btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); folio_get(folio); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); - fixup->page = &folio->page; + fixup->folio = folio; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); From 7a851305ea20376efcb9b44826ed2ff1828e3a39 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 17:38:46 -0400 Subject: [PATCH 913/991] btrfs: convert uncompress_inline() to take a folio Update uncompress_inline to take a folio and update it's usage accordingly. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8ca07ca7dcd0ab..29b5c257f95a0a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6707,7 +6707,7 @@ static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, } static noinline int uncompress_inline(struct btrfs_path *path, - struct page *page, + struct folio *folio, struct btrfs_file_extent_item *item) { int ret; @@ -6729,7 +6729,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, &folio->page, 0, inline_size, + max_size); /* * decompression code contains a memset to fill in any space between the end @@ -6740,7 +6741,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, */ if (max_size < PAGE_SIZE) - memzero_page(page, max_size, PAGE_SIZE - max_size); + folio_zero_range(folio, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } @@ -6760,7 +6761,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path fi = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) - return uncompress_inline(path, page, fi); + return uncompress_inline(path, page_folio(page), fi); copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); From a08ab53ab895c209b8b1595cf17dd0184054a369 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 19:25:10 -0400 Subject: [PATCH 914/991] btrfs: convert read_inline_extent() to use a folio Instead of using a page, use a folio instead, take a folio as an argument, and update the callers appropriately. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 29b5c257f95a0a..eb3c8ccf733721 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6747,30 +6747,30 @@ static noinline int uncompress_inline(struct btrfs_path *path, } static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, - struct page *page) + struct folio *folio) { struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; - if (!page || PageUptodate(page)) + if (!folio || folio_test_uptodate(folio)) return 0; - ASSERT(page_offset(page) == 0); + ASSERT(folio_pos(folio) == 0); fi = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) - return uncompress_inline(path, page_folio(page), fi); + return uncompress_inline(path, folio, fi); copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); if (copy_size < PAGE_SIZE) - memzero_page(page, copy_size, PAGE_SIZE - copy_size); + folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); return 0; } @@ -6945,7 +6945,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, page); + ret = read_inline_extent(inode, path, page_folio(page)); if (ret < 0) goto out; goto insert; From 22103adcd392b5ae4adb3c4870fd1dbb3c2683bc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 19:26:58 -0400 Subject: [PATCH 915/991] btrfs: convert btrfs_get_extent() to take a folio We only pass this into read_inline_extent, change it to take a folio and update the callers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index fc60c0cde47964..2d7f8da54d8a66 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -578,7 +578,7 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path); struct inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, u64 start, u64 len); + struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6ba8867d2a1fe4..f4eb44298bd86b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -987,7 +987,7 @@ static struct extent_map *__get_extent_map(struct inode *inode, struct page *pag *em_cached = NULL; } - em = btrfs_get_extent(BTRFS_I(inode), page, start, len); + em = btrfs_get_extent(BTRFS_I(inode), page_folio(page), start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index eb3c8ccf733721..05e74ef068968e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6792,7 +6792,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, u64 start, u64 len) + struct folio *folio, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; @@ -6815,7 +6815,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (em) { if (em->start > start || em->start + em->len <= start) free_extent_map(em); - else if (em->disk_bytenr == EXTENT_MAP_INLINE && page) + else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) free_extent_map(em); else goto out; @@ -6945,7 +6945,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, page_folio(page)); + ret = read_inline_extent(inode, path, folio); if (ret < 0) goto out; goto insert; From 9d9bf933cf14c209ab88a34c3884c727c3619798 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 19:28:26 -0400 Subject: [PATCH 916/991] btrfs: convert __get_extent_map() to take a folio Now that btrfs_get_extent takes a folio, update __get_extent_map to take a folio as well. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f4eb44298bd86b..782a38370d0383 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -968,8 +968,9 @@ void clear_page_extent_mapped(struct page *page) folio_detach_private(folio); } -static struct extent_map *__get_extent_map(struct inode *inode, struct page *page, - u64 start, u64 len, struct extent_map **em_cached) +static struct extent_map *__get_extent_map(struct inode *inode, + struct folio *folio, u64 start, + u64 len, struct extent_map **em_cached) { struct extent_map *em; @@ -987,7 +988,7 @@ static struct extent_map *__get_extent_map(struct inode *inode, struct page *pag *em_cached = NULL; } - em = btrfs_get_extent(BTRFS_I(inode), page_folio(page), start, len); + em = btrfs_get_extent(BTRFS_I(inode), folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -1050,8 +1051,8 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, end_folio_read(folio, true, cur, iosize); break; } - em = __get_extent_map(inode, folio_page(folio, 0), cur, - end - cur + 1, em_cached); + em = __get_extent_map(inode, folio, cur, end - cur + 1, + em_cached); if (IS_ERR(em)) { unlock_extent(tree, cur, end, NULL); end_folio_read(folio, false, cur, end + 1 - cur); From 7d389c01c60876b6c69915919cbbde4405fdfa1a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 20:17:26 -0400 Subject: [PATCH 917/991] btrfs: convert find_next_dirty_byte() to take a folio We already use a folio some in this function, replace all page usage with the folio and update the function to take the folio as an argument. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 782a38370d0383..e56b62746a15f1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1348,9 +1348,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. */ static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info, - struct page *page, u64 *start, u64 *end) + struct folio *folio, u64 *start, u64 *end) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); struct btrfs_subpage_info *spi = fs_info->subpage_info; u64 orig_start = *start; @@ -1363,14 +1362,15 @@ static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info, * For regular sector size == page size case, since one page only * contains one sector, we return the page offset directly. */ - if (!btrfs_is_subpage(fs_info, page->mapping)) { - *start = page_offset(page); - *end = page_offset(page) + PAGE_SIZE; + if (!btrfs_is_subpage(fs_info, folio->mapping)) { + *start = folio_pos(folio); + *end = folio_pos(folio) + folio_size(folio); return; } range_start_bit = spi->dirty_offset + - (offset_in_page(orig_start) >> fs_info->sectorsize_bits); + (offset_in_folio(folio, orig_start) >> + fs_info->sectorsize_bits); /* We should have the page locked, but just in case */ spin_lock_irqsave(&subpage->lock, flags); @@ -1381,8 +1381,8 @@ static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info, range_start_bit -= spi->dirty_offset; range_end_bit -= spi->dirty_offset; - *start = page_offset(page) + range_start_bit * fs_info->sectorsize; - *end = page_offset(page) + range_end_bit * fs_info->sectorsize; + *start = folio_pos(folio) + range_start_bit * fs_info->sectorsize; + *end = folio_pos(folio) + range_end_bit * fs_info->sectorsize; } /* @@ -1443,7 +1443,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, break; } - find_next_dirty_byte(fs_info, &folio->page, &dirty_range_start, + find_next_dirty_byte(fs_info, folio, &dirty_range_start, &dirty_range_end); if (cur < dirty_range_start) { cur = dirty_range_start; From b10d400924e30be80511b0fb93e15cc69aec1995 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 20:20:24 -0400 Subject: [PATCH 918/991] btrfs: convert wait_subpage_spinlock() to only use a folio Currently this already uses a folio for most things, update it to take a folio and update all the page usage with the corresponding folio usage. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 05e74ef068968e..0e5db913d6bb47 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7187,13 +7187,12 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. */ -static void wait_subpage_spinlock(struct page *page) +static void wait_subpage_spinlock(struct folio *folio) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page->mapping)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -7223,7 +7222,7 @@ static int btrfs_launder_folio(struct folio *folio) static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { if (try_release_extent_mapping(&folio->page, gfp_flags)) { - wait_subpage_spinlock(&folio->page); + wait_subpage_spinlock(folio); clear_page_extent_mapped(&folio->page); return true; } @@ -7284,7 +7283,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * do double ordered extent accounting on the same folio. */ folio_wait_writeback(folio); - wait_subpage_spinlock(&folio->page); + wait_subpage_spinlock(folio); /* * For subpage case, we have call sites like From d7d92bb8a26c7e3d5db0033003de851ce1b25802 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 20:22:32 -0400 Subject: [PATCH 919/991] btrfs: convert btrfs_set_range_writeback() to use a folio We already use a lot of functions here that use folios, update the function to use __filemap_get_folio instead of find_get_page and then use the folio directly. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0e5db913d6bb47..aece69dc41ec1b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8958,19 +8958,19 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; u32 len; ASSERT(end + 1 - start <= U32_MAX); len = end + 1 - start; while (index <= end_index) { - page = find_get_page(inode->vfs_inode.i_mapping, index); - ASSERT(page); /* Pages should be in the extent_io_tree */ + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); + ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */ /* This is for data, which doesn't yet support larger folio. */ - ASSERT(folio_order(page_folio(page)) == 0); - btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); - put_page(page); + ASSERT(folio_order(folio) == 0); + btrfs_folio_set_writeback(fs_info, folio, start, len); + folio_put(folio); index++; } } From a4f88f3744b03f758f7bb1d607104c0f14ef7513 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 20:37:20 -0400 Subject: [PATCH 920/991] btrfs: convert insert_inline_extent() to use a folio We only use a page to copy in the data for the inline extent. Use a folio for this instead. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aece69dc41ec1b..cd9290f86a4c8f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -495,7 +495,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct page *page = NULL; const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsigned long ptr; @@ -555,12 +554,16 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { - page = find_get_page(inode->vfs_inode.i_mapping, 0); + struct folio *folio; + + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, + 0, 0, 0); + ASSERT(!IS_ERR(folio)); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); write_extent_buffer(leaf, kaddr, ptr, size); kunmap_local(kaddr); - put_page(page); + folio_put(folio); } btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); From f05689b401f6e4d6b9732635337f3be77311c347 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 20:39:35 -0400 Subject: [PATCH 921/991] btrfs: convert extent_range_clear_dirty_for_io() to use a folio Instead of getting a page and using that to clear dirty for io, use the folio helper and use the appropriate folio functions. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cd9290f86a4c8f..a9656e5529fb83 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -877,19 +877,19 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; int ret = 0; for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { - page = find_get_page(inode->i_mapping, index); - if (unlikely(!page)) { + folio = __filemap_get_folio(inode->i_mapping, index, 0, 0); + if (IS_ERR(folio)) { if (!ret) - ret = -ENOENT; + ret = PTR_ERR(folio); continue; } - clear_page_dirty_for_io(page); - put_page(page); + folio_clear_dirty_for_io(folio); + folio_put(folio); } return ret; } From b7e008beba17e39fadeed943bb68862d887dff59 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 25 Jul 2024 10:46:01 +0100 Subject: [PATCH 922/991] btrfs: reschedule when updating chunk maps at the end of a device replace At the end of a device replace we must go over all the chunk maps and update their stripes to point to the target device instead of the source device. We iterate over the chunk maps while holding a write lock and we never reschedule, which can result in monopolizing a CPU for too long and blocking readers for too long (it's a rw lock, non-blocking). So improve on this by rescheduling if necessary. This is safe because at this point we are holding the chunk mutex, which means no new chunks can be allocated and therefore we don't risk missing a new chunk map that covers a range behind the last one we processed before rescheduling. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f638c458d28537..20cf5e95f2bcd2 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -827,6 +827,14 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( u64 start = 0; int i; + /* + * The chunk mutex must be held so that no new chunks can be created + * while we are updating existing chunks. This guarantees we don't miss + * any new chunk that gets created for a range that falls before the + * range of the last chunk we processed. + */ + lockdep_assert_held(&fs_info->chunk_mutex); + write_lock(&fs_info->mapping_tree_lock); do { struct btrfs_chunk_map *map; @@ -839,6 +847,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( map->stripes[i].dev = tgtdev; start = map->start + map->chunk_len; btrfs_free_chunk_map(map); + cond_resched_rwlock_write(&fs_info->mapping_tree_lock); } while (start); write_unlock(&fs_info->mapping_tree_lock); } From 5ab146f486dd1d40a3c523d14be17ce8de85d314 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 25 Jul 2024 11:48:10 +0100 Subject: [PATCH 923/991] btrfs: more efficient chunk map iteration when device replace finishes When iterating the chunk maps when a device replace finishes we are doing a full rbtree search for each chunk map, which is not the most efficient thing to do, wasting CPU time. As we are holding a write lock on the tree during the whole iteration, we can simply start from the first node in the tree and then move to the next chunk map by doing a rb_next() call - the only exception is when we need to reschedule, in which case we have to do a full rbtree search since we dropped the write lock and the tree may have changed (chunk maps may have been removed and the tree got rebalanced). So just do that. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 20cf5e95f2bcd2..83d5cdd77f293e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -824,8 +824,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - u64 start = 0; - int i; + struct rb_node *node; /* * The chunk mutex must be held so that no new chunks can be created @@ -836,19 +835,34 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( lockdep_assert_held(&fs_info->chunk_mutex); write_lock(&fs_info->mapping_tree_lock); - do { + node = rb_first_cached(&fs_info->mapping_tree); + while (node) { + struct rb_node *next = rb_next(node); struct btrfs_chunk_map *map; + u64 next_start; - map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX); - if (!map) - break; - for (i = 0; i < map->num_stripes; i++) + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + next_start = map->start + map->chunk_len; + + for (int i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; - start = map->start + map->chunk_len; - btrfs_free_chunk_map(map); - cond_resched_rwlock_write(&fs_info->mapping_tree_lock); - } while (start); + + if (cond_resched_rwlock_write(&fs_info->mapping_tree_lock)) { + map = btrfs_find_chunk_map_nolock(fs_info, next_start, U64_MAX); + if (!map) + break; + node = &map->rb_node; + /* + * Drop the lookup reference since we are holding the + * lock in write mode and no one can remove the chunk + * map from the tree and drop its tree reference. + */ + btrfs_free_chunk_map(map); + } else { + node = next; + } + } write_unlock(&fs_info->mapping_tree_lock); } From 18eda6708fb478c6727201b1481fe80fcb6d14bb Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 31 Jul 2024 12:41:06 -0700 Subject: [PATCH 924/991] btrfs: add comment about locking in cow_file_range_inline() Add a comment to document the complicated locked_page unlock logic in cow_file_range_inline. The specifically tricky part is that a caller just up the stack converts ret == 0 to ret == 1 and then another caller far up the callstack handles ret == 1 as a success, AND returns without cleanup in that case, both of which "feel" unnatural and led to the original bug. Try to document that somewhat specific callstack logic here to explain the weird un-setting of locked_folio on success. Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a9656e5529fb83..d25aeb844a642b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -744,6 +744,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, return ret; } + /* + * In the successful case (ret == 0 here), cow_file_range will return 1. + * + * Quite a bit further up the callstack in __extent_writepage, ret == 1 + * is treated as a short circuited success and does not unlock the folio, + * so we must do it here. + * + * In the failure case, the locked_folio does get unlocked by + * btrfs_folio_end_all_writers, which asserts that it is still locked + * at that point, so we must *not* unlock it here. + * + * The other two callsites in compress_file_range do not have a + * locked_folio, so they are not relevant to this logic. + */ if (ret == 0) locked_folio = NULL; From 38e3558003a522335f0de5ed8556cc9ee8baf18f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 Jul 2024 22:43:03 +0200 Subject: [PATCH 925/991] btrfs: don't dump stripe-tree on lookup error This just creates unnecessary noise and doesn't provide any insights into debugging RAID stripe-tree related issues. Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index bd06ff7956910f..c1c74f310e8b34 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -284,8 +284,6 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, if (ret > 0) ret = -ENOENT; if (ret && ret != -EIO && !stripe->is_scrub) { - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) - btrfs_print_tree(leaf, 1); btrfs_err(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", logical, logical + *length, stripe->dev->devid, From 18d5fad376fbffa6779e91afd6b0be34743fe891 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 Jul 2024 22:43:04 +0200 Subject: [PATCH 926/991] btrfs: rename btrfs_io_stripe::is_scrub to rst_search_commit_root Rename 'btrfs_io_stripe::is_scrub' to 'rst_search_commit_root'. While 'is_scrub' describes the state of the io_stripe (it is a stripe submitted by scrub) it does not describe the purpose, namely looking at the commit root when searching RAID stripe-tree entries. Renaming the stripe to rst_search_commit_root describes this purpose. Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 2 +- fs/btrfs/raid-stripe-tree.c | 4 ++-- fs/btrfs/scrub.c | 2 +- fs/btrfs/volumes.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index b4e31ae17cd95a..36d0e52faeecda 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -678,7 +678,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t ret; int error; - smap.is_scrub = !bbio->inode; + smap.rst_search_commit_root = !bbio->inode; btrfs_bio_counter_inc_blocked(fs_info); error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index c1c74f310e8b34..28a545367be76a 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -210,7 +210,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, if (!path) return -ENOMEM; - if (stripe->is_scrub) { + if (stripe->rst_search_commit_root) { path->skip_locking = 1; path->search_commit_root = 1; } @@ -283,7 +283,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, out: if (ret > 0) ret = -ENOENT; - if (ret && ret != -EIO && !stripe->is_scrub) { + if (ret && ret != -EIO && !stripe->rst_search_commit_root) { btrfs_err(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", logical, logical + *length, stripe->dev->devid, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0de9162ff48138..b3afa63658231a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1694,7 +1694,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, (i << fs_info->sectorsize_bits); int err; - io_stripe.is_scrub = true; + io_stripe.rst_search_commit_root = true; stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; /* * For RST cases, we need to manually split the bbio to diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index c947187539ddbd..03d2d60afe0cfa 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -444,7 +444,7 @@ struct btrfs_io_stripe { /* Block mapping. */ u64 physical; u64 length; - bool is_scrub; + bool rst_search_commit_root; /* For the endio handler. */ struct btrfs_io_context *bioc; }; From 3b73265d42e2b821ed8ea1b1e53affb62b39b128 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 Jul 2024 22:43:05 +0200 Subject: [PATCH 927/991] btrfs: set search_commit_root on stripe io in case of relocation Set rst_search_commit_root in the btrfs_io_stripe we're passing to btrfs_map_block() in case we're doing data relocation. Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/bio.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 36d0e52faeecda..f6cb58d7f16a46 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -678,7 +678,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t ret; int error; - smap.rst_search_commit_root = !bbio->inode; + if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) + smap.rst_search_commit_root = true; + else + smap.rst_search_commit_root = false; btrfs_bio_counter_inc_blocked(fs_info); error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, From 21febadc2e044780f782b89b787cc3c556916a29 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 Jul 2024 22:43:06 +0200 Subject: [PATCH 928/991] btrfs: don't readahead the relocation inode on RST On relocation we're doing readahead on the relocation inode, but if the filesystem is backed by a RAID stripe tree we can get ENOENT (e.g. due to preallocated extents not being mapped in the RST) from the lookup. But readahead doesn't handle the error and submits invalid reads to the device, causing an assertion in the scatter-gather list code: BTRFS info (device nvme1n1): balance: start -d -m -s BTRFS info (device nvme1n1): relocating block group 6480920576 flags data|raid0 BTRFS error (device nvme1n1): cannot find raid-stripe for logical [6481928192, 6481969152] devid 2, profile raid0 ------------[ cut here ]------------ kernel BUG at include/linux/scatterlist.h:115! Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 1012 Comm: btrfs Not tainted 6.10.0-rc7+ #567 RIP: 0010:__blk_rq_map_sg+0x339/0x4a0 RSP: 0018:ffffc90001a43820 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffea00045d4802 RDX: 0000000117520000 RSI: 0000000000000000 RDI: ffff8881027d1000 RBP: 0000000000003000 R08: ffffea00045d4902 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000001000 R12: ffff8881003d10b8 R13: ffffc90001a438f0 R14: 0000000000000000 R15: 0000000000003000 FS: 00007fcc048a6900(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000002cd11000 CR3: 00000001109ea001 CR4: 0000000000370eb0 Call Trace: ? __die_body.cold+0x14/0x25 ? die+0x2e/0x50 ? do_trap+0xca/0x110 ? do_error_trap+0x65/0x80 ? __blk_rq_map_sg+0x339/0x4a0 ? exc_invalid_op+0x50/0x70 ? __blk_rq_map_sg+0x339/0x4a0 ? asm_exc_invalid_op+0x1a/0x20 ? __blk_rq_map_sg+0x339/0x4a0 nvme_prep_rq.part.0+0x9d/0x770 nvme_queue_rq+0x7d/0x1e0 __blk_mq_issue_directly+0x2a/0x90 ? blk_mq_get_budget_and_tag+0x61/0x90 blk_mq_try_issue_list_directly+0x56/0xf0 blk_mq_flush_plug_list.part.0+0x52b/0x5d0 __blk_flush_plug+0xc6/0x110 blk_finish_plug+0x28/0x40 read_pages+0x160/0x1c0 page_cache_ra_unbounded+0x109/0x180 relocate_file_extent_cluster+0x611/0x6a0 ? btrfs_search_slot+0xba4/0xd20 ? balance_dirty_pages_ratelimited_flags+0x26/0xb00 relocate_data_extent.constprop.0+0x134/0x160 relocate_block_group+0x3f2/0x500 btrfs_relocate_block_group+0x250/0x430 btrfs_relocate_chunk+0x3f/0x130 btrfs_balance+0x71b/0xef0 ? kmalloc_trace_noprof+0x13b/0x280 btrfs_ioctl+0x2c2e/0x3030 ? kvfree_call_rcu+0x1e6/0x340 ? list_lru_add_obj+0x66/0x80 ? mntput_no_expire+0x3a/0x220 __x64_sys_ioctl+0x96/0xc0 do_syscall_64+0x54/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fcc04514f9b Code: Unable to access opcode bytes at 0x7fcc04514f71. RSP: 002b:00007ffeba923370 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007fcc04514f9b RDX: 00007ffeba923460 RSI: 00000000c4009420 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000013 R09: 0000000000000001 R10: 00007fcc043fbba8 R11: 0000000000000246 R12: 00007ffeba924fc5 R13: 00007ffeba923460 R14: 0000000000000002 R15: 00000000004d4bb0 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:__blk_rq_map_sg+0x339/0x4a0 RSP: 0018:ffffc90001a43820 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffea00045d4802 RDX: 0000000117520000 RSI: 0000000000000000 RDI: ffff8881027d1000 RBP: 0000000000003000 R08: ffffea00045d4902 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000001000 R12: ffff8881003d10b8 R13: ffffc90001a438f0 R14: 0000000000000000 R15: 0000000000003000 FS: 00007fcc048a6900(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fcc04514f71 CR3: 00000001109ea001 CR4: 0000000000370eb0 Kernel panic - not syncing: Fatal exception Kernel Offset: disabled ---[ end Kernel panic - not syncing: Fatal exception ]--- So in case of a relocation on a RAID stripe-tree based file system, skip the readahead. Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0533d0f82dc993..ea4ed85919ec8f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -36,6 +36,7 @@ #include "relocation.h" #include "super.h" #include "tree-checker.h" +#include "raid-stripe-tree.h" /* * Relocation overview @@ -2965,21 +2966,34 @@ static int relocate_one_folio(struct reloc_control *rc, u64 folio_end; u64 cur; int ret; + const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); ASSERT(index <= last_index); folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { - page_cache_sync_readahead(inode->i_mapping, ra, NULL, - index, last_index + 1 - index); + + /* + * On relocation we're doing readahead on the relocation inode, + * but if the filesystem is backed by a RAID stripe tree we can + * get ENOENT (e.g. due to preallocated extents not being + * mapped in the RST) from the lookup. + * + * But readahead doesn't handle the error and submits invalid + * reads to the device, causing a assertion failures. + */ + if (!use_rst) + page_cache_sync_readahead(inode->i_mapping, ra, NULL, + index, last_index + 1 - index); folio = __filemap_get_folio(inode->i_mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); if (IS_ERR(folio)) return PTR_ERR(folio); } WARN_ON(folio_order(folio)); - if (folio_test_readahead(folio)) + if (folio_test_readahead(folio) && !use_rst) page_cache_async_readahead(inode->i_mapping, ra, NULL, folio, last_index + 1 - index); From f9a4315922f8c41355c62736959fd90dd8858348 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 Jul 2024 22:43:07 +0200 Subject: [PATCH 929/991] btrfs: change RST lookup error message level to debug Now that RAID stripe-tree lookup failures are not treated as a fatal issue any more, change the RAID stripe-tree lookup error message to debug level. Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 28a545367be76a..4c859b550f6c33 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -284,7 +284,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, if (ret > 0) ret = -ENOENT; if (ret && ret != -EIO && !stripe->rst_search_commit_root) { - btrfs_err(fs_info, + btrfs_debug(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", logical, logical + *length, stripe->dev->devid, btrfs_bg_type_to_raid_name(map_type)); From 2248bb2f5f1664ecd66cd413c11affcabed83c9f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 5 Aug 2024 15:02:54 +0930 Subject: [PATCH 930/991] btrfs: make btrfs_is_subpage() to return false directly for 4K page size Btrfs only supports sectorsize 4K, 8K, 16K, 32K, 64K for now, thus for systems with 4K page size, there is no way the fs is subpage (sectorsize < PAGE_SIZE). So here we define btrfs_is_subpage() different according to the PAGE_SIZE: - PAGE_SIZE > 4K We may hit real subpage cases, define btrfs_is_subpage() as a regular function and do the usual checks. - PAGE_SIZE == 4K (no smaller PAGE_SIZE support AFAIK) There is no way the fs is subpage, so just define btrfs_is_subpage() as an inline function which always return false. This saves about 7K bytes for x86_64 debug builds: text data bss dec hex filename Before: 1484452 168693 25776 1678921 199e49 fs/btrfs/btrfs.ko After: 1476605 168445 25776 1670826 197eaa fs/btrfs/btrfs.ko Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 2 ++ fs/btrfs/subpage.h | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 8ddd5fcbeb9391..631d96f1e90576 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -64,6 +64,7 @@ * This means a slightly higher tree locking latency. */ +#if PAGE_SIZE > SZ_4K bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) { if (fs_info->sectorsize >= PAGE_SIZE) @@ -85,6 +86,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space return true; return false; } +#endif void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) { diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 249396e118d0a8..5532cc4fac504d 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -5,6 +5,7 @@ #include #include +#include struct address_space; struct folio; @@ -88,7 +89,15 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; +#if PAGE_SIZE > SZ_4K bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); +#else +static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, + struct address_space *mapping) +{ + return false; +} +#endif void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, From 66496a013b08416a0d02d537bf23b146f0e765ed Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 7 Aug 2024 17:13:58 +0100 Subject: [PATCH 931/991] btrfs: directly wake up cleaner kthread in the BTRFS_IOC_SYNC ioctl The BTRFS_IOC_SYNC ioctl wants to wake up the cleaner kthread so that it does any pending work (subvolume deletion, delayed iputs, etc), however it is waking up the transaction kthread, which in turn wakes up the cleaner. Since we don't have any transaction to commit, as any ongoing transaction was already committed when it called btrfs_sync_fs() and the goal is just to wake up the cleaner thread, directly wake up the cleaner instead of the transaction kthread. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e0a664b8a46a86..ee01cc8288837e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4765,11 +4765,10 @@ long btrfs_ioctl(struct file *file, unsigned int return ret; ret = btrfs_sync_fs(inode->i_sb, 1); /* - * The transaction thread may want to do more work, - * namely it pokes the cleaner kthread that will start - * processing uncleaned subvols. + * There may be work for the cleaner kthread to do (subvolume + * deletion, delayed iputs, defrag inodes, etc), so wake it up. */ - wake_up_process(fs_info->transaction_kthread); + wake_up_process(fs_info->cleaner_kthread); return ret; } case BTRFS_IOC_START_SYNC: From 7ea2813e787964b5da19cd79db3a84b673d0a65c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 13 Aug 2024 13:36:40 +0200 Subject: [PATCH 932/991] btrfs: reduce chunk_map lookups in btrfs_map_block() Currently we're calling btrfs_num_copies() before btrfs_get_chunk_map() in btrfs_map_block(). But btrfs_num_copies() itself does a chunk map lookup to be able to calculate the number of copies. So split out the code getting the number of copies from btrfs_num_copies() into a helper called btrfs_chunk_map_num_copies() and directly call it from btrfs_map_block() and btrfs_num_copies(). This saves us one rbtree lookup per btrfs_map_block() invocation. Reviewed-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 49 +++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e07452207426aa..8f340ad1d93845 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5781,11 +5781,31 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) write_unlock(&fs_info->mapping_tree_lock); } +static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type); + + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 2; + + /* + * There could be two corrupted data stripes, we need to loop retry in + * order to rebuild the correct data. + * + * Fail a stripe at a time on every retry except the stripe under + * reconstruction. + */ + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return map->num_stripes; + + /* Non-RAID56, use their ncopies from btrfs_raid_array. */ + return btrfs_raid_array[index].ncopies; +} + int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct btrfs_chunk_map *map; - enum btrfs_raid_types index; - int ret = 1; + int ret; map = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(map)) @@ -5797,22 +5817,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) */ return 1; - index = btrfs_bg_flags_to_raid_index(map->type); - - /* Non-RAID56, use their ncopies from btrfs_raid_array. */ - if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) - ret = btrfs_raid_array[index].ncopies; - else if (map->type & BTRFS_BLOCK_GROUP_RAID5) - ret = 2; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - /* - * There could be two corrupted data stripes, we need - * to loop retry in order to rebuild the correct data. - * - * Fail a stripe at a time on every retry except the - * stripe under reconstruction. - */ - ret = map->num_stripes; + ret = btrfs_chunk_map_num_copies(map); btrfs_free_chunk_map(map); return ret; } @@ -6462,14 +6467,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom.stripe_index = 0; io_geom.op = op; - num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); - if (io_geom.mirror_num > num_copies) - return -EINVAL; - map = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(map)) return PTR_ERR(map); + num_copies = btrfs_chunk_map_num_copies(map); + if (io_geom.mirror_num > num_copies) + return -EINVAL; + map_offset = logical - map->start; io_geom.raid56_full_stripe_start = (u64)-1; max_len = btrfs_max_io_len(map, map_offset, &io_geom); From 1f383d8a93c60d5094ed1819f9fcfec810c4a591 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 30 Aug 2024 16:35:48 +0930 Subject: [PATCH 933/991] btrfs: subpage: fix the bitmap dump which can cause bitmap corruption In commit 75258f20fb70 ("btrfs: subpage: dump extra subpage bitmaps for debug") an internal macro GET_SUBPAGE_BITMAP() is introduced to grab the bitmap of each attribute. But that commit is using bitmap_cut() which will do the left shift of the larger bitmap, causing incorrect values. Thankfully this bitmap_cut() is only called for debug usage, and so far it's not yet causing problem. Fix it to use bitmap_read() to only grab the desired sub-bitmap. Fixes: 75258f20fb70 ("btrfs: subpage: dump extra subpage bitmaps for debug") CC: stable@vger.kernel.org # 6.6+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 631d96f1e90576..f8795c3d227089 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -902,8 +902,14 @@ void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct fol } #define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ - bitmap_cut(dst, subpage->bitmaps, 0, \ - subpage_info->name##_offset, subpage_info->bitmap_nr_bits) +{ \ + const int bitmap_nr_bits = subpage_info->bitmap_nr_bits; \ + \ + ASSERT(bitmap_nr_bits < BITS_PER_LONG); \ + *dst = bitmap_read(subpage->bitmaps, \ + subpage_info->name##_offset, \ + bitmap_nr_bits); \ +} void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) From cd0011794dea78f72e06f61116bab890c794f8c9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 7 Aug 2024 14:31:54 +0930 Subject: [PATCH 934/991] btrfs: refactor __extent_writepage_io() to do sector-by-sector submission Unlike the bitmap usage inside raid56, for __extent_writepage_io() we handle the subpage submission not sector-by-sector, but for each dirty range we found. This is not a big deal normally, as the subpage complex code is already mostly optimized out by the compiler for x86_64. However for the sake of consistency and for the future of subpage sector-perfect compression support, this patch does: - Extract the sector submission code into submit_one_sector() - Add the needed code to extract the dirty bitmap for subpage case There is a small pitfall for non-subpage case, as we cleared page dirty before starting writeback, so we have to manually set the default dirty_bitmap to 1 for such case. - Use bitmap_and() to calculate the target sectors we need to submit This is done for both subpage and non-subpage cases, and will later be expanded to skip inline/compression ranges. For x86_64, the dirty bitmap will be fixed to 1, with the length of 1, so we're still doing the same workload per sector. For larger page sizes, the overhead will be a little larger, as previous we only need to do one extent_map lookup per-dirty-range, but now it will be one extent_map lookup per-sector. But that is the same frequency as x86_64, so we're just aligning the behavior to x86_64. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 198 +++++++++++++++++++------------------------ fs/btrfs/subpage.c | 17 ++++ fs/btrfs/subpage.h | 3 + 3 files changed, 107 insertions(+), 111 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e56b62746a15f1..822e2bf8bc990c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1333,56 +1333,68 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, } /* - * Find the first byte we need to write. + * Return 0 if we have submitted or queued the sector for submission. + * Return <0 for critical errors. * - * For subpage, one page can contain several sectors, and - * __extent_writepage_io() will just grab all extent maps in the page - * range and try to submit all non-inline/non-compressed extents. - * - * This is a big problem for subpage, we shouldn't re-submit already written - * data at all. - * This function will lookup subpage dirty bit to find which range we really - * need to submit. - * - * Return the next dirty range in [@start, @end). - * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. + * Caller should make sure filepos < i_size and handle filepos >= i_size case. */ -static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 *start, u64 *end) +static int submit_one_sector(struct btrfs_inode *inode, + struct folio *folio, + u64 filepos, struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size) { - struct btrfs_subpage *subpage = folio_get_private(folio); - struct btrfs_subpage_info *spi = fs_info->subpage_info; - u64 orig_start = *start; - /* Declare as unsigned long so we can use bitmap ops */ - unsigned long flags; - int range_start_bit; - int range_end_bit; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map *em; + u64 block_start; + u64 disk_bytenr; + u64 extent_offset; + u64 em_end; + const u32 sectorsize = fs_info->sectorsize; - /* - * For regular sector size == page size case, since one page only - * contains one sector, we return the page offset directly. - */ - if (!btrfs_is_subpage(fs_info, folio->mapping)) { - *start = folio_pos(folio); - *end = folio_pos(folio) + folio_size(folio); - return; - } + ASSERT(IS_ALIGNED(filepos, sectorsize)); + + /* @filepos >= i_size case should be handled by the caller. */ + ASSERT(filepos < i_size); + + em = btrfs_get_extent(inode, NULL, filepos, sectorsize); + if (IS_ERR(em)) + return PTR_ERR_OR_ZERO(em); - range_start_bit = spi->dirty_offset + - (offset_in_folio(folio, orig_start) >> - fs_info->sectorsize_bits); + extent_offset = filepos - em->start; + em_end = extent_map_end(em); + ASSERT(filepos <= em_end); + ASSERT(IS_ALIGNED(em->start, sectorsize)); + ASSERT(IS_ALIGNED(em->len, sectorsize)); - /* We should have the page locked, but just in case */ - spin_lock_irqsave(&subpage->lock, flags); - bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, - spi->dirty_offset + spi->bitmap_nr_bits); - spin_unlock_irqrestore(&subpage->lock, flags); + block_start = extent_map_block_start(em); + disk_bytenr = extent_map_block_start(em) + extent_offset; - range_start_bit -= spi->dirty_offset; - range_end_bit -= spi->dirty_offset; + ASSERT(!extent_map_is_compressed(em)); + ASSERT(block_start != EXTENT_MAP_HOLE); + ASSERT(block_start != EXTENT_MAP_INLINE); - *start = folio_pos(folio) + range_start_bit * fs_info->sectorsize; - *end = folio_pos(folio) + range_end_bit * fs_info->sectorsize; + free_extent_map(em); + em = NULL; + + btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1); + /* + * Above call should set the whole folio with writeback flag, even + * just for a single subpage sector. + * As long as the folio is properly locked and the range is correct, + * we should always get the folio with writeback flag. + */ + ASSERT(folio_test_writeback(folio)); + + /* + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. + * So clear subpage dirty bit here so next time we won't submit + * folio for range already written to disk. + */ + btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); + submit_extent_folio(bio_ctrl, disk_bytenr, folio, + sectorsize, filepos - folio_pos(folio)); + return 0; } /* @@ -1400,16 +1412,24 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, loff_t i_size, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 cur = start; - u64 end = start + len - 1; - u64 extent_offset; - u64 block_start; - struct extent_map *em; + unsigned long range_bitmap = 0; + /* + * This is the default value for sectorsize == PAGE_SIZE case. + * We known we need to write the dirty sector (aka the page), + * even if the page is not dirty (we cleared it before entering). + * + * For subpage cases we will get the correct bitmap later. + */ + unsigned long dirty_bitmap = 1; + unsigned int bitmap_size = 1; + const u64 folio_start = folio_pos(folio); + u64 cur; + int bit; int ret = 0; int nr = 0; - ASSERT(start >= folio_pos(folio) && - start + len <= folio_pos(folio) + folio_size(folio)); + ASSERT(start >= folio_start && + start + len <= folio_start + folio_size(folio)); ret = btrfs_writepage_cow_fixup(folio); if (ret) { @@ -1419,18 +1439,23 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, return 1; } + if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { + ASSERT(fs_info->subpage_info); + btrfs_get_subpage_dirty_bitmap(fs_info, folio, &dirty_bitmap); + bitmap_size = fs_info->subpage_info->bitmap_nr_bits; + } + for (cur = start; cur < start + len; cur += fs_info->sectorsize) + set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); + bitmap_and(&dirty_bitmap, &dirty_bitmap, &range_bitmap, bitmap_size); + bio_ctrl->end_io_func = end_bbio_data_write; - while (cur <= end) { - u32 len = end - cur + 1; - u64 disk_bytenr; - u64 em_end; - u64 dirty_range_start = cur; - u64 dirty_range_end; - u32 iosize; + + for_each_set_bit(bit, &dirty_bitmap, bitmap_size) { + cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { - btrfs_mark_ordered_io_finished(inode, folio, cur, len, - true); + btrfs_mark_ordered_io_finished(inode, folio, cur, + start + len - cur, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. @@ -1439,62 +1464,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_folio_clear_dirty(fs_info, folio, cur, len); + btrfs_folio_clear_dirty(fs_info, folio, cur, + start + len - cur); break; } - - find_next_dirty_byte(fs_info, folio, &dirty_range_start, - &dirty_range_end); - if (cur < dirty_range_start) { - cur = dirty_range_start; - continue; - } - - em = btrfs_get_extent(inode, NULL, cur, len); - if (IS_ERR(em)) { - ret = PTR_ERR_OR_ZERO(em); + ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); + if (ret < 0) goto out_error; - } - - extent_offset = cur - em->start; - em_end = extent_map_end(em); - ASSERT(cur <= em_end); - ASSERT(cur < end); - ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); - ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); - - block_start = extent_map_block_start(em); - disk_bytenr = extent_map_block_start(em) + extent_offset; - - ASSERT(!extent_map_is_compressed(em)); - ASSERT(block_start != EXTENT_MAP_HOLE); - ASSERT(block_start != EXTENT_MAP_INLINE); - - /* - * Note that em_end from extent_map_end() and dirty_range_end from - * find_next_dirty_byte() are all exclusive - */ - iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - free_extent_map(em); - em = NULL; - - /* - * Although the PageDirty bit is cleared before entering this - * function, subpage dirty bit is not cleared. - * So clear subpage dirty bit here so next time we won't submit - * folio for range already written to disk. - */ - btrfs_folio_clear_dirty(fs_info, folio, cur, iosize); - btrfs_set_range_writeback(inode, cur, cur + iosize - 1); - if (!folio_test_writeback(folio)) { - btrfs_err(inode->root->fs_info, - "folio %lu not writeback, cur %llu end %llu", - folio->index, cur, end); - } - - submit_extent_folio(bio_ctrl, disk_bytenr, folio, - iosize, cur - folio_pos(folio)); - cur += iosize; nr++; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index f8795c3d227089..84a9953e32f374 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -946,3 +946,20 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, subpage_info->bitmap_nr_bits, &ordered_bitmap, subpage_info->bitmap_nr_bits, &checked_bitmap); } + +void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, + struct folio *folio, + unsigned long *ret_bitmap) +{ + struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; + struct btrfs_subpage *subpage; + unsigned long flags; + + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + ASSERT(subpage_info); + subpage = folio_get_private(folio); + + spin_lock_irqsave(&subpage->lock, flags); + GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, ret_bitmap); + spin_unlock_irqrestore(&subpage->lock, flags); +} diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 5532cc4fac504d..eee55e5a39526c 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -175,6 +175,9 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); +void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, + struct folio *folio, + unsigned long *ret_bitmap); void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); From 3de2540504af4ff1f5596261ffe8373c9e279c2f Mon Sep 17 00:00:00 2001 From: Junchao Sun Date: Fri, 7 Jun 2024 22:30:20 +0800 Subject: [PATCH 935/991] btrfs: qgroup: use goto style to handle errors in add_delayed_ref() Clean up resources using goto to get rid of repeated code. Reviewed-by: Qu Wenruo Signed-off-by: Junchao Sun Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 06a9e0542d708b..0bfa014b796dad 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -1005,18 +1005,13 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_ref_node_cachep, node); - return -ENOMEM; - } + if (!head_ref) + goto free_node; if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); - if (!record) { - kmem_cache_free(btrfs_delayed_ref_node_cachep, node); - kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); - return -ENOMEM; - } + if (!record) + goto free_head_ref; } init_delayed_ref_common(fs_info, node, generic_ref); @@ -1052,6 +1047,12 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(trans, record); return 0; + +free_head_ref: + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); +free_node: + kmem_cache_free(btrfs_delayed_ref_node_cachep, node); + return -ENOMEM; } /* From 54a1a53e919073051db917c1c42924d46d17e188 Mon Sep 17 00:00:00 2001 From: Junchao Sun Date: Fri, 7 Jun 2024 22:30:21 +0800 Subject: [PATCH 936/991] btrfs: qgroup: use xarray to track dirty extents in transaction Use xarray to track dirty extents to reduce the size of the struct btrfs_qgroup_extent_record from 64 bytes to 40 bytes. The xarray is more cache line friendly, it also reduces the complexity of insertion and search code compared to rb tree. Another change introduced is about error handling. Before this patch, the result of btrfs_qgroup_trace_extent_nolock() is always a success. In this patch, because of this function calls the function xa_store() which has the possibility to fail, so mark qgroup as inconsistent if error happened and then free preallocated memory. Also we preallocate memory before spin_lock(), if memory preallcation failed, error handling is the same the existing code. Suggested-by: Qu Wenruo Signed-off-by: Junchao Sun Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 17 +++++++++-- fs/btrfs/delayed-ref.h | 4 +-- fs/btrfs/qgroup.c | 66 ++++++++++++++++++++---------------------- fs/btrfs/qgroup.h | 1 - fs/btrfs/transaction.c | 5 ++-- 5 files changed, 50 insertions(+), 43 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 0bfa014b796dad..ad9ef8312e4161 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -855,11 +855,17 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, /* Record qgroup extent info if provided */ if (qrecord) { - if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, - delayed_refs, qrecord)) + int ret; + + ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info, + delayed_refs, qrecord); + if (ret) { + /* Clean up if insertion fails or item exists. */ + xa_release(&delayed_refs->dirty_extents, qrecord->bytenr); kfree(qrecord); - else + } else { qrecord_inserted = true; + } } trace_add_delayed_ref_head(trans->fs_info, head_ref, action); @@ -1012,6 +1018,9 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) goto free_head_ref; + if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, + generic_ref->bytenr, GFP_NOFS)) + goto free_record; } init_delayed_ref_common(fs_info, node, generic_ref); @@ -1048,6 +1057,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, return btrfs_qgroup_trace_extent_post(trans, record); return 0; +free_record: + kfree(record); free_head_ref: kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); free_node: diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 05f634eb472d84..085f30968abaea 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -202,8 +202,8 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root_cached href_root; - /* dirty extent records */ - struct rb_root dirty_extent_root; + /* Track dirty extent records. */ + struct xarray dirty_extents; /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index feb8f9f2f3582d..c297909f150618 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1998,16 +1998,14 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, * * Return 0 for success insert * Return >0 for existing record, caller can free @record safely. - * Error is not possible + * Return <0 for insertion failure, caller can free @record safely. */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) { - struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_qgroup_extent_record *entry; - u64 bytenr = record->bytenr; + struct btrfs_qgroup_extent_record *existing, *ret; + unsigned long bytenr = record->bytenr; if (!btrfs_qgroup_full_accounting(fs_info)) return 1; @@ -2015,26 +2013,24 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, - node); - if (bytenr < entry->bytenr) { - p = &(*p)->rb_left; - } else if (bytenr > entry->bytenr) { - p = &(*p)->rb_right; - } else { - if (record->data_rsv && !entry->data_rsv) { - entry->data_rsv = record->data_rsv; - entry->data_rsv_refroot = - record->data_rsv_refroot; - } - return 1; + xa_lock(&delayed_refs->dirty_extents); + existing = xa_load(&delayed_refs->dirty_extents, bytenr); + if (existing) { + if (record->data_rsv && !existing->data_rsv) { + existing->data_rsv = record->data_rsv; + existing->data_rsv_refroot = record->data_rsv_refroot; } + xa_unlock(&delayed_refs->dirty_extents); + return 1; + } + + ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC); + xa_unlock(&delayed_refs->dirty_extents); + if (xa_is_err(ret)) { + qgroup_mark_inconsistent(fs_info); + return xa_err(ret); } - rb_link_node(&record->node, parent_node, p); - rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); return 0; } @@ -2141,6 +2137,11 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!record) return -ENOMEM; + if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) { + kfree(record); + return -ENOMEM; + } + delayed_refs = &trans->transaction->delayed_refs; record->bytenr = bytenr; record->num_bytes = num_bytes; @@ -2149,7 +2150,9 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, spin_lock(&delayed_refs->lock); ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); spin_unlock(&delayed_refs->lock); - if (ret > 0) { + if (ret) { + /* Clean up if insertion fails or item exists. */ + xa_release(&delayed_refs->dirty_extents, record->bytenr); kfree(record); return 0; } @@ -3018,7 +3021,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; - struct rb_node *node; + unsigned long index; u64 num_dirty_extents = 0; u64 qgroup_to_skip; int ret = 0; @@ -3028,10 +3031,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; - while ((node = rb_first(&delayed_refs->dirty_extent_root))) { - record = rb_entry(node, struct btrfs_qgroup_extent_record, - node); - + xa_for_each(&delayed_refs->dirty_extents, index, record) { num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); @@ -3097,7 +3097,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) ulist_free(record->old_roots); ulist_free(new_roots); new_roots = NULL; - rb_erase(node, &delayed_refs->dirty_extent_root); + xa_erase(&delayed_refs->dirty_extents, index); kfree(record); } @@ -4874,15 +4874,13 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) { struct btrfs_qgroup_extent_record *entry; - struct btrfs_qgroup_extent_record *next; - struct rb_root *root; + unsigned long index; - root = &trans->delayed_refs.dirty_extent_root; - rbtree_postorder_for_each_entry_safe(entry, next, root, node) { + xa_for_each(&trans->delayed_refs.dirty_extents, index, entry) { ulist_free(entry->old_roots); kfree(entry); } - *root = RB_ROOT; + xa_destroy(&trans->delayed_refs.dirty_extents); } void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index deb479d176a96c..98adf4ec7b0172 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -125,7 +125,6 @@ struct btrfs_inode; * Record a dirty extent, and info qgroup to update quota on it */ struct btrfs_qgroup_extent_record { - struct rb_node node; u64 bytenr; u64 num_bytes; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5e6fff8e1003a7..0fc873af891f65 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -143,8 +143,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT( &transaction->delayed_refs.href_root.rb_root)); - WARN_ON(!RB_EMPTY_ROOT( - &transaction->delayed_refs.dirty_extent_root)); + WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", @@ -351,7 +350,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; - cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; + xa_init(&cur_trans->delayed_refs.dirty_extents); atomic_set(&cur_trans->delayed_refs.num_entries, 0); /* From a3be6e89112bb0c2471e3b0d9a720be4cccd2ba8 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 14 Aug 2024 10:13:29 +0200 Subject: [PATCH 937/991] btrfs: send: fix grammar in comments Fix a few obvious grammar mistakes: a -> an, then -> than. Signed-off-by: Thorsten Blum Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 619fa0b8b3f6f9..7f48ba6c1c77a0 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -62,7 +62,7 @@ struct fs_path { /* * Average path length does not exceed 200 bytes, we'll have * better packing in the slab and higher chance to satisfy - * a allocation later during send. + * an allocation later during send. */ char pad[256]; }; @@ -1136,7 +1136,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Start with a small buffer (1 page). If later we end up needing more * space, which can happen for xattrs on a fs with a leaf size greater - * then the page size, attempt to increase the buffer. Typically xattr + * than the page size, attempt to increase the buffer. Typically xattr * values are small. */ buf_len = PATH_MAX; From 521551692488548e22d680a05119c57493f0e01b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 14 Aug 2024 10:50:21 +0930 Subject: [PATCH 938/991] btrfs: remove the nr_ret parameter from __extent_writepage_io() The parameter @nr_ret is used to tell the caller how many sectors have been submitted for IO. Then callers check @nr_ret value to determine if we need to manually clear the PAGECACHE_TAG_DIRTY, as if we submitted no sector (e.g. all sectors are beyond i_size) there is no folio_start_writeback() called thus PAGECACHE_TAG_DIRTY tag will not be cleared. Remove this parameter by: - Moving the btrfs_folio_clear_writeback() call into __extent_writepage_io() So that if we didn't submit any IO, then manually call btrfs_folio_set_writeback() to clear PAGECACHE_TAG_DIRTY when the page is no longer dirty. - Use a bool to record if we have submitted any sector Instead of an int. - Use subpage compatible helpers to end folio writeback. This brings no change to the behavior, just for the sake of consistency. As for the call site inside __extent_writepage(), we're always called for the whole page, so the existing full page helper folio_(start|end)_writeback() is totally fine. For the call site inside extent_write_locked_range(), although we can have subpage range, folio_start_writeback() will only clear PAGECACHE_TAG_DIRTY if the page is no longer dirty, and the full folio will still be dirty if there is any subpage dirty range. Only when the last dirty subpage sector is cleared, the folio_start_writeback() will clear PAGECACHE_TAG_DIRTY. So no matter if we call the full page or subpage helper, the result is still the same, then just use the subpage helpers for consistency. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 822e2bf8bc990c..6083bed89df222 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1409,7 +1409,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct folio *folio, u64 start, u32 len, struct btrfs_bio_ctrl *bio_ctrl, - loff_t i_size, int *nr_ret) + loff_t i_size) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; @@ -1422,11 +1422,11 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ unsigned long dirty_bitmap = 1; unsigned int bitmap_size = 1; + bool submitted_io = false; const u64 folio_start = folio_pos(folio); u64 cur; int bit; int ret = 0; - int nr = 0; ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); @@ -1470,20 +1470,24 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); if (ret < 0) - goto out_error; - nr++; + goto out; + submitted_io = true; } btrfs_folio_assert_not_dirty(fs_info, folio, start, len); - *nr_ret = nr; - return 0; - -out_error: +out: /* - * If we finish without problem, we should not only clear folio dirty, - * but also empty subpage dirty bits + * If we didn't submitted any sector (>= i_size), folio dirty get + * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared + * by folio_start_writeback() if the folio is not dirty). + * + * Here we set writeback and clear for the range. If the full folio + * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. */ - *nr_ret = nr; + if (!submitted_io) { + btrfs_folio_set_writeback(fs_info, folio, start, len); + btrfs_folio_clear_writeback(fs_info, folio, start, len); + } return ret; } @@ -1501,7 +1505,6 @@ static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ct struct inode *inode = folio->mapping->host; const u64 page_start = folio_pos(folio); int ret; - int nr = 0; size_t pg_offset; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; @@ -1532,18 +1535,13 @@ static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ct goto done; ret = __extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), - PAGE_SIZE, bio_ctrl, i_size, &nr); + PAGE_SIZE, bio_ctrl, i_size); if (ret == 1) return 0; bio_ctrl->wbc->nr_to_write--; done: - if (nr == 0) { - /* make sure the mapping tag for page dirty gets cleared */ - folio_start_writeback(folio); - folio_end_writeback(folio); - } if (ret) { btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, page_start, PAGE_SIZE, !ret); @@ -2276,7 +2274,6 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); u32 cur_len = cur_end + 1 - cur; struct folio *folio; - int nr = 0; folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0); @@ -2297,15 +2294,10 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f ASSERT(folio_test_dirty(folio)); ret = __extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, - &bio_ctrl, i_size, &nr); + &bio_ctrl, i_size); if (ret == 1) goto next_page; - /* Make sure the mapping tag for page dirty gets cleared. */ - if (nr == 0) { - btrfs_folio_set_writeback(fs_info, folio, cur, cur_len); - btrfs_folio_clear_writeback(fs_info, folio, cur, cur_len); - } if (ret) { btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, cur, cur_len, !ret); From 03610e3f1b3948e0823bcecbdc91648b5cab44d1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 26 Aug 2024 15:44:50 +0930 Subject: [PATCH 939/991] btrfs: subpage: remove btrfs_fs_info::subpage_info member The member btrfs_fs_info::subpage_info stores the cached bitmap start position inside the merged bitmap. However in reality there is only one thing depending on the sectorsize, bitmap_nr_bits, which records the number of sectors that fit inside a page. The sequence of sub-bitmaps have fixed order, thus it's just a quick multiplication to calculate the start position of each sub-bitmaps. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 +------ fs/btrfs/extent_io.c | 8 ++-- fs/btrfs/fs.h | 2 +- fs/btrfs/subpage.c | 93 ++++++++++++++------------------------------ fs/btrfs/subpage.h | 43 +++++++------------- 5 files changed, 50 insertions(+), 110 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a6f5441e62d103..612460e07b2e74 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1285,7 +1285,6 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); - kfree(fs_info->subpage_info); kvfree(fs_info); } @@ -3322,6 +3321,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; @@ -3346,20 +3346,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) { - struct btrfs_subpage_info *subpage_info; - + if (sectorsize < PAGE_SIZE) btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); - if (!subpage_info) { - ret = -ENOMEM; - goto fail_alloc; - } - btrfs_init_subpage_info(subpage_info, sectorsize); - fs_info->subpage_info = subpage_info; - } ret = btrfs_init_workqueues(fs_info); if (ret) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6083bed89df222..643dd948054f5e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1440,9 +1440,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { - ASSERT(fs_info->subpage_info); + ASSERT(fs_info->sectors_per_page > 1); btrfs_get_subpage_dirty_bitmap(fs_info, folio, &dirty_bitmap); - bitmap_size = fs_info->subpage_info->bitmap_nr_bits; + bitmap_size = fs_info->sectors_per_page; } for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); @@ -1827,7 +1827,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { + while (bit_start < fs_info->sectors_per_page) { struct btrfs_subpage *subpage = folio_get_private(folio); struct extent_buffer *eb; unsigned long flags; @@ -1843,7 +1843,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) break; } spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, + if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock(&page->mapping->i_private_lock); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 3d6d4b50322084..79f64e383eddf8 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -703,8 +703,8 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; + u32 sectors_per_page; struct workqueue_struct *scrub_workers; - struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 84a9953e32f374..1dda17b5ab12c0 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -88,37 +88,6 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space } #endif -void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) -{ - unsigned int cur = 0; - unsigned int nr_bits; - - ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize)); - - nr_bits = PAGE_SIZE / sectorsize; - subpage_info->bitmap_nr_bits = nr_bits; - - subpage_info->uptodate_offset = cur; - cur += nr_bits; - - subpage_info->dirty_offset = cur; - cur += nr_bits; - - subpage_info->writeback_offset = cur; - cur += nr_bits; - - subpage_info->ordered_offset = cur; - cur += nr_bits; - - subpage_info->checked_offset = cur; - cur += nr_bits; - - subpage_info->locked_offset = cur; - cur += nr_bits; - - subpage_info->total_nr_bits = cur; -} - int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type) { @@ -165,7 +134,7 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, ASSERT(fs_info->sectorsize < PAGE_SIZE); real_size = struct_size(ret, bitmaps, - BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits)); + BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page)); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); @@ -248,7 +217,7 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, \ btrfs_subpage_assert(fs_info, folio, start, len); \ __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - __start_bit += fs_info->subpage_info->name##_offset; \ + __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -420,13 +389,13 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, #define subpage_test_bitmap_all_set(fs_info, subpage, name) \ bitmap_test_range_all_set(subpage->bitmaps, \ - fs_info->subpage_info->name##_offset, \ - fs_info->subpage_info->bitmap_nr_bits) + fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ + fs_info->sectors_per_page) #define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ bitmap_test_range_all_zero(subpage->bitmaps, \ - fs_info->subpage_info->name##_offset, \ - fs_info->subpage_info->bitmap_nr_bits) + fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ + fs_info->sectors_per_page) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) @@ -805,7 +774,7 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->writers); - ASSERT(ret <= fs_info->subpage_info->bitmap_nr_bits); + ASSERT(ret <= fs_info->sectors_per_page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -821,14 +790,13 @@ bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 search_start, u64 *found_start_ret, u32 *found_len_ret) { - struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage = folio_get_private(folio); + const u32 sectors_per_page = fs_info->sectors_per_page; const unsigned int len = PAGE_SIZE - offset_in_page(search_start); const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, locked, search_start, len); - const unsigned int locked_bitmap_start = subpage_info->locked_offset; - const unsigned int locked_bitmap_end = locked_bitmap_start + - subpage_info->bitmap_nr_bits; + const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked; + const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page; unsigned long flags; int first_zero; int first_set; @@ -901,21 +869,21 @@ void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct fol } } -#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ +#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ { \ - const int bitmap_nr_bits = subpage_info->bitmap_nr_bits; \ + const int sectors_per_page = fs_info->sectors_per_page; \ \ - ASSERT(bitmap_nr_bits < BITS_PER_LONG); \ + ASSERT(sectors_per_page < BITS_PER_LONG); \ *dst = bitmap_read(subpage->bitmaps, \ - subpage_info->name##_offset, \ - bitmap_nr_bits); \ + sectors_per_page * btrfs_bitmap_nr_##name, \ + sectors_per_page); \ } void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage; + const u32 sectors_per_page = fs_info->sectors_per_page; unsigned long uptodate_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; @@ -924,42 +892,41 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(subpage_info); + ASSERT(sectors_per_page > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, locked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), - subpage_info->bitmap_nr_bits, &uptodate_bitmap, - subpage_info->bitmap_nr_bits, &dirty_bitmap, - subpage_info->bitmap_nr_bits, &writeback_bitmap, - subpage_info->bitmap_nr_bits, &ordered_bitmap, - subpage_info->bitmap_nr_bits, &checked_bitmap); + sectors_per_page, &uptodate_bitmap, + sectors_per_page, &dirty_bitmap, + sectors_per_page, &writeback_bitmap, + sectors_per_page, &ordered_bitmap, + sectors_per_page, &checked_bitmap); } void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap) { - struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage; unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(subpage_info); + ASSERT(fs_info->sectors_per_page > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, ret_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index eee55e5a39526c..b67cd5f6539d80 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -19,39 +19,23 @@ struct btrfs_fs_info; * * This structure records how they are organized in the bitmap: * - * /- uptodate_offset /- dirty_offset /- ordered_offset + * /- uptodate /- dirty /- ordered * | | | * v v v * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o| - * |<- bitmap_nr_bits ->| - * |<----------------- total_nr_bits ------------------>| + * |< sectors_per_page >| + * + * Unlike regular macro-like enums, here we do not go upper-case names, as + * these names will be utilized in various macros to define function names. */ -struct btrfs_subpage_info { - /* Number of bits for each bitmap */ - unsigned int bitmap_nr_bits; - - /* Total number of bits for the whole bitmap */ - unsigned int total_nr_bits; - - /* - * *_offset indicates where the bitmap starts, the length is always - * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. - */ - unsigned int uptodate_offset; - unsigned int dirty_offset; - unsigned int writeback_offset; - unsigned int ordered_offset; - unsigned int checked_offset; - - /* - * For locked bitmaps, normally it's subpage representation for folio - * Locked flag, but metadata is different: - * - * - Metadata doesn't really lock the folio - * It's just to prevent page::private get cleared before the last - * end_page_read(). - */ - unsigned int locked_offset; +enum { + btrfs_bitmap_nr_uptodate = 0, + btrfs_bitmap_nr_dirty, + btrfs_bitmap_nr_writeback, + btrfs_bitmap_nr_ordered, + btrfs_bitmap_nr_checked, + btrfs_bitmap_nr_locked, + btrfs_bitmap_nr_max }; /* @@ -99,7 +83,6 @@ static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, } #endif -void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type); void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); From cdc9b3a7ab220574fc353ccc241a76c99ef601a1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:40:11 +0200 Subject: [PATCH 940/991] btrfs: rename btrfs_submit_bio() to btrfs_submit_bbio() The function name is a bit misleading as it submits the btrfs_bio (bbio), rename it so we can use btrfs_submit_bio() when an actual bio is submitted. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/bio.c | 10 +++++----- fs/btrfs/bio.h | 6 +++--- fs/btrfs/compression.c | 4 ++-- fs/btrfs/direct-io.c | 2 +- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/inode.c | 4 ++-- fs/btrfs/scrub.c | 10 +++++----- 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index f6cb58d7f16a46..4f3e265880bfbb 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -53,7 +53,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, /* * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for - * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * btrfs, and is used for all I/O submitted through btrfs_submit_bbio(). * * Just like the underlying bio_alloc_bioset it will not fail as it is backed by * a mempool. @@ -211,7 +211,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, goto done; } - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return; } @@ -280,7 +280,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return fbio; } @@ -777,7 +777,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) return true; } -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) { /* If bbio->inode is not populated, its file_offset must be 0. */ ASSERT(bbio->inode || bbio->file_offset == 0); @@ -789,7 +789,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) /* * Submit a repair write. * - * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a + * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index d9dd5276093df0..e4861234074588 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -29,7 +29,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and - * passed to btrfs_submit_bio for mapping to the physical devices. + * passed to btrfs_submit_bbio() for mapping to the physical devices. */ struct btrfs_bio { /* @@ -42,7 +42,7 @@ struct btrfs_bio { union { /* * For data reads: checksumming and original I/O information. - * (for internal use in the btrfs_submit_bio machinery only) + * (for internal use in the btrfs_submit_bbio() machinery only) */ struct { u8 *csum; @@ -104,7 +104,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); /* Submit using blkcg_punt_bio_submit. */ #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct folio *folio, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 832ab8984c41dd..39cd2ed1974bd3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -395,7 +395,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb->bbio.ordered = ordered; btrfs_add_compressed_bio_folios(cb); - btrfs_submit_bio(&cb->bbio, 0); + btrfs_submit_bbio(&cb->bbio, 0); } /* @@ -630,7 +630,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) if (memstall) psi_memstall_leave(&pflags); - btrfs_submit_bio(&cb->bbio, 0); + btrfs_submit_bbio(&cb->bbio, 0); return; out_free_compressed_pages: diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 364bce34f0346b..ea7f918b1c459c 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -726,7 +726,7 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, } } - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 643dd948054f5e..8de6d226475d6e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -117,7 +117,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); else - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; @@ -1800,7 +1800,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_unlock(folio); } } - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); } /* @@ -3572,7 +3572,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, ASSERT(ret); } } - btrfs_submit_bio(bbio, mirror_num); + btrfs_submit_bbio(bbio, mirror_num); done: if (wait == WAIT_COMPLETE) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d25aeb844a642b..27aa67f135ff62 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9152,7 +9152,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, btrfs_encoded_read_endio, &priv); @@ -9167,7 +9167,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, } while (disk_io_size); atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b3afa63658231a..3a34274280746c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -838,7 +838,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, bbio->bio.bi_iter.bi_size >= blocksize)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); bbio = NULL; @@ -857,7 +857,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); } @@ -1683,7 +1683,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, bbio->bio.bi_iter.bi_size >= stripe_len)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); bbio = NULL; } @@ -1720,7 +1720,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); } if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1776,7 +1776,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, mirror = calc_next_mirror(mirror, num_copies); } - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); } static bool stripe_has_metadata_error(struct scrub_stripe *stripe) From 0898b6489405328c7719c3b071201ef769f3e348 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:41:45 +0200 Subject: [PATCH 941/991] btrfs: rename __btrfs_submit_bio() and drop double underscores Previous patch freed the function name btrfs_submit_bio() so we can use it for a helper that submits struct bio. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/bio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 4f3e265880bfbb..d5dcc356df3345 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -502,8 +502,8 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, - struct btrfs_io_stripe *smap, int mirror_num) +static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { if (!bioc) { /* Single mirror read/write fast path. */ @@ -603,7 +603,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) * context. This changes nothing when cgroups aren't in use. */ bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; - __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); + btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } static bool should_async_write(struct btrfs_bio *bbio) @@ -752,7 +752,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) } } - __btrfs_submit_bio(bio, bioc, &smap, mirror_num); + btrfs_submit_bio(bio, bioc, &smap, mirror_num); done: return map_length == length; @@ -878,7 +878,7 @@ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_ ASSERT(smap.dev == fs_info->dev_replace.srcdev); smap.dev = fs_info->dev_replace.tgtdev; } - __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); + btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); return; fail: From ffd7ff09ab1275fe6d3bd8b746feb7b5e3eda04f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:30:16 +0200 Subject: [PATCH 942/991] btrfs: rename __extent_writepage() and drop double underscores The function does not follow the pattern where the underscores would be justified, so rename it. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 28 ++++++++++++++-------------- fs/btrfs/inode.c | 2 +- fs/btrfs/subpage.c | 4 ++-- include/trace/events/btrfs.h | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8de6d226475d6e..f7a388529c171b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1177,7 +1177,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio) } /* - * helper for __extent_writepage, doing all of the delayed allocation setup. + * helper for extent_writepage(), doing all of the delayed allocation setup. * * This returns 1 if btrfs_run_delalloc_range function did all the work required * to write the page (copy into inline extent). In this case the IO has @@ -1398,18 +1398,18 @@ static int submit_one_sector(struct btrfs_inode *inode, } /* - * helper for __extent_writepage. This calls the writepage start hooks, + * Helper for extent_writepage(). This calls the writepage start hooks, * and does the loop to map the page into extents and bios. * * We return 1 if the IO is started and the page is unlocked, * 0 if all went well (page still locked) * < 0 if there were errors (page still locked) */ -static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, - struct folio *folio, - u64 start, u32 len, - struct btrfs_bio_ctrl *bio_ctrl, - loff_t i_size) +static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, + struct folio *folio, + u64 start, u32 len, + struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; @@ -1500,7 +1500,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * Return 0 if everything goes well. * Return <0 for error. */ -static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) +static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = folio->mapping->host; const u64 page_start = folio_pos(folio); @@ -1509,7 +1509,7 @@ static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ct loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - trace___extent_writepage(folio, inode, bio_ctrl->wbc); + trace_extent_writepage(folio, inode, bio_ctrl->wbc); WARN_ON(!folio_test_locked(folio)); @@ -1534,8 +1534,8 @@ static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ct if (ret) goto done; - ret = __extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), - PAGE_SIZE, bio_ctrl, i_size); + ret = extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), + PAGE_SIZE, bio_ctrl, i_size); if (ret == 1) return 0; @@ -2202,7 +2202,7 @@ static int extent_write_cache_pages(struct address_space *mapping, continue; } - ret = __extent_writepage(folio, bio_ctrl); + ret = extent_writepage(folio, bio_ctrl); if (ret < 0) { done = 1; break; @@ -2293,8 +2293,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); - ret = __extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, - &bio_ctrl, i_size); + ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, + &bio_ctrl, i_size); if (ret == 1) goto next_page; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 27aa67f135ff62..efe75b03d5f113 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -747,7 +747,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, /* * In the successful case (ret == 0 here), cow_file_range will return 1. * - * Quite a bit further up the callstack in __extent_writepage, ret == 1 + * Quite a bit further up the callstack in extent_writepage(), ret == 1 * is treated as a short circuited success and does not unlock the folio, * so we must do it here. * diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 1dda17b5ab12c0..ca7d2aedfa8d9d 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -705,7 +705,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, * - Page locked by plain lock_page() * It should not have any subpage::writers count. * Can be unlocked by unlock_page(). - * This is the most common locked page for __extent_writepage() called + * This is the most common locked page for extent_writepage() called * inside extent_write_cache_pages(). * Rarer cases include the @locked_page from extent_write_locked_range(). * @@ -829,7 +829,7 @@ bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, * Unlike btrfs_folio_end_writer_lock() which unlocks a specified subpage range, * this ends all writer locked ranges of a page. * - * This is for the locked page of __extent_writepage(), as the locked page + * This is for the locked page of extent_writepage(), as the locked page * can contain several locked subpage ranges. */ void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 0eddbb8b67283c..e4add61e00f14d 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -721,7 +721,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage, __entry->writeback_index) ); -DEFINE_EVENT(btrfs__writepage, __extent_writepage, +DEFINE_EVENT(btrfs__writepage, extent_writepage, TP_PROTO(const struct folio *folio, const struct inode *inode, const struct writeback_control *wbc), From a55a176f7a42ec543be714ce0cf206e534b7693a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:44:15 +0200 Subject: [PATCH 943/991] btrfs: rename __compare_inode_defrag() and drop double underscores The function does not follow the pattern where the underscores would be justified, so rename it. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index f6dbda37a3615a..e2949f6305845e 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -45,7 +45,7 @@ struct inode_defrag { u32 extent_thresh; }; -static int __compare_inode_defrag(struct inode_defrag *defrag1, +static int compare_inode_defrag(struct inode_defrag *defrag1, struct inode_defrag *defrag2) { if (defrag1->root > defrag2->root) @@ -83,7 +83,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, parent = *p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(defrag, entry); + ret = compare_inode_defrag(defrag, entry); if (ret < 0) p = &parent->rb_left; else if (ret > 0) @@ -189,7 +189,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(&tmp, entry); + ret = compare_inode_defrag(&tmp, entry); if (ret < 0) p = parent->rb_left; else if (ret > 0) @@ -198,7 +198,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( goto out; } - if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + if (parent && compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); if (parent) entry = rb_entry(parent, struct inode_defrag, rb_node); From 2747c2fee133a164cd55e30efa16eec1085b223e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:44:30 +0200 Subject: [PATCH 944/991] btrfs: constify arguments of compare_inode_defrag() A comparator function does not change its parameters, make them const. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index e2949f6305845e..e4bb5a8651f32e 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -45,8 +45,8 @@ struct inode_defrag { u32 extent_thresh; }; -static int compare_inode_defrag(struct inode_defrag *defrag1, - struct inode_defrag *defrag2) +static int compare_inode_defrag(const struct inode_defrag *defrag1, + const struct inode_defrag *defrag2) { if (defrag1->root > defrag2->root) return 1; From c60704a49f8f842fa05edcb57632ebe0a776deb0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:45:38 +0200 Subject: [PATCH 945/991] btrfs: rename __need_auto_defrag() and drop double underscores The function does not follow the pattern where the underscores would be justified, so rename it. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index e4bb5a8651f32e..b735fce21f07e6 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -107,7 +107,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, return 0; } -static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) +static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) return 0; @@ -130,7 +130,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, u64 transid; int ret; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) return 0; if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) @@ -245,7 +245,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, again: if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) goto cleanup; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) goto cleanup; /* Get the inode */ @@ -306,7 +306,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) break; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) break; /* find an inode to defrag */ From 51a1f93c4f902f018655f3e6de8c93f33f26ce62 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:50:43 +0200 Subject: [PATCH 946/991] btrfs: rename __btrfs_add_inode_defrag() and drop double underscores The function does not follow the pattern where the underscores would be justified, so rename it. Also update the misleading comment, the passed item is not freed, that's what the caller does. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index b735fce21f07e6..5258dd86dbd8f7 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -61,16 +61,14 @@ static int compare_inode_defrag(const struct inode_defrag *defrag1, } /* - * Pop a record for an inode into the defrag tree. The lock must be held + * Insert a record for an inode into the defrag tree. The lock must be held * already. * * If you're inserting a record for an older transid than an existing record, * the transid already in the tree is lowered. - * - * If an existing record is found the defrag item you pass in is freed. */ -static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) +static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, + struct inode_defrag *defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct inode_defrag *entry; @@ -157,7 +155,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, * and then re-read this inode, this new inode doesn't have * IN_DEFRAG flag. At the case, we may find the existed defrag. */ - ret = __btrfs_add_inode_defrag(inode, defrag); + ret = btrfs_insert_inode_defrag(inode, defrag); if (ret) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } else { From 6d7fc03dd6450d23e7d0608ad57833e74c00b100 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:51:38 +0200 Subject: [PATCH 947/991] btrfs: rename __btrfs_run_defrag_inode() and drop double underscores The function does not follow the pattern where the underscores would be justified, so rename it. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 5258dd86dbd8f7..41d67065d02bd0 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -231,8 +231,8 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) #define BTRFS_DEFRAG_BATCH 1024 -static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) +static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) { struct btrfs_root *inode_root; struct inode *inode; @@ -322,7 +322,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) first_ino = defrag->ino + 1; root_objectid = defrag->root; - __btrfs_run_defrag_inode(fs_info, defrag); + btrfs_run_defrag_inode(fs_info, defrag); } atomic_dec(&fs_info->defrag_running); From 215d7a6cd722ed08e83ac5ef01dcece8a9e2276b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 04:05:48 +0200 Subject: [PATCH 948/991] btrfs: clear defragmented inodes using postorder in btrfs_cleanup_defrag_inodes() btrfs_cleanup_defrag_inodes() is not called frequently, only in remount or unmount, but the way it frees the inodes in fs_info->defrag_inodes is inefficient. Each time it needs to locate first node, remove it, potentially rebalance tree until it's done. This allows to do a conditional reschedule. For cleanups the rbtree_postorder_for_each_entry_safe() iterator is convenient but we can't reschedule and restart iteration because some of the tree nodes would be already freed. The cleanup operation is kmem_cache_free() which will likely take the fast path for most objects so rescheduling should not be necessary. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 41d67065d02bd0..89f51252d25cf6 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -212,20 +212,14 @@ static struct inode_defrag *btrfs_pick_defrag_inode( void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { - struct inode_defrag *defrag; - struct rb_node *node; + struct inode_defrag *defrag, *next; spin_lock(&fs_info->defrag_inodes_lock); - node = rb_first(&fs_info->defrag_inodes); - while (node) { - rb_erase(node, &fs_info->defrag_inodes); - defrag = rb_entry(node, struct inode_defrag, rb_node); - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - cond_resched_lock(&fs_info->defrag_inodes_lock); + rbtree_postorder_for_each_entry_safe(defrag, next, + &fs_info->defrag_inodes, rb_node) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - node = rb_first(&fs_info->defrag_inodes); - } spin_unlock(&fs_info->defrag_inodes_lock); } From 8717805e225a8964ae27f19c78b66ca11f75b94c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 04:10:11 +0200 Subject: [PATCH 949/991] btrfs: return void from btrfs_add_inode_defrag() The potential memory allocation failure is not a fatal error, skipping autodefrag is fine and the caller inode_should_defrag() does not care about the errors. Further writes can attempt to add the inode back to the defragmentation list again. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 14 +++++++------- fs/btrfs/defrag.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 89f51252d25cf6..6af593a0313d4b 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -117,10 +117,11 @@ static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) } /* - * Insert a defrag record for this inode if auto defrag is enabled. + * Insert a defrag record for this inode if auto defrag is enabled. No errors + * returned as they're not considered fatal. */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) +void btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -129,10 +130,10 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, int ret; if (!need_auto_defrag(fs_info)) - return 0; + return; if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) - return 0; + return; if (trans) transid = trans->transid; @@ -141,7 +142,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) - return -ENOMEM; + return; defrag->ino = btrfs_ino(inode); defrag->transid = transid; @@ -162,7 +163,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); - return 0; } /* diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 878528e086fbe8..97f36ab3f24dfb 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -18,8 +18,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh); +void btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_defrag_root(struct btrfs_root *root); From 5937c0fe6bbe9378b22fa0da88e421beff8a1989 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 04:13:44 +0200 Subject: [PATCH 950/991] btrfs: drop transaction parameter from btrfs_add_inode_defrag() There's only one caller inode_should_defrag() that passes NULL to btrfs_add_inode_defrag() so we can drop it an simplify the code. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 11 ++--------- fs/btrfs/defrag.h | 3 +-- fs/btrfs/inode.c | 2 +- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 6af593a0313d4b..5b6bf0a59b2347 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -120,13 +120,11 @@ static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) * Insert a defrag record for this inode if auto defrag is enabled. No errors * returned as they're not considered fatal. */ -void btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct inode_defrag *defrag; - u64 transid; int ret; if (!need_auto_defrag(fs_info)) @@ -135,17 +133,12 @@ void btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) return; - if (trans) - transid = trans->transid; - else - transid = btrfs_get_root_last_trans(root); - defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) return; defrag->ino = btrfs_ino(inode); - defrag->transid = transid; + defrag->transid = btrfs_get_root_last_trans(root); defrag->root = btrfs_root_id(root); defrag->extent_thresh = extent_thresh; diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 97f36ab3f24dfb..6b7596c4f0dc12 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -18,8 +18,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); -void btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh); +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_defrag_root(struct btrfs_root *root); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index efe75b03d5f113..e96b63d7e8fd38 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -885,7 +885,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode, small_write); + btrfs_add_inode_defrag(inode, small_write); } static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) From 98a7d7ccb13897a17fd4aeb4cd9a867e2ee43747 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 04:26:51 +0200 Subject: [PATCH 951/991] btrfs: always pass readahead state to defrag Defrag ioctl passes readahead from the file, but autodefrag does not have a file so the readahead state is allocated when needed. The autodefrag loop in cleaner thread iterates over inodes so we can simply provide an on-stack readahead state and will not need to allocate it in btrfs_defrag_file(). The size is 32 bytes which is acceptable. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 5b6bf0a59b2347..acf1f39e45d0bd 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -219,7 +219,8 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) #define BTRFS_DEFRAG_BATCH 1024 static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) + struct inode_defrag *defrag, + struct file_ra_state *ra) { struct btrfs_root *inode_root; struct inode *inode; @@ -258,9 +259,10 @@ static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; + file_ra_state_init(ra, inode->i_mapping); sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); iput(inode); @@ -287,6 +289,8 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) atomic_inc(&fs_info->defrag_running); while (1) { + struct file_ra_state ra = { 0 }; + /* Pause the auto defragger. */ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) break; @@ -309,7 +313,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) first_ino = defrag->ino + 1; root_objectid = defrag->root; - btrfs_run_defrag_inode(fs_info, defrag); + btrfs_run_defrag_inode(fs_info, defrag, &ra); } atomic_dec(&fs_info->defrag_running); @@ -1302,8 +1306,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode, if (entry->start + range_len <= *last_scanned_ret) continue; - if (ra) - page_cache_sync_readahead(inode->vfs_inode.i_mapping, + page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, ((entry->start + range_len - 1) >> PAGE_SHIFT) - (entry->start >> PAGE_SHIFT) + 1); @@ -1335,7 +1338,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode, * Entry point to file defragmentation. * * @inode: inode to be defragged - * @ra: readahead state (can be NUL) + * @ra: readahead state * @range: defrag options including range and flags * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode @@ -1357,12 +1360,13 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); - bool ra_allocated = false; int compress_type = BTRFS_COMPRESS_ZLIB; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; + ASSERT(ra); + if (isize == 0) return 0; @@ -1391,18 +1395,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, cur = round_down(range->start, fs_info->sectorsize); last_byte = round_up(last_byte, fs_info->sectorsize) - 1; - /* - * If we were not given a ra, allocate a readahead context. As - * readahead is just an optimization, defrag will work without it so - * we don't error out. - */ - if (!ra) { - ra_allocated = true; - ra = kzalloc(sizeof(*ra), GFP_KERNEL); - if (ra) - file_ra_state_init(ra, inode->i_mapping); - } - /* * Make writeback start from the beginning of the range, so that the * defrag range can be written sequentially. @@ -1457,8 +1449,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, cond_resched(); } - if (ra_allocated) - kfree(ra); /* * Update range.start for autodefrag, this will indicate where to start * in next run. From d2811f4d7d128dd47ad3a87770a89c981da1f372 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 19 Aug 2024 14:15:52 -0400 Subject: [PATCH 952/991] btrfs: introduce EXTENT_DIO_LOCKED In order to support dropping the extent lock during a read we need a way to make sure that direct reads and direct writes for overlapping ranges are protected from each other. To accomplish this introduce another lock bit specifically for direct io. Subsequent patches will utilize this to protect direct IO operations. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 55 +++++++++++++++++---------------------- fs/btrfs/extent-io-tree.h | 38 ++++++++++++++++++++++++--- 2 files changed, 58 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index c54c5d7a5cd538..6d08c100b01de4 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -126,7 +126,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, * Empty an io tree, removing and freeing every extent state record from the * tree. This should be called once we are sure no other task can access the * tree anymore, so no tree updates happen after we empty the tree and there - * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never + * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never * set on any extent state when calling this function). */ void extent_io_tree_release(struct extent_io_tree *tree) @@ -141,7 +141,7 @@ void extent_io_tree_release(struct extent_io_tree *tree) rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) { /* Clear node to keep free_extent_state() happy. */ RB_CLEAR_NODE(&state->rb_node); - ASSERT(!(state->state & EXTENT_LOCKED)); + ASSERT(!(state->state & EXTENT_LOCK_BITS)); /* * No need for a memory barrier here, as we are holding the tree * lock and we only change the waitqueue while holding that lock @@ -399,7 +399,7 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s */ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) { - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + if (state->state & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) return; merge_prev_state(tree, state); @@ -445,7 +445,7 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, struct rb_node *parent = NULL; const u64 start = state->start - 1; const u64 end = state->end + 1; - const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)); + const bool try_merge = !(bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)); set_state_bits(tree, state, bits, changeset); @@ -616,9 +616,6 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) * inserting elements in the tree, so the gfp mask is used to indicate which * allocations or sleeping are allowed. * - * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given - * range from the tree regardless of state (ie for truncate). - * * The range [start, end] is inclusive. * * This takes the tree lock, and returns 0 on success and < 0 on error. @@ -647,8 +644,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; - wake = (bits & EXTENT_LOCKED) ? 1 : 0; - if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0); + if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) clear = 1; again: if (!prealloc) { @@ -861,8 +858,7 @@ static void cache_state_if_flags(struct extent_state *state, static void cache_state(struct extent_state *state, struct extent_state **cached_ptr) { - return cache_state_if_flags(state, cached_ptr, - EXTENT_LOCKED | EXTENT_BOUNDARY); + return cache_state_if_flags(state, cached_ptr, EXTENT_LOCK_BITS | EXTENT_BOUNDARY); } /* @@ -1063,7 +1059,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int ret = 0; u64 last_start; u64 last_end; - u32 exclusive_bits = (bits & EXTENT_LOCKED); + u32 exclusive_bits = (bits & EXTENT_LOCK_BITS); gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); @@ -1812,12 +1808,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset) { /* - * We don't support EXTENT_LOCKED yet, as current changeset will - * record any bits changed, so for EXTENT_LOCKED case, it will - * either fail with -EEXIST or changeset will record the whole - * range. + * We don't support EXTENT_LOCK_BITS yet, as current changeset will + * record any bits changed, so for EXTENT_LOCK_BITS case, it will either + * fail with -EEXIST or changeset will record the whole range. */ - ASSERT(!(bits & EXTENT_LOCKED)); + ASSERT(!(bits & EXTENT_LOCK_BITS)); return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } @@ -1826,26 +1821,25 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset) { /* - * Don't support EXTENT_LOCKED case, same reason as + * Don't support EXTENT_LOCK_BITS case, same reason as * set_record_extent_bits(). */ - ASSERT(!(bits & EXTENT_LOCKED)); + ASSERT(!(bits & EXTENT_LOCK_BITS)); return __clear_extent_bit(tree, start, end, bits, NULL, changeset); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached) { int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + err = __set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL); if (err == -EEXIST) { if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, cached); + clear_extent_bit(tree, start, failed_start - 1, bits, cached); return 0; } return 1; @@ -1855,23 +1849,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, * Either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state) +int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *failed_state = NULL; int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + err = __set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); while (err == -EEXIST) { if (failed_start != start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, cached_state); + bits, cached_state); - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED, - &failed_state); - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, + wait_extent_bit(tree, failed_start, end, bits, &failed_state); + err = __set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 9d3a52d8f59a80..6ffef1cd37c1bf 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -19,6 +19,7 @@ enum { ENUM_BIT(EXTENT_DIRTY), ENUM_BIT(EXTENT_UPTODATE), ENUM_BIT(EXTENT_LOCKED), + ENUM_BIT(EXTENT_DIO_LOCKED), ENUM_BIT(EXTENT_NEW), ENUM_BIT(EXTENT_DELALLOC), ENUM_BIT(EXTENT_DEFRAG), @@ -67,6 +68,8 @@ enum { EXTENT_ADD_INODE_BYTES | \ EXTENT_CLEAR_ALL_BITS) +#define EXTENT_LOCK_BITS (EXTENT_LOCKED | EXTENT_DIO_LOCKED) + /* * Redefined bits above which are used only in the device allocation tree, * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV @@ -134,12 +137,22 @@ const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tre void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); +int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); +bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); +static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) +{ + return __lock_extent(tree, start, end, EXTENT_LOCKED, cached); +} -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); +static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached); +} int __init extent_state_init_cachep(void); void __cold extent_state_free_cachep(void); @@ -212,5 +225,22 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); +static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); +} + +static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); +} + +static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL); +} #endif /* BTRFS_EXTENT_IO_TREE_H */ From ef176288ac18f7754b08bbe0fa63c3b548342a43 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 19 Aug 2024 16:50:01 -0400 Subject: [PATCH 953/991] btrfs: take the dio extent lock during O_DIRECT operations Currently we hold the extent lock for the entire duration of a read. This isn't really necessary in the buffered case, we're protected by the page lock, however it's necessary for O_DIRECT. For O_DIRECT reads, if we only locked the extent for the part where we get the extent, we could potentially race with an O_DIRECT write in the same region. This isn't really a problem, unless the read is delayed so much that the write does the COW, unpins the old extent, and some other application re-allocates the extent before the read is actually able to be submitted. At that point at best we'd have a checksum mismatch, but at worse we could read data that doesn't belong to us. To address this potential race we need to make sure we don't have overlapping, concurrent direct io reads and writes. To accomplish this use the new EXTENT_DIO_LOCKED bit in the direct IO case in the same spot as the current extent lock. The writes will take this while they're creating the ordered extent, which is also used to make sure concurrent buffered reads or concurrent direct reads are not allowed to occur, and drop it after the ordered extent is taken. For reads it will act as the current read behavior for the EXTENT_LOCKED bit, we set it when we're starting the read, we clear it in the end_io to allow other direct writes to continue. This still has the drawback of disallowing concurrent overlapping direct reads from occurring, but that exists with the current extent locking. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/direct-io.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index ea7f918b1c459c..4a5f9b2632f216 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -40,11 +40,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, struct btrfs_ordered_extent *ordered; int ret = 0; + /* Direct lock must be taken before the extent lock. */ + if (nowait) { + if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) + return -EAGAIN; + } else { + lock_dio_extent(io_tree, lockstart, lockend, cached_state); + } + while (1) { if (nowait) { if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) - return -EAGAIN; + cached_state)) { + ret = -EAGAIN; + break; + } } else { lock_extent(io_tree, lockstart, lockend, cached_state); } @@ -120,6 +130,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, cond_resched(); } + if (ret) + unlock_dio_extent(io_tree, lockstart, lockend, cached_state); return ret; } @@ -546,8 +558,9 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, } if (unlock_extents) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, + &cached_state); else free_extent_state(cached_state); @@ -572,8 +585,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, return 0; unlock_err: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + /* + * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget + * to update this, be explicit that we expect EXTENT_LOCKED and + * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); err: if (dio_data->data_space_reserved) { btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -596,8 +614,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, - NULL); + clear_extent_bit(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL); return 0; } @@ -608,8 +626,9 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, btrfs_finish_ordered_extent(dio_data->ordered, NULL, pos, length, false); else - unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); + clear_extent_bit(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL); ret = -ENOTBLK; } if (write) { @@ -641,8 +660,9 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, !bio->bi_status); } else { - unlock_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); + clear_extent_bit(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL); } bbio->bio.bi_private = bbio->private; From cd23d5e3ee9bb2ef5459b067b00cbf5aa0b52880 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Aug 2024 15:16:24 -0400 Subject: [PATCH 954/991] btrfs: do not hold the extent lock for entire read Historically we've held the extent lock throughout the entire read. There's been a few reasons for this, but it's mostly just caused us problems. For example, this prevents us from allowing page faults during direct io reads, because we could deadlock. This has forced us to only allow 4k reads at a time for io_uring NOWAIT requests because we have no idea if we'll be forced to page fault and thus have to do a whole lot of work. On the buffered side we are protected by the page lock, as long as we're reading things like buffered writes, punch hole, and even direct IO to a certain degree will get hung up on the page lock while the page is in flight. On the direct side we have the dio extent lock, which acts much like the way the extent lock worked previously to this patch, however just for direct reads. This protects direct reads from concurrent direct writes, while we're protected from buffered writes via the inode lock. Now that we're protected in all cases, narrow the extent lock to the part where we're getting the extent map to submit the reads, no longer holding the extent lock for the entire read operation. Push the extent lock down into do_readpage() so that we're only grabbing it when looking up the extent map. This portion was contributed by Goldwyn. Co-developed-by: Goldwyn Rodrigues Reviewed-by: Goldwyn Rodrigues Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/direct-io.c | 49 +++++++++++----------- fs/btrfs/extent_io.c | 94 ++---------------------------------------- 3 files changed, 29 insertions(+), 116 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 39cd2ed1974bd3..52952745d44a43 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -521,6 +521,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, } add_size = min(em->start + em->len, page_end + 1) - cur; free_extent_map(em); + unlock_extent(tree, cur, page_end, NULL); if (folio->index == end_index) { size_t zero_offset = offset_in_folio(folio, isize); @@ -534,7 +535,6 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (!bio_add_folio(orig_bio, folio, add_size, offset_in_folio(folio, cur))) { - unlock_extent(tree, cur, page_end, NULL); folio_unlock(folio); folio_put(folio); break; diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 4a5f9b2632f216..bd38df5647e35b 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -365,7 +365,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, int ret = 0; u64 len = length; const u64 data_alloc_len = length; - bool unlock_extents = false; + u32 unlock_bits = EXTENT_LOCKED; /* * We could potentially fault if we have a buffer > PAGE_SIZE, and if @@ -526,7 +526,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, start, &len, flags); if (ret < 0) goto unlock_err; - unlock_extents = true; /* Recalc len in case the new em is smaller than requested */ len = min(len, em->len - (start - em->start)); if (dio_data->data_space_reserved) { @@ -547,23 +546,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, release_offset, release_len); } - } else { - /* - * We need to unlock only the end area that we aren't using. - * The rest is going to be unlocked by the endio routine. - */ - lockstart = start + len; - if (lockstart < lockend) - unlock_extents = true; } - if (unlock_extents) - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_LOCKED | EXTENT_DIO_LOCKED, - &cached_state); - else - free_extent_state(cached_state); - /* * Translate extent map information to iomap. * We trim the extents (and move the addr) even though iomap code does @@ -582,6 +566,23 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->length = len; free_extent_map(em); + /* + * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, + * writes only hold it for this part. We hold the extent lock until + * we're completely done with the extent map to make sure it remains + * valid. + */ + if (write) + unlock_bits |= EXTENT_DIO_LOCKED; + + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, &cached_state); + + /* We didn't use everything, unlock the dio extent for the remainder. */ + if (!write && (start + len) < lockend) + unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, + lockend, NULL); + return 0; unlock_err: @@ -614,8 +615,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, - EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL); + unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); return 0; } @@ -626,9 +627,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, btrfs_finish_ordered_extent(dio_data->ordered, NULL, pos, length, false); else - clear_extent_bit(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, - EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL); + unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); ret = -ENOTBLK; } if (write) { @@ -660,9 +660,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, !bio->bi_status); } else { - clear_extent_bit(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, - EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL); + unlock_dio_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); } bbio->bio.bi_private = bbio->private; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f7a388529c171b..bd1a7b2fc71ad6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -480,75 +480,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) bio_put(bio); } -/* - * Record previously processed extent range - * - * For endio_readpage_release_extent() to handle a full extent range, reducing - * the extent io operations. - */ -struct processed_extent { - struct btrfs_inode *inode; - /* Start of the range in @inode */ - u64 start; - /* End of the range in @inode */ - u64 end; - bool uptodate; -}; - -/* - * Try to release processed extent range - * - * May not release the extent range right now if the current range is - * contiguous to processed extent. - * - * Will release processed extent when any of @inode, @uptodate, the range is - * no longer contiguous to the processed range. - * - * Passing @inode == NULL will force processed extent to be released. - */ -static void endio_readpage_release_extent(struct processed_extent *processed, - struct btrfs_inode *inode, u64 start, u64 end, - bool uptodate) -{ - struct extent_state *cached = NULL; - struct extent_io_tree *tree; - - /* The first extent, initialize @processed */ - if (!processed->inode) - goto update; - - /* - * Contiguous to processed extent, just uptodate the end. - * - * Several things to notice: - * - * - bio can be merged as long as on-disk bytenr is contiguous - * This means we can have page belonging to other inodes, thus need to - * check if the inode still matches. - * - bvec can contain range beyond current page for multi-page bvec - * Thus we need to do processed->end + 1 >= start check - */ - if (processed->inode == inode && processed->uptodate == uptodate && - processed->end + 1 >= start && end >= processed->end) { - processed->end = end; - return; - } - - tree = &processed->inode->io_tree; - /* - * Now we don't have range contiguous to the processed range, release - * the processed range now. - */ - unlock_extent(tree, processed->start, processed->end, &cached); - -update: - /* Update processed to current range */ - processed->inode = inode; - processed->start = start; - processed->end = end; - processed->uptodate = uptodate; -} - static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { ASSERT(folio_test_locked(folio)); @@ -575,7 +506,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) { struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; - struct processed_extent processed = { 0 }; struct folio_iter fi; const u32 sectorsize = fs_info->sectorsize; @@ -640,11 +570,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) /* Update page status and unlock. */ end_folio_read(folio, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, uptodate); } - /* Release the last extent */ - endio_readpage_release_extent(&processed, NULL, 0, 0, false); bio_put(bio); } @@ -973,6 +899,7 @@ static struct extent_map *__get_extent_map(struct inode *inode, u64 len, struct extent_map **em_cached) { struct extent_map *em; + struct extent_state *cached_state = NULL; ASSERT(em_cached); @@ -988,12 +915,15 @@ static struct extent_map *__get_extent_map(struct inode *inode, *em_cached = NULL; } + btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state); em = btrfs_get_extent(BTRFS_I(inode), folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); + return em; } /* @@ -1019,11 +949,9 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, size_t pg_offset = 0; size_t iosize; size_t blocksize = fs_info->sectorsize; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_extent(tree, start, end, NULL); folio_unlock(folio); return ret; } @@ -1047,14 +975,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (cur >= last_byte) { iosize = folio_size(folio) - pg_offset; folio_zero_range(folio, pg_offset, iosize); - unlock_extent(tree, cur, cur + iosize - 1, NULL); end_folio_read(folio, true, cur, iosize); break; } em = __get_extent_map(inode, folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { - unlock_extent(tree, cur, end, NULL); end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); } @@ -1123,7 +1049,6 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (block_start == EXTENT_MAP_HOLE) { folio_zero_range(folio, pg_offset, iosize); - unlock_extent(tree, cur, cur + iosize - 1, NULL); end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; @@ -1131,7 +1056,6 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, } /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - unlock_extent(tree, cur, cur + iosize - 1, NULL); end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; @@ -1156,15 +1080,10 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { - struct btrfs_inode *inode = folio_to_inode(folio); - u64 start = folio_pos(folio); - u64 end = start + folio_size(folio) - 1; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); free_extent_map(em_cached); @@ -2337,15 +2256,10 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; - struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); struct folio *folio; - u64 start = readahead_pos(rac); - u64 end = start + readahead_length(rac) - 1; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); From 317c2108c61fb986f30eb64311c220cdec2ecd08 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 24 Aug 2024 19:36:43 +0930 Subject: [PATCH 955/991] btrfs: merge btrfs_orig_bbio_end_io() into btrfs_bio_end_io() There are only two differences between the two functions: - btrfs_orig_bbio_end_io() does extra error propagation This is mostly to allow tolerance for write errors. - btrfs_orig_bbio_end_io() does extra pending_ios check This check can handle both the original bio, or the cloned one. (All accounting happens in the original one). This makes btrfs_orig_bbio_end_io() a much safer call. In fact we already had a double freeing error due to usage of btrfs_bio_end_io() in the error path of btrfs_submit_chunk(). So just move the whole content of btrfs_orig_bbio_end_io() into btrfs_bio_end_io(). For normal paths this brings no change, because they are already calling btrfs_orig_bbio_end_io() in the first place. For error paths (not only inside bio.c but also external callers), this change will introduce extra checks, especially for external callers, as they will error out without submitting the btrfs bio. But considering it's already in the error path, such slower but much safer checks are still an overall win. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index d5dcc356df3345..ce13416bc10f03 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -120,12 +120,6 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio) } } -void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - __btrfs_bio_end_io(bbio); -} - static void btrfs_orig_write_end_io(struct bio *bio); static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, @@ -147,8 +141,9 @@ static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, } } -static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { + bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; @@ -179,7 +174,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) static void btrfs_repair_done(struct btrfs_failed_bio *fbio) { if (atomic_dec_and_test(&fbio->repair_count)) { - btrfs_orig_bbio_end_io(fbio->bbio); + btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status); mempool_free(fbio, &btrfs_failed_bio_pool); } } @@ -326,7 +321,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de if (fbio) btrfs_repair_done(fbio); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) @@ -360,7 +355,7 @@ static void btrfs_end_bio_work(struct work_struct *work) if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_simple_end_io(struct bio *bio) @@ -380,7 +375,7 @@ static void btrfs_simple_end_io(struct bio *bio) } else { if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) btrfs_record_physical_zoned(bbio); - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } } @@ -394,7 +389,7 @@ static void btrfs_raid56_end_io(struct bio *bio) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -424,7 +419,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -593,7 +588,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_orig_bbio_end_io(async->bbio); + btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); return; } @@ -768,11 +763,9 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); ASSERT(remaining); - remaining->bio.bi_status = ret; - btrfs_orig_bbio_end_io(remaining); + btrfs_bio_end_io(remaining, ret); } - bbio->bio.bi_status = ret; - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, ret); /* Do not submit another chunk */ return true; } From 39bd93d111db3d706128b3633c7f760256ca5ce0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 29 May 2024 17:03:47 +0930 Subject: [PATCH 956/991] btrfs: make compression path to be subpage compatible Currently btrfs compression path is not really subpage compatible, every thing is still done in page unit. That's fine for regular sector size and subpage routine. As even for subpage routine compression is only enabled if the whole range is page aligned, so reading the page cache in page unit is totally fine. However in preparation for the future subpage perfect compression support, we need to change the compression routine to properly handle a subpage range. This patch would prepare both zlib and zstd to only read the subpage range for compression. Lzo is already doing subpage aware read, as lzo's on-disk format is already sectorsize dependent. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 8 ++++++++ fs/btrfs/zlib.c | 19 ++++++++++++++++--- fs/btrfs/zstd.c | 19 +++++++++++++------ 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index cfdc643191862a..5d01f092ae13f2 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -82,6 +82,14 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level) return ((type_level & 0xF0) >> 4); } +/* @range_end must be exclusive. */ +static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) +{ + u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE; + + return min(range_end, page_end) - cur; +} + int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 30971dd741e2df..ee80d064b9549c 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -20,6 +20,8 @@ #include #include "btrfs_inode.h" #include "compression.h" +#include "fs.h" +#include "subpage.h" /* workspace buffer size for s390 zlib hardware support */ #define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE) @@ -108,6 +110,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; const unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const u64 orig_end = start + len; *out_folios = 0; *total_out = 0; @@ -153,6 +156,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, if (in_buf_folios > 1) { int i; + /* S390 hardware acceleration path, not subpage. */ + ASSERT(!btrfs_is_subpage( + inode_to_fs_info(mapping->host), + mapping)); for (i = 0; i < in_buf_folios; i++) { if (data_in) { kunmap_local(data_in); @@ -167,9 +174,14 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; + workspace->strm.avail_in = + (in_buf_folios << PAGE_SHIFT); } workspace->strm.next_in = workspace->buf; } else { + unsigned int pg_off; + unsigned int cur_len; + if (data_in) { kunmap_local(data_in); folio_put(in_folio); @@ -179,12 +191,13 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, start, &in_folio); if (ret < 0) goto out; - data_in = kmap_local_folio(in_folio, 0); + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + data_in = kmap_local_folio(in_folio, pg_off); start += PAGE_SIZE; workspace->strm.next_in = data_in; + workspace->strm.avail_in = cur_len; } - workspace->strm.avail_in = min(bytes_left, - (unsigned long) workspace->buf_size); } ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 2a079561b2b10b..05cf7cebc17c85 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -389,7 +389,10 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long tot_out = 0; unsigned long len = *total_out; const unsigned long nr_dest_folios = *out_folios; + const u64 orig_end = start + len; unsigned long max_out = nr_dest_folios * PAGE_SIZE; + unsigned int pg_off; + unsigned int cur_len; zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, len); @@ -415,9 +418,11 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - workspace->in_buf.src = kmap_local_folio(in_folio, 0); + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.size = cur_len; /* Allocate and map in the output buffer */ out_folio = btrfs_alloc_compr_folio(); @@ -494,14 +499,16 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, kunmap_local(workspace->in_buf.src); workspace->in_buf.src = NULL; folio_put(in_folio); - start += PAGE_SIZE; - len -= PAGE_SIZE; + start += cur_len; + len -= cur_len; ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - workspace->in_buf.src = kmap_local_folio(in_folio, 0); + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.size = cur_len; } } while (1) { From 9f2ea6d5fb74baa0b32ba8bd7541009acae9cad9 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:28:55 +0800 Subject: [PATCH 957/991] btrfs: convert clear_page_extent_mapped() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Now clear_page_extent_mapped() can deal with a folio directly, so change its name to clear_folio_extent_mapped(). Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 ++++----- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 4 ++-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bd1a7b2fc71ad6..8b5f36474ae157 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -877,18 +877,17 @@ int set_folio_extent_mapped(struct folio *folio) return 0; } -void clear_page_extent_mapped(struct page *page) +void clear_folio_extent_mapped(struct folio *folio) { - struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info; - ASSERT(page->mapping); + ASSERT(folio->mapping); if (!folio_test_private(folio)) return; - fs_info = page_to_fs_info(page); - if (btrfs_is_subpage(fs_info, page->mapping)) + fs_info = folio_to_fs_info(folio); + if (btrfs_is_subpage(fs_info, folio->mapping)) return btrfs_detach_subpage(fs_info, folio); folio_detach_private(folio); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b38460279b9932..1d9b30021109b9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -249,7 +249,7 @@ int btree_write_cache_pages(struct address_space *mapping, void btrfs_readahead(struct readahead_control *rac); int set_folio_extent_mapped(struct folio *folio); int set_page_extent_mapped(struct page *page); -void clear_page_extent_mapped(struct page *page); +void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e96b63d7e8fd38..5e3c772eed2b43 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7240,7 +7240,7 @@ static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { if (try_release_extent_mapping(&folio->page, gfp_flags)) { wait_subpage_spinlock(folio); - clear_page_extent_mapped(&folio->page); + clear_folio_extent_mapped(folio); return true; } return false; @@ -7438,7 +7438,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); - clear_page_extent_mapped(&folio->page); + clear_folio_extent_mapped(folio); } static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) From 236edd4dc464840a0f67dcea2d66605c0902600e Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:28:56 +0800 Subject: [PATCH 958/991] btrfs: convert get_next_extent_buffer() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Use folio_pos instead of page_offset, which is more consistent with folio usage. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8b5f36474ae157..65b6b391f283a4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4041,17 +4041,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst, #define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( - const struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) + const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) { struct extent_buffer *gang[GANG_LOOKUP_SIZE]; struct extent_buffer *found = NULL; - u64 page_start = page_offset(page); - u64 cur = page_start; + u64 folio_start = folio_pos(folio); + u64 cur = folio_start; - ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); + ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - while (cur < page_start + PAGE_SIZE) { + while (cur < folio_start + PAGE_SIZE) { int ret; int i; @@ -4063,7 +4063,7 @@ static struct extent_buffer *get_next_extent_buffer( goto out; for (i = 0; i < ret; i++) { /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) + if (gang[i]->start >= folio_start + PAGE_SIZE) goto out; /* Found one */ if (gang[i]->start >= bytenr) { @@ -4096,7 +4096,7 @@ static int try_release_subpage_extent_buffer(struct page *page) * with spinlock rather than RCU. */ spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, page, cur); + eb = get_next_extent_buffer(fs_info, page_folio(page), cur); if (!eb) { /* No more eb in the page range after or at cur */ spin_unlock(&fs_info->buffer_lock); From 25f0fc94603a6130422191b1f893716f3e63f56f Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:28:57 +0800 Subject: [PATCH 959/991] btrfs: convert try_release_subpage_extent_buffer() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. And use folio_pos instead of page_offset, which is more consistent with folio usage. At the same time, folio_test_private() can handle folio directly without converting from page to folio first. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 65b6b391f283a4..f5508cfb36d9cc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4077,11 +4077,11 @@ static struct extent_buffer *get_next_extent_buffer( return found; } -static int try_release_subpage_extent_buffer(struct page *page) +static int try_release_subpage_extent_buffer(struct folio *folio) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - u64 cur = page_offset(page); - const u64 end = page_offset(page) + PAGE_SIZE; + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + u64 cur = folio_pos(folio); + const u64 end = cur + PAGE_SIZE; int ret; while (cur < end) { @@ -4096,7 +4096,7 @@ static int try_release_subpage_extent_buffer(struct page *page) * with spinlock rather than RCU. */ spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, page_folio(page), cur); + eb = get_next_extent_buffer(fs_info, folio, cur); if (!eb) { /* No more eb in the page range after or at cur */ spin_unlock(&fs_info->buffer_lock); @@ -4137,12 +4137,12 @@ static int try_release_subpage_extent_buffer(struct page *page) * Finally to check if we have cleared folio private, as if we have * released all ebs in the page, the folio private should be cleared now. */ - spin_lock(&page->mapping->i_private_lock); - if (!folio_test_private(page_folio(page))) + spin_lock(&folio->mapping->i_private_lock); + if (!folio_test_private(folio)) ret = 1; else ret = 0; - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return ret; } @@ -4153,7 +4153,7 @@ int try_release_extent_buffer(struct page *page) struct extent_buffer *eb; if (page_to_fs_info(page)->nodesize < PAGE_SIZE) - return try_release_subpage_extent_buffer(page); + return try_release_subpage_extent_buffer(page_folio(page)); /* * We need to make sure nobody is changing folio private, as we rely on From 61953a966d28f442c71cefdf46734021fe392c03 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:28:58 +0800 Subject: [PATCH 960/991] btrfs: convert try_release_extent_buffer() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 15 +++++++-------- fs/btrfs/extent_io.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 612460e07b2e74..25d768e67e3724 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -525,7 +525,7 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) if (folio_test_writeback(folio) || folio_test_dirty(folio)) return false; - return try_release_extent_buffer(&folio->page); + return try_release_extent_buffer(folio); } static void btree_invalidate_folio(struct folio *folio, size_t offset, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f5508cfb36d9cc..f8b001053d0591 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4147,21 +4147,20 @@ static int try_release_subpage_extent_buffer(struct folio *folio) } -int try_release_extent_buffer(struct page *page) +int try_release_extent_buffer(struct folio *folio) { - struct folio *folio = page_folio(page); struct extent_buffer *eb; - if (page_to_fs_info(page)->nodesize < PAGE_SIZE) - return try_release_subpage_extent_buffer(page_folio(page)); + if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + return try_release_subpage_extent_buffer(folio); /* * We need to make sure nobody is changing folio private, as we rely on * folio private as the pointer to extent buffer. */ - spin_lock(&page->mapping->i_private_lock); + spin_lock(&folio->mapping->i_private_lock); if (!folio_test_private(folio)) { - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return 1; } @@ -4176,10 +4175,10 @@ int try_release_extent_buffer(struct page *page) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return 0; } - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); /* * If tree ref isn't set then we know the ref on this eb is a real ref, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1d9b30021109b9..345774c84c4bca 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -237,7 +237,7 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) } bool try_release_extent_mapping(struct page *page, gfp_t mask); -int try_release_extent_buffer(struct page *page); +int try_release_extent_buffer(struct folio *folio); int btrfs_read_folio(struct file *file, struct folio *folio); void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, From f7b8f67caa901d8398878d5a771696cb67689189 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:28:59 +0800 Subject: [PATCH 961/991] btrfs: convert read_key_bytes() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Moreover, use kmap_local_folio() instead of kmap_local_page(), which is more consistent with folio usage. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/verity.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 4042dd6437aefa..e36dc99021a0bf 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -284,7 +284,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, * page and ignore dest, but it must still be non-NULL to avoid the * counting-only behavior. * @len: length in bytes to read - * @dest_page: copy into this page instead of the dest buffer + * @dest_folio: copy into this folio instead of the dest buffer * * Helper function to read items from the btree. This returns the number of * bytes read or < 0 for errors. We can return short reads if the items don't @@ -294,7 +294,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, * Returns number of bytes read or a negative error code on failure. */ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, - char *dest, u64 len, struct page *dest_page) + char *dest, u64 len, struct folio *dest_folio) { struct btrfs_path *path; struct btrfs_root *root = inode->root; @@ -314,7 +314,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, if (!path) return -ENOMEM; - if (dest_page) + if (dest_folio) path->reada = READA_FORWARD; key.objectid = btrfs_ino(inode); @@ -371,15 +371,15 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, copy_offset = offset - key.offset; if (dest) { - if (dest_page) - kaddr = kmap_local_page(dest_page); + if (dest_folio) + kaddr = kmap_local_folio(dest_folio, 0); data = btrfs_item_ptr(leaf, path->slots[0], void); read_extent_buffer(leaf, kaddr + dest_offset, (unsigned long)data + copy_offset, copy_bytes); - if (dest_page) + if (dest_folio) kunmap_local(kaddr); } @@ -762,7 +762,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode, * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] */ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, - folio_address(folio), PAGE_SIZE, &folio->page); + folio_address(folio), PAGE_SIZE, folio); if (ret < 0) { folio_put(folio); return ERR_PTR(ret); From 8cc7158f71dad9c454df37f74c62c51755bfe076 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:00 +0800 Subject: [PATCH 962/991] btrfs: convert submit_eb_subpage() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Moreover, use folio_pos() instead of page_offset(), which is more consistent with folio usage. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f8b001053d0591..f16c5be22849d8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1735,12 +1735,11 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * Return >=0 for the number of submitted extent buffers. * Return <0 for fatal error. */ -static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) +static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); int submitted = 0; - u64 page_start = page_offset(page); + u64 folio_start = folio_pos(folio); int bit_start = 0; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; @@ -1755,21 +1754,21 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * Take private lock to ensure the subpage won't be detached * in the meantime. */ - spin_lock(&page->mapping->i_private_lock); + spin_lock(&folio->mapping->i_private_lock); if (!folio_test_private(folio)) { - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); break; } spin_lock_irqsave(&subpage->lock, flags); if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); bit_start++; continue; } - start = page_start + bit_start * fs_info->sectorsize; + start = folio_start + bit_start * fs_info->sectorsize; bit_start += sectors_per_node; /* @@ -1778,7 +1777,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) */ eb = find_extent_buffer_nolock(fs_info, start); spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); /* * The eb has already reached 0 refs thus find_extent_buffer() @@ -1829,7 +1828,7 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) return 0; if (page_to_fs_info(page)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, wbc); + return submit_eb_subpage(folio, wbc); spin_lock(&mapping->i_private_lock); if (!folio_test_private(folio)) { From 914354de88cf2672c48856b6958ad58a2a6e62c6 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:01 +0800 Subject: [PATCH 963/991] btrfs: convert submit_eb_page() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f16c5be22849d8..14d2cc71fffdff 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1816,18 +1816,17 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) +static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx) { struct writeback_control *wbc = ctx->wbc; - struct address_space *mapping = page->mapping; - struct folio *folio = page_folio(page); + struct address_space *mapping = folio->mapping; struct extent_buffer *eb; int ret; if (!folio_test_private(folio)) return 0; - if (page_to_fs_info(page)->nodesize < PAGE_SIZE) + if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) return submit_eb_subpage(folio, wbc); spin_lock(&mapping->i_private_lock); @@ -1926,7 +1925,7 @@ int btree_write_cache_pages(struct address_space *mapping, for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, &ctx); + ret = submit_eb_page(folio, &ctx); if (ret == 0) continue; if (ret < 0) { From 016a94d4269429181853c1c59af52cb9ba495204 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:02 +0800 Subject: [PATCH 964/991] btrfs: convert try_release_extent_state() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Moreover, use folio_pos() instead of page_offset(), which is more consistent with folio usage. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 14d2cc71fffdff..430dc1b9de320e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2303,9 +2303,9 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * to drop the page. */ static bool try_release_extent_state(struct extent_io_tree *tree, - struct page *page, gfp_t mask) + struct folio *folio, gfp_t mask) { - u64 start = page_offset(page); + u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; bool ret; @@ -2414,7 +2414,7 @@ bool try_release_extent_mapping(struct page *page, gfp_t mask) cond_resched(); } } - return try_release_extent_state(io_tree, page, mask); + return try_release_extent_state(io_tree, folio, mask); } static void __free_extent_buffer(struct extent_buffer *eb) From 8d62985cc4c54a32051524ba128e1f8248b848a6 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:03 +0800 Subject: [PATCH 965/991] btrfs: convert try_release_extent_mapping() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. And page_to_inode() can be replaced with folio_to_inode() now. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 430dc1b9de320e..c07930986fe586 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2341,11 +2341,11 @@ static bool try_release_extent_state(struct extent_io_tree *tree, * in the range corresponding to the page, both state records and extent * map records are removed */ -bool try_release_extent_mapping(struct page *page, gfp_t mask) +bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { - u64 start = page_offset(page); + u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; - struct btrfs_inode *inode = page_to_inode(page); + struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree; while (start <= end) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 345774c84c4bca..8a36117ed4532b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -236,7 +236,7 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -bool try_release_extent_mapping(struct page *page, gfp_t mask); +bool try_release_extent_mapping(struct folio *folio, gfp_t mask); int try_release_extent_buffer(struct folio *folio); int btrfs_read_folio(struct file *file, struct folio *folio); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5e3c772eed2b43..1483e33127b772 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7238,7 +7238,7 @@ static int btrfs_launder_folio(struct folio *folio) static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { - if (try_release_extent_mapping(&folio->page, gfp_flags)) { + if (try_release_extent_mapping(folio, gfp_flags)) { wait_subpage_spinlock(folio); clear_folio_extent_mapped(folio); return true; From 97c3db14ed270b31e2167a62ca2fcae53c6aefec Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:04 +0800 Subject: [PATCH 966/991] btrfs: convert zlib_decompress() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. And memcpy_to_page() can be replaced with memcpy_to_folio(). But there is no memzero_folio(), but it can be replaced equivalently by folio_zero_range(). Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/compression.h | 2 +- fs/btrfs/zlib.c | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 52952745d44a43..e97692fdfab006 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -142,7 +142,7 @@ static int compression_decompress(int type, struct list_head *ws, unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { - case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, page_folio(dest_page), dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, dest_pgoff, srclen, destlen); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 5d01f092ae13f2..f4f7a981cb90cc 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -162,7 +162,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index ee80d064b9549c..100abc00b794ca 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -393,7 +393,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -421,12 +421,12 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, ret = zlib_inflateInit2(&workspace->strm, wbits); if (unlikely(ret != Z_OK)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zlib decompression init failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page)); + folio_pos(dest_folio)); return -EIO; } @@ -439,16 +439,16 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, if (ret != Z_STREAM_END) goto out; - memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, to_copy); out: if (unlikely(to_copy != destlen)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zlib decompression failed, error %d root %llu inode %llu offset %llu decompressed %lu expected %zu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page), to_copy, destlen); + folio_pos(dest_folio), to_copy, destlen); ret = -EIO; } else { ret = 0; @@ -457,7 +457,7 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, zlib_inflateEnd(&workspace->strm); if (unlikely(to_copy < destlen)) - memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); + folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); return ret; } From aa87c16612750af328082fa56702c8f53c7fe717 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:05 +0800 Subject: [PATCH 967/991] btrfs: convert lzo_decompress() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. And memcpy_to_page() can be replaced with memcpy_to_folio(). But there is no memzero_folio(), but it can be replaced equivalently by folio_zero_range(). Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/compression.h | 2 +- fs/btrfs/lzo.c | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e97692fdfab006..a70ca9c94ea791 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -144,7 +144,7 @@ static int compression_decompress(int type, struct list_head *ws, switch (type) { case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, page_folio(dest_page), dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, page_folio(dest_page), dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, dest_pgoff, srclen, destlen); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index f4f7a981cb90cc..4b5a7ba54815cc 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -173,7 +173,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1e2a68b8f62d8a..72856f6775f7f1 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -438,11 +438,11 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; @@ -467,22 +467,22 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (unlikely(ret != LZO_E_OK)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(fs_info, "lzo decompression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page)); + folio_pos(dest_folio)); ret = -EIO; goto out; } ASSERT(out_len <= sectorsize); - memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len); /* Early end, considered as an error. */ if (unlikely(out_len < destlen)) { ret = -EIO; - memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); + folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len); } out: return ret; From 09b3e769c77e9662820d9a6314b9311a9df482fa Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:06 +0800 Subject: [PATCH 968/991] btrfs: convert zstd_decompress() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. And memcpy_to_page() can be replaced with memcpy_to_folio(). But there is no memzero_folio(), but it can be replaced equivalently by folio_zero_range(). Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/compression.h | 2 +- fs/btrfs/zstd.c | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index a70ca9c94ea791..a5f7cbe1c06ed2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -146,7 +146,7 @@ static int compression_decompress(int type, struct list_head *ws, dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, page_folio(dest_page), dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, page_folio(dest_page), dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 4b5a7ba54815cc..d2453cf28eef8e 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -183,7 +183,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 05cf7cebc17c85..866607fd3e5884 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -656,11 +656,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(folio_inode(dest_folio)->i_sb); const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; @@ -669,12 +669,12 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (unlikely(!stream)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zstd decompression init failed, root %llu inode %llu offset %llu", btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page)); + folio_pos(dest_folio)); ret = -EIO; goto finish; } @@ -693,21 +693,21 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, */ ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret))) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zstd decompression failed, error %d root %llu inode %llu offset %llu", zstd_get_error_code(ret), btrfs_root_id(inode->root), - btrfs_ino(inode), page_offset(dest_page)); + btrfs_ino(inode), folio_pos(dest_folio)); goto finish; } to_copy = workspace->out_buf.pos; - memcpy_to_page(dest_page, dest_pgoff, workspace->out_buf.dst, to_copy); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->out_buf.dst, to_copy); finish: /* Error or early end. */ if (unlikely(to_copy < destlen)) { ret = -EIO; - memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); + folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); } return ret; } From 8813a2f8955bf2c6a271cd5448a8b87a5cddf8f3 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:07 +0800 Subject: [PATCH 969/991] btrfs: convert btrfs_decompress() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Based on the previous patch, the compression path can be directly used in folio without converting to page. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 14 +++++++------- fs/btrfs/compression.h | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/reflink.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index a5f7cbe1c06ed2..90aef2627ca27b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -138,15 +138,15 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, + const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { - case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, page_folio(dest_page), + case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, page_folio(dest_page), + case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, page_folio(dest_page), + case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: @@ -1061,10 +1061,10 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * single page, and we want to read a single page out of it. * start_byte tells us the offset into the compressed data we're interested in */ -int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { - struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); struct list_head *workspace; const u32 sectorsize = fs_info->sectorsize; int ret; @@ -1077,7 +1077,7 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); workspace = get_workspace(type, 0); - ret = compression_decompress(type, workspace, data_in, dest_page, + ret = compression_decompress(type, workspace, data_in, dest_folio, dest_pgoff, srclen, destlen); put_workspace(type, workspace); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index d2453cf28eef8e..b6563b6a333e5d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -96,7 +96,7 @@ void __cold btrfs_exit_compress(void); int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); -int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1483e33127b772..edac499fd83d29 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6746,7 +6746,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, &folio->page, 0, inline_size, + ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, max_size); /* diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index df6b93b927cd85..b768e590a44c22 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -118,7 +118,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, memcpy_to_page(page, offset_in_page(file_offset), data_start, datal); } else { - ret = btrfs_decompress(comp_type, data_start, page, + ret = btrfs_decompress(comp_type, data_start, page_folio(page), offset_in_page(file_offset), inline_size, datal); if (ret) From eef61487b32fba1c6d24cd5a0c3262c338af58cc Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:08 +0800 Subject: [PATCH 970/991] btrfs: convert copy_inline_to_page() to use folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Moreover find_or_create_page() is compatible API, and it can replaced with __filemap_get_folio(). Some interfaces have been converted to use folio before, so the conversion operation from page can be eliminated here. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/reflink.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index b768e590a44c22..f0824c948cb700 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -66,7 +66,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; - struct page *page = NULL; + struct folio *folio = NULL; struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; @@ -83,14 +83,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret) goto out; - page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(mapping)); - if (!page) { + folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + btrfs_alloc_write_mask(mapping)); + if (IS_ERR(folio)) { ret = -ENOMEM; goto out_unlock; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; @@ -115,15 +116,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { - memcpy_to_page(page, offset_in_page(file_offset), data_start, - datal); + memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start, + datal); } else { - ret = btrfs_decompress(comp_type, data_start, page_folio(page), - offset_in_page(file_offset), + ret = btrfs_decompress(comp_type, data_start, folio, + offset_in_folio(folio, file_offset), inline_size, datal); if (ret) goto out_unlock; - flush_dcache_page(page); + flush_dcache_folio(folio); } /* @@ -139,15 +140,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) - memzero_page(page, datal, block_size - datal); + folio_zero_range(folio, datal, block_size - datal); - btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size); - btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size); - btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size); + btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size); + btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size); + btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size); out_unlock: - if (page) { - unlock_page(page); - put_page(page); + if (!IS_ERR(folio)) { + folio_unlock(folio); + folio_put(folio); } if (ret) btrfs_delalloc_release_space(inode, data_reserved, file_offset, From 9f6ff37c3f7060b980a4029a9d1e284a75621684 Mon Sep 17 00:00:00 2001 From: Luca Stefani Date: Mon, 2 Sep 2024 13:10:53 +0200 Subject: [PATCH 971/991] btrfs: always update fstrim_range on failure in FITRIM ioctl Even in case of failure we could've discarded some data and userspace should be made aware of it, so copy fstrim_range to userspace regardless. Also make sure to update the trimmed bytes amount even if btrfs_trim_free_extents fails. CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Qu Wenruo Signed-off-by: Luca Stefani Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/ioctl.c | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index feec49e6f9c801..a5966324607d49 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6551,13 +6551,13 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) continue; ret = btrfs_trim_free_extents(device, &group_trimmed); + + trimmed += group_trimmed; if (ret) { dev_failed++; dev_ret = ret; break; } - - trimmed += group_trimmed; } mutex_unlock(&fs_devices->device_list_mutex); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ee01cc8288837e..8537eb9b553121 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -543,13 +543,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(fs_info, &range); - if (ret < 0) - return ret; if (copy_to_user(arg, &range, sizeof(range))) return -EFAULT; - return 0; + return ret; } int __pure btrfs_is_empty_uuid(const u8 *uuid) From 8887d8684347d87b67a09217d301822f45041a79 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 29 Aug 2024 19:03:52 +0100 Subject: [PATCH 972/991] btrfs: add and use helper to verify the calling task has locked the inode We have a few places that check if we have the inode locked by doing: ASSERT(inode_is_locked(vfs_inode)); This actually proved to be useful several times as if assertions are enabled (and by default they are in many distros) it immediately triggers a crash which is impossible for users to miss. However that doesn't check if the lock is held by the calling task, so the check passes if some other task locked the inode. Using one of the lockdep functions to check the lock is held, like lockdep_assert_held() for example, does check that the calling task holds the lock, and if that's not the case it produces a warning and stack trace in dmesg. However, despite the misleading "assert" in the name of the lockdep helpers, it does not trigger a crash/BUG_ON(), just a warning and splat in dmesg, which is easy to get unnoticed by users who may have lockdep enabled. So add a helper that does the ASSERT() and calls lockdep_assert_held() immediately after and use it every where we check the inode is locked. Like this if the lock is held by some other task we get the warning in dmesg which is caught by fstests, very helpful during development, and may also be occassionaly noticed by users with lockdep enabled. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 8 ++++++++ fs/btrfs/file.c | 2 +- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/tree-log.c | 2 +- fs/btrfs/verity.c | 6 +++--- fs/btrfs/xattr.c | 2 +- 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2d7f8da54d8a66..90e72031c72457 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -505,6 +505,14 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) return true; } +static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) +{ + /* Immediately trigger a crash if the inode is not locked. */ + ASSERT(inode_is_locked(&inode->vfs_inode)); + /* Trigger a splat in dmesg if this task is not holding the lock. */ + lockdep_assert_held(&inode->vfs_inode.i_rwsem); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c7a7234998aa4c..c5e36f58eb079e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1617,7 +1617,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { skip_ilock = true; current->journal_info = NULL; - lockdep_assert_held(&inode->vfs_inode.i_rwsem); + btrfs_assert_inode_locked(inode); } trace_btrfs_sync_file(file, datasync); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index eb9b32ffbc0c04..2104d60c216166 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1015,7 +1015,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, { struct rb_node *n; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); spin_lock_irq(&inode->ordered_tree_lock); for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f0cf8ce26f010f..e2ed2a791f8f03 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2877,7 +2877,7 @@ void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *tmp; - ASSERT(inode_is_locked(&ctx->inode->vfs_inode)); + btrfs_assert_inode_locked(ctx->inode); list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { list_del_init(&ordered->log_list); diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index e36dc99021a0bf..e97ad824ae16d9 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -460,7 +460,7 @@ static int rollback_verity(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; int ret; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size); clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); ret = btrfs_drop_verity_items(inode); @@ -585,7 +585,7 @@ static int btrfs_begin_enable_verity(struct file *filp) struct btrfs_trans_handle *trans; int ret; - ASSERT(inode_is_locked(file_inode(filp))); + btrfs_assert_inode_locked(inode); if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags)) return -EBUSY; @@ -633,7 +633,7 @@ static int btrfs_end_enable_verity(struct file *filp, const void *desc, int ret = 0; int rollback_ret; - ASSERT(inode_is_locked(file_inode(filp))); + btrfs_assert_inode_locked(inode); if (desc == NULL) goto rollback; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 738c7bb8ea7c89..ce464cd8e0ac79 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -120,7 +120,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { - ASSERT(inode_is_locked(inode)); + btrfs_assert_inode_locked(BTRFS_I(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, 0); if (!di) From 92de3a34bd58181dea2dd5c9a9a96bce840bc6f8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Jun 2024 02:28:30 +0200 Subject: [PATCH 973/991] btrfs: rework BTRFS_I as macro to preserve parameter const Currently BTRFS_I is a static inline function that takes a const inode and returns btrfs inode, dropping the 'const' qualifier. This can break assumptions of compiler though it seems there's no real case. To make the parameter and return type consistent regardint const we can use the container_of_const() that preserves it. However this would not check the parameter type. To fix that use the same _Generic construct but implement only the two expected types. Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 90e72031c72457..9a4b7c11931877 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -350,10 +350,12 @@ static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, WRITE_ONCE(inode->first_dir_index_to_log, index); } -static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) -{ - return container_of(inode, struct btrfs_inode, vfs_inode); -} +/* Type checked and const-preserving VFS inode -> btrfs inode. */ +#define BTRFS_I(_inode) \ + _Generic(_inode, \ + struct inode *: container_of(_inode, struct btrfs_inode, vfs_inode), \ + const struct inode *: (const struct btrfs_inode *)container_of( \ + _inode, const struct btrfs_inode, vfs_inode)) static inline unsigned long btrfs_inode_hash(u64 objectid, const struct btrfs_root *root) From 7318b56f66bf7652075e33f3a3760250de5c0cd9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 26 Jun 2024 23:39:11 +0200 Subject: [PATCH 974/991] btrfs: constify more pointer parameters Continue adding const to parameters. This is for clarity and minor addition to safety. There are some minor effects, in the assembly code and .ko measured on release config. Signed-off-by: David Sterba --- fs/btrfs/backref.c | 6 +++--- fs/btrfs/block-group.c | 34 +++++++++++++++++----------------- fs/btrfs/block-group.h | 11 +++++------ fs/btrfs/block-rsv.c | 2 +- fs/btrfs/block-rsv.h | 2 +- fs/btrfs/ctree.c | 18 +++++++++--------- fs/btrfs/ctree.h | 6 +++--- fs/btrfs/discard.c | 4 ++-- fs/btrfs/file-item.c | 4 ++-- fs/btrfs/file-item.h | 2 +- fs/btrfs/inode-item.c | 10 +++++----- fs/btrfs/inode-item.h | 4 ++-- fs/btrfs/space-info.c | 25 ++++++++++++------------- fs/btrfs/space-info.h | 10 +++++----- fs/btrfs/tree-mod-log.c | 14 +++++++------- fs/btrfs/tree-mod-log.h | 6 +++--- fs/btrfs/zoned.c | 2 +- fs/btrfs/zoned.h | 4 ++-- include/trace/events/btrfs.h | 6 +++--- 19 files changed, 84 insertions(+), 86 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a2de5c05f97c89..e2f478ecd7fd8f 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -219,8 +219,8 @@ static void free_pref(struct prelim_ref *ref) * A -1 return indicates ref1 is a 'lower' block than ref2, while 1 * indicates a 'higher' block. */ -static int prelim_ref_compare(struct prelim_ref *ref1, - struct prelim_ref *ref2) +static int prelim_ref_compare(const struct prelim_ref *ref1, + const struct prelim_ref *ref2) { if (ref1->level < ref2->level) return -1; @@ -251,7 +251,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1, } static void update_share_count(struct share_check *sc, int oldcount, - int newcount, struct prelim_ref *newref) + int newcount, const struct prelim_ref *newref) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 2e49d978f504ed..7980b2e33a9216 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -23,7 +23,7 @@ #include "extent-tree.h" #ifdef CONFIG_BTRFS_DEBUG -int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) +int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -40,9 +40,9 @@ int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) * * Should be called with balance_lock held */ -static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags) { - struct btrfs_balance_control *bctl = fs_info->balance_ctl; + const struct btrfs_balance_control *bctl = fs_info->balance_ctl; u64 target = 0; if (!bctl) @@ -1415,9 +1415,9 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) } static bool clean_pinned_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group *bg) + const struct btrfs_block_group *bg) { - struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *prev_trans = NULL; const u64 start = bg->start; const u64 end = start + bg->length - 1; @@ -1756,14 +1756,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, return bg1->used > bg2->used; } -static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) +static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) { if (btrfs_is_zoned(fs_info)) return btrfs_zoned_should_reclaim(fs_info); return true; } -static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) +static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed) { const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info); u64 thresh_bytes = mult_perc(bg->length, thresh_pct); @@ -2006,8 +2006,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } -static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, - struct btrfs_path *path) +static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key, + const struct btrfs_path *path) { struct btrfs_chunk_map *map; struct btrfs_block_group_item bg; @@ -2055,7 +2055,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; @@ -2640,8 +2640,8 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, } static int insert_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 chunk_offset, - u64 start, u64 num_bytes) + const struct btrfs_device *device, u64 chunk_offset, + u64 start, u64 num_bytes) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; @@ -2817,7 +2817,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) * For extent tree v2 we use the block_group_item->chunk_offset to point at our * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. */ -static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) +static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset) { u64 div = SZ_1G; u64 index; @@ -3842,8 +3842,8 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } } -static int should_alloc_chunk(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *sinfo, int force) +static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, int force) { u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; @@ -4218,7 +4218,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, return ret; } -static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) +static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type) { u64 num_dev; @@ -4622,7 +4622,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, return 0; } -bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg) { if (btrfs_is_zoned(bg->fs_info)) return false; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 915111338fc027..36937eeab9b8a1 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -266,7 +266,7 @@ struct btrfs_block_group { u64 reclaim_mark; }; -static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) +static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group) { return (block_group->start + block_group->length); } @@ -278,8 +278,7 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); } -static inline bool btrfs_is_block_group_data_only( - struct btrfs_block_group *block_group) +static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) { /* * In mixed mode the fragmentation is expected to be high, lowering the @@ -290,7 +289,7 @@ static inline bool btrfs_is_block_group_data_only( } #ifdef CONFIG_BTRFS_DEBUG -int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group); +int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group); #endif struct btrfs_block_group *btrfs_lookup_first_block_group( @@ -370,7 +369,7 @@ static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } -static inline int btrfs_block_group_done(struct btrfs_block_group *cache) +static inline int btrfs_block_group_done(const struct btrfs_block_group *cache) { smp_mb(); return cache->cached == BTRFS_CACHE_FINISHED || @@ -387,6 +386,6 @@ enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class); -bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); +bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg); #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index b299b82d676e41..a07b9594dc7067 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -553,7 +553,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, +int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { u64 needed_bytes; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 1f53b967d06919..d12b1fac5c74dd 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -89,7 +89,7 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize); -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, +int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 451203055bbfb3..0cc919d15b1441 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2564,8 +2564,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, * */ static void fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, int level) + const struct btrfs_path *path, + const struct btrfs_disk_key *key, int level) { int i; struct extent_buffer *t; @@ -2594,7 +2594,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, * that the new key won't break the order */ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_path *path, + const struct btrfs_path *path, const struct btrfs_key *new_key) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2660,8 +2660,8 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, * is correct, we only need to bother the last key of @left and the first * key of @right. */ -static bool check_sibling_keys(struct extent_buffer *left, - struct extent_buffer *right) +static bool check_sibling_keys(const struct extent_buffer *left, + const struct extent_buffer *right) { struct btrfs_key left_last; struct btrfs_key right_first; @@ -2928,8 +2928,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * blocknr is the block the key points to. */ static int insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, u64 bytenr, + const struct btrfs_path *path, + const struct btrfs_disk_key *key, u64 bytenr, int slot, int level) { struct extent_buffer *lower; @@ -4019,7 +4019,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, * the front. */ void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 new_size, int from_end) + const struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -4111,7 +4111,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, * make the item pointed to by the path bigger, data_size is the added size. */ void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 data_size) + const struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c8568b1a61c432..fd73c284822a88 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -538,7 +538,7 @@ int btrfs_previous_item(struct btrfs_root *root, int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_path *path, + const struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -572,9 +572,9 @@ bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 data_size); + const struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 new_size, int from_end); + const struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 944a7340f6a449..e815d165cccc22 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -68,7 +68,7 @@ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { }; static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, - struct btrfs_block_group *block_group) + const struct btrfs_block_group *block_group) { return &discard_ctl->discard_list[block_group->discard_index]; } @@ -80,7 +80,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, * * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ -static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl) { struct btrfs_fs_info *fs_info = container_of(discard_ctl, struct btrfs_fs_info, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 5c342fe1af6192..886749b3967284 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -151,7 +151,7 @@ static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info) * Calculate the total size needed to allocate for an ordered sum structure * spanning @bytes in the file. */ -static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) +static int btrfs_ordered_sum_size(const struct btrfs_fs_info *fs_info, unsigned long bytes) { return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes); } @@ -1272,7 +1272,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, + const struct btrfs_file_extent_item *fi, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 557dc43d71422b..0e13661a71f3bd 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -74,7 +74,7 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, + const struct btrfs_file_extent_item *fi, struct extent_map *em); int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 316756ff08acb2..29572dfaf8785c 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -14,7 +14,7 @@ #include "extent-tree.h" #include "file-item.h" -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, +struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name) { @@ -42,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, } struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name) { struct btrfs_inode_extref *extref; @@ -423,9 +423,9 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root return ret; } -static inline void btrfs_trace_truncate(struct btrfs_inode *inode, - struct extent_buffer *leaf, - struct btrfs_file_extent_item *fi, +static inline void btrfs_trace_truncate(const struct btrfs_inode *inode, + const struct extent_buffer *leaf, + const struct btrfs_file_extent_item *fi, u64 offset, int extent_type, int slot) { if (!inode) diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index c4aded82709b1a..c11b97fdccc472 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -109,11 +109,11 @@ struct btrfs_inode_extref *btrfs_lookup_inode_extref( u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, +struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name); #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c691784b4660cd..d5a9cd8a4fd8d7 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -163,7 +163,7 @@ * thing with or without extra unallocated space. */ -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included) { ASSERT(s_info); @@ -368,7 +368,7 @@ static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) } static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, + const struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { u64 profile; @@ -437,7 +437,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, } int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { u64 avail; @@ -542,8 +542,8 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } -static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info) +static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *info) { const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); @@ -844,9 +844,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, return; } -static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *space_info) { u64 used; u64 avail; @@ -871,7 +870,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, } static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) + const struct btrfs_space_info *space_info) { const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; @@ -1943,7 +1942,7 @@ static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info) * Typically with 10 block groups as the target, the discrete values this comes * out to are 0, 10, 20, ... , 80, 90, and 99. */ -static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info) +static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info) { struct btrfs_fs_info *fs_info = space_info->fs_info; u64 unalloc = atomic64_read(&fs_info->free_chunk_space); @@ -1962,7 +1961,7 @@ static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info) return calc_pct_ratio(want, target); } -int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info) +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info) { lockdep_assert_held(&space_info->lock); @@ -1985,7 +1984,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) return unalloc < data_chunk_size; } -static void do_reclaim_sweep(struct btrfs_fs_info *fs_info, +static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; @@ -2073,7 +2072,7 @@ bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) return ret; } -void btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info) +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) { int raid; struct btrfs_space_info *space_info; diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 5602026c5e1485..efbecc0c5258d2 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -217,7 +217,7 @@ struct reserve_ticket { wait_queue_head_t wait; }; -static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); @@ -258,7 +258,7 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, @@ -271,7 +271,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( @@ -293,7 +293,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes); void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready); bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); -int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info); -void btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info); +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index fa45b5fb968390..b382a4c443d427 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -170,7 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, * this until all tree mod log insertions are recorded in the rb tree and then * write unlock fs_info::tree_mod_log_lock. */ -static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return true; @@ -188,7 +188,7 @@ static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffe /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) + const struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return false; @@ -198,7 +198,7 @@ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, return true; } -static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, +static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { @@ -221,7 +221,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, return tm; } -int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, +int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; @@ -258,7 +258,7 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, return ret; } -static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, +static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) { @@ -278,7 +278,7 @@ static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, return tm; } -int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, +int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) { @@ -535,7 +535,7 @@ static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, } int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index ff00c8e8a393cb..6308c577a4a4ab 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -37,7 +37,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, struct extent_buffer *new_root, bool log_removal); -int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, +int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, @@ -47,11 +47,11 @@ struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items); -int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, +int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items); u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 047e3337852e1a..71e184120a9b09 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2459,7 +2459,7 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_devices->device_list_mutex); } -bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 30b2e48a1cec3a..7612e657260530 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -89,7 +89,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); -bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length); int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); @@ -242,7 +242,7 @@ static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } -static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { return false; } diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index e4add61e00f14d..bf60ad50011e77 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1825,7 +1825,7 @@ TRACE_EVENT(qgroup_update_counters, TRACE_EVENT(qgroup_update_reserve, - TP_PROTO(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, + TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_qgroup *qgroup, s64 diff, int type), TP_ARGS(fs_info, qgroup, diff, type), @@ -1851,7 +1851,7 @@ TRACE_EVENT(qgroup_update_reserve, TRACE_EVENT(qgroup_meta_reserve, - TP_PROTO(struct btrfs_root *root, s64 diff, int type), + TP_PROTO(const struct btrfs_root *root, s64 diff, int type), TP_ARGS(root, diff, type), @@ -1874,7 +1874,7 @@ TRACE_EVENT(qgroup_meta_reserve, TRACE_EVENT(qgroup_meta_convert, - TP_PROTO(struct btrfs_root *root, s64 diff), + TP_PROTO(const struct btrfs_root *root, s64 diff), TP_ARGS(root, diff), From d8f7e7de82b656364b90439d5f311d430fff9e10 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 30 Aug 2024 16:48:20 +0930 Subject: [PATCH 975/991] btrfs: remove btrfs_folio_end_all_writers() The function btrfs_folio_end_all_writers() is only utilized in extent_writepage() as a way to unlock all subpage range (for both successful submission and error handling). Meanwhile we have a similar function, btrfs_folio_end_writer_lock(). The difference is, btrfs_folio_end_writer_lock() expects a range that is a subset of the already locked range. This limit on btrfs_folio_end_writer_lock() is a little overkilled, preventing it from being utilized for error paths. So here we enhance btrfs_folio_end_writer_lock() to accept a superset of the locked range, and only end the locked subset. This means we can replace btrfs_folio_end_all_writers() with btrfs_folio_end_writer_lock() instead. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 ++- fs/btrfs/subpage.c | 57 +++++++------------------------------------- fs/btrfs/subpage.h | 1 - 3 files changed, 10 insertions(+), 51 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c07930986fe586..485d88f9947bd9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1466,7 +1466,8 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl mapping_set_error(folio->mapping, ret); } - btrfs_folio_end_all_writers(inode_to_fs_info(inode), folio); + btrfs_folio_end_writer_lock(inode_to_fs_info(inode), folio, + page_start, PAGE_SIZE); ASSERT(ret <= 0); return ret; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index ca7d2aedfa8d9d..7fe58c4d992324 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -322,6 +322,8 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); unsigned long flags; + unsigned int cleared = 0; + int bit = start_bit; bool last; btrfs_subpage_assert(fs_info, folio, start, len); @@ -339,11 +341,12 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf return true; } - ASSERT(atomic_read(&subpage->writers) >= nbits); - /* The target range should have been locked. */ - ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); - bitmap_clear(subpage->bitmaps, start_bit, nbits); - last = atomic_sub_and_test(nbits, &subpage->writers); + for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) { + clear_bit(bit, subpage->bitmaps); + cleared++; + } + ASSERT(atomic_read(&subpage->writers) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->writers); spin_unlock_irqrestore(&subpage->lock, flags); return last; } @@ -825,50 +828,6 @@ bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, return found; } -/* - * Unlike btrfs_folio_end_writer_lock() which unlocks a specified subpage range, - * this ends all writer locked ranges of a page. - * - * This is for the locked page of extent_writepage(), as the locked page - * can contain several locked subpage ranges. - */ -void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - u64 folio_start = folio_pos(folio); - u64 cur = folio_start; - - ASSERT(folio_test_locked(folio)); - if (!btrfs_is_subpage(fs_info, folio->mapping)) { - folio_unlock(folio); - return; - } - - /* The page has no new delalloc range locked on it. Just plain unlock. */ - if (atomic_read(&subpage->writers) == 0) { - folio_unlock(folio); - return; - } - while (cur < folio_start + PAGE_SIZE) { - u64 found_start; - u32 found_len; - bool found; - bool last; - - found = btrfs_subpage_find_writer_locked(fs_info, folio, cur, - &found_start, &found_len); - if (!found) - break; - last = btrfs_subpage_end_and_test_writer(fs_info, folio, - found_start, found_len); - if (last) { - folio_unlock(folio); - break; - } - cur = found_start + found_len; - } -} - #define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ { \ const int sectors_per_page = fs_info->sectors_per_page; \ diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index b67cd5f6539d80..f90e0c4f4cabd2 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -109,7 +109,6 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 search_start, u64 *found_start_ret, u32 *found_len_ret); -void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio); /* * Template for subpage related operations. From 9b72abb43b9c2c4fadee1be2932f4979ac5fbb7d Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 3 Sep 2024 11:19:05 -0700 Subject: [PATCH 976/991] btrfs: DEFINE_FREE for struct btrfs_path Add a DEFINE_FREE for struct btrfs_path. This defines a function that can be called using the __free attribute. Define a macro BTRFS_PATH_AUTO_FREE to make the declaration of an auto freeing path very clear. The intended use is to define the auto free of path in cases where the path is allocated somewhere at the beginning and freed either on all error paths or at the end of the function. int func() { BTRFS_PATH_AUTO_FREE(path); if (...) return -ERROR; path = alloc_path(); ... if (...) return -ERROR; ... return 0; } Signed-off-by: Leo Martins [ update changelog ] Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fd73c284822a88..1a44fb9845e3f0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,6 +6,7 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H +#include "linux/cleanup.h" #include #include #include @@ -84,6 +85,9 @@ struct btrfs_path { unsigned int nowait:1; }; +#define BTRFS_PATH_AUTO_FREE(path_name) \ + struct btrfs_path *path_name __free(btrfs_free_path) = NULL + /* * The state of btrfs root */ @@ -598,6 +602,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); +DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T)) int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); From a1d170103c8ede41e600ca240bbd33f2586f63b4 Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 3 Sep 2024 11:19:06 -0700 Subject: [PATCH 977/991] btrfs: use btrfs_path auto free in zoned.c All cleanup paths lead to btrfs_path_free so path can be defined with the automatic freeing callback in the following functions: - calculate_emulated_zone_size() - calculate_alloc_pointer() Signed-off-by: Leo Martins Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 71e184120a9b09..7fa2920632ba68 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -287,7 +287,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, /* The emulated zone size is determined from the size of device extent */ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct extent_buffer *leaf; @@ -304,28 +304,21 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; + return ret; /* No dev extents at all? Not good */ - if (ret > 0) { - ret = -EUCLEAN; - goto out; - } + if (ret > 0) + return -EUCLEAN; } leaf = path->nodes[0]; dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); - ret = 0; - -out: - btrfs_free_path(path); - - return ret; + return 0; } int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -1211,7 +1204,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; int ret; @@ -1246,7 +1239,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!ret) ret = -EUCLEAN; if (ret < 0) - goto out; + return ret; ret = btrfs_previous_extent_item(root, path, cache->start); if (ret) { @@ -1254,7 +1247,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, ret = 0; *offset_ret = 0; } - goto out; + return ret; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -1266,15 +1259,10 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!(found_key.objectid >= cache->start && found_key.objectid + length <= cache->start + cache->length)) { - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; - ret = 0; - -out: - btrfs_free_path(path); - return ret; + return 0; } struct zone_info { From b7b74b3e74aeabdbbdc52ed046779c176c84b9b8 Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 3 Sep 2024 11:19:07 -0700 Subject: [PATCH 978/991] btrfs: BTRFS_PATH_AUTO_FREE in orphan.c All cleanup paths lead to btrfs_path_free so path can be defined with the automatic freeing callback in the following functions: - btrfs_insert_orphan_item() - btrfs_del_orphan_item() Signed-off-by: Leo Martins Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/orphan.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 6195a2215b8fee..9f3ad124104f1f 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -9,9 +9,8 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; - int ret = 0; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; @@ -21,16 +20,13 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - - btrfs_free_path(path); - return ret; + return btrfs_insert_empty_item(trans, root, path, &key, 0); } int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -44,15 +40,9 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; - if (ret) { /* JDM: Really? */ - ret = -ENOENT; - goto out; - } - - ret = btrfs_del_item(trans, root, path); + return ret; + if (ret) + return -ENOENT; -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } From ee81808197bf38e36110a13a95b016c56d4073a5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 2 Sep 2024 13:57:08 +0930 Subject: [PATCH 979/991] btrfs: merge btrfs_folio_unlock_writer() into btrfs_folio_end_writer_lock() The function btrfs_folio_unlock_writer() is already calling btrfs_folio_end_writer_lock() to do the heavy lifting work, the only missing 0 writer check. Thus there is no need to keep two different functions, move the 0 writer check into btrfs_folio_end_writer_lock(), and remove btrfs_folio_unlock_writer(). Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- fs/btrfs/subpage.c | 81 +++++++++++++++++++------------------------- fs/btrfs/subpage.h | 2 -- 3 files changed, 35 insertions(+), 50 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 485d88f9947bd9..70be1150c34e31 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2220,7 +2220,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f cur, cur_len, !ret); mapping_set_error(mapping, ret); } - btrfs_folio_unlock_writer(fs_info, folio, cur, cur_len); + btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; next_page: diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 7fe58c4d992324..83660fa82c3236 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -378,13 +378,47 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, return 0; } +/* + * Handle different locked folios: + * + * - Non-subpage folio + * Just unlock it. + * + * - folio locked but without any subpage locked + * This happens either before writepage_delalloc() or the delalloc range is + * already handled by previous folio. + * We can simple unlock it. + * + * - folio locked with subpage range locked. + * We go through the locked sectors inside the range and clear their locked + * bitmap, reduce the writer lock number, and unlock the page if that's + * the last locked range. + */ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { + struct btrfs_subpage *subpage = folio_get_private(folio); + + ASSERT(folio_test_locked(folio)); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { folio_unlock(folio); return; } + + /* + * For subpage case, there are two types of locked page. With or + * without writers number. + * + * Since we own the page lock, no one else could touch subpage::writers + * and we are safe to do several atomic operations without spinlock. + */ + if (atomic_read(&subpage->writers) == 0) { + /* No writers, locked by plain lock_page(). */ + folio_unlock(folio); + return; + } + btrfs_subpage_clamp_range(folio, &start, &len); if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) folio_unlock(folio); @@ -702,53 +736,6 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } -/* - * Handle different locked pages with different page sizes: - * - * - Page locked by plain lock_page() - * It should not have any subpage::writers count. - * Can be unlocked by unlock_page(). - * This is the most common locked page for extent_writepage() called - * inside extent_write_cache_pages(). - * Rarer cases include the @locked_page from extent_write_locked_range(). - * - * - Page locked by lock_delalloc_pages() - * There is only one caller, all pages except @locked_page for - * extent_write_locked_range(). - * In this case, we have to call subpage helper to handle the case. - */ -void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage; - - ASSERT(folio_test_locked(folio)); - /* For non-subpage case, we just unlock the page */ - if (!btrfs_is_subpage(fs_info, folio->mapping)) { - folio_unlock(folio); - return; - } - - ASSERT(folio_test_private(folio) && folio_get_private(folio)); - subpage = folio_get_private(folio); - - /* - * For subpage case, there are two types of locked page. With or - * without writers number. - * - * Since we own the page lock, no one else could touch subpage::writers - * and we are safe to do several atomic operations without spinlock. - */ - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page() */ - folio_unlock(folio); - return; - } - - /* Have writers, use proper subpage helper to end it */ - btrfs_folio_end_writer_lock(fs_info, folio, start, len); -} - /* * This is for folio already locked by plain lock_page()/folio_lock(), which * doesn't have any subpage awareness. diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index f90e0c4f4cabd2..f805261e0999c3 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -155,8 +155,6 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); -void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap); From 966416b380926c94751476747ea68571004eac24 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 2 Sep 2024 14:29:06 +0930 Subject: [PATCH 980/991] btrfs: only unlock the to-be-submitted ranges inside a folio [SUBPAGE COMPRESSION LIMITS] Currently inside writepage_delalloc(), if a delalloc range is going to be submitted asynchronously (inline or compression, the page dirty/writeback/unlock are all handled in at different time, not at the submission time), then we return 1 and extent_writepage() will skip the submission. This is fine if every sector matches page size, but if a sector is smaller than page size (aka, subpage case), then it can be very problematic, for example for the following 64K page: 0 16K 32K 48K 64K |/| |///////| |/| | | 4K 52K Where |/| is the dirty range we need to submit. In the above case, we need the following different handling for the 3 ranges: - [0, 4K) needs to be submitted for regular write A single sector cannot be compressed. - [16K, 32K) needs to be submitted for compressed write - [48K, 52K) needs to be submitted for regular write. Above, if we try to submit [16K, 32K) for compressed write, we will return 1 and immediately, and without submitting the remaining [48K, 52K) range. Furthermore, since extent_writepage() will exit without unlocking any sectors, the submitted range [0, 4K) will not have sector unlocked. That's the reason why for now subpage is only allowed for full page range. [ENHANCEMENT] - Introduce a submission bitmap at btrfs_bio_ctrl::submit_bitmap This records which sectors will be submitted by extent_writepage_io(). This allows us to track which sectors needs to be submitted thus later to be properly unlocked. For asynchronously submitted range (inline/compression), the corresponding bits will be cleared from that bitmap. - Only return 1 if no sector needs to be submitted in writepage_delalloc() - Only submit sectors marked by submission bitmap inside extent_writepage_io() So we won't touch the asynchronously submitted part. - Introduce btrfs_folio_end_writer_lock_bitmap() helper This will only unlock the involved sectors specified by @bitmap parameter, to avoid touching the range asynchronously submitted. Please note that, since subpage compression is still limited to page aligned range, this change is only a preparation for future sector perfect compression support for subpage. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 89 +++++++++++++++++++++++++------------------- fs/btrfs/subpage.c | 33 ++++++++++++++++ fs/btrfs/subpage.h | 2 + 3 files changed, 86 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 70be1150c34e31..39c9677c47d5a8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -101,6 +101,13 @@ struct btrfs_bio_ctrl { blk_opf_t opf; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; + + /* + * The sectors of the page which are going to be submitted by + * extent_writepage_io(). + * This is to avoid touching ranges covered by compression/inline. + */ + unsigned long submit_bitmap; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -1106,9 +1113,10 @@ int btrfs_read_folio(struct file *file, struct folio *folio) */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct folio *folio, - struct writeback_control *wbc) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); + struct writeback_control *wbc = bio_ctrl->wbc; const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; @@ -1123,6 +1131,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_to_write = 0; int ret = 0; + /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ + if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { + ASSERT(fs_info->sectors_per_page > 1); + btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); + } else { + bio_ctrl->submit_bitmap = 1; + } + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; @@ -1190,22 +1206,18 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, } /* - * We can hit btrfs_run_delalloc_range() with >0 return value. - * - * This happens when either the IO is already done and folio - * unlocked (inline) or the IO submission and folio unlock would - * be handled as async (compression). - * - * Inline is only possible for regular sectorsize for now. - * - * Compression is possible for both subpage and regular cases, - * but even for subpage compression only happens for page aligned - * range, thus the found delalloc range must go beyond current - * folio. + * We have some ranges that's going to be submitted asynchronously + * (compression or inline). These range have their own control + * on when to unlock the pages. We should not touch them + * anymore, so clear the range from the submission bitmap. */ - if (ret > 0) - ASSERT(!is_subpage || found_start + found_len >= page_end); - + if (ret > 0) { + unsigned int start_bit = (found_start - page_start) >> + fs_info->sectorsize_bits; + unsigned int end_bit = (min(page_end + 1, found_start + found_len) - + page_start) >> fs_info->sectorsize_bits; + bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit); + } /* * Above btrfs_run_delalloc_range() may have unlocked the folio, * thus for the last range, we cannot touch the folio anymore. @@ -1230,10 +1242,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); /* - * If btrfs_run_dealloc_range() already started I/O and unlocked - * the folios, we just need to account for them here. + * If all ranges are submitted asynchronously, we just need to account + * for them here. */ - if (ret == 1) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1331,15 +1343,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; - /* - * This is the default value for sectorsize == PAGE_SIZE case. - * We known we need to write the dirty sector (aka the page), - * even if the page is not dirty (we cleared it before entering). - * - * For subpage cases we will get the correct bitmap later. - */ - unsigned long dirty_bitmap = 1; - unsigned int bitmap_size = 1; bool submitted_io = false; const u64 folio_start = folio_pos(folio); u64 cur; @@ -1357,18 +1360,14 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, return 1; } - if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { - ASSERT(fs_info->sectors_per_page > 1); - btrfs_get_subpage_dirty_bitmap(fs_info, folio, &dirty_bitmap); - bitmap_size = fs_info->sectors_per_page; - } for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); - bitmap_and(&dirty_bitmap, &dirty_bitmap, &range_bitmap, bitmap_size); + bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, + fs_info->sectors_per_page); bio_ctrl->end_io_func = end_bbio_data_write; - for_each_set_bit(bit, &dirty_bitmap, bitmap_size) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { @@ -1421,6 +1420,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = folio->mapping->host; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); const u64 page_start = folio_pos(folio); int ret; size_t pg_offset; @@ -1442,11 +1442,16 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl if (folio->index == end_index) folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); + /* + * Default to unlock the whole folio. + * The proper bitmap can only be initialized until writepage_delalloc(). + */ + bio_ctrl->submit_bitmap = (unsigned long)-1; ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl->wbc); + ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl); if (ret == 1) return 0; if (ret) @@ -1466,8 +1471,11 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl mapping_set_error(folio->mapping, ret); } - btrfs_folio_end_writer_lock(inode_to_fs_info(inode), folio, - page_start, PAGE_SIZE); + /* + * Only unlock ranges that are submitted. As there can be some async + * submitted ranges inside the folio. + */ + btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); ASSERT(ret <= 0); return ret; } @@ -2210,6 +2218,11 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); + /* + * Set the submission bitmap to submit all sectors. + * extent_writepage_io() will do the truncation correctly. + */ + bio_ctrl.submit_bitmap = (unsigned long)-1; ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, &bio_ctrl, i_size); if (ret == 1) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 83660fa82c3236..fe4d719d506bf5 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -424,6 +424,39 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, folio_unlock(folio); } +void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap) +{ + struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; + unsigned long flags; + bool last = false; + int cleared = 0; + int bit; + + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + folio_unlock(folio); + return; + } + + if (atomic_read(&subpage->writers) == 0) { + /* No writers, locked by plain lock_page(). */ + folio_unlock(folio); + return; + } + + spin_lock_irqsave(&subpage->lock, flags); + for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) { + if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) + cleared++; + } + ASSERT(atomic_read(&subpage->writers) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->writers); + spin_unlock_irqrestore(&subpage->lock, flags); + if (last) + folio_unlock(folio); +} + #define subpage_test_bitmap_all_set(fs_info, subpage, name) \ bitmap_test_range_all_set(subpage->bitmaps, \ fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index f805261e0999c3..4b85d91d0e18b0 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -106,6 +106,8 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap); bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 search_start, u64 *found_start_ret, u32 *found_len_ret); From d2594ad7ad7b76eef415eeb62b1fa92ad2d14f4b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 21 Feb 2024 15:50:10 +0100 Subject: [PATCH 981/991] btrfs: === misc-next on b-for-next === Any commits after this one are for testing and evaluation only. Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 4fb925e8c981d8..7fe030d91e4f0b 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +# misc-next marker config BTRFS_FS tristate "Btrfs filesystem support" From 6c43e74320100b18562487b9b4a0b29e3f5c41a5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 7 Feb 2024 02:34:23 +0100 Subject: [PATCH 982/991] btrfs: handle unexpected parent block offset in btrfs_alloc_tree_block() Change a BUG_ON to a proper error handling, here it checks that a root other than reloc tree does not see a non-zero offset. This is set by btrfs_force_cow_block() and is a special case so the check makes sure it's not accidentally set by other callers. Reviewed-by: Boris Burkov Reviewed-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 12 ++++++++++-- fs/btrfs/zoned.c | 9 +++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a5966324607d49..3ec9939f20d33c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5149,8 +5149,16 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, parent = ins.objectid; flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; owning_root = reloc_src_root; - } else - BUG_ON(parent > 0); + } else { + if (unlikely(parent > 0)) { + /* + * Other roots than reloc tree don't expect start + * offset of a parent block. + */ + ret = -EUCLEAN; + goto out_free_reserved; + } + } if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 7fa2920632ba68..41ce252bb8fe52 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1668,6 +1668,15 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return -EINVAL; } + /* Reject non SINGLE data profiles without RST. */ + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && + (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + if (cache->alloc_offset > cache->zone_capacity) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", From 36c9c8067baf0de8e2be0c5f4e79a9e53495571c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2024 14:24:51 +1030 Subject: [PATCH 983/991] btrfs: scrub: fix incorrectly reported logical/physical address [BUG] Scrub is not reporting the correct logical/physical address, it can be verified by the following script: # mkfs.btrfs -f $dev1 # mount $dev1 $mnt # xfs_io -f -c "pwrite -S 0xaa 0 128k" $mnt/file1 # umount $mnt # xfs_io -f -c "pwrite -S 0xff 13647872 4k" $dev1 # mount $dev1 $mnt # btrfs scrub start -fB $mnt # umount $mnt Note above 13647872 is the physical address for logical 13631488 + 4K. Scrub would report the following error: BTRFS error (device dm-2): unable to fixup (regular) error at logical 13631488 on dev /dev/mapper/test-scratch1 physical 13631488 BTRFS warning (device dm-2): checksum error at logical 13631488 on dev /dev/mapper/test-scratch1, physical 13631488, root 5, inode 257, offset 0, length 4096, links 1 (path: file1) On the other hand, "btrfs check --check-data-csum" is reporting the correct logical/physical address: Checking filesystem on /dev/test/scratch1 UUID: db2eb621-b09d-4f24-8199-da17dc7b3201 [5/7] checking csums against data mirror 1 bytenr 13647872 csum 0x13fec125 expected csum 0x656bd64e ERROR: errors found in csum tree [CAUSE] In the function scrub_stripe_report_errors(), we always use the stripe->logical and its physical address to print the error message, not taking the sector number into consideration at all. [FIX] Fix the error reporting function by calculating logical/physical with the sector number. Now the scrub report is correct: BTRFS error (device dm-2): unable to fixup (regular) error at logical 13647872 on dev /dev/mapper/test-scratch1 physical 13647872 BTRFS warning (device dm-2): checksum error at logical 13647872 on dev /dev/mapper/test-scratch1, physical 13647872, root 5, inode 257, offset 16384, length 4096, links 1 (path: file1) Fixes: 0096580713ff ("btrfs: scrub: introduce error reporting functionality for scrub_stripe") CC: stable@vger.kernel.org #6.4+ Reviewed-by: Anand Jain Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3a34274280746c..f1e0dc0e04c61d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -870,7 +870,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, DEFAULT_RATELIMIT_BURST); struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_device *dev = NULL; - u64 physical = 0; + u64 stripe_physical = stripe->physical; int nr_data_sectors = 0; int nr_meta_sectors = 0; int nr_nodatacsum_sectors = 0; @@ -903,13 +903,17 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, */ if (ret < 0) goto skip; - physical = bioc->stripes[stripe_index].physical; + stripe_physical = bioc->stripes[stripe_index].physical; dev = bioc->stripes[stripe_index].dev; btrfs_put_bioc(bioc); } skip: for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) { + const u64 logical = stripe->logical + + (sector_nr << fs_info->sectorsize_bits); + const u64 physical = stripe_physical + + (sector_nr << fs_info->sectorsize_bits); bool repaired = false; if (stripe->sectors[sector_nr].is_metadata) { @@ -938,12 +942,12 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, if (dev) { btrfs_err_rl_in_rcu(fs_info, "fixed up error at logical %llu on dev %s physical %llu", - stripe->logical, btrfs_dev_name(dev), + logical, btrfs_dev_name(dev), physical); } else { btrfs_err_rl_in_rcu(fs_info, "fixed up error at logical %llu on mirror %u", - stripe->logical, stripe->mirror_num); + logical, stripe->mirror_num); } continue; } @@ -952,26 +956,26 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, if (dev) { btrfs_err_rl_in_rcu(fs_info, "unable to fixup (regular) error at logical %llu on dev %s physical %llu", - stripe->logical, btrfs_dev_name(dev), + logical, btrfs_dev_name(dev), physical); } else { btrfs_err_rl_in_rcu(fs_info, "unable to fixup (regular) error at logical %llu on mirror %u", - stripe->logical, stripe->mirror_num); + logical, stripe->mirror_num); } if (test_bit(sector_nr, &stripe->io_error_bitmap)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("i/o error", dev, false, - stripe->logical, physical); + logical, physical); if (test_bit(sector_nr, &stripe->csum_error_bitmap)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("checksum error", dev, false, - stripe->logical, physical); + logical, physical); if (test_bit(sector_nr, &stripe->meta_error_bitmap)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("header error", dev, false, - stripe->logical, physical); + logical, physical); } spin_lock(&sctx->stat_lock); From 5f8fae64054b6d22dea45ee3ca081b6e9d095bdb Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2024 14:24:52 +1030 Subject: [PATCH 984/991] btrfs: reduce the log level for btrfs_dev_stat_inc_and_print() Currently when we increase the device statistics, it would always lead to an error message in the kernel log. However this output is mostly duplicated with the existing ones: - For scrub operations We always have the following messages: * "fixed up error at logical %llu" * "unable to fixup (regular) error at logical %llu" So no matter if the corruption is repaired or not, it scrub would output an error message to indicate the problem. - For non-scrub read operations We also have the following messages: * "csum failed root %lld inode %llu off %llu" for data csum mismatch * "bad (tree block start|fsid|tree block level)" for metadata * "read error corrected: ino %llu off %llu" for repaired data/metadata So the error message from btrfs_dev_stat_inc_and_print() is duplicated. The real usage for the btrfs device statistics is for some user space daemon to check if there is any new errors, acting like some checks on SMART, thus we don't really need/want those messages in dmesg. This patch would reduce the log level to debug (disabled by default) for btrfs_dev_stat_inc_and_print(). For users really want to utilize btrfs devices statistics, they should go check "btrfs device stats" periodically, and we should focus the kernel error messages to more important things. CC: stable@vger.kernel.org Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8f340ad1d93845..4a259bdaa21c92 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7696,7 +7696,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) if (!dev->dev_stats_valid) return; - btrfs_err_rl_in_rcu(dev->fs_info, + btrfs_debug_rl_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), From f7248d7939220354227d99de7bdb99dddfc4eb0b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2024 14:24:53 +1030 Subject: [PATCH 985/991] btrfs: scrub: remove unused is_super parameter from scrub_print_common_warning() Since commit 2a2dc22f7e9d ("btrfs: scrub: use dedicated super block verification function to scrub one super block"), the super block scrubbing is handled in a dedicated helper, thus scrub_print_common_warning() no longer needs to print warning for super block errors. Just remove the parameter. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f1e0dc0e04c61d..ea3b93916d4527 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -476,7 +476,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, } static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev, - bool is_super, u64 logical, u64 physical) + u64 logical, u64 physical) { struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_path *path; @@ -488,12 +488,6 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * u32 item_size; int ret; - /* Super block error, no need to search extent tree. */ - if (is_super) { - btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", - errstr, btrfs_dev_name(dev), physical); - return; - } path = btrfs_alloc_path(); if (!path) return; @@ -966,15 +960,15 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, if (test_bit(sector_nr, &stripe->io_error_bitmap)) if (__ratelimit(&rs) && dev) - scrub_print_common_warning("i/o error", dev, false, + scrub_print_common_warning("i/o error", dev, logical, physical); if (test_bit(sector_nr, &stripe->csum_error_bitmap)) if (__ratelimit(&rs) && dev) - scrub_print_common_warning("checksum error", dev, false, + scrub_print_common_warning("checksum error", dev, logical, physical); if (test_bit(sector_nr, &stripe->meta_error_bitmap)) if (__ratelimit(&rs) && dev) - scrub_print_common_warning("header error", dev, false, + scrub_print_common_warning("header error", dev, logical, physical); } From 2b00b442e39e4dbb81c56eb58f63f162217f6e0a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2024 14:24:54 +1030 Subject: [PATCH 986/991] btrfs: scrub: remove unnecessary dev/physical lookup for scrub_stripe_report_errors() The @stripe passed into scrub_stripe_report_errors() either has stripe->dev and stripe->physical properly populated (regular data stripes) or stripe->flags would have SCRUB_STRIPE_FLAG_NO_REPORT (RAID56 P/Q stripes). Thus there is no need to go with btrfs_map_block() to get the dev/physical. Just add an extra ASSERT() to make sure we get stripe->dev populated correctly. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 54 +++++++----------------------------------------- 1 file changed, 7 insertions(+), 47 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ea3b93916d4527..16c13d2434005d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -863,7 +863,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_device *dev = NULL; + struct btrfs_device *dev = stripe->dev; u64 stripe_physical = stripe->physical; int nr_data_sectors = 0; int nr_meta_sectors = 0; @@ -874,35 +874,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state)) return; - /* - * Init needed infos for error reporting. - * - * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() - * thus no need for dev/physical, error reporting still needs dev and physical. - */ - if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { - u64 mapped_len = fs_info->sectorsize; - struct btrfs_io_context *bioc = NULL; - int stripe_index = stripe->mirror_num - 1; - int ret; - - /* For scrub, our mirror_num should always start at 1. */ - ASSERT(stripe->mirror_num >= 1); - ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - stripe->logical, &mapped_len, &bioc, - NULL, NULL); - /* - * If we failed, dev will be NULL, and later detailed reports - * will just be skipped. - */ - if (ret < 0) - goto skip; - stripe_physical = bioc->stripes[stripe_index].physical; - dev = bioc->stripes[stripe_index].dev; - btrfs_put_bioc(bioc); - } - -skip: + ASSERT(dev); for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) { const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); @@ -933,41 +905,29 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, * output the message of repaired message. */ if (repaired) { - if (dev) { - btrfs_err_rl_in_rcu(fs_info, + btrfs_err_rl_in_rcu(fs_info, "fixed up error at logical %llu on dev %s physical %llu", logical, btrfs_dev_name(dev), physical); - } else { - btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on mirror %u", - logical, stripe->mirror_num); - } continue; } /* The remaining are all for unrepaired. */ - if (dev) { - btrfs_err_rl_in_rcu(fs_info, + btrfs_err_rl_in_rcu(fs_info, "unable to fixup (regular) error at logical %llu on dev %s physical %llu", logical, btrfs_dev_name(dev), physical); - } else { - btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on mirror %u", - logical, stripe->mirror_num); - } if (test_bit(sector_nr, &stripe->io_error_bitmap)) - if (__ratelimit(&rs) && dev) + if (__ratelimit(&rs)) scrub_print_common_warning("i/o error", dev, logical, physical); if (test_bit(sector_nr, &stripe->csum_error_bitmap)) - if (__ratelimit(&rs) && dev) + if (__ratelimit(&rs)) scrub_print_common_warning("checksum error", dev, logical, physical); if (test_bit(sector_nr, &stripe->meta_error_bitmap)) - if (__ratelimit(&rs) && dev) + if (__ratelimit(&rs)) scrub_print_common_warning("header error", dev, logical, physical); } From c9f2ed5b760df530a950fbcf79c2607f6a95900f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2024 14:24:55 +1030 Subject: [PATCH 987/991] btrfs: scrub: simplify the inode iteration output The following two output are not really needed: - nlinks Normally file inodes should have nlinks as 1, for those inodes have multiple hard links, the inode/root number is still enough to pin it down to certain inode. - size The size is always fixed to sector size. By removing the nlinks output, we can reduce one inode item lookup. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 16c13d2434005d..d041b433b799f2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -388,17 +388,13 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, u64 root, void *warn_ctx) { - u32 nlink; int ret; int i; unsigned nofs_flag; - struct extent_buffer *eb; - struct btrfs_inode_item *inode_item; struct scrub_warning *swarn = warn_ctx; struct btrfs_fs_info *fs_info = swarn->dev->fs_info; struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; - struct btrfs_key key; local_root = btrfs_get_fs_root(fs_info, root, true); if (IS_ERR(local_root)) { @@ -406,26 +402,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, goto err; } - /* - * this makes the path point to (inum INODE_ITEM ioff) - */ - key.objectid = inum; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); - if (ret) { - btrfs_put_root(local_root); - btrfs_release_path(swarn->path); - goto err; - } - - eb = swarn->path->nodes[0]; - inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], - struct btrfs_inode_item); - nlink = btrfs_inode_nlink(eb, inode_item); - btrfs_release_path(swarn->path); - /* * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub * uses GFP_NOFS in this context, so we keep it consistent but it does @@ -451,12 +427,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", +"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, path: %s", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, root, inum, offset, - fs_info->sectorsize, nlink, (char *)(unsigned long)ipath->fspath->val[i]); btrfs_put_root(local_root); From 70bf50a7e175b2a4e1c3afce76c3c0483b58a487 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2024 14:24:56 +1030 Subject: [PATCH 988/991] btrfs: scrub: ensure we output at least one error message for unrepaired corruption For btrfs scrub error messages, I have hit a lot of support cases on older kernels where no filename is outputted at all, with only error messages like: BTRFS error (device dm-0): bdev /dev/mapper/sys-rootlv errs: wr 0, rd 0, flush 0, corrupt 2823, gen 0 BTRFS error (device dm-0): unable to fixup (regular) error at logical 1563504640 on dev /dev/mapper/sys-rootlv BTRFS error (device dm-0): bdev /dev/mapper/sys-rootlv errs: wr 0, rd 0, flush 0, corrupt 2824, gen 0 BTRFS error (device dm-0): unable to fixup (regular) error at logical 1579016192 on dev /dev/mapper/sys-rootlv BTRFS error (device dm-0): bdev /dev/mapper/sys-rootlv errs: wr 0, rd 0, flush 0, corrupt 2825, gen 0 The "unable to fixup (regular) error" line shows we hit an unrepairable error, then normally we would do data/metadata backref walk to grab the correct info. But we can hit cases like the inode is already orphan (unlinked from any parent inode), or even the subvolume is orphan (unlinked and waiting to be deleted). In that case we're not sure what's the proper way to continue (Is it some critical data/metadata? Would it prevent the system from booting?) To improve the situation, this patch would: - Ensure we at least output one message for each unrepairable error If by somehow we didn't output any error message for the error, we always fallback to the basic logical/physical error output, with its type (data or metadata). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d041b433b799f2..c0b4ae1a5e5a6b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -226,6 +226,7 @@ struct scrub_warning { u64 physical; u64 logical; struct btrfs_device *dev; + bool message_printed; }; static void release_scrub_stripe(struct scrub_stripe *stripe) @@ -425,7 +426,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, * we deliberately ignore the bit ipath might have been too small to * hold all of the paths here */ - for (i = 0; i < ipath->fspath->elem_cnt; ++i) + for (i = 0; i < ipath->fspath->elem_cnt; ++i) { btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, path: %s", swarn->errstr, swarn->logical, @@ -433,6 +434,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, swarn->physical, root, inum, offset, (char *)(unsigned long)ipath->fspath->val[i]); + swarn->message_printed = true; + } btrfs_put_root(local_root); free_ipath(ipath); @@ -445,7 +448,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, btrfs_dev_name(swarn->dev), swarn->physical, root, inum, offset, ret); - + swarn->message_printed = true; free_ipath(ipath); return 0; } @@ -471,6 +474,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * swarn.logical = logical; swarn.errstr = errstr; swarn.dev = NULL; + swarn.message_printed = false; ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, &flags); @@ -492,12 +496,8 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); - if (ret < 0) { - btrfs_warn(fs_info, - "failed to resolve tree backref for logical %llu: %d", - swarn.logical, ret); + if (ret < 0) break; - } if (ret > 0) break; btrfs_warn_in_rcu(fs_info, @@ -505,7 +505,13 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * errstr, swarn.logical, btrfs_dev_name(dev), swarn.physical, (ref_level ? "node" : "leaf"), ref_level, ref_root); + swarn.message_printed = true; } + if (!swarn.message_printed) + btrfs_warn_in_rcu(fs_info, + "%s at metadata, logical %llu on dev %s physical %llu", + errstr, swarn.logical, + btrfs_dev_name(dev), swarn.physical); btrfs_release_path(path); } else { struct btrfs_backref_walk_ctx ctx = { 0 }; @@ -520,6 +526,11 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * swarn.dev = dev; iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); + if (!swarn.message_printed) + btrfs_warn_in_rcu(fs_info, + "%s at data, filename unresolved, logical %llu on dev %s physical %llu", + errstr, swarn.logical, + btrfs_dev_name(dev), swarn.physical); } out: From f703521daa51d03d6527527aa3cf90bced08ff32 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2024 14:24:57 +1030 Subject: [PATCH 989/991] btrfs: scrub: use generic ratelimit helpers to output error messages Currently scrub goes different ways to rate limits its error messages: - regular btrfs_err_rl_in_rcu() helper for repaired sectors and the initial message for unrepaired sectors - Manually rate limits scrub_print_common_warning() I'd say the different rate limits could lead to cases where we only got the "unable to fixup (regular) error" messages but the detailed report about that corruption is ratelimited. To make the whole rate limit works more consistently, change the rate limit by: - Always using btrfs_*_rl() helpers - Remove the initial "unable to fixup (regular) error" message Since we're ensured to have at least one error message for each unrepaired sector (before rate limit), there is no need for a duplicated line. And if we hit rate limits, we will rate limit all the error messages together, not treating different error messages differently. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c0b4ae1a5e5a6b..7edcda731b8ab7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -427,7 +427,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn_rl_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, path: %s", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), @@ -442,7 +442,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, return 0; err: - btrfs_warn_in_rcu(fs_info, + btrfs_warn_rl_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), @@ -500,7 +500,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * break; if (ret > 0) break; - btrfs_warn_in_rcu(fs_info, + btrfs_warn_rl_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, btrfs_dev_name(dev), swarn.physical, (ref_level ? "node" : "leaf"), @@ -508,7 +508,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * swarn.message_printed = true; } if (!swarn.message_printed) - btrfs_warn_in_rcu(fs_info, + btrfs_warn_rl_in_rcu(fs_info, "%s at metadata, logical %llu on dev %s physical %llu", errstr, swarn.logical, btrfs_dev_name(dev), swarn.physical); @@ -527,7 +527,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); if (!swarn.message_printed) - btrfs_warn_in_rcu(fs_info, + btrfs_warn_rl_in_rcu(fs_info, "%s at data, filename unresolved, logical %llu on dev %s physical %llu", errstr, swarn.logical, btrfs_dev_name(dev), swarn.physical); @@ -846,8 +846,6 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, static void scrub_stripe_report_errors(struct scrub_ctx *sctx, struct scrub_stripe *stripe) { - static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_device *dev = stripe->dev; u64 stripe_physical = stripe->physical; @@ -899,22 +897,14 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, } /* The remaining are all for unrepaired. */ - btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on dev %s physical %llu", - logical, btrfs_dev_name(dev), - physical); - if (test_bit(sector_nr, &stripe->io_error_bitmap)) - if (__ratelimit(&rs)) - scrub_print_common_warning("i/o error", dev, + scrub_print_common_warning("i/o error", dev, logical, physical); if (test_bit(sector_nr, &stripe->csum_error_bitmap)) - if (__ratelimit(&rs)) - scrub_print_common_warning("checksum error", dev, + scrub_print_common_warning("checksum error", dev, logical, physical); if (test_bit(sector_nr, &stripe->meta_error_bitmap)) - if (__ratelimit(&rs)) - scrub_print_common_warning("header error", dev, + scrub_print_common_warning("header error", dev, logical, physical); } From 986f3fb2f94d0ac470d2e39b6c22e235bbad27cf Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Fri, 30 Aug 2024 13:24:54 -0700 Subject: [PATCH 990/991] btrfs: push cleanup into read_locked_inode() Move btrfs_add_inode_to_root so it can be called from btrfs_read_locked_inode, no changes were made to the function. Move cleanup code from btrfs_iget_path to btrfs_read_locked_inode. This improves readability and improves a leaky abstraction. Previously btrfs_iget_path had to handle a positive error case as a result of a call to btrfs_search_slot, but it makes more sense to handle this closer to the source of the call. Signed-off-by: Leo Martins Signed-off-by: David Sterba --- fs/btrfs/inode.c | 99 +++++++++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 48 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index edac499fd83d29..f0f12d4a3020c7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3786,8 +3786,39 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) return 0; } +static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) +{ + struct btrfs_root *root = inode->root; + struct btrfs_inode *existing; + const u64 ino = btrfs_ino(inode); + int ret; + + if (inode_unhashed(&inode->vfs_inode)) + return 0; + + if (prealloc) { + ret = xa_reserve(&root->inodes, ino, GFP_NOFS); + if (ret) + return ret; + } + + existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); + + if (xa_is_err(existing)) { + ret = xa_err(existing); + ASSERT(ret != -EINVAL); + ASSERT(ret != -ENOMEM); + return ret; + } else if (existing) { + WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + } + + return 0; +} + /* - * read an inode from the btree into the in-memory inode + * Read a locked inode from the btree into the in-memory inode. + * On failure clean up the inode. */ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *in_path) @@ -3807,7 +3838,7 @@ static int btrfs_read_locked_inode(struct inode *inode, ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); if (ret) - return ret; + goto out; ret = btrfs_fill_inode(inode, &rdev); if (!ret) @@ -3816,7 +3847,7 @@ static int btrfs_read_locked_inode(struct inode *inode, if (!path) { path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + goto out; } btrfs_get_inode_key(BTRFS_I(inode), &location); @@ -3825,7 +3856,13 @@ static int btrfs_read_locked_inode(struct inode *inode, if (ret) { if (path != in_path) btrfs_free_path(path); - return ret; + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_lookup_inode(), this means the inode was not found. + */ + if (ret > 0) + ret = -ENOENT; + goto out; } leaf = path->nodes[0]; @@ -3988,7 +4025,15 @@ static int btrfs_read_locked_inode(struct inode *inode, } btrfs_sync_inode_flags_to_i_flags(inode); + + ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + if (ret) + goto out; + return 0; +out: + iget_failed(inode); + return ret; } /* @@ -5500,35 +5545,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, return err; } -static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) -{ - struct btrfs_root *root = inode->root; - struct btrfs_inode *existing; - const u64 ino = btrfs_ino(inode); - int ret; - - if (inode_unhashed(&inode->vfs_inode)) - return 0; - - if (prealloc) { - ret = xa_reserve(&root->inodes, ino, GFP_NOFS); - if (ret) - return ret; - } - - existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); - - if (xa_is_err(existing)) { - ret = xa_err(existing); - ASSERT(ret != -EINVAL); - ASSERT(ret != -ENOMEM); - return ret; - } else if (existing) { - WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); - } - return 0; -} static void btrfs_del_inode_from_root(struct btrfs_inode *inode) { @@ -5609,25 +5626,11 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, return inode; ret = btrfs_read_locked_inode(inode, path); - /* - * ret > 0 can come from btrfs_search_slot called by - * btrfs_read_locked_inode(), this means the inode item was not found. - */ - if (ret > 0) - ret = -ENOENT; - if (ret < 0) - goto error; - - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); - if (ret < 0) - goto error; + if (ret) + return ERR_PTR(ret); unlock_new_inode(inode); - return inode; -error: - iget_failed(inode); - return ERR_PTR(ret); } struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) From 4e7847f4695636907a76363d64bb246c7c3ddc8f Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Fri, 30 Aug 2024 13:24:55 -0700 Subject: [PATCH 991/991] btrfs: remove conditional path allocation Remove conditional path allocation from btrfs_read_locked_inode. Add an ASSERT(path) to indicate it should never be called with a NULL path. Call btrfs_read_locked_inode directly from btrfs_iget. This causes code duplication between btrfs_iget and btrfs_iget_path, but I think this is justifiable as it removes the need for conditionally allocating the path inside of read_locked_inode. This makes the code easier to reason about and makes it clear who has the responsibility of allocating and freeing the path. Signed-off-by: Leo Martins Signed-off-by: David Sterba --- fs/btrfs/inode.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f0f12d4a3020c7..e7e507d6218116 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3821,10 +3821,9 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) * On failure clean up the inode. */ static int btrfs_read_locked_inode(struct inode *inode, - struct btrfs_path *in_path) + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3844,18 +3843,12 @@ static int btrfs_read_locked_inode(struct inode *inode, if (!ret) filled = true; - if (!path) { - path = btrfs_alloc_path(); - if (!path) - goto out; - } + ASSERT(path); btrfs_get_inode_key(BTRFS_I(inode), &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - if (path != in_path) - btrfs_free_path(path); /* * ret > 0 can come from btrfs_search_slot called by * btrfs_lookup_inode(), this means the inode was not found. @@ -3997,8 +3990,6 @@ static int btrfs_read_locked_inode(struct inode *inode, btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } - if (path != in_path) - btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); @@ -5608,9 +5599,8 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) /* * Get an inode object given its inode number and corresponding root. - * Path can be preallocated to prevent recursing back to iget through - * allocator. NULL is also valid but may require an additional allocation - * later. + * Path is preallocated to prevent recursing back to iget through + * allocator. */ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path) @@ -5633,9 +5623,35 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, return inode; } +/* + * Get an inode object given its inode number and corresponding root. + */ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(ino, root, NULL); + struct inode *inode; + struct btrfs_path *path; + int ret; + + inode = btrfs_iget_locked(ino, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) + return inode; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_read_locked_inode(inode, path); + + btrfs_free_path(path); + + if (ret) + return ERR_PTR(ret); + + unlock_new_inode(inode); + return inode; } static struct inode *new_simple_dir(struct inode *dir,