The following commit has been merged in the master branch: commit 1bcadc80ec6a46fb7193999935aaa299b4916569 Merge: c2d0e416bdd9c83db3c9bb1f19433d5ba34e18c2 3bf5dbc45e65624e7c415209aff4085d1c68e985 Author: Stephen Rothwell sfr@canb.auug.org.au Date: Tue Sep 10 13:52:39 2024 +1000
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
diff --combined Documentation/admin-guide/media/vivid.rst index ac233b142a279,c9d301ab46a38..034ca7c77fb97 --- a/Documentation/admin-guide/media/vivid.rst +++ b/Documentation/admin-guide/media/vivid.rst @@@ -328,7 -328,7 +328,7 @@@ and an HDMI input, one input for each i detail below.
Special attention has been given to the rate at which new frames become - available. The jitter will be around 1 jiffie (that depends on the HZ + available. The jitter will be around 1 jiffy (that depends on the HZ configuration of your kernel, so usually 1/100, 1/250 or 1/1000 of a second), but the long-term behavior is exactly following the framerate. So a framerate of 59.94 Hz is really different from 60 Hz. If the framerate @@@ -1343,7 -1343,7 +1343,7 @@@ Some Future Improvement Just as a reminder and in no particular order:
- Add a virtual alsa driver to test audio -- Add virtual sub-devices and media controller support +- Add virtual sub-devices - Some support for testing compressed video - Add support to loop raw VBI output to raw VBI input - Add support to loop teletext sliced VBI output to VBI input @@@ -1358,4 -1358,4 +1358,4 @@@ - Make a thread for the RDS generation, that would help in particular for the "Controls" RDS Rx I/O Mode as the read-only RDS controls could be updated in real-time. -- Changing the EDID should cause hotplug detect emulation to happen. +- Changing the EDID doesn't wait 100 ms before setting the HPD signal. diff --combined Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst index c146e5bba8818,731c266beb1a1..dc728c739e28d --- a/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst +++ b/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst @@@ -14,10 -14,10 +14,10 @@@ Gestor de tareas CF
CFS viene de las siglas en inglés de "Gestor de tareas totalmente justo" ("Completely Fair Scheduler"), y es el nuevo gestor de tareas de escritorio -implementado por Ingo Molnar e integrado en Linux 2.6.23. Es el sustituto de -el previo gestor de tareas SCHED_OTHER. - -Nota: El planificador EEVDF fue incorporado más recientemente al kernel. +implementado por Ingo Molnar e integrado en Linux 2.6.23. Es el sustituto +del previo gestor de tareas SCHED_OTHER. Hoy en día se está abriendo camino +para el gestor de tareas EEVDF, cuya documentación se puede ver en +Documentation/scheduler/sched-eevdf.rst
El 80% del diseño de CFS puede ser resumido en una única frase: CFS básicamente modela una "CPU ideal, precisa y multi-tarea" sobre hardware @@@ -109,7 -109,7 +109,7 @@@ para que se ejecute, y la tarea en ejec ==================================
CFS usa una granularidad de nanosegundos y no depende de ningún - jiffie o detalles como HZ. De este modo, el gestor de tareas CFS no tiene + jiffy o detalles como HZ. De este modo, el gestor de tareas CFS no tiene noción de "ventanas de tiempo" de la forma en que tenía el gestor de tareas previo, y tampoco tiene heurísticos. Únicamente hay un parámetro central ajustable (se ha de cambiar en CONFIG_SCHED_DEBUG): diff --combined MAINTAINERS index 87c33c5c2aa50,e1d135484da58..d5eaff008ed44 --- a/MAINTAINERS +++ b/MAINTAINERS @@@ -334,7 -334,6 +334,7 @@@ L: linux-acpi@vger.kernel.or L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: drivers/acpi/arm64 +F: include/linux/acpi_iort.h
ACPI FOR RISC-V (ACPI/riscv) M: Sunil V L sunilvl@ventanamicro.com @@@ -538,17 -537,6 +538,17 @@@ F: drivers/leds/leds-adp5520. F: drivers/mfd/adp5520.c F: drivers/video/backlight/adp5520_bl.c
+ADP5585 GPIO EXPANDER, PWM AND KEYPAD CONTROLLER DRIVER +M: Laurent Pinchart laurent.pinchart@ideasonboard.com +L: linux-gpio@vger.kernel.org +L: linux-pwm@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/*/adi,adp5585*.yaml +F: drivers/gpio/gpio-adp5585.c +F: drivers/mfd/adp5585.c +F: drivers/pwm/pwm-adp5585.c +F: include/linux/mfd/adp5585.h + ADP5588 QWERTY KEYPAD AND IO EXPANDER DRIVER (ADP5588/ADP5587) M: Michael Hennerich michael.hennerich@analog.com S: Supported @@@ -1025,13 -1013,6 +1025,13 @@@ S: Supporte T: git https://gitlab.freedesktop.org/agd5f/linux.git F: drivers/gpu/drm/amd/display/
+AMD DISPLAY CORE - DML +M: Chaitanya Dhere chaitanya.dhere@amd.com +M: Jun Lei jun.lei@amd.com +S: Supported +F: drivers/gpu/drm/amd/display/dc/dml/ +F: drivers/gpu/drm/amd/display/dc/dml2/ + AMD FAM15H PROCESSOR POWER MONITORING DRIVER M: Huang Rui ray.huang@amd.com L: linux-hwmon@vger.kernel.org @@@ -1172,13 -1153,6 +1172,13 @@@ S: Supporte F: arch/arm64/boot/dts/amd/amd-seattle-xgbe*.dtsi F: drivers/net/ethernet/amd/xgbe/
+AMLOGIC BLUETOOTH DRIVER +M: Yang Li yang.li@amlogic.com +L: linux-bluetooth@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/net/bluetooth/amlogic,w155s2-bt.yaml +F: drivers/bluetooth/hci_aml.c + AMLOGIC DDR PMU DRIVER M: Jiucheng Xu jiucheng.xu@amlogic.com L: linux-amlogic@lists.infradead.org @@@ -1228,13 -1202,6 +1228,13 @@@ W: https://ez.analog.com/linux-software F: Documentation/devicetree/bindings/iio/dac/adi,ad3552r.yaml F: drivers/iio/dac/ad3552r.c
+ANALOG DEVICES INC AD4000 DRIVER +M: Marcelo Schmitt marcelo.schmitt@analog.com +L: linux-iio@vger.kernel.org +S: Supported +W: https://ez.analog.com/linux-software-drivers +F: Documentation/devicetree/bindings/iio/adc/adi,ad4000.yaml + ANALOG DEVICES INC AD4130 DRIVER M: Cosmin Tanislav cosmin.tanislav@analog.com L: linux-iio@vger.kernel.org @@@ -1642,14 -1609,6 +1642,14 @@@ F: Documentation/admin-guide/perf/xgene F: Documentation/devicetree/bindings/perf/apm-xgene-pmu.txt F: drivers/perf/xgene_pmu.c
+APPLIED MICRO QT2025 PHY DRIVER +M: FUJITA Tomonori fujita.tomonori@gmail.com +R: Trevor Gross tmgross@umich.edu +L: netdev@vger.kernel.org +L: rust-for-linux@vger.kernel.org +S: Maintained +F: drivers/net/phy/qt2025.rs + APTINA CAMERA SENSOR PLL M: Laurent Pinchart Laurent.pinchart@ideasonboard.com L: linux-media@vger.kernel.org @@@ -1778,17 -1737,6 +1778,17 @@@ F: drivers/mtd/maps/physmap-versatile. F: drivers/power/reset/arm-versatile-reboot.c F: drivers/soc/versatile/
+ARM INTERCONNECT PMU DRIVERS +M: Robin Murphy robin.murphy@arm.com +S: Supported +F: Documentation/admin-guide/perf/arm-cmn.rst +F: Documentation/admin-guide/perf/arm-ni.rst +F: Documentation/devicetree/bindings/perf/arm,cmn.yaml +F: Documentation/devicetree/bindings/perf/arm,ni.yaml +F: drivers/perf/arm-cmn.c +F: drivers/perf/arm-ni.c +F: tools/perf/pmu-events/arch/arm64/arm/cmn/ + ARM KOMEDA DRM-KMS DRIVER M: Liviu Dudau liviu.dudau@arm.com S: Supported @@@ -1806,7 -1754,6 +1806,7 @@@ L: dri-devel@lists.freedesktop.or S: Supported T: git https://gitlab.freedesktop.org/drm/misc/kernel.git F: Documentation/gpu/panfrost.rst +F: drivers/gpu/drm/ci/xfails/panfrost* F: drivers/gpu/drm/panfrost/ F: include/uapi/drm/panfrost_drm.h
@@@ -3839,9 -3786,10 +3839,9 @@@ F: Documentation/filesystems/befs.rs F: fs/befs/
BFQ I/O SCHEDULER -M: Paolo Valente paolo.valente@unimore.it -M: Jens Axboe axboe@kernel.dk +M: Yu Kuai yukuai3@huawei.com L: linux-block@vger.kernel.org -S: Maintained +S: Odd Fixes F: Documentation/block/bfq-iosched.rst F: block/bfq-*
@@@ -5153,8 -5101,10 +5153,8 @@@ F: Documentation/devicetree/bindings/me F: drivers/media/cec/platform/cec-gpio/
CELL BROADBAND ENGINE ARCHITECTURE -M: Arnd Bergmann arnd@arndb.de L: linuxppc-dev@lists.ozlabs.org -S: Supported -W: http://www.ibm.com/developerworks/power/cell/ +S: Orphan F: arch/powerpc/include/asm/cell*.h F: arch/powerpc/include/asm/spu*.h F: arch/powerpc/include/uapi/asm/spu*.h @@@ -5247,7 -5197,7 +5247,7 @@@ F: Documentation/dev-tools/checkpatch.r
CHINESE DOCUMENTATION M: Alex Shi alexs@kernel.org -M: Yanteng Si siyanteng@loongson.cn +M: Yanteng Si si.yanteng@linux.dev S: Maintained F: Documentation/translations/zh_CN/
@@@ -5874,9 -5824,6 +5874,9 @@@ CPU POWER MONITORING SUBSYSTE M: Thomas Renninger trenn@suse.com M: Shuah Khan shuah@kernel.org M: Shuah Khan skhan@linuxfoundation.org +M: John B. Wyatt IV jwyatt@redhat.com +M: John B. Wyatt IV sageofredondo@gmail.com +M: John Kacur jkacur@redhat.com L: linux-pm@vger.kernel.org S: Maintained F: tools/power/cpupower/ @@@ -6559,7 -6506,6 +6559,7 @@@ F: Documentation/devicetree/bindings/re F: Documentation/devicetree/bindings/regulator/dlg,da9*.yaml F: Documentation/devicetree/bindings/regulator/dlg,slg51000.yaml F: Documentation/devicetree/bindings/sound/da[79]*.txt +F: Documentation/devicetree/bindings/sound/dlg,da7213.yaml F: Documentation/devicetree/bindings/thermal/dlg,da9062-thermal.yaml F: Documentation/devicetree/bindings/watchdog/dlg,da9062-watchdog.yaml F: Documentation/hwmon/da90??.rst @@@ -6720,7 -6666,6 +6720,7 @@@ F: drivers/dma-buf/dma-heap. F: drivers/dma-buf/heaps/* F: include/linux/dma-heap.h F: include/uapi/linux/dma-heap.h +F: tools/testing/selftests/dmabuf-heaps/
DMC FREQUENCY DRIVER FOR SAMSUNG EXYNOS5422 M: Lukasz Luba lukasz.luba@arm.com @@@ -6772,7 -6717,6 +6772,7 @@@ DOCUMENTATION PROCES M: Jonathan Corbet corbet@lwn.net L: workflows@vger.kernel.org S: Maintained +F: Documentation/dev-tools/ F: Documentation/maintainer/ F: Documentation/process/
@@@ -6780,7 -6724,6 +6780,7 @@@ DOCUMENTATION REPORTING ISSUE M: Thorsten Leemhuis linux@leemhuis.info L: linux-doc@vger.kernel.org S: Maintained +F: Documentation/admin-guide/bug-bisect.rst F: Documentation/admin-guide/quickly-build-trimmed-linux.rst F: Documentation/admin-guide/reporting-issues.rst F: Documentation/admin-guide/verify-bugs-and-bisect-regressions.rst @@@ -7395,10 -7338,10 +7395,10 @@@ F: drivers/gpu/drm/udl
DRM DRIVER FOR VIRTUAL KERNEL MODESETTING (VKMS) M: Rodrigo Siqueira rodrigosiqueiramelo@gmail.com -M: Melissa Wen melissa.srw@gmail.com M: Maíra Canal mairacanal@riseup.net R: Haneen Mohammed hamohammed.sa@gmail.com -R: Daniel Vetter daniel@ffwll.ch +R: Simona Vetter simona@ffwll.ch +R: Melissa Wen melissa.srw@gmail.com L: dri-devel@lists.freedesktop.org S: Maintained T: git https://gitlab.freedesktop.org/drm/misc/kernel.git @@@ -7431,7 -7374,7 +7431,7 @@@ F: drivers/gpu/drm/panel/panel-widechip
DRM DRIVERS M: David Airlie airlied@gmail.com -M: Daniel Vetter daniel@ffwll.ch +M: Simona Vetter simona@ffwll.ch L: dri-devel@lists.freedesktop.org S: Maintained B: https://gitlab.freedesktop.org/drm @@@ -7527,6 -7470,7 +7527,6 @@@ M: Kyungmin Park <kyungmin.park@samsung L: dri-devel@lists.freedesktop.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git -F: Documentation/devicetree/bindings/display/exynos/ F: Documentation/devicetree/bindings/display/samsung/ F: drivers/gpu/drm/exynos/ F: include/uapi/drm/exynos_drm.h @@@ -8401,7 -8345,6 +8401,7 @@@ F: include/linux/mii. F: include/linux/of_net.h F: include/linux/phy.h F: include/linux/phy_fixed.h +F: include/linux/phy_link_topology.h F: include/linux/phylib_stubs.h F: include/linux/platform_data/mdio-bcm-unimac.h F: include/linux/platform_data/mdio-gpio.h @@@ -8417,7 -8360,6 +8417,7 @@@ L: netdev@vger.kernel.or L: rust-for-linux@vger.kernel.org S: Maintained F: rust/kernel/net/phy.rs +F: rust/kernel/net/phy/reg.rs
EXEC & BINFMT API, ELF R: Eric Biederman ebiederm@xmission.com @@@ -8598,9 -8540,8 +8598,9 @@@ F: drivers/net/wan/farsync. FAULT INJECTION SUPPORT M: Akinobu Mita akinobu.mita@gmail.com S: Supported -F: Documentation/fault-injection/ +F: Documentation/dev-tools/fault-injection/ F: lib/fault-inject.c +F: tools/testing/fault-injection/
FBTFT Framebuffer drivers L: dri-devel@lists.freedesktop.org @@@ -8662,7 -8603,6 +8662,7 @@@ M: Christian Brauner <brauner@kernel.or R: Jan Kara jack@suse.cz L: linux-fsdevel@vger.kernel.org S: Maintained +T: git https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git F: fs/* F: include/linux/fs.h F: include/linux/fs_types.h @@@ -8875,7 -8815,7 +8875,7 @@@ W: https://floatingpoint.billm.au F: arch/x86/math-emu/
FRAMEBUFFER CORE -M: Daniel Vetter daniel@ffwll.ch +M: Simona Vetter simona@ffwll.ch S: Odd Fixes T: git https://gitlab.freedesktop.org/drm/misc/kernel.git F: drivers/video/fbdev/core/ @@@ -9072,7 -9012,6 +9072,7 @@@ M: Herve Codina <herve.codina@bootlin.c L: linuxppc-dev@lists.ozlabs.org S: Maintained F: Documentation/devicetree/bindings/soc/fsl/cpm_qe/fsl,cpm1-scc-qmc.yaml +F: Documentation/devicetree/bindings/soc/fsl/cpm_qe/fsl,qe-ucc-qmc.yaml F: drivers/soc/fsl/qe/qmc.c F: include/soc/fsl/qe/qmc.h
@@@ -9088,11 -9027,9 +9088,11 @@@ M: Herve Codina <herve.codina@bootlin.c L: linuxppc-dev@lists.ozlabs.org S: Maintained F: Documentation/devicetree/bindings/soc/fsl/cpm_qe/fsl,cpm1-tsa.yaml +F: Documentation/devicetree/bindings/soc/fsl/cpm_qe/fsl,qe-tsa.yaml F: drivers/soc/fsl/qe/tsa.c F: drivers/soc/fsl/qe/tsa.h F: include/dt-bindings/soc/cpm1-fsl,tsa.h +F: include/dt-bindings/soc/qe-fsl,tsa.h
FREESCALE QUICC ENGINE UCC ETHERNET DRIVER L: netdev@vger.kernel.org @@@ -10089,10 -10026,12 +10089,12 @@@ S: Maintaine T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core F: Documentation/timers/ F: include/linux/clockchips.h + F: include/linux/delay.h F: include/linux/hrtimer.h F: include/linux/timer.h F: kernel/time/clockevents.c F: kernel/time/hrtimer.c + F: kernel/time/sleep_timeout.c F: kernel/time/timer.c F: kernel/time/timer_list.c F: kernel/time/timer_migration.* @@@ -11042,7 -10981,6 +11044,7 @@@ T: git https://gitlab.freedesktop.org/d F: Documentation/devicetree/bindings/gpu/img,powervr-rogue.yaml F: Documentation/devicetree/bindings/gpu/img,powervr-sgx.yaml F: Documentation/gpu/imagination/ +F: drivers/gpu/drm/ci/xfails/powervr* F: drivers/gpu/drm/imagination/ F: include/uapi/drm/pvr_drm.h
@@@ -11168,17 -11106,10 +11170,17 @@@ F: Documentation/devicetree/bindings/se F: Documentation/input/ F: drivers/input/ F: include/dt-bindings/input/ +F: include/linux/gameport.h +F: include/linux/i8042.h F: include/linux/input.h F: include/linux/input/ +F: include/linux/libps2.h +F: include/linux/serio.h +F: include/uapi/linux/gameport.h F: include/uapi/linux/input-event-codes.h F: include/uapi/linux/input.h +F: include/uapi/linux/serio.h +F: include/uapi/linux/uinput.h
INPUT MULTITOUCH (MT) PROTOCOL M: Henrik Rydberg rydberg@bitmath.org @@@ -11205,16 -11136,6 +11207,16 @@@ T: git git://git.kernel.org/pub/scm/lin F: security/integrity/ F: security/integrity/ima/
+INTEGRITY POLICY ENFORCEMENT (IPE) +M: Fan Wu wufan@linux.microsoft.com +L: linux-security-module@vger.kernel.org +S: Supported +T: git https://github.com/microsoft/ipe.git +F: Documentation/admin-guide/LSM/ipe.rst +F: Documentation/security/ipe.rst +F: scripts/ipe/ +F: security/ipe/ + INTEL 810/815 FRAMEBUFFER DRIVER M: Antonino Daplas adaplas@gmail.com L: linux-fbdev@vger.kernel.org @@@ -11819,7 -11740,6 +11821,7 @@@ T: git git://git.kernel.org/pub/scm/lin F: drivers/iommu/dma-iommu.c F: drivers/iommu/dma-iommu.h F: drivers/iommu/iova.c +F: include/linux/iommu-dma.h F: include/linux/iova.h
IOMMU SUBSYSTEM @@@ -12390,7 -12310,6 +12392,7 @@@ L: kvm@vger.kernel.or L: loongarch@lists.linux.dev S: Maintained T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git +F: Documentation/virt/kvm/loongarch/ F: arch/loongarch/include/asm/kvm* F: arch/loongarch/include/uapi/asm/kvm* F: arch/loongarch/kvm/ @@@ -13600,7 -13519,7 +13602,7 @@@ S: Maintaine F: Documentation/devicetree/bindings/mfd/marvell,88pm886-a1.yaml F: drivers/input/misc/88pm886-onkey.c F: drivers/mfd/88pm886.c -F: drivers/regulators/88pm886-regulator.c +F: drivers/regulator/88pm886-regulator.c F: include/linux/mfd/88pm886.h
MARVELL ARMADA 3700 PHY DRIVERS @@@ -14311,8 -14230,8 +14313,8 @@@ M: Sean Wang <sean.wang@mediatek.com L: linux-bluetooth@vger.kernel.org L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) S: Maintained +F: Documentation/devicetree/bindings/net/bluetooth/mediatek,bluetooth.txt F: Documentation/devicetree/bindings/net/bluetooth/mediatek,mt7921s-bluetooth.yaml -F: Documentation/devicetree/bindings/net/mediatek-bluetooth.txt F: drivers/bluetooth/btmtkuart.c
MEDIATEK BOARD LEVEL SHUTDOWN DRIVERS @@@ -14590,7 -14509,7 +14592,7 @@@ MELLANOX ETHERNET DRIVER (mlx4_en M: Tariq Toukan tariqt@nvidia.com L: netdev@vger.kernel.org S: Supported -W: http://www.mellanox.com +W: https://www.nvidia.com/networking/ Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlx4/en_*
@@@ -14599,7 -14518,7 +14601,7 @@@ M: Saeed Mahameed <saeedm@nvidia.com M: Tariq Toukan tariqt@nvidia.com L: netdev@vger.kernel.org S: Supported -W: http://www.mellanox.com +W: https://www.nvidia.com/networking/ Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlx5/core/en_*
@@@ -14607,7 -14526,7 +14609,7 @@@ MELLANOX ETHERNET INNOVA DRIVER R: Boris Pismenny borisp@nvidia.com L: netdev@vger.kernel.org S: Supported -W: http://www.mellanox.com +W: https://www.nvidia.com/networking/ Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlx5/core/en_accel/* F: drivers/net/ethernet/mellanox/mlx5/core/fpga/* @@@ -14618,7 -14537,7 +14620,7 @@@ M: Ido Schimmel <idosch@nvidia.com M: Petr Machata petrm@nvidia.com L: netdev@vger.kernel.org S: Supported -W: http://www.mellanox.com +W: https://www.nvidia.com/networking/ Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlxsw/ F: tools/testing/selftests/drivers/net/mlxsw/ @@@ -14627,7 -14546,7 +14629,7 @@@ MELLANOX FIRMWARE FLASH LIBRARY (mlxfw M: mlxsw@nvidia.com L: netdev@vger.kernel.org S: Supported -W: http://www.mellanox.com +W: https://www.nvidia.com/networking/ Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlxfw/
@@@ -14646,7 -14565,7 +14648,7 @@@ M: Tariq Toukan <tariqt@nvidia.com L: netdev@vger.kernel.org L: linux-rdma@vger.kernel.org S: Supported -W: http://www.mellanox.com +W: https://www.nvidia.com/networking/ Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlx4/ F: include/linux/mlx4/ @@@ -14655,7 -14574,7 +14657,7 @@@ MELLANOX MLX4 IB drive M: Yishai Hadas yishaih@nvidia.com L: linux-rdma@vger.kernel.org S: Supported -W: http://www.mellanox.com +W: https://www.nvidia.com/networking/ Q: http://patchwork.kernel.org/project/linux-rdma/list/ F: drivers/infiniband/hw/mlx4/ F: include/linux/mlx4/ @@@ -14668,7 -14587,7 +14670,7 @@@ M: Tariq Toukan <tariqt@nvidia.com L: netdev@vger.kernel.org L: linux-rdma@vger.kernel.org S: Supported -W: http://www.mellanox.com +W: https://www.nvidia.com/networking/ Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: Documentation/networking/device_drivers/ethernet/mellanox/ F: drivers/net/ethernet/mellanox/mlx5/core/ @@@ -14678,7 -14597,7 +14680,7 @@@ MELLANOX MLX5 IB drive M: Leon Romanovsky leonro@nvidia.com L: linux-rdma@vger.kernel.org S: Supported -W: http://www.mellanox.com +W: https://www.nvidia.com/networking/ Q: http://patchwork.kernel.org/project/linux-rdma/list/ F: drivers/infiniband/hw/mlx5/ F: include/linux/mlx5/ @@@ -15301,12 -15220,6 +15303,12 @@@ S: Maintaine F: Documentation/hwmon/surface_fan.rst F: drivers/hwmon/surface_fan.c
+MICROSOFT SURFACE SENSOR THERMAL DRIVER +M: Maximilian Luz luzmaximilian@gmail.com +L: linux-hwmon@vger.kernel.org +S: Maintained +F: drivers/hwmon/surface_temp.c + MICROSOFT SURFACE GPE LID SUPPORT DRIVER M: Maximilian Luz luzmaximilian@gmail.com L: platform-driver-x86@vger.kernel.org @@@ -15559,9 -15472,6 +15561,9 @@@ F: include/dt-bindings/clock/mobileye,e
MODULE SUPPORT M: Luis Chamberlain mcgrof@kernel.org +R: Petr Pavlu petr.pavlu@suse.com +R: Sami Tolvanen samitolvanen@google.com +R: Daniel Gomez da.gomez@samsung.com L: linux-modules@vger.kernel.org L: linux-kernel@vger.kernel.org S: Maintained @@@ -15880,7 -15790,6 +15882,7 @@@ M: Breno Leitao <leitao@debian.org S: Maintained F: Documentation/networking/netconsole.rst F: drivers/net/netconsole.c +F: tools/testing/selftests/drivers/net/netcons_basic.sh
NETDEVSIM M: Jakub Kicinski kuba@kernel.org @@@ -16924,7 -16833,6 +16926,7 @@@ OMNIVISION OG01A1B SENSOR DRIVE M: Sakari Ailus sakari.ailus@linux.intel.com L: linux-media@vger.kernel.org S: Maintained +F: Documentation/devicetree/bindings/media/i2c/ovti,og01a1b.yaml F: drivers/media/i2c/og01a1b.c
OMNIVISION OV01A10 SENSOR DRIVER @@@ -17506,7 -17414,7 +17508,7 @@@ PCI DRIVER FOR ALTERA PCIE I M: Joyce Ooi joyce.ooi@intel.com L: linux-pci@vger.kernel.org S: Supported -F: Documentation/devicetree/bindings/pci/altera-pcie.txt +F: Documentation/devicetree/bindings/pci/altr,pcie-root-port.yaml F: drivers/pci/controller/pcie-altera.c
PCI DRIVER FOR APPLIEDMICRO XGENE @@@ -17738,7 -17646,7 +17740,7 @@@ PCI MSI DRIVER FOR ALTERA MSI I M: Joyce Ooi joyce.ooi@intel.com L: linux-pci@vger.kernel.org S: Supported -F: Documentation/devicetree/bindings/pci/altera-pcie-msi.txt +F: Documentation/devicetree/bindings/pci/altr,msi-controller.yaml F: drivers/pci/controller/pcie-altera-msi.c
PCI MSI DRIVER FOR APPLIEDMICRO XGENE @@@ -18881,7 -18789,7 +18883,7 @@@ M: Bryan O'Donoghue <bryan.odonoghue@li L: linux-media@vger.kernel.org S: Maintained F: Documentation/admin-guide/media/qcom_camss.rst -F: Documentation/devicetree/bindings/media/*camss* +F: Documentation/devicetree/bindings/media/qcom,*camss* F: drivers/media/platform/qcom/camss/
QUALCOMM CLOCK DRIVERS @@@ -18896,6 -18804,7 +18898,6 @@@ F: include/dt-bindings/clock/qcom, QUALCOMM CLOUD AI (QAIC) DRIVER M: Jeffrey Hugo quic_jhugo@quicinc.com R: Carl Vanderlip quic_carlv@quicinc.com -R: Pranjal Ramajor Asha Kanojiya quic_pkanojiy@quicinc.com L: linux-arm-msm@vger.kernel.org L: dri-devel@lists.freedesktop.org S: Supported @@@ -18990,7 -18899,6 +18992,7 @@@ L: linux-arm-msm@vger.kernel.or S: Maintained F: Documentation/devicetree/bindings/interconnect/qcom,msm8998-bwmon.yaml F: drivers/soc/qcom/icc-bwmon.c +F: drivers/soc/qcom/trace_icc-bwmon.h
QUALCOMM IOMMU M: Rob Clark robdclark@gmail.com @@@ -19330,7 -19238,6 +19332,7 @@@ S: Supporte W: https://oss.oracle.com/projects/rds/ F: Documentation/networking/rds.rst F: net/rds/ +F: tools/testing/selftests/net/rds/
RDT - RESOURCE ALLOCATION M: Fenghua Yu fenghua.yu@intel.com @@@ -19790,10 -19697,12 +19792,10 @@@ L: linux-riscv@lists.infradead.or S: Maintained Q: https://patchwork.kernel.org/project/linux-riscv/list/ T: git https://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git/ -F: Documentation/devicetree/bindings/riscv/ -F: arch/riscv/boot/dts/ -X: arch/riscv/boot/dts/allwinner/ -X: arch/riscv/boot/dts/renesas/ -X: arch/riscv/boot/dts/sophgo/ -X: arch/riscv/boot/dts/thead/ +F: arch/riscv/boot/dts/canaan/ +F: arch/riscv/boot/dts/microchip/ +F: arch/riscv/boot/dts/sifive/ +F: arch/riscv/boot/dts/starfive/
RISC-V PMU DRIVERS M: Atish Patra atishp@atishpatra.org @@@ -19831,14 -19740,6 +19833,14 @@@ F: Documentation/ABI/*/sysfs-driver-hid F: drivers/hid/hid-roccat* F: include/linux/hid-roccat*
+ROCKCHIP CAN-FD DRIVER +M: Marc Kleine-Budde mkl@pengutronix.de +R: kernel@pengutronix.de +L: linux-can@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/net/can/rockchip,rk3568v2-canfd.yaml +F: drivers/net/can/rockchip/ + ROCKCHIP CRYPTO DRIVERS M: Corentin Labbe clabbe@baylibre.com L: linux-crypto@vger.kernel.org @@@ -19865,13 -19766,6 +19867,13 @@@ F: Documentation/userspace-api/media/v4 F: drivers/media/platform/rockchip/rkisp1 F: include/uapi/linux/rkisp1-config.h
+ROCKCHIP RK3568 RANDOM NUMBER GENERATOR SUPPORT +M: Daniel Golle daniel@makrotopia.org +M: Aurelien Jarno aurelien@aurel32.net +S: Maintained +F: Documentation/devicetree/bindings/rng/rockchip,rk3568-rng.yaml +F: drivers/char/hw_random/rockchip-rng.c + ROCKCHIP RASTER 2D GRAPHIC ACCELERATION UNIT DRIVER M: Jacob Chen jacob-chen@iotwrt.com M: Ezequiel Garcia ezequiel@vanguardiasur.com.ar @@@ -19988,26 -19882,12 +19990,26 @@@ T: git git://linuxtv.org/media_tree.gi F: Documentation/devicetree/bindings/media/allwinner,sun8i-a83t-de2-rotate.yaml F: drivers/media/platform/sunxi/sun8i-rotate/
+RPMB SUBSYSTEM +M: Jens Wiklander jens.wiklander@linaro.org +L: linux-kernel@vger.kernel.org +S: Supported +F: drivers/misc/rpmb-core.c +F: include/linux/rpmb.h + RPMSG TTY DRIVER M: Arnaud Pouliquen arnaud.pouliquen@foss.st.com L: linux-remoteproc@vger.kernel.org S: Maintained F: drivers/tty/rpmsg_tty.c
+RTASE ETHERNET DRIVER +M: Justin Lai justinlai0215@realtek.com +M: Larry Chiu larry.chiu@realtek.com +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/ethernet/realtek/rtase/ + RTL2830 MEDIA DRIVER L: linux-media@vger.kernel.org S: Orphan @@@ -20272,16 -20152,6 +20274,16 @@@ B: mailto:linux-samsung-soc@vger.kernel F: Documentation/devicetree/bindings/sound/samsung* F: sound/soc/samsung/
+SAMSUNG EXYNOS850 SoC SUPPORT +M: Sam Protsenko semen.protsenko@linaro.org +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +L: linux-samsung-soc@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/clock/samsung,exynos850-clock.yaml +F: arch/arm64/boot/dts/exynos/exynos850* +F: drivers/clk/samsung/clk-exynos850.c +F: include/dt-bindings/clock/exynos850.h + SAMSUNG EXYNOS PSEUDO RANDOM NUMBER GENERATOR (RNG) DRIVER M: Krzysztof Kozlowski krzk@kernel.org L: linux-crypto@vger.kernel.org @@@ -21669,8 -21539,10 +21671,8 @@@ F: include/linux/spmi. F: include/trace/events/spmi.h
SPU FILE SYSTEM -M: Jeremy Kerr jk@ozlabs.org L: linuxppc-dev@lists.ozlabs.org -S: Supported -W: http://www.ibm.com/developerworks/power/cell/ +S: Orphan F: Documentation/filesystems/spufs/spufs.rst F: arch/powerpc/platforms/cell/spufs/
@@@ -22599,7 -22471,6 +22601,7 @@@ M: Jens Wiklander <jens.wiklander@linar R: Sumit Garg sumit.garg@linaro.org L: op-tee@lists.trustedfirmware.org S: Maintained +F: Documentation/ABI/testing/sysfs-class-tee F: Documentation/driver-api/tee.rst F: Documentation/tee/ F: Documentation/userspace-api/tee.rst @@@ -22645,7 -22516,6 +22647,7 @@@ M: Thierry Reding <thierry.reding@gmail R: Krishna Reddy vdumpa@nvidia.com L: linux-tegra@vger.kernel.org S: Supported +F: drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c F: drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c F: drivers/iommu/tegra*
@@@ -22751,11 -22621,12 +22753,11 @@@ F: Documentation/devicetree/bindings/so F: Documentation/devicetree/bindings/sound/ti,tas2562.yaml F: Documentation/devicetree/bindings/sound/ti,tas2770.yaml F: Documentation/devicetree/bindings/sound/ti,tas27xx.yaml +F: Documentation/devicetree/bindings/sound/ti,tpa6130a2.yaml F: Documentation/devicetree/bindings/sound/ti,pcm1681.yaml F: Documentation/devicetree/bindings/sound/ti,pcm3168a.yaml F: Documentation/devicetree/bindings/sound/ti,tlv320*.yaml F: Documentation/devicetree/bindings/sound/ti,tlv320adcx140.yaml -F: Documentation/devicetree/bindings/sound/tlv320aic31xx.txt -F: Documentation/devicetree/bindings/sound/tpa6130a2.txt F: include/sound/tas2*.h F: include/sound/tlv320*.h F: include/sound/tpa6130a2-plat.h @@@ -23333,7 -23204,6 +23335,7 @@@ Q: https://patchwork.kernel.org/project T: git git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git F: Documentation/devicetree/bindings/tpm/ F: drivers/char/tpm/ +F: tools/testing/selftests/tpm2/
TPS546D24 DRIVER M: Duke Du dukedu83@gmail.com @@@ -23346,8 -23216,9 +23348,8 @@@ TQ SYSTEMS BOARD & DRIVER SUPPOR L: linux@ew.tq-group.com S: Supported W: https://www.tq-group.com/en/products/tq-embedded/ -F: arch/arm/boot/dts/imx*mba*.dts* -F: arch/arm/boot/dts/imx*tqma*.dts* -F: arch/arm/boot/dts/mba*.dtsi +F: arch/arm/boot/dts/nxp/imx/*mba*.dts* +F: arch/arm/boot/dts/nxp/imx/*tqma*.dts* F: arch/arm64/boot/dts/freescale/fsl-*tqml*.dts* F: arch/arm64/boot/dts/freescale/imx*mba*.dts* F: arch/arm64/boot/dts/freescale/imx*tqma*.dts* @@@ -24555,20 -24426,6 +24557,20 @@@ F: include/uapi/linux/vsockmon. F: net/vmw_vsock/ F: tools/testing/vsock/
+VMA +M: Andrew Morton akpm@linux-foundation.org +R: Liam R. Howlett Liam.Howlett@oracle.com +R: Vlastimil Babka vbabka@suse.cz +R: Lorenzo Stoakes lorenzo.stoakes@oracle.com +L: linux-mm@kvack.org +S: Maintained +W: https://www.linux-mm.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: mm/vma.c +F: mm/vma.h +F: mm/vma_internal.h +F: tools/testing/vma/ + VMALLOC M: Andrew Morton akpm@linux-foundation.org R: Uladzislau Rezki urezki@gmail.com @@@ -24958,6 -24815,17 +24960,17 @@@ T: git git://git.kernel.org/pub/scm/lin F: Documentation/arch/x86/ F: Documentation/devicetree/bindings/x86/ F: arch/x86/ + F: tools/testing/selftests/x86 + + X86 CPUID DATABASE + M: Borislav Petkov bp@alien8.de + M: Thomas Gleixner tglx@linutronix.de + M: x86@kernel.org + R: Ahmed S. Darwish darwi@linutronix.de + L: x86-cpuid@lists.linux.dev + S: Maintained + W: https://x86-cpuid.org + F: tools/arch/x86/kcpuid/cpuid.csv
X86 ENTRY CODE M: Andy Lutomirski luto@kernel.org @@@ -25400,19 -25268,6 +25413,19 @@@ S: Maintaine F: drivers/spi/spi-xtensa-xtfpga.c F: sound/soc/xtensa/xtfpga-i2s.c
+XZ EMBEDDED +M: Lasse Collin lasse.collin@tukaani.org +S: Maintained +W: https://tukaani.org/xz/embedded.html +B: https://github.com/tukaani-project/xz-embedded/issues +C: irc://irc.libera.chat/tukaani +F: Documentation/staging/xz.rst +F: include/linux/decompress/unxz.h +F: include/linux/xz.h +F: lib/decompress_unxz.c +F: lib/xz/ +F: scripts/xz_wrap.sh + YAM DRIVER FOR AX.25 M: Jean-Paul Roubelat jpr@f6fbb.org L: linux-hams@vger.kernel.org @@@ -25437,6 -25292,7 +25450,6 @@@ F: tools/net/ynl
YEALINK PHONE DRIVER M: Henk Vergonet Henk.Vergonet@gmail.com -L: usbb2k-api-dev@nongnu.org S: Maintained F: Documentation/input/devices/yealink.rst F: drivers/input/misc/yealink.* diff --combined arch/x86/Kconfig index 47a2ff9096dad,d422247b28822..28401c16b0c3f --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@@ -28,7 -28,6 +28,7 @@@ config X86_6 select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE @@@ -80,7 -79,6 +80,7 @@@ config X8 select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE select ARCH_HAS_DEVMEM_IS_ALLOWED + select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER @@@ -109,6 -107,7 +109,7 @@@ select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_HAVE_EXTRA_ELF_NOTES select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT @@@ -298,7 -297,6 +299,7 @@@ select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_PER_CPU_PAGE_FIRST_CHUNK select NEED_SG_DMA_LENGTH + select NUMA_MEMBLKS if NUMA select PCI_DOMAINS if PCI select PCI_LOCKLESS_CONFIG if PCI select PERF_EVENTS @@@ -946,6 -944,7 +947,6 @@@ config DM
config GART_IOMMU bool "Old AMD GART IOMMU support" - select DMA_OPS select IOMMU_HELPER select SWIOTLB depends on X86_64 && PCI && AMD_NB @@@ -1601,6 -1600,14 +1602,6 @@@ config X86_64_ACPI_NUM help Enable ACPI SRAT based node topology detection.
-config NUMA_EMU - bool "NUMA emulation" - depends on NUMA - help - Enable NUMA emulation. A flat machine will be split - into virtual nodes when booted with "numa=fake=N", where N is the - number of nodes. This is only useful for debugging. - config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP range 1 10 @@@ -1800,7 -1807,6 +1801,7 @@@ config X86_PA def_bool y prompt "x86 PAT support" if EXPERT depends on MTRR + select ARCH_USES_PG_ARCH_2 help Use PAT attributes to setup page level cache control.
@@@ -1812,6 -1818,10 +1813,6 @@@
If unsure, say Y.
-config ARCH_USES_PG_UNCACHED - def_bool y - depends on X86_PAT - config X86_UMIP def_bool y prompt "User Mode Instruction Prevention" if EXPERT @@@ -1880,10 -1890,6 +1881,10 @@@ config X86_INTEL_MEMORY_PROTECTION_KEY
If unsure, say y.
+config ARCH_PKEY_BITS + int + default 4 + choice prompt "TSX enable mode" depends on CPU_SUP_INTEL @@@ -2421,6 -2427,14 +2422,14 @@@ config CFI_AUTO_DEFAUL
source "kernel/livepatch/Kconfig"
+ config X86_BUS_LOCK_DETECT + bool "Split Lock Detect and Bus Lock Detect support" + depends on CPU_SUP_INTEL || CPU_SUP_AMD + default y + help + Enable Split Lock Detect and Bus Lock Detect functionalities. + See file:Documentation/arch/x86/buslock.rst for more information. + endmenu
config CC_HAS_NAMED_AS @@@ -2605,24 -2619,15 +2614,15 @@@ config MITIGATION_SL against straight line speculation. The kernel image might be slightly larger.
- config MITIGATION_GDS_FORCE - bool "Force GDS Mitigation" + config MITIGATION_GDS + bool "Mitigate Gather Data Sampling" depends on CPU_SUP_INTEL - default n + default y help - Gather Data Sampling (GDS) is a hardware vulnerability which allows - unprivileged speculative access to data which was previously stored in - vector registers. - - This option is equivalent to setting gather_data_sampling=force on the - command line. The microcode mitigation is used if present, otherwise - AVX is disabled as a mitigation. On affected systems that are missing - the microcode any userspace code that unconditionally uses AVX will - break with this option set. - - Setting this option on systems not vulnerable to GDS has no effect. - - If in doubt, say N. + Enable mitigation for Gather Data Sampling (GDS). GDS is a hardware + vulnerability which allows unprivileged speculative access to data + which was previously stored in vector registers. The attacker uses gather + instructions to infer the stale vector register data.
config MITIGATION_RFDS bool "RFDS Mitigation" @@@ -2645,6 -2650,107 +2645,107 @@@ config MITIGATION_SPECTRE_BH indirect branches. See file:Documentation/admin-guide/hw-vuln/spectre.rst
+ config MITIGATION_MDS + bool "Mitigate Microarchitectural Data Sampling (MDS) hardware bug" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for Microarchitectural Data Sampling (MDS). MDS is + a hardware vulnerability which allows unprivileged speculative access + to data which is available in various CPU internal buffers. + See also file:Documentation/admin-guide/hw-vuln/mds.rst + + config MITIGATION_TAA + bool "Mitigate TSX Asynchronous Abort (TAA) hardware bug" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for TSX Asynchronous Abort (TAA). TAA is a hardware + vulnerability that allows unprivileged speculative access to data + which is available in various CPU internal buffers by using + asynchronous aborts within an Intel TSX transactional region. + See also file:Documentation/admin-guide/hw-vuln/tsx_async_abort.rst + + config MITIGATION_MMIO_STALE_DATA + bool "Mitigate MMIO Stale Data hardware bug" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for MMIO Stale Data hardware bugs. Processor MMIO + Stale Data Vulnerabilities are a class of memory-mapped I/O (MMIO) + vulnerabilities that can expose data. The vulnerabilities require the + attacker to have access to MMIO. + See also + file:Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst + + config MITIGATION_L1TF + bool "Mitigate L1 Terminal Fault (L1TF) hardware bug" + depends on CPU_SUP_INTEL + default y + help + Mitigate L1 Terminal Fault (L1TF) hardware bug. L1 Terminal Fault is a + hardware vulnerability which allows unprivileged speculative access to data + available in the Level 1 Data Cache. + See <file:Documentation/admin-guide/hw-vuln/l1tf.rst + + config MITIGATION_RETBLEED + bool "Mitigate RETBleed hardware bug" + depends on (CPU_SUP_INTEL && MITIGATION_SPECTRE_V2) || MITIGATION_UNRET_ENTRY || MITIGATION_IBPB_ENTRY + default y + help + Enable mitigation for RETBleed (Arbitrary Speculative Code Execution + with Return Instructions) vulnerability. RETBleed is a speculative + execution attack which takes advantage of microarchitectural behavior + in many modern microprocessors, similar to Spectre v2. An + unprivileged attacker can use these flaws to bypass conventional + memory security restrictions to gain read access to privileged memory + that would otherwise be inaccessible. + + config MITIGATION_SPECTRE_V1 + bool "Mitigate SPECTRE V1 hardware bug" + default y + help + Enable mitigation for Spectre V1 (Bounds Check Bypass). Spectre V1 is a + class of side channel attacks that takes advantage of speculative + execution that bypasses conditional branch instructions used for + memory access bounds check. + See also file:Documentation/admin-guide/hw-vuln/spectre.rst + + config MITIGATION_SPECTRE_V2 + bool "Mitigate SPECTRE V2 hardware bug" + default y + help + Enable mitigation for Spectre V2 (Branch Target Injection). Spectre + V2 is a class of side channel attacks that takes advantage of + indirect branch predictors inside the processor. In Spectre variant 2 + attacks, the attacker can steer speculative indirect branches in the + victim to gadget code by poisoning the branch target buffer of a CPU + used for predicting indirect branch addresses. + See also file:Documentation/admin-guide/hw-vuln/spectre.rst + + config MITIGATION_SRBDS + bool "Mitigate Special Register Buffer Data Sampling (SRBDS) hardware bug" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for Special Register Buffer Data Sampling (SRBDS). + SRBDS is a hardware vulnerability that allows Microarchitectural Data + Sampling (MDS) techniques to infer values returned from special + register accesses. An unprivileged user can extract values returned + from RDRAND and RDSEED executed on another core or sibling thread + using MDS techniques. + See also + file:Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst + + config MITIGATION_SSB + bool "Mitigate Speculative Store Bypass (SSB) hardware bug" + default y + help + Enable mitigation for Speculative Store Bypass (SSB). SSB is a + hardware security vulnerability and its exploitation takes advantage + of speculative execution in a similar way to the Meltdown and Spectre + security vulnerabilities. + endif
config ARCH_HAS_ADD_PAGES diff --combined arch/x86/include/asm/mmu_context.h index 80f2a3187aa66,19091ebb86338..2886cb668d7fa --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@@ -88,7 -88,13 +88,13 @@@ static inline void switch_ldt(struct mm #ifdef CONFIG_ADDRESS_MASKING static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) { - return mm->context.lam_cr3_mask; + /* + * When switch_mm_irqs_off() is called for a kthread, it may race with + * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two + * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it + * reads a single value for both. + */ + return READ_ONCE(mm->context.lam_cr3_mask); }
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) @@@ -232,6 -238,11 +238,6 @@@ static inline bool is_64bit_mm(struct m } #endif
-static inline void arch_unmap(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ -} - /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other diff --combined arch/x86/include/asm/processor.h index 775acbdea1a96,399f7d1c4c61f..4a686f0e5dbf6 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@@ -582,7 -582,8 +582,8 @@@ extern void switch_gdt_and_percpu_base( extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void cpu_init(void); - extern void cpu_init_exception_handling(void); + extern void cpu_init_exception_handling(bool boot_cpu); + extern void cpu_init_replace_early_idt(void); extern void cr4_init(void);
extern void set_task_blockstep(struct task_struct *task, bool on); @@@ -691,6 -692,8 +692,6 @@@ static inline u32 per_cpu_l2c_id(unsign }
#ifdef CONFIG_CPU_SUP_AMD -extern u32 amd_get_highest_perf(void); - /* * Issue a DIV 0/1 insn to clear any division data from previous DIV * operations. @@@ -703,6 -706,7 +704,6 @@@ static __always_inline void amd_clear_d
extern void amd_check_microcode(void); #else -static inline u32 amd_get_highest_perf(void) { return 0; } static inline void amd_clear_divider(void) { } static inline void amd_check_microcode(void) { } #endif diff --combined arch/x86/kernel/cpu/mshyperv.c index ead967479fa63,3d4237f275696..d18078834deda --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@@ -16,7 -16,6 +16,6 @@@ #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/kexec.h> - #include <linux/i8253.h> #include <linux/random.h> #include <asm/processor.h> #include <asm/hypervisor.h> @@@ -199,8 -198,8 +198,8 @@@ static void hv_machine_shutdown(void * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor * corrupts the old VP Assist Pages and can crash the kexec kernel. */ - if (kexec_in_progress && hyperv_init_cpuhp > 0) - cpuhp_remove_state(hyperv_init_cpuhp); + if (kexec_in_progress) + cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE);
/* The function calls stop_other_cpus(). */ native_machine_shutdown(); @@@ -424,7 -423,6 +423,7 @@@ static void __init ms_hyperv_init_platf ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; x86_platform.calibrate_cpu = hv_get_tsc_khz; + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); }
if (ms_hyperv.priv_high & HV_ISOLATION) { @@@ -450,23 -448,9 +449,23 @@@ ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
if (!ms_hyperv.paravisor_present) { - /* To be supported: more work is required. */ + /* + * Mark the Hyper-V TSC page feature as disabled + * in a TDX VM without paravisor so that the + * Invariant TSC, which is a better clocksource + * anyway, is used instead. + */ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+ /* + * The Invariant TSC is expected to be available + * in a TDX VM without paravisor, but if not, + * print a warning message. The slower Hyper-V MSR-based + * Ref Counter should end up being the clocksource. + */ + if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) + pr_warn("Hyper-V: Invariant TSC is unavailable\n"); + /* HV_MSR_CRASH_CTL is unsupported. */ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
@@@ -537,16 -521,6 +536,6 @@@ if (efi_enabled(EFI_BOOT)) x86_platform.get_nmi_reason = hv_get_nmi_reason;
- /* - * Hyper-V VMs have a PIT emulation quirk such that zeroing the - * counter register during PIT shutdown restarts the PIT. So it - * continues to interrupt @18.2 HZ. Setting i8253_clear_counter - * to false tells pit_shutdown() not to zero the counter so that - * the PIT really is shutdown. Generation 2 VMs don't have a PIT, - * and setting this value has no effect. - */ - i8253_clear_counter_on_shutdown = false; - #if IS_ENABLED(CONFIG_HYPERV) if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || ms_hyperv.paravisor_present) diff --combined block/blk-mq.c index 3f1f7d0b3ff35,aa28157b1aafc..831c5cf5d8740 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@@ -1128,7 -1128,7 +1128,7 @@@ static void blk_complete_reqs(struct ll rq->q->mq_ops->complete(rq); }
- static __latent_entropy void blk_done_softirq(struct softirq_action *h) + static __latent_entropy void blk_done_softirq(void) { blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); } @@@ -2753,7 -2753,6 +2753,7 @@@ static void blk_mq_dispatch_plug_list(s void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request *rq; + unsigned int depth;
/* * We may have been called recursively midway through handling @@@ -2764,7 -2763,6 +2764,7 @@@ */ if (plug->rq_count == 0) return; + depth = plug->rq_count; plug->rq_count = 0;
if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { @@@ -2772,7 -2770,6 +2772,7 @@@
rq = rq_list_peek(&plug->mq_list); q = rq->q; + trace_block_unplug(q, depth, true);
/* * Peek first request and see if we have a ->queue_rqs() hook. @@@ -2942,7 -2939,7 +2942,7 @@@ void blk_mq_submit_bio(struct bio *bio struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; - unsigned int nr_segs = 1; + unsigned int nr_segs; struct request *rq; blk_status_t ret;
@@@ -2984,10 -2981,11 +2984,10 @@@ goto queue_exit; }
- if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - goto queue_exit; - } + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto queue_exit; + if (!bio_integrity_prep(bio)) goto queue_exit;
diff --combined drivers/gpu/drm/i915/i915_utils.c index b34a2d3d331d6,f2ba51c20e975..2576f8f6c0f69 --- a/drivers/gpu/drm/i915/i915_utils.c +++ b/drivers/gpu/drm/i915/i915_utils.c @@@ -11,10 -11,51 +11,10 @@@ #include "i915_reg.h" #include "i915_utils.h"
-#define FDO_BUG_MSG "Please file a bug on drm/i915; see " FDO_BUG_URL " for details." - -void -__i915_printk(struct drm_i915_private *dev_priv, const char *level, - const char *fmt, ...) -{ - static bool shown_bug_once; - struct device *kdev = dev_priv->drm.dev; - bool is_error = level[1] <= KERN_ERR[1]; - bool is_debug = level[1] == KERN_DEBUG[1]; - struct va_format vaf; - va_list args; - - if (is_debug && !drm_debug_enabled(DRM_UT_DRIVER)) - return; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - if (is_error) - dev_printk(level, kdev, "%pV", &vaf); - else - dev_printk(level, kdev, "[" DRM_NAME ":%ps] %pV", - __builtin_return_address(0), &vaf); - - va_end(args); - - if (is_error && !shown_bug_once) { - /* - * Ask the user to file a bug report for the error, except - * if they may have caused the bug by fiddling with unsafe - * module parameters. - */ - if (!test_taint(TAINT_USER)) - dev_notice(kdev, "%s", FDO_BUG_MSG); - shown_bug_once = true; - } -} - void add_taint_for_CI(struct drm_i915_private *i915, unsigned int taint) { - __i915_printk(i915, KERN_NOTICE, "CI tainted:%#x by %pS\n", - taint, (void *)_RET_IP_); + drm_notice(&i915->drm, "CI tainted: %#x by %pS\n", + taint, __builtin_return_address(0));
/* Failures that occur during fault injection testing are expected */ if (!i915_error_injected()) @@@ -33,9 -74,9 +33,9 @@@ int __i915_inject_probe_error(struct dr if (++i915_probe_fail_count < i915_modparams.inject_probe_failure) return 0;
- __i915_printk(i915, KERN_INFO, - "Injecting failure %d at checkpoint %u [%s:%d]\n", - err, i915_modparams.inject_probe_failure, func, line); + drm_info(&i915->drm, "Injecting failure %d at checkpoint %u [%s:%d]\n", + err, i915_modparams.inject_probe_failure, func, line); + i915_modparams.inject_probe_failure = 0; return err; } @@@ -69,7 -110,7 +69,7 @@@ void set_timer_ms(struct timer_list *t * Paranoia to make sure the compiler computes the timeout before * loading 'jiffies' as jiffies is volatile and may be updated in * the background by a timer tick. All to reduce the complexity - * of the addition and reduce the risk of losing a jiffie. + * of the addition and reduce the risk of losing a jiffy. */ barrier();
diff --combined drivers/gpu/drm/v3d/v3d_bo.c index ecb80fd75b1a0,9eafe53a8f41a..ebe52bef4ffb8 --- a/drivers/gpu/drm/v3d/v3d_bo.c +++ b/drivers/gpu/drm/v3d/v3d_bo.c @@@ -26,17 -26,6 +26,17 @@@ #include "v3d_drv.h" #include "uapi/drm/v3d_drm.h"
+static enum drm_gem_object_status v3d_gem_status(struct drm_gem_object *obj) +{ + struct v3d_bo *bo = to_v3d_bo(obj); + enum drm_gem_object_status res = 0; + + if (bo->base.pages) + res |= DRM_GEM_OBJECT_RESIDENT; + + return res; +} + /* Called DRM core on the last userspace/kernel unreference of the * BO. */ @@@ -74,7 -63,6 +74,7 @@@ static const struct drm_gem_object_func .vmap = drm_gem_shmem_object_vmap, .vunmap = drm_gem_shmem_object_vunmap, .mmap = drm_gem_shmem_object_mmap, + .status = v3d_gem_status, .vm_ops = &drm_gem_shmem_vm_ops, };
@@@ -291,7 -279,7 +291,7 @@@ v3d_wait_bo_ioctl(struct drm_device *de else args->timeout_ns = 0;
- /* Asked to wait beyond the jiffie/scheduler precision? */ + /* Asked to wait beyond the jiffy/scheduler precision? */ if (ret == -ETIME && args->timeout_ns) ret = -EAGAIN;
diff --combined drivers/hwmon/k10temp.c index 85a7632f3b50a,f96b91e433126..7dc19c5d62ac3 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@@ -438,21 -438,16 +438,21 @@@ static int k10temp_probe(struct pci_de data->disp_negative = true; }
- if (boot_cpu_data.x86 == 0x15 && + data->is_zen = cpu_feature_enabled(X86_FEATURE_ZEN); + if (data->is_zen) { + data->temp_adjust_mask = ZEN_CUR_TEMP_RANGE_SEL_MASK; + data->read_tempreg = read_tempreg_nb_zen; + } else if (boot_cpu_data.x86 == 0x15 && ((boot_cpu_data.x86_model & 0xf0) == 0x60 || (boot_cpu_data.x86_model & 0xf0) == 0x70)) { data->read_htcreg = read_htcreg_nb_f15; data->read_tempreg = read_tempreg_nb_f15; - } else if (boot_cpu_data.x86 == 0x17 || boot_cpu_data.x86 == 0x18) { - data->temp_adjust_mask = ZEN_CUR_TEMP_RANGE_SEL_MASK; - data->read_tempreg = read_tempreg_nb_zen; - data->is_zen = true; + } else { + data->read_htcreg = read_htcreg_pci; + data->read_tempreg = read_tempreg_pci; + }
+ if (boot_cpu_data.x86 == 0x17 || boot_cpu_data.x86 == 0x18) { switch (boot_cpu_data.x86_model) { case 0x1: /* Zen */ case 0x8: /* Zen+ */ @@@ -474,6 -469,10 +474,6 @@@ break; } } else if (boot_cpu_data.x86 == 0x19) { - data->temp_adjust_mask = ZEN_CUR_TEMP_RANGE_SEL_MASK; - data->read_tempreg = read_tempreg_nb_zen; - data->is_zen = true; - switch (boot_cpu_data.x86_model) { case 0x0 ... 0x1: /* Zen3 SP3/TR */ case 0x8: /* Zen3 TR Chagall */ @@@ -497,6 -496,13 +497,6 @@@ k10temp_get_ccd_support(data, 12); break; } - } else if (boot_cpu_data.x86 == 0x1a) { - data->temp_adjust_mask = ZEN_CUR_TEMP_RANGE_SEL_MASK; - data->read_tempreg = read_tempreg_nb_zen; - data->is_zen = true; - } else { - data->read_htcreg = read_htcreg_pci; - data->read_tempreg = read_tempreg_pci; }
for (i = 0; i < ARRAY_SIZE(tctl_offset_table); i++) { @@@ -542,6 -548,7 +542,7 @@@ static const struct pci_device_id k10te { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3) }, { PCI_VDEVICE(HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; diff --combined fs/proc/base.c index 1ad51858528f7,632cf1fc8f8c1..b31283d81c52e --- a/fs/proc/base.c +++ b/fs/proc/base.c @@@ -85,7 -85,6 +85,7 @@@ #include <linux/elf.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> +#include <linux/fs_parser.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> @@@ -118,40 -117,6 +118,40 @@@ static u8 nlink_tid __ro_after_init; static u8 nlink_tgid __ro_after_init;
+enum proc_mem_force { + PROC_MEM_FORCE_ALWAYS, + PROC_MEM_FORCE_PTRACE, + PROC_MEM_FORCE_NEVER +}; + +static enum proc_mem_force proc_mem_force_override __ro_after_init = + IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER : + IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE : + PROC_MEM_FORCE_ALWAYS; + +static const struct constant_table proc_mem_force_table[] __initconst = { + { "always", PROC_MEM_FORCE_ALWAYS }, + { "ptrace", PROC_MEM_FORCE_PTRACE }, + { "never", PROC_MEM_FORCE_NEVER }, + { } +}; + +static int __init early_proc_mem_force_override(char *buf) +{ + if (!buf) + return -EINVAL; + + /* + * lookup_constant() defaults to proc_mem_force_override to preseve + * the initial Kconfig choice in case an invalid param gets passed. + */ + proc_mem_force_override = lookup_constant(proc_mem_force_table, + buf, proc_mem_force_override); + + return 0; +} +early_param("proc_mem.force_override", early_proc_mem_force_override); + struct pid_entry { const char *name; unsigned int len; @@@ -862,31 -827,12 +862,31 @@@ static int __mem_open(struct inode *ino
static int mem_open(struct inode *inode, struct file *file) { - int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); - - /* OK to pass negative loff_t, we can catch out-of-range */ - file->f_mode |= FMODE_UNSIGNED_OFFSET; + if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) + return -EINVAL; + return __mem_open(inode, file, PTRACE_MODE_ATTACH); +}
- return ret; +static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm) +{ + struct task_struct *task; + bool ptrace_active = false; + + switch (proc_mem_force_override) { + case PROC_MEM_FORCE_NEVER: + return false; + case PROC_MEM_FORCE_PTRACE: + task = get_proc_task(file_inode(file)); + if (task) { + ptrace_active = READ_ONCE(task->ptrace) && + READ_ONCE(task->mm) == mm && + READ_ONCE(task->parent) == current; + put_task_struct(task); + } + return ptrace_active; + default: + return true; + } }
static ssize_t mem_rw(struct file *file, char __user *buf, @@@ -909,9 -855,7 +909,9 @@@ if (!mmget_not_zero(mm)) goto free;
- flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); + flags = write ? FOLL_WRITE : 0; + if (proc_mem_foll_force(file, mm)) + flags |= FOLL_FORCE;
while (count > 0) { size_t this_len = min_t(size_t, count, PAGE_SIZE); @@@ -988,7 -932,6 +988,7 @@@ static const struct file_operations pro .write = mem_write, .open = mem_open, .release = mem_release, + .fop_flags = FOP_UNSIGNED_OFFSET, };
static int environ_open(struct inode *inode, struct file *file) @@@ -2333,8 -2276,8 +2333,8 @@@ proc_map_files_instantiate(struct dentr inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64;
- d_set_d_op(dentry, &tid_map_files_dentry_operations); - return d_splice_alias(inode, dentry); + return proc_splice_unmountable(inode, dentry, + &tid_map_files_dentry_operations); }
static struct dentry *proc_map_files_lookup(struct inode *dir, @@@ -2513,13 -2456,13 +2513,13 @@@ static void *timers_start(struct seq_fi if (!tp->sighand) return ERR_PTR(-ESRCH);
- return seq_list_start(&tp->task->signal->posix_timers, *pos); + return seq_hlist_start(&tp->task->signal->posix_timers, *pos); }
static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; - return seq_list_next(v, &tp->task->signal->posix_timers, pos); + return seq_hlist_next(v, &tp->task->signal->posix_timers, pos); }
static void timers_stop(struct seq_file *m, void *v) @@@ -2548,7 -2491,7 +2548,7 @@@ static int show_timer(struct seq_file * [SIGEV_THREAD] = "thread", };
- timer = list_entry((struct list_head *)v, struct k_itimer, list); + timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); notify = timer->it_sigev_notify;
seq_printf(m, "ID: %d\n", timer->it_id); @@@ -2626,10 -2569,11 +2626,11 @@@ static ssize_t timerslack_ns_write(stru }
task_lock(p); - if (slack_ns == 0) - p->timer_slack_ns = p->default_timer_slack_ns; - else - p->timer_slack_ns = slack_ns; + if (rt_or_dl_task_policy(p)) + slack_ns = 0; + else if (slack_ns == 0) + slack_ns = p->default_timer_slack_ns; + p->timer_slack_ns = slack_ns; task_unlock(p);
out: @@@ -3927,12 -3871,12 +3928,12 @@@ static int proc_task_readdir(struct fil if (!dir_emit_dots(file, ctx)) return 0;
- /* f_version caches the tgid value that the last readdir call couldn't - * return. lseek aka telldir automagically resets f_version to 0. + /* We cache the tgid value that the last readdir call couldn't + * return and lseek resets it to 0. */ ns = proc_pid_ns(inode->i_sb); - tid = (int)file->f_version; - file->f_version = 0; + tid = (int)(intptr_t)file->private_data; + file->private_data = NULL; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { @@@ -3947,7 -3891,7 +3948,7 @@@ proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ - file->f_version = (u64)tid; + file->private_data = (void *)(intptr_t)tid; put_task_struct(task); break; } @@@ -3972,24 -3916,6 +3973,24 @@@ static int proc_task_getattr(struct mnt return 0; }
+/* + * proc_task_readdir() set @file->private_data to a positive integer + * value, so casting that to u64 is safe. generic_llseek_cookie() will + * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is + * here to catch any unexpected change in behavior either in + * proc_task_readdir() or generic_llseek_cookie(). + */ +static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence) +{ + u64 cookie = (u64)(intptr_t)file->private_data; + loff_t off; + + off = generic_llseek_cookie(file, offset, whence, &cookie); + WARN_ON_ONCE(cookie > INT_MAX); + file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */ + return off; +} + static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, @@@ -4000,7 -3926,7 +4001,7 @@@ static const struct file_operations proc_task_operations = { .read = generic_read_dir, .iterate_shared = proc_task_readdir, - .llseek = generic_file_llseek, + .llseek = proc_dir_llseek, };
void __init set_proc_pid_nlink(void) diff --combined fs/select.c index 1a4849e2afb97,ad171b7a5c11f..cae82e9e0dcc0 --- a/fs/select.c +++ b/fs/select.c @@@ -77,19 -77,16 +77,16 @@@ u64 select_estimate_accuracy(struct tim { u64 ret; struct timespec64 now; + u64 slack = current->timer_slack_ns;
- /* - * Realtime tasks get a slack of 0 for obvious reasons. - */ - - if (rt_task(current)) + if (slack == 0) return 0;
ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); - if (ret < current->timer_slack_ns) - return current->timer_slack_ns; + if (ret < slack) + return slack; return ret; }
@@@ -840,7 -837,7 +837,7 @@@ SYSCALL_DEFINE1(old_select, struct sel_ struct poll_list { struct poll_list *next; unsigned int len; - struct pollfd entries[]; + struct pollfd entries[] __counted_by(len); };
#define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) diff --combined include/linux/pci_ids.h index 2c94d4004dd50,91182aa1d2ec5..e4bddb9277956 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@@ -580,6 -580,7 +580,7 @@@ #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F3 0x12fb #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3 0x12c3 #define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3 0x16fb + #define PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3 0x124b #define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3 0x12bb #define PCI_DEVICE_ID_AMD_MI200_DF_F3 0x14d3 #define PCI_DEVICE_ID_AMD_MI300_DF_F3 0x152b @@@ -2661,8 -2662,6 +2662,8 @@@ #define PCI_DEVICE_ID_DCI_PCCOM8 0x0002 #define PCI_DEVICE_ID_DCI_PCCOM2 0x0004
+#define PCI_VENDOR_ID_GLENFLY 0x6766 + #define PCI_VENDOR_ID_INTEL 0x8086 #define PCI_DEVICE_ID_INTEL_EESSC 0x0008 #define PCI_DEVICE_ID_INTEL_HDA_CML_LP 0x02c8 diff --combined include/linux/perf_event.h index e336306b8c08e,701549967c185..7a852832dc120 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@@ -168,6 -168,9 +168,9 @@@ struct hw_perf_event struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; }; + struct { /* aux / Intel-PT */ + u64 aux_config; + }; struct { /* software */ struct hrtimer hrtimer; }; @@@ -963,12 -966,16 +966,16 @@@ struct perf_event_context struct rcu_head rcu_head;
/* - * Sum (event->pending_work + event->pending_work) + * The count of events for which using the switch-out fast path + * should be avoided. + * + * Sum (event->pending_work + events with + * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ - local_t nr_pending; + local_t nr_no_switch_fast; };
struct perf_cpu_pmu_context { @@@ -1602,7 -1609,13 +1609,7 @@@ static inline int perf_is_paranoid(void return sysctl_perf_event_paranoid > -1; }
-static inline int perf_allow_kernel(struct perf_event_attr *attr) -{ - if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) - return -EACCES; - - return security_perf_event_open(attr, PERF_SECURITY_KERNEL); -} +int perf_allow_kernel(struct perf_event_attr *attr);
static inline int perf_allow_cpu(struct perf_event_attr *attr) { diff --combined include/linux/uprobes.h index 493dc95d912c9,2b294bf1881fe..e6f4e73125ffa --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@@ -16,6 -16,7 +16,7 @@@ #include <linux/types.h> #include <linux/wait.h>
+ struct uprobe; struct vm_area_struct; struct mm_struct; struct inode; @@@ -27,22 -28,22 +28,22 @@@ struct page
#define MAX_URETPROBE_DEPTH 64
- enum uprobe_filter_ctx { - UPROBE_FILTER_REGISTER, - UPROBE_FILTER_UNREGISTER, - UPROBE_FILTER_MMAP, - }; - struct uprobe_consumer { + /* + * handler() can return UPROBE_HANDLER_REMOVE to signal the need to + * unregister uprobe for current process. If UPROBE_HANDLER_REMOVE is + * returned, filter() callback has to be implemented as well and it + * should return false to "confirm" the decision to uninstall uprobe + * for the current process. If filter() is omitted or returns true, + * UPROBE_HANDLER_REMOVE is effectively ignored. + */ int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); int (*ret_handler)(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs); - bool (*filter)(struct uprobe_consumer *self, - enum uprobe_filter_ctx ctx, - struct mm_struct *mm); + bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
- struct uprobe_consumer *next; + struct list_head cons_node; };
#ifdef CONFIG_UPROBES @@@ -76,6 -77,8 +77,8 @@@ struct uprobe_task struct uprobe *active_uprobe; unsigned long xol_vaddr;
+ struct arch_uprobe *auprobe; + struct return_instance *return_instances; unsigned int depth; }; @@@ -110,10 -113,10 +113,10 @@@ extern bool is_trap_insn(uprobe_opcode_ extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); - extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); - extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); - extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); - extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); + extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); + extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); + extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); + extern void uprobe_unregister_sync(void); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void uprobe_start_dup_mmap(void); @@@ -126,6 -129,7 +129,6 @@@ extern int uprobe_pre_sstep_notifier(st extern void uprobe_notify_resume(struct pt_regs *regs); extern bool uprobe_deny_signal(void); extern bool arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs); -extern void uprobe_clear_state(struct mm_struct *mm); extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); @@@ -150,22 -154,21 +153,21 @@@ static inline void uprobes_init(void
#define uprobe_get_trap_addr(regs) instruction_pointer(regs)
- static inline int - uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) - { - return -ENOSYS; - } - static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) + static inline struct uprobe * + uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { - return -ENOSYS; + return ERR_PTR(-ENOSYS); } static inline int - uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add) + uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add) { return -ENOSYS; } static inline void - uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) + uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) + { + } + static inline void uprobe_unregister_sync(void) { } static inline int uprobe_mmap(struct vm_area_struct *vma) diff --combined include/uapi/linux/elf.h index 81762ff3c99e1,e30a9b47dc87f..b9935988da5cf --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@@ -411,6 -411,7 +411,7 @@@ typedef struct elf64_shdr #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ /* Old binutils treats 0x203 as a CET state */ #define NT_X86_SHSTK 0x204 /* x86 SHSTK state */ + #define NT_X86_XSAVE_LAYOUT 0x205 /* XSAVE layout description */ #define NT_S390_HIGH_GPRS 0x300 /* s390 upper register halves */ #define NT_S390_TIMER 0x301 /* s390 timer register */ #define NT_S390_TODCMP 0x302 /* s390 TOD clock comparator register */ @@@ -441,7 -442,6 +442,7 @@@ #define NT_ARM_ZA 0x40c /* ARM SME ZA registers */ #define NT_ARM_ZT 0x40d /* ARM SME ZT registers */ #define NT_ARM_FPMR 0x40e /* ARM floating point mode register */ +#define NT_ARM_POE 0x40f /* ARM POE registers */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ diff --combined kernel/events/core.c index b21c8f24a9876,67e115d4ef968..08b309946fcae --- a/kernel/events/core.c +++ b/kernel/events/core.c @@@ -155,20 -155,55 +155,55 @@@ static int cpu_function_call(int cpu, r return data.ret; }
+ enum event_type_t { + EVENT_FLEXIBLE = 0x01, + EVENT_PINNED = 0x02, + EVENT_TIME = 0x04, + EVENT_FROZEN = 0x08, + /* see ctx_resched() for details */ + EVENT_CPU = 0x10, + EVENT_CGROUP = 0x20, + + /* compound helpers */ + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, + EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, + }; + + static inline void __perf_ctx_lock(struct perf_event_context *ctx) + { + raw_spin_lock(&ctx->lock); + WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); + } + static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - raw_spin_lock(&cpuctx->ctx.lock); + __perf_ctx_lock(&cpuctx->ctx); if (ctx) - raw_spin_lock(&ctx->lock); + __perf_ctx_lock(ctx); + } + + static inline void __perf_ctx_unlock(struct perf_event_context *ctx) + { + /* + * If ctx_sched_in() didn't again set any ALL flags, clean up + * after ctx_sched_out() by clearing is_active. + */ + if (ctx->is_active & EVENT_FROZEN) { + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + else + ctx->is_active &= ~EVENT_FROZEN; + } + raw_spin_unlock(&ctx->lock); }
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); + __perf_ctx_unlock(ctx); + __perf_ctx_unlock(&cpuctx->ctx); }
#define TASK_TOMBSTONE ((void *)-1L) @@@ -264,6 -299,7 +299,7 @@@ static void event_function_call(struct { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ + struct perf_cpu_context *cpuctx; struct event_function_struct efs = { .event = event, .func = func, @@@ -291,22 -327,25 +327,25 @@@ again if (!task_function_call(task, event_function, &efs)) return;
- raw_spin_lock_irq(&ctx->lock); + local_irq_disable(); + cpuctx = this_cpu_ptr(&perf_cpu_context); + perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; - if (task == TASK_TOMBSTONE) { - raw_spin_unlock_irq(&ctx->lock); - return; - } + if (task == TASK_TOMBSTONE) + goto unlock; if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); + perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); goto again; } func(event, NULL, ctx, data); - raw_spin_unlock_irq(&ctx->lock); + unlock: + perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); }
/* @@@ -369,16 -408,6 +408,6 @@@ unlock (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV)
- enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_TIME = 0x4, - /* see ctx_resched() for details */ - EVENT_CPU = 0x8, - EVENT_CGROUP = 0x10, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, - }; - /* * perf_sched_events : >0 events exist */ @@@ -685,30 -714,32 +714,32 @@@ do { ___p; \ })
+ #define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ + list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ + if (_cgroup && !_epc->nr_cgroups) \ + continue; \ + else if (_pmu && _epc->pmu != _pmu) \ + continue; \ + else + static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_disable(pmu_ctx->pmu); - } }
static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_enable(pmu_ctx->pmu); - } }
- static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); - static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); + static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); + static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
#ifdef CONFIG_CGROUP_PERF
@@@ -865,7 -896,7 +896,7 @@@ static void perf_cgroup_switch(struct t perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_ctx_disable(&cpuctx->ctx, true);
- ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in @@@ -877,7 -908,7 +908,7 @@@ * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ - ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); + ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@@ -1768,6 -1799,14 +1799,14 @@@ perf_event_groups_next(struct perf_even event = rb_entry_safe(rb_next(&event->group_node), \ typeof(*event), group_node))
+ /* + * Does the event attribute request inherit with PERF_SAMPLE_READ + */ + static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) + { + return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); + } + /* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@@ -1798,6 -1837,8 +1837,8 @@@ list_add_event(struct perf_event *event ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; + if (has_inherit_and_sample_read(&event->attr)) + local_inc(&ctx->nr_no_switch_fast);
if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); @@@ -2022,6 -2063,8 +2063,8 @@@ list_del_event(struct perf_event *event ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; + if (has_inherit_and_sample_read(&event->attr)) + local_dec(&ctx->nr_no_switch_fast);
list_del_rcu(&event->event_entry);
@@@ -2317,6 -2360,45 +2360,45 @@@ group_sched_out(struct perf_event *grou event_sched_out(event, ctx); }
+ static inline void + __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) + { + if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, final); + } + } + + static inline void + ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) + { + __ctx_time_update(cpuctx, ctx, false); + } + + /* + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). + */ + static inline void + ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) + { + ctx_time_update(cpuctx, ctx); + if (ctx->is_active & EVENT_TIME) + ctx->is_active |= EVENT_FROZEN; + } + + static inline void + ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) + { + if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + } + #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL #define DETACH_DEAD 0x04UL @@@ -2336,10 -2418,7 +2418,7 @@@ __perf_remove_from_context(struct perf_ struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info;
- if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, false); - } + ctx_time_update(cpuctx, ctx);
/* * Ensure event_sched_out() switches to OFF, at the very least @@@ -2424,12 -2503,8 +2503,8 @@@ static void __perf_event_disable(struc if (event->state < PERF_EVENT_STATE_INACTIVE) return;
- if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } - perf_pmu_disable(event->pmu_ctx->pmu); + ctx_time_update_event(ctx, event);
if (event == event->group_leader) group_sched_out(event, ctx); @@@ -2645,7 -2720,8 +2720,8 @@@ static void add_event_to_ctx(struct per }
static void task_ctx_sched_out(struct perf_event_context *ctx, - enum event_type_t event_type) + struct pmu *pmu, + enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
@@@ -2655,18 -2731,19 +2731,19 @@@ if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return;
- ctx_sched_out(ctx, event_type); + ctx_sched_out(ctx, pmu, event_type); }
static void perf_event_sched_in(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) + struct perf_event_context *ctx, + struct pmu *pmu) { - ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, EVENT_PINNED); - ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); }
/* @@@ -2684,16 -2761,12 +2761,12 @@@ * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ - /* - * XXX: ctx_resched() reschedule entire perf_event_context while adding new - * event to the context or enabling existing event in the context. We can - * probably optimize it by rescheduling only affected pmu_ctx. - */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, - enum event_type_t event_type) + struct pmu *pmu, enum event_type_t event_type) { bool cpu_event = !!(event_type & EVENT_CPU); + struct perf_event_pmu_context *epc;
/* * If pinned groups are involved, flexible groups also need to be @@@ -2704,10 -2777,14 +2777,14 @@@
event_type &= EVENT_ALL;
- perf_ctx_disable(&cpuctx->ctx, false); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_disable(epc->pmu); + if (task_ctx) { - perf_ctx_disable(task_ctx, false); - task_ctx_sched_out(task_ctx, event_type); + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_disable(epc->pmu); + + task_ctx_sched_out(task_ctx, pmu, event_type); }
/* @@@ -2718,15 -2795,19 +2795,19 @@@ * - otherwise, do nothing more. */ if (cpu_event) - ctx_sched_out(&cpuctx->ctx, event_type); + ctx_sched_out(&cpuctx->ctx, pmu, event_type); else if (event_type & EVENT_PINNED) - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, task_ctx); + perf_event_sched_in(cpuctx, task_ctx, pmu);
- perf_ctx_enable(&cpuctx->ctx, false); - if (task_ctx) - perf_ctx_enable(task_ctx, false); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_enable(epc->pmu); + + if (task_ctx) { + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_enable(epc->pmu); + } }
void perf_pmu_resched(struct pmu *pmu) @@@ -2735,7 -2816,7 +2816,7 @@@ struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx); - ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); }
@@@ -2791,9 -2872,10 +2872,10 @@@ static int __perf_install_in_context(v #endif
if (reprogram) { - ctx_sched_out(ctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); add_event_to_ctx(event, ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, + get_event_type(event)); } else { add_event_to_ctx(event, ctx); } @@@ -2936,8 -3018,7 +3018,7 @@@ static void __perf_event_enable(struct event->state <= PERF_EVENT_STATE_ERROR) return;
- if (ctx->is_active) - ctx_sched_out(ctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); @@@ -2945,25 -3026,21 +3026,21 @@@ if (!ctx->is_active) return;
- if (!event_filter_match(event)) { - ctx_sched_in(ctx, EVENT_TIME); + if (!event_filter_match(event)) return; - }
/* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, EVENT_TIME); + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) return; - }
task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); }
/* @@@ -3231,7 -3308,7 +3308,7 @@@ static void __pmu_ctx_sched_out(struct struct perf_event *event, *tmp; struct pmu *pmu = pmu_ctx->pmu;
- if (ctx->task && !ctx->is_active) { + if (ctx->task && !(ctx->is_active & EVENT_ALL)) { struct perf_cpu_pmu_context *cpc;
cpc = this_cpu_ptr(pmu->cpu_pmu_context); @@@ -3239,7 -3316,7 +3316,7 @@@ cpc->task_epc = NULL; }
- if (!event_type) + if (!(event_type & EVENT_ALL)) return;
perf_pmu_disable(pmu); @@@ -3265,8 -3342,17 +3342,17 @@@ perf_pmu_enable(pmu); }
+ /* + * Be very careful with the @pmu argument since this will change ctx state. + * The @pmu argument works for ctx_resched(), because that is symmetric in + * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. + * + * However, if you were to be asymmetrical, you could end up with messed up + * state, eg. ctx->is_active cleared even though most EPCs would still actually + * be active. + */ static void - ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) + ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; @@@ -3297,34 -3383,36 +3383,36 @@@ * * would only update time for the pinned events. */ - if (is_active & EVENT_TIME) { - /* update (and stop) ctx time */ - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); + + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + ctx->is_active &= ~event_type; + + if (!(ctx->is_active & EVENT_ALL)) { /* - * CPU-release for the below ->is_active store, - * see __load_acquire() in perf_event_time_now() + * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() + * does not observe a hole. perf_ctx_unlock() will clean up. */ - barrier(); + if (ctx->is_active & EVENT_FROZEN) + ctx->is_active &= EVENT_TIME_FROZEN; + else + ctx->is_active = 0; }
- ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) + if (!(ctx->is_active & EVENT_ALL)) cpuctx->task_ctx = NULL; }
is_active ^= ctx->is_active; /* changed bits */
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_out(pmu_ctx, is_active); - } }
/* @@@ -3517,12 -3605,17 +3605,17 @@@ perf_event_context_sched_out(struct tas
perf_ctx_disable(ctx, false);
- /* PMIs are disabled; ctx->nr_pending is stable. */ - if (local_read(&ctx->nr_pending) || - local_read(&next_ctx->nr_pending)) { + /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ + if (local_read(&ctx->nr_no_switch_fast) || + local_read(&next_ctx->nr_no_switch_fast)) { /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. + * + * Likewise, when a context contains inherit + + * SAMPLE_READ events they should be switched + * out using the slow path so that they are + * treated as if they were distinct contexts. */ raw_spin_unlock(&next_ctx->lock); rcu_read_unlock(); @@@ -3563,7 -3656,7 +3656,7 @@@ unlock
inside_switch: perf_ctx_sched_task_cb(ctx, false); - task_ctx_sched_out(ctx, EVENT_ALL); + task_ctx_sched_out(ctx, NULL, EVENT_ALL);
perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); @@@ -3861,29 -3954,22 +3954,22 @@@ static void pmu_groups_sched_in(struct merge_sched_in, &can_add_hw); }
- static void ctx_groups_sched_in(struct perf_event_context *ctx, - struct perf_event_groups *groups, - bool cgroup) + static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) { - struct perf_event_pmu_context *pmu_ctx; - - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); - } - } + struct perf_event_context *ctx = pmu_ctx->ctx;
- static void __pmu_ctx_sched_in(struct perf_event_context *ctx, - struct pmu *pmu) - { - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); + if (event_type & EVENT_PINNED) + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + if (event_type & EVENT_FLEXIBLE) + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); }
static void - ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) + ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP;
@@@ -3907,7 -3993,7 +3993,7 @@@
ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { - if (!is_active) + if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); @@@ -3919,12 -4005,16 +4005,16 @@@ * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (is_active & EVENT_PINNED) - ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup); + if (is_active & EVENT_PINNED) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + }
/* Then walk through the lower prio flexible groups */ - if (is_active & EVENT_FLEXIBLE) - ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup); + if (is_active & EVENT_FLEXIBLE) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + } }
static void perf_event_context_sched_in(struct task_struct *task) @@@ -3967,10 -4057,10 +4057,10 @@@ */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { perf_ctx_disable(&cpuctx->ctx, false); - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); }
- perf_event_sched_in(cpuctx, ctx); + perf_event_sched_in(cpuctx, ctx, NULL);
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
@@@ -4093,7 -4183,11 +4183,11 @@@ static void perf_adjust_period(struct p period = perf_calculate_period(event, nsec, count);
delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ + if (delta >= 0) + delta += 7; + else + delta -= 7; + delta /= 8; /* low pass filter */
sample_period = hwc->sample_period + delta;
@@@ -4311,14 -4405,14 +4405,14 @@@ static bool perf_rotate_context(struct update_context_time(&cpuctx->ctx); __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); - __pmu_ctx_sched_in(&cpuctx->ctx, pmu); + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE); }
if (task_event) rotate_ctx(task_epc->ctx, task_event);
if (task_event || (task_epc && cpu_event)) - __pmu_ctx_sched_in(task_epc->ctx, pmu); + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@@ -4384,7 -4478,7 +4478,7 @@@ static void perf_event_enable_on_exec(s
cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx);
list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); @@@ -4396,9 -4490,7 +4490,7 @@@ */ if (enabled) { clone_ctx = unclone_ctx(ctx); - ctx_resched(cpuctx, ctx, event_type); - } else { - ctx_sched_in(ctx, EVENT_TIME); + ctx_resched(cpuctx, ctx, NULL, event_type); } perf_ctx_unlock(cpuctx, ctx);
@@@ -4501,10 -4593,7 +4593,7 @@@ static void __perf_event_read(void *inf return;
raw_spin_lock(&ctx->lock); - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event);
perf_event_update_time(event); if (data->group) @@@ -4539,8 -4628,11 +4628,11 @@@ unlock raw_spin_unlock(&ctx->lock); }
- static inline u64 perf_event_count(struct perf_event *event) + static inline u64 perf_event_count(struct perf_event *event, bool self) { + if (self) + return local64_read(&event->count); + return local64_read(&event->count) + atomic64_read(&event->child_count); }
@@@ -4701,10 -4793,7 +4793,7 @@@ again * May read while context is not active (e.g., thread is * blocked), in that case we cannot update context time */ - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event);
perf_event_update_time(event); if (group) @@@ -5205,7 -5294,7 +5294,7 @@@ static void perf_pending_task_sync(stru */ if (task_work_cancel(current, head)) { event->pending_work = 0; - local_dec(&event->ctx->nr_pending); + local_dec(&event->ctx->nr_no_switch_fast); return; }
@@@ -5499,7 -5588,7 +5588,7 @@@ static u64 __perf_event_read_value(stru mutex_lock(&event->child_mutex);
(void)perf_event_read(event, false); - total += perf_event_count(event); + total += perf_event_count(event, false);
*enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@@ -5508,7 -5597,7 +5597,7 @@@
list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); - total += perf_event_count(child); + total += perf_event_count(child, false); *enabled += child->total_time_enabled; *running += child->total_time_running; } @@@ -5590,14 -5679,14 +5679,14 @@@ static int __perf_read_group_add(struc /* * Write {count,id} tuples for every sibling. */ - values[n++] += perf_event_count(leader); + values[n++] += perf_event_count(leader, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) { - values[n++] += perf_event_count(sub); + values[n++] += perf_event_count(sub, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) @@@ -6177,7 -6266,7 +6266,7 @@@ void perf_event_update_userpage(struct ++userpg->lock; barrier(); userpg->index = perf_event_index(event); - userpg->offset = perf_event_count(event); + userpg->offset = perf_event_count(event, false); if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count);
@@@ -6874,7 -6963,7 +6963,7 @@@ static void perf_pending_task(struct ca if (event->pending_work) { event->pending_work = 0; perf_sigtrap(event); - local_dec(&event->ctx->nr_pending); + local_dec(&event->ctx->nr_no_switch_fast); rcuwait_wake_up(&event->pending_work_wait); } rcu_read_unlock(); @@@ -7256,7 -7345,7 +7345,7 @@@ static void perf_output_read_one(struc u64 values[5]; int n = 0;
- values[n++] = perf_event_count(event); + values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr)); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); @@@ -7274,14 -7363,15 +7363,15 @@@ }
static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; unsigned long flags; u64 values[6]; int n = 0; + bool self = has_inherit_and_sample_read(&event->attr);
/* * Disabling interrupts avoids all counter scheduling @@@ -7301,7 -7391,7 +7391,7 @@@ (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader);
- values[n++] = perf_event_count(leader); + values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) @@@ -7316,7 -7406,7 +7406,7 @@@ (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub);
- values[n++] = perf_event_count(sub); + values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) @@@ -7337,6 -7427,10 +7427,10 @@@ * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. + * + * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread + * counts rather than attempting to accumulate some value across all children on + * all cores. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) @@@ -9747,7 -9841,7 +9841,7 @@@ static int __perf_event_overflow(struc if (!event->pending_work && !task_work_add(current, &event->pending_task, notify_mode)) { event->pending_work = pending_id; - local_inc(&event->ctx->nr_pending); + local_inc(&event->ctx->nr_no_switch_fast);
event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) @@@ -12064,10 -12158,12 +12158,12 @@@ perf_event_alloc(struct perf_event_att local64_set(&hwc->period_left, hwc->sample_period);
/* - * We currently do not support PERF_SAMPLE_READ on inherited events. + * We do not support PERF_SAMPLE_READ on inherited events unless + * PERF_SAMPLE_TID is also selected, which allows inherited events to + * collect per-thread samples. * See perf_output_read(). */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) + if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) goto err_ns;
if (!has_branch_stack(event)) @@@ -13091,7 -13187,7 +13187,7 @@@ static void sync_child_event(struct per perf_event_read_event(child_event, task); }
- child_val = perf_event_count(child_event); + child_val = perf_event_count(child_event, false);
/* * Add back the child's count to the parent's count: @@@ -13182,7 -13278,7 +13278,7 @@@ static void perf_event_exit_task_contex * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(child_ctx, EVENT_ALL); + task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
/* * Now that the context is inactive, destroy the task <-> ctx relation @@@ -13358,15 -13454,6 +13454,15 @@@ const struct perf_event_attr *perf_even return &event->attr; }
+int perf_allow_kernel(struct perf_event_attr *attr) +{ + if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) + return -EACCES; + + return security_perf_event_open(attr, PERF_SECURITY_KERNEL); +} +EXPORT_SYMBOL_GPL(perf_allow_kernel); + /* * Inherit an event from parent task to child task. * @@@ -13740,7 -13827,7 +13836,7 @@@ static void __perf_event_exit_context(v struct perf_event *event;
raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); diff --combined kernel/events/uprobes.c index 5afd00f264314,4b7e590dc428e..86fcb2386ea2f --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@@ -40,6 -40,9 +40,9 @@@ static struct rb_root uprobes_tree = RB #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ + static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); + + DEFINE_STATIC_SRCU(uprobes_srcu);
#define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ @@@ -57,8 -60,9 +60,9 @@@ struct uprobe struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; - struct uprobe_consumer *consumers; + struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ + struct rcu_head rcu; loff_t offset; loff_t ref_ctr_offset; unsigned long flags; @@@ -109,6 -113,11 +113,11 @@@ struct xol_area unsigned long vaddr; /* Page(s) of instruction slots */ };
+ static void uprobe_warn(struct task_struct *t, const char *msg) + { + pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); + } + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@@ -453,7 -462,7 +462,7 @@@ static int update_ref_ctr(struct uprob * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_lock held for write. + * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, @@@ -587,25 -596,63 +596,63 @@@ set_orig_insn(struct arch_uprobe *aupro *(uprobe_opcode_t *)&auprobe->insn); }
+ /* uprobe should have guaranteed positive refcount */ static struct uprobe *get_uprobe(struct uprobe *uprobe) { refcount_inc(&uprobe->ref); return uprobe; }
+ /* + * uprobe should have guaranteed lifetime, which can be either of: + * - caller already has refcount taken (and wants an extra one); + * - uprobe is RCU protected and won't be freed until after grace period; + * - we are holding uprobes_treelock (for read or write, doesn't matter). + */ + static struct uprobe *try_get_uprobe(struct uprobe *uprobe) + { + if (refcount_inc_not_zero(&uprobe->ref)) + return uprobe; + return NULL; + } + + static inline bool uprobe_is_active(struct uprobe *uprobe) + { + return !RB_EMPTY_NODE(&uprobe->rb_node); + } + + static void uprobe_free_rcu(struct rcu_head *rcu) + { + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); + + kfree(uprobe); + } + static void put_uprobe(struct uprobe *uprobe) { - if (refcount_dec_and_test(&uprobe->ref)) { - /* - * If application munmap(exec_vma) before uprobe_unregister() - * gets called, we don't get a chance to remove uprobe from - * delayed_uprobe_list from remove_breakpoint(). Do it here. - */ - mutex_lock(&delayed_uprobe_lock); - delayed_uprobe_remove(uprobe, NULL); - mutex_unlock(&delayed_uprobe_lock); - kfree(uprobe); + if (!refcount_dec_and_test(&uprobe->ref)) + return; + + write_lock(&uprobes_treelock); + + if (uprobe_is_active(uprobe)) { + write_seqcount_begin(&uprobes_seqcount); + rb_erase(&uprobe->rb_node, &uprobes_tree); + write_seqcount_end(&uprobes_seqcount); } + + write_unlock(&uprobes_treelock); + + /* + * If application munmap(exec_vma) before uprobe_unregister() + * gets called, we don't get a chance to remove uprobe from + * delayed_uprobe_list from remove_breakpoint(). Do it here. + */ + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(uprobe, NULL); + mutex_unlock(&delayed_uprobe_lock); + + call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); }
static __always_inline @@@ -647,62 -694,86 +694,86 @@@ static inline int __uprobe_cmp(struct r return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); }
- static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) + /* + * Assumes being inside RCU protected region. + * No refcount is taken on returned uprobe. + */ + static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) { struct __uprobe_key key = { .inode = inode, .offset = offset, }; - struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key); + struct rb_node *node; + unsigned int seq;
- if (node) - return get_uprobe(__node_2_uprobe(node)); + lockdep_assert(srcu_read_lock_held(&uprobes_srcu)); + + do { + seq = read_seqcount_begin(&uprobes_seqcount); + node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); + /* + * Lockless RB-tree lookups can result only in false negatives. + * If the element is found, it is correct and can be returned + * under RCU protection. If we find nothing, we need to + * validate that seqcount didn't change. If it did, we have to + * try again as we might have missed the element (false + * negative). If seqcount is unchanged, search truly failed. + */ + if (node) + return __node_2_uprobe(node); + } while (read_seqcount_retry(&uprobes_seqcount, seq));
return NULL; }
/* - * Find a uprobe corresponding to a given inode:offset - * Acquires uprobes_treelock + * Attempt to insert a new uprobe into uprobes_tree. + * + * If uprobe already exists (for given inode+offset), we just increment + * refcount of previously existing uprobe. + * + * If not, a provided new instance of uprobe is inserted into the tree (with + * assumed initial refcount == 1). + * + * In any case, we return a uprobe instance that ends up being in uprobes_tree. + * Caller has to clean up new uprobe instance, if it ended up not being + * inserted into the tree. + * + * We assume that uprobes_treelock is held for writing. */ - static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) - { - struct uprobe *uprobe; - - read_lock(&uprobes_treelock); - uprobe = __find_uprobe(inode, offset); - read_unlock(&uprobes_treelock); - - return uprobe; - } - static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { struct rb_node *node; + again: + node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); + if (node) { + struct uprobe *u = __node_2_uprobe(node); + + if (!try_get_uprobe(u)) { + rb_erase(node, &uprobes_tree); + RB_CLEAR_NODE(&u->rb_node); + goto again; + }
- node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); - if (node) - return get_uprobe(__node_2_uprobe(node)); + return u; + }
- /* get access + creation ref */ - refcount_set(&uprobe->ref, 2); - return NULL; + return uprobe; }
/* - * Acquire uprobes_treelock. - * Matching uprobe already exists in rbtree; - * increment (access refcount) and return the matching uprobe. - * - * No matching uprobe; insert the uprobe in rb_tree; - * get a double refcount (access + creation) and return NULL. + * Acquire uprobes_treelock and insert uprobe into uprobes_tree + * (or reuse existing one, see __insert_uprobe() comments above). */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u;
write_lock(&uprobes_treelock); + write_seqcount_begin(&uprobes_seqcount); u = __insert_uprobe(uprobe); + write_seqcount_end(&uprobes_seqcount); write_unlock(&uprobes_treelock);
return u; @@@ -725,18 -796,21 +796,21 @@@ static struct uprobe *alloc_uprobe(stru
uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); if (!uprobe) - return NULL; + return ERR_PTR(-ENOMEM);
uprobe->inode = inode; uprobe->offset = offset; uprobe->ref_ctr_offset = ref_ctr_offset; + INIT_LIST_HEAD(&uprobe->consumers); init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); + RB_CLEAR_NODE(&uprobe->rb_node); + refcount_set(&uprobe->ref, 1);
/* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ - if (cur_uprobe) { + if (cur_uprobe != uprobe) { if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { ref_ctr_mismatch_warn(cur_uprobe, uprobe); put_uprobe(cur_uprobe); @@@ -753,32 -827,19 +827,19 @@@ static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); - uc->next = uprobe->consumers; - uprobe->consumers = uc; + list_add_rcu(&uc->cons_node, &uprobe->consumers); up_write(&uprobe->consumer_rwsem); }
/* * For uprobe @uprobe, delete the consumer @uc. - * Return true if the @uc is deleted successfully - * or return false. + * Should never be called with consumer that's not part of @uprobe->consumers. */ - static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) + static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) { - struct uprobe_consumer **con; - bool ret = false; - down_write(&uprobe->consumer_rwsem); - for (con = &uprobe->consumers; *con; con = &(*con)->next) { - if (*con == uc) { - *con = uc->next; - ret = true; - break; - } - } + list_del_rcu(&uc->cons_node); up_write(&uprobe->consumer_rwsem); - - return ret; }
static int __copy_insn(struct address_space *mapping, struct file *filp, @@@ -863,21 -924,20 +924,20 @@@ static int prepare_uprobe(struct uprob return ret; }
- static inline bool consumer_filter(struct uprobe_consumer *uc, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) + static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { - return !uc->filter || uc->filter(uc, ctx, mm); + return !uc->filter || uc->filter(uc, mm); }
- static bool filter_chain(struct uprobe *uprobe, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) + static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false;
down_read(&uprobe->consumer_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - ret = consumer_filter(uc, ctx, mm); + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { + ret = consumer_filter(uc, mm); if (ret) break; } @@@ -921,27 -981,6 +981,6 @@@ remove_breakpoint(struct uprobe *uprobe return set_orig_insn(&uprobe->arch, mm, vaddr); }
- static inline bool uprobe_is_active(struct uprobe *uprobe) - { - return !RB_EMPTY_NODE(&uprobe->rb_node); - } - /* - * There could be threads that have already hit the breakpoint. They - * will recheck the current insn and restart if find_uprobe() fails. - * See find_active_uprobe(). - */ - static void delete_uprobe(struct uprobe *uprobe) - { - if (WARN_ON(!uprobe_is_active(uprobe))) - return; - - write_lock(&uprobes_treelock); - rb_erase(&uprobe->rb_node, &uprobes_tree); - write_unlock(&uprobes_treelock); - RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ - put_uprobe(uprobe); - } - struct map_info { struct map_info *next; struct mm_struct *mm; @@@ -1046,7 -1085,13 +1085,13 @@@ register_for_each_vma(struct uprobe *up
if (err && is_register) goto free; - + /* + * We take mmap_lock for writing to avoid the race with + * find_active_uprobe_rcu() which takes mmap_lock for reading. + * Thus this install_breakpoint() can not make + * is_trap_at_addr() true right after find_uprobe_rcu() + * returns NULL in find_active_uprobe_rcu(). + */ mmap_write_lock(mm); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || @@@ -1059,12 -1104,10 +1104,10 @@@
if (is_register) { /* consult only the "caller", new consumer. */ - if (consumer_filter(new, - UPROBE_FILTER_REGISTER, mm)) + if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { - if (!filter_chain(uprobe, - UPROBE_FILTER_UNREGISTER, mm)) + if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, mm, info->vaddr); }
@@@ -1079,152 -1122,140 +1122,140 @@@ return err; }
- static void - __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) + /** + * uprobe_unregister_nosync - unregister an already registered probe. + * @uprobe: uprobe to remove + * @uc: identify which probe if multiple probes are colocated. + */ + void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err;
- if (WARN_ON(!consumer_del(uprobe, uc))) - return; - + down_write(&uprobe->register_rwsem); + consumer_del(uprobe, uc); err = register_for_each_vma(uprobe, NULL); - /* TODO : cant unregister? schedule a worker thread */ - if (!uprobe->consumers && !err) - delete_uprobe(uprobe); - } - - /* - * uprobe_unregister - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. - * @uc: identify which probe if multiple probes are colocated. - */ - void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) - { - struct uprobe *uprobe; + up_write(&uprobe->register_rwsem);
- uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) + /* TODO : cant unregister? schedule a worker thread */ + if (unlikely(err)) { + uprobe_warn(current, "unregister, leaking uprobe"); return; + }
- down_write(&uprobe->register_rwsem); - __uprobe_unregister(uprobe, uc); - up_write(&uprobe->register_rwsem); put_uprobe(uprobe); } - EXPORT_SYMBOL_GPL(uprobe_unregister); + EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
- /* - * __uprobe_register - register a probe + void uprobe_unregister_sync(void) + { + /* + * Now that handler_chain() and handle_uretprobe_chain() iterate over + * uprobe->consumers list under RCU protection without holding + * uprobe->register_rwsem, we need to wait for RCU grace period to + * make sure that we can't call into just unregistered + * uprobe_consumer's callbacks anymore. If we don't do that, fast and + * unlucky enough caller can free consumer's memory and cause + * handler_chain() or handle_uretprobe_chain() to do an use-after-free. + */ + synchronize_srcu(&uprobes_srcu); + } + EXPORT_SYMBOL_GPL(uprobe_unregister_sync); + + /** + * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. + * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * - * Apart from the access refcount, __uprobe_register() takes a creation + * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. Caller of __uprobe_register() is required to keep @inode + * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * - * Return errno if it cannot successully install probes - * else return 0 (success) + * Return: pointer to the new uprobe on success or an ERR_PTR on failure. */ - static int __uprobe_register(struct inode *inode, loff_t offset, - loff_t ref_ctr_offset, struct uprobe_consumer *uc) + struct uprobe *uprobe_register(struct inode *inode, + loff_t offset, loff_t ref_ctr_offset, + struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret;
/* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) - return -EINVAL; + return ERR_PTR(-EINVAL);
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ if (!inode->i_mapping->a_ops->read_folio && !shmem_mapping(inode->i_mapping)) - return -EIO; + return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) - return -EINVAL; + return ERR_PTR(-EINVAL);
/* * This ensures that copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) - return -EINVAL; + return ERR_PTR(-EINVAL); if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) - return -EINVAL; + return ERR_PTR(-EINVAL);
- retry: uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); - if (!uprobe) - return -ENOMEM; if (IS_ERR(uprobe)) - return PTR_ERR(uprobe); + return uprobe;
- /* - * We can race with uprobe_unregister()->delete_uprobe(). - * Check uprobe_is_active() and retry if it is false. - */ down_write(&uprobe->register_rwsem); - ret = -EAGAIN; - if (likely(uprobe_is_active(uprobe))) { - consumer_add(uprobe, uc); - ret = register_for_each_vma(uprobe, uc); - if (ret) - __uprobe_unregister(uprobe, uc); - } + consumer_add(uprobe, uc); + ret = register_for_each_vma(uprobe, uc); up_write(&uprobe->register_rwsem); - put_uprobe(uprobe);
- if (unlikely(ret == -EAGAIN)) - goto retry; - return ret; - } + if (ret) { + uprobe_unregister_nosync(uprobe, uc); + /* + * Registration might have partially succeeded, so we can have + * this consumer being called right at this time. We need to + * sync here. It's ok, it's unlikely slow path. + */ + uprobe_unregister_sync(); + return ERR_PTR(ret); + }
- int uprobe_register(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc) - { - return __uprobe_register(inode, offset, 0, uc); + return uprobe; } EXPORT_SYMBOL_GPL(uprobe_register);
- int uprobe_register_refctr(struct inode *inode, loff_t offset, - loff_t ref_ctr_offset, struct uprobe_consumer *uc) - { - return __uprobe_register(inode, offset, ref_ctr_offset, uc); - } - EXPORT_SYMBOL_GPL(uprobe_register_refctr); - - /* - * uprobe_apply - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. + /** + * uprobe_apply - add or remove the breakpoints according to @uc->filter + * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints + * Return: 0 on success or negative error code. */ - int uprobe_apply(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc, bool add) + int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { - struct uprobe *uprobe; struct uprobe_consumer *con; - int ret = -ENOENT; - - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) - return ret; + int ret = -ENOENT, srcu_idx;
down_write(&uprobe->register_rwsem); - for (con = uprobe->consumers; con && con != uc ; con = con->next) - ; - if (con) - ret = register_for_each_vma(uprobe, add ? uc : NULL); + + srcu_idx = srcu_read_lock(&uprobes_srcu); + list_for_each_entry_srcu(con, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { + if (con == uc) { + ret = register_for_each_vma(uprobe, add ? uc : NULL); + break; + } + } + srcu_read_unlock(&uprobes_srcu, srcu_idx); + up_write(&uprobe->register_rwsem); - put_uprobe(uprobe);
return ret; } @@@ -1305,15 -1336,17 +1336,17 @@@ static void build_probe_list(struct ino u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset < min) break; - list_add(&u->pending_list, head); - get_uprobe(u); + /* if uprobe went away, it's safe to ignore it */ + if (try_get_uprobe(u)) + list_add(&u->pending_list, head); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; - list_add(&u->pending_list, head); - get_uprobe(u); + /* if uprobe went away, it's safe to ignore it */ + if (try_get_uprobe(u)) + list_add(&u->pending_list, head); } } read_unlock(&uprobes_treelock); @@@ -1384,7 -1417,7 +1417,7 @@@ int uprobe_mmap(struct vm_area_struct * */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && - filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { + filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } @@@ -1482,22 -1515,6 +1515,22 @@@ void * __weak arch_uprobe_trampoline(un return &insn; }
+/* + * uprobe_clear_state - Free the area allocated for slots. + */ +static void uprobe_clear_state(const struct vm_special_mapping *sm, struct vm_area_struct *vma) +{ + struct xol_area *area = container_of(vma->vm_private_data, struct xol_area, xol_mapping); + + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(NULL, vma->vm_mm); + mutex_unlock(&delayed_uprobe_lock); + + put_page(area->pages[0]); + kfree(area->bitmap); + kfree(area); +} + static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; @@@ -1515,7 -1532,6 +1548,7 @@@ goto free_area;
area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.close = uprobe_clear_state; area->xol_mapping.pages = area->pages; area->pages[0] = alloc_page(GFP_HIGHUSER); if (!area->pages[0]) @@@ -1561,6 -1577,25 +1594,6 @@@ static struct xol_area *get_xol_area(vo return area; }
-/* - * uprobe_clear_state - Free the area allocated for slots. - */ -void uprobe_clear_state(struct mm_struct *mm) -{ - struct xol_area *area = mm->uprobes_state.xol_area; - - mutex_lock(&delayed_uprobe_lock); - delayed_uprobe_remove(NULL, mm); - mutex_unlock(&delayed_uprobe_lock); - - if (!area) - return; - - put_page(area->pages[0]); - kfree(area->bitmap); - kfree(area); -} - void uprobe_start_dup_mmap(void) { percpu_down_read(&dup_mmap_sem); @@@ -1768,6 -1803,12 +1801,12 @@@ static int dup_utask(struct task_struc return -ENOMEM;
*n = *o; + /* + * uprobe's refcnt has to be positive at this point, kept by + * utask->return_instances items; return_instances can't be + * removed right now, as task is blocked due to duping; so + * get_uprobe() is safe to use here. + */ get_uprobe(n->uprobe); n->next = NULL;
@@@ -1779,12 -1820,6 +1818,6 @@@ return 0; }
- static void uprobe_warn(struct task_struct *t, const char *msg) - { - pr_warn("uprobe: %s:%d failed to %s\n", - current->comm, current->pid, msg); - } - static void dup_xol_work(struct callback_head *work) { if (current->flags & PF_EXITING) @@@ -1881,9 -1916,13 +1914,13 @@@ static void prepare_uretprobe(struct up return; }
+ /* we need to bump refcount to store uprobe in utask */ + if (!try_get_uprobe(uprobe)) + return; + ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); if (!ri) - return; + goto fail;
trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); @@@ -1910,8 -1949,7 +1947,7 @@@ } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - - ri->uprobe = get_uprobe(uprobe); + ri->uprobe = uprobe; ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; @@@ -1922,8 -1960,9 +1958,9 @@@ utask->return_instances = ri;
return; - fail: + fail: kfree(ri); + put_uprobe(uprobe); }
/* Prepare to single-step probed instruction out of line. */ @@@ -1938,9 -1977,14 +1975,14 @@@ pre_ssout(struct uprobe *uprobe, struc if (!utask) return -ENOMEM;
+ if (!try_get_uprobe(uprobe)) + return -EINVAL; + xol_vaddr = xol_get_insn_slot(uprobe); - if (!xol_vaddr) - return -ENOMEM; + if (!xol_vaddr) { + err = -ENOMEM; + goto err_out; + }
utask->xol_vaddr = xol_vaddr; utask->vaddr = bp_vaddr; @@@ -1948,12 -1992,15 +1990,15 @@@ err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { xol_free_insn_slot(current); - return err; + goto err_out; }
utask->active_uprobe = uprobe; utask->state = UTASK_SSTEP; return 0; + err_out: + put_uprobe(uprobe); + return err; }
/* @@@ -2026,13 -2073,7 +2071,7 @@@ static int is_trap_at_addr(struct mm_st if (likely(result == 0)) goto out;
- /* - * The NULL 'tsk' here ensures that any faults that occur here - * will not be accounted to the task. 'mm' *is* current->mm, - * but we treat this as a 'remote' access since it is - * essentially a kernel access to the memory. - */ - result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL); + result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); if (result < 0) return result;
@@@ -2043,7 -2084,8 +2082,8 @@@ return is_trap_insn(&opcode); }
- static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) + /* assumes being inside RCU protected region */ + static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; @@@ -2056,7 -2098,7 +2096,7 @@@ struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr);
- uprobe = find_uprobe(inode, offset); + uprobe = find_uprobe_rcu(inode, offset); }
if (!uprobe) @@@ -2077,9 -2119,12 +2117,12 @@@ static void handler_chain(struct uprob struct uprobe_consumer *uc; int remove = UPROBE_HANDLER_REMOVE; bool need_prep = false; /* prepare return uprobe, when needed */ + bool has_consumers = false; + + current->utask->auprobe = &uprobe->arch;
- down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { int rc = 0;
if (uc->handler) { @@@ -2092,16 -2137,24 +2135,24 @@@ need_prep = true;
remove &= rc; + has_consumers = true; } + current->utask->auprobe = NULL;
if (need_prep && !remove) prepare_uretprobe(uprobe, regs); /* put bp at return */
- if (remove && uprobe->consumers) { - WARN_ON(!uprobe_is_active(uprobe)); - unapply_uprobe(uprobe, current->mm); + if (remove && has_consumers) { + down_read(&uprobe->register_rwsem); + + /* re-check that removal is still required, this time under lock */ + if (!filter_chain(uprobe, current->mm)) { + WARN_ON(!uprobe_is_active(uprobe)); + unapply_uprobe(uprobe, current->mm); + } + + up_read(&uprobe->register_rwsem); } - up_read(&uprobe->register_rwsem); }
static void @@@ -2109,13 -2162,15 +2160,15 @@@ handle_uretprobe_chain(struct return_in { struct uprobe *uprobe = ri->uprobe; struct uprobe_consumer *uc; + int srcu_idx;
- down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { + srcu_idx = srcu_read_lock(&uprobes_srcu); + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { if (uc->ret_handler) uc->ret_handler(uc, ri->func, regs); } - up_read(&uprobe->register_rwsem); + srcu_read_unlock(&uprobes_srcu, srcu_idx); }
static struct return_instance *find_next_ret_chain(struct return_instance *ri) @@@ -2200,13 -2255,15 +2253,15 @@@ static void handle_swbp(struct pt_regs { struct uprobe *uprobe; unsigned long bp_vaddr; - int is_swbp; + int is_swbp, srcu_idx;
bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == uprobe_get_trampoline_vaddr()) return uprobe_handle_trampoline(regs);
- uprobe = find_active_uprobe(bp_vaddr, &is_swbp); + srcu_idx = srcu_read_lock(&uprobes_srcu); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ @@@ -2222,7 -2279,7 +2277,7 @@@ */ instruction_pointer_set(regs, bp_vaddr); } - return; + goto out; }
/* change it in advance for ->handler() and restart */ @@@ -2257,12 -2314,12 +2312,12 @@@ if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out;
- if (!pre_ssout(uprobe, regs, bp_vaddr)) - return; + if (pre_ssout(uprobe, regs, bp_vaddr)) + goto out;
- /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ out: - put_uprobe(uprobe); + /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ + srcu_read_unlock(&uprobes_srcu, srcu_idx); }
/* diff --combined kernel/fork.c index 0241a2ff1d336,0b71fc9fa750d..cd3f92f0a13d6 --- a/kernel/fork.c +++ b/kernel/fork.c @@@ -832,7 -832,7 +832,7 @@@ static void check_mm(struct mm_struct * pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", mm_pgtables_bytes(mm));
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } @@@ -1182,7 -1182,7 +1182,7 @@@ static struct task_struct *dup_task_str tsk->active_memcg = NULL; #endif
- #ifdef CONFIG_CPU_SUP_INTEL + #ifdef CONFIG_X86_BUS_LOCK_DETECT tsk->reported_split_lock = 0; #endif
@@@ -1276,7 -1276,7 +1276,7 @@@ static struct mm_struct *mm_init(struc RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); @@@ -1338,6 -1338,7 +1338,6 @@@ static inline void __mmput(struct mm_st { VM_BUG_ON(atomic_read(&mm->mm_users));
- uprobe_clear_state(mm); exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ @@@ -1753,30 -1754,33 +1753,30 @@@ static int copy_files(unsigned long clo int no_files) { struct files_struct *oldf, *newf; - int error = 0;
/* * A background process may not have any files ... */ oldf = current->files; if (!oldf) - goto out; + return 0;
if (no_files) { tsk->files = NULL; - goto out; + return 0; }
if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); - goto out; + return 0; }
- newf = dup_fd(oldf, NR_OPEN_MAX, &error); - if (!newf) - goto out; + newf = dup_fd(oldf, NULL); + if (IS_ERR(newf)) + return PTR_ERR(newf);
tsk->files = newf; - error = 0; -out: - return error; + return 0; }
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) @@@ -1857,7 -1861,7 +1857,7 @@@ static int copy_signal(unsigned long cl prev_cputime_init(&sig->prev_cputime);
#ifdef CONFIG_POSIX_TIMERS - INIT_LIST_HEAD(&sig->posix_timers); + INIT_HLIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; #endif @@@ -3228,16 -3232,17 +3228,16 @@@ static int unshare_fs(unsigned long uns /* * Unshare file descriptor table if it is being shared */ -int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, - struct files_struct **new_fdp) +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) { struct files_struct *fd = current->files; - int error = 0;
if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, max_fds, &error); - if (!*new_fdp) - return error; + fd = dup_fd(fd, NULL); + if (IS_ERR(fd)) + return PTR_ERR(fd); + *new_fdp = fd; }
return 0; @@@ -3295,7 -3300,7 +3295,7 @@@ int ksys_unshare(unsigned long unshare_ err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); + err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@@ -3387,7 -3392,7 +3387,7 @@@ int unshare_files(void struct files_struct *old, *copy = NULL; int error;
- error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); + error = unshare_fd(CLONE_FILES, ©); if (error || !copy) return error;
diff --combined kernel/irq/msi.c index ca6e2ae6d6fc0,1c7e5159064cc..3a24d6b5f559c --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@@ -82,7 -82,7 +82,7 @@@ static struct msi_desc *msi_alloc_desc( desc->dev = dev; desc->nvec_used = nvec; if (affinity) { - desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL); + desc->affinity = kmemdup_array(affinity, nvec, sizeof(*desc->affinity), GFP_KERNEL); if (!desc->affinity) { kfree(desc); return NULL; @@@ -832,7 -832,7 +832,7 @@@ static void msi_domain_update_chip_ops( struct irq_chip *chip = info->chip;
BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask); - if (!chip->irq_set_affinity) + if (!chip->irq_set_affinity && !(info->flags & MSI_FLAG_NO_AFFINITY)) chip->irq_set_affinity = msi_domain_set_affinity; }
diff --combined kernel/locking/lockdep.c index 7963deac33c31,364ae0b55beea..536bd471557f5 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@@ -56,7 -56,6 +56,7 @@@ #include <linux/kprobes.h> #include <linux/lockdep.h> #include <linux/context_tracking.h> +#include <linux/console.h>
#include <asm/sections.h>
@@@ -574,10 -573,8 +574,10 @@@ static struct lock_trace *save_trace(vo if (!debug_locks_off_graph_unlock()) return NULL;
+ nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); + nbcon_cpu_emergency_exit();
return NULL; } @@@ -788,7 -785,7 +788,7 @@@ static void lockdep_print_held_locks(st printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p)); else printk("%d lock%s held by %s/%d:\n", depth, - depth > 1 ? "s" : "", p->comm, task_pid_nr(p)); + str_plural(depth), p->comm, task_pid_nr(p)); /* * It's not reliable to print a task's held locks if it's not sleeping * and it's not the current task. @@@ -890,13 -887,11 +890,13 @@@ look_up_lock_class(const struct lockdep if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { instrumentation_begin(); debug_locks_off(); + nbcon_cpu_emergency_enter(); printk(KERN_ERR "BUG: looking up invalid subclass: %u\n", subclass); printk(KERN_ERR "turning off the locking correctness validator.\n"); dump_stack(); + nbcon_cpu_emergency_exit(); instrumentation_end(); return NULL; } @@@ -973,13 -968,11 +973,13 @@@ static bool assign_lock_key(struct lock else { /* Debug-check: all keys must be persistent! */ debug_locks_off(); + nbcon_cpu_emergency_enter(); pr_err("INFO: trying to register non-static key.\n"); pr_err("The code is fine but needs lockdep annotation, or maybe\n"); pr_err("you didn't initialize this object before use?\n"); pr_err("turning off the locking correctness validator.\n"); dump_stack(); + nbcon_cpu_emergency_exit(); return false; }
@@@ -1323,10 -1316,8 +1323,10 @@@ register_lock_class(struct lockdep_map return NULL; }
+ nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } nr_lock_classes++; @@@ -1358,13 -1349,11 +1358,13 @@@ if (verbose(class)) { graph_unlock();
+ nbcon_cpu_emergency_enter(); printk("\nnew class %px: %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); dump_stack(); + nbcon_cpu_emergency_exit();
if (!graph_lock()) { return NULL; @@@ -1403,10 -1392,8 +1403,10 @@@ static struct lock_list *alloc_list_ent if (!debug_locks_off_graph_unlock()) return NULL;
+ nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } nr_list_entries++; @@@ -2052,8 -2039,6 +2052,8 @@@ static noinline void print_circular_bug
depth = get_lock_depth(target);
+ nbcon_cpu_emergency_enter(); + print_circular_bug_header(target, depth, check_src, check_tgt);
parent = get_lock_parent(target); @@@ -2072,8 -2057,6 +2072,8 @@@
printk("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); }
static noinline void print_bfs_bug(int ret) @@@ -2084,6 -2067,9 +2084,9 @@@ /* * Breadth-first-search failed, graph got corrupted? */ + if (ret == BFS_EQUEUEFULL) + pr_warn("Increase LOCKDEP_CIRCULAR_QUEUE_BITS to avoid this warning:\n"); + WARN(1, "lockdep bfs error:%d\n", ret); }
@@@ -2586,8 -2572,6 +2589,8 @@@ print_bad_irq_dependency(struct task_st if (!debug_locks_off_graph_unlock() || debug_locks_silent) return;
+ nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=====================================================\n"); pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", @@@ -2637,13 -2621,11 +2640,13 @@@ pr_warn(" and %s-irq-unsafe lock:\n", irqclass); next_root->trace = save_trace(); if (!next_root->trace) - return; + goto out; print_shortest_lock_dependencies(forwards_entry, next_root);
pr_warn("\nstack backtrace:\n"); dump_stack(); +out: + nbcon_cpu_emergency_exit(); }
static const char *state_names[] = { @@@ -3008,8 -2990,6 +3011,8 @@@ print_deadlock_bug(struct task_struct * if (!debug_locks_off_graph_unlock() || debug_locks_silent) return;
+ nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("============================================\n"); pr_warn("WARNING: possible recursive locking detected\n"); @@@ -3032,8 -3012,6 +3035,8 @@@
pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); }
/* @@@ -3631,8 -3609,6 +3634,8 @@@ static void print_collision(struct task struct held_lock *hlock_next, struct lock_chain *chain) { + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("============================\n"); pr_warn("WARNING: chain_key collision\n"); @@@ -3649,8 -3625,6 +3652,8 @@@
pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } #endif
@@@ -3741,10 -3715,8 +3744,10 @@@ static inline int add_chain_cache(struc if (!debug_locks_off_graph_unlock()) return 0;
+ nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } chain->chain_key = chain_key; @@@ -3761,10 -3733,8 +3764,10 @@@ if (!debug_locks_off_graph_unlock()) return 0;
+ nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; }
@@@ -4003,8 -3973,6 +4006,8 @@@ print_usage_bug(struct task_struct *cur if (!debug_locks_off() || debug_locks_silent) return;
+ nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("================================\n"); pr_warn("WARNING: inconsistent lock state\n"); @@@ -4033,8 -4001,6 +4036,8 @@@
pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); }
/* @@@ -4069,8 -4035,6 +4072,8 @@@ print_irq_inversion_bug(struct task_str if (!debug_locks_off_graph_unlock() || debug_locks_silent) return;
+ nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("========================================================\n"); pr_warn("WARNING: possible irq lock inversion dependency detected\n"); @@@ -4111,13 -4075,11 +4114,13 @@@ pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); root->trace = save_trace(); if (!root->trace) - return; + goto out; print_shortest_lock_dependencies(other, root);
pr_warn("\nstack backtrace:\n"); dump_stack(); +out: + nbcon_cpu_emergency_exit(); }
/* @@@ -4194,8 -4156,6 +4197,8 @@@ void print_irqtrace_events(struct task_ { const struct irqtrace_events *trace = &curr->irqtrace;
+ nbcon_cpu_emergency_enter(); + printk("irq event stamp: %u\n", trace->irq_events); printk("hardirqs last enabled at (%u): [<%px>] %pS\n", trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, @@@ -4209,8 -4169,6 +4212,8 @@@ printk("softirqs last disabled at (%u): [<%px>] %pS\n", trace->softirq_disable_event, (void *)trace->softirq_disable_ip, (void *)trace->softirq_disable_ip); + + nbcon_cpu_emergency_exit(); }
static int HARDIRQ_verbose(struct lock_class *class) @@@ -4731,12 -4689,10 +4734,12 @@@ unlock * We must printk outside of the graph_lock: */ if (ret == 2) { + nbcon_cpu_emergency_enter(); printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); print_lock(this); print_irqtrace_events(curr); dump_stack(); + nbcon_cpu_emergency_exit(); }
return ret; @@@ -4777,8 -4733,6 +4780,8 @@@ print_lock_invalid_wait_context(struct if (debug_locks_silent) return 0;
+ nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=============================\n"); pr_warn("[ BUG: Invalid wait context ]\n"); @@@ -4798,8 -4752,6 +4801,8 @@@ pr_warn("stack backtrace:\n"); dump_stack();
+ nbcon_cpu_emergency_exit(); + return 0; }
@@@ -5007,8 -4959,6 +5010,8 @@@ print_lock_nested_lock_not_held(struct if (debug_locks_silent) return;
+ nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("==================================\n"); pr_warn("WARNING: Nested lock was not taken\n"); @@@ -5029,8 -4979,6 +5032,8 @@@
pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); }
static int __lock_is_held(const struct lockdep_map *lock, int read); @@@ -5079,13 -5027,11 +5082,13 @@@ static int __lock_acquire(struct lockde debug_class_ops_inc(class);
if (very_verbose(class)) { + nbcon_cpu_emergency_enter(); printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); dump_stack(); + nbcon_cpu_emergency_exit(); }
/* @@@ -5212,7 -5158,6 +5215,7 @@@ #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); printk(KERN_DEBUG "depth: %i max: %lu!\n", curr->lockdep_depth, MAX_LOCK_DEPTH); @@@ -5220,7 -5165,6 +5223,7 @@@ lockdep_print_held_locks(current); debug_show_all_locks(); dump_stack(); + nbcon_cpu_emergency_exit();
return 0; } @@@ -5240,8 -5184,6 +5243,8 @@@ static void print_unlock_imbalance_bug( if (debug_locks_silent) return;
+ nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=====================================\n"); pr_warn("WARNING: bad unlock balance detected!\n"); @@@ -5258,8 -5200,6 +5261,8 @@@
pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); }
static noinstr int match_held_lock(const struct held_lock *hlock, @@@ -5964,8 -5904,6 +5967,8 @@@ static void print_lock_contention_bug(s if (debug_locks_silent) return;
+ nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=================================\n"); pr_warn("WARNING: bad contention detected!\n"); @@@ -5982,8 -5920,6 +5985,8 @@@
pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); }
static void @@@ -6263,25 -6199,27 +6266,27 @@@ static struct pending_free *get_pending static void free_zapped_rcu(struct rcu_head *cb);
/* - * Schedule an RCU callback if no RCU callback is pending. Must be called with - * the graph lock held. - */ - static void call_rcu_zapped(struct pending_free *pf) + * See if we need to queue an RCU callback, must called with + * the lockdep lock held, returns false if either we don't have + * any pending free or the callback is already scheduled. + * Otherwise, a call_rcu() must follow this function call. + */ + static bool prepare_call_rcu_zapped(struct pending_free *pf) { WARN_ON_ONCE(inside_selftest());
if (list_empty(&pf->zapped)) - return; + return false;
if (delayed_free.scheduled) - return; + return false;
delayed_free.scheduled = true;
WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf); delayed_free.index ^= 1;
- call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + return true; }
/* The caller must hold the graph lock. May be called from RCU context. */ @@@ -6307,6 -6245,7 +6312,7 @@@ static void free_zapped_rcu(struct rcu_ { struct pending_free *pf; unsigned long flags; + bool need_callback;
if (WARN_ON_ONCE(ch != &delayed_free.rcu_head)) return; @@@ -6318,14 -6257,18 +6324,18 @@@ pf = delayed_free.pf + (delayed_free.index ^ 1); __free_zapped_classes(pf); delayed_free.scheduled = false; + need_callback = + prepare_call_rcu_zapped(delayed_free.pf + delayed_free.index); + lockdep_unlock(); + raw_local_irq_restore(flags);
/* - * If there's anything on the open list, close and start a new callback. - */ - call_rcu_zapped(delayed_free.pf + delayed_free.index); + * If there's pending free and its callback has not been scheduled, + * queue an RCU callback. + */ + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
- lockdep_unlock(); - raw_local_irq_restore(flags); }
/* @@@ -6365,6 -6308,7 +6375,7 @@@ static void lockdep_free_key_range_reg( { struct pending_free *pf; unsigned long flags; + bool need_callback;
init_data_structures_once();
@@@ -6372,10 -6316,11 +6383,11 @@@ lockdep_lock(); pf = get_pending_free(); __lockdep_free_key_range(pf, start, size); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); lockdep_unlock(); raw_local_irq_restore(flags); - + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); /* * Wait for any possible iterators from look_up_lock_class() to pass * before continuing to free the memory they refer to. @@@ -6469,6 -6414,7 +6481,7 @@@ static void lockdep_reset_lock_reg(stru struct pending_free *pf; unsigned long flags; int locked; + bool need_callback = false;
raw_local_irq_save(flags); locked = graph_lock(); @@@ -6477,11 -6423,13 +6490,13 @@@
pf = get_pending_free(); __lockdep_reset_lock(pf, lock); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf);
graph_unlock(); out_irq: raw_local_irq_restore(flags); + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); }
/* @@@ -6525,6 -6473,7 +6540,7 @@@ void lockdep_unregister_key(struct lock struct pending_free *pf; unsigned long flags; bool found = false; + bool need_callback = false;
might_sleep();
@@@ -6545,11 -6494,14 +6561,14 @@@ if (found) { pf = get_pending_free(); __lockdep_free_key_range(pf, key, 1); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); } lockdep_unlock(); raw_local_irq_restore(flags);
+ if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ synchronize_rcu(); } @@@ -6603,8 -6555,6 +6622,8 @@@ print_freed_lock_bug(struct task_struc if (debug_locks_silent) return;
+ nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=========================\n"); pr_warn("WARNING: held lock freed!\n"); @@@ -6617,8 -6567,6 +6636,8 @@@
pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); }
static inline int not_in_range(const void* mem_from, unsigned long mem_len, @@@ -6665,8 -6613,6 +6684,8 @@@ static void print_held_locks_bug(void if (debug_locks_silent) return;
+ nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("====================================\n"); pr_warn("WARNING: %s/%d still has locks held!\n", @@@ -6676,8 -6622,6 +6695,8 @@@ lockdep_print_held_locks(current); pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); }
void debug_check_no_locks_held(void) @@@ -6735,7 -6679,6 +6754,7 @@@ asmlinkage __visible void lockdep_sys_e if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("================================================\n"); pr_warn("WARNING: lock held when returning to user space!\n"); @@@ -6744,7 -6687,6 +6763,7 @@@ pr_warn("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); + nbcon_cpu_emergency_exit(); }
/* @@@ -6761,7 -6703,6 +6780,7 @@@ void lockdep_rcu_suspicious(const char bool rcu = warn_rcu_enter();
/* Note: the following can be executed concurrently, so be careful. */ + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("=============================\n"); pr_warn("WARNING: suspicious RCU usage\n"); @@@ -6800,7 -6741,6 +6819,7 @@@ lockdep_print_held_locks(curr); pr_warn("\nstack backtrace:\n"); dump_stack(); + nbcon_cpu_emergency_exit(); warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --combined kernel/sched/core.c index 43e701f540130,ffcd637dc8e42..5a8446a69b6d3 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@@ -163,7 -163,10 +163,10 @@@ static inline int __task_prio(const str if (p->sched_class == &stop_sched_class) /* trumps deadline */ return -2;
- if (rt_prio(p->prio)) /* includes deadline */ + if (p->dl_server) + return -1; /* deadline */ + + if (rt_or_dl_prio(p->prio)) return p->prio; /* [-1, 99] */
if (p->sched_class == &idle_sched_class) @@@ -192,8 -195,24 +195,24 @@@ static inline bool prio_less(const stru if (-pb < -pa) return false;
- if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ - return !dl_time_before(a->dl.deadline, b->dl.deadline); + if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */ + const struct sched_dl_entity *a_dl, *b_dl; + + a_dl = &a->dl; + /* + * Since,'a' and 'b' can be CFS tasks served by DL server, + * __task_prio() can return -1 (for DL) even for those. In that + * case, get to the dl_server's DL entity. + */ + if (a->dl_server) + a_dl = a->dl_server; + + b_dl = &b->dl; + if (b->dl_server) + b_dl = b->dl_server; + + return !dl_time_before(a_dl->deadline, b_dl->deadline); + }
if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ return cfs_prio_less(a, b, in_fi); @@@ -1269,7 -1288,7 +1288,7 @@@ bool sched_can_stop_tick(struct rq *rq * dequeued by migrating while the constrained task continues to run. * E.g. going from 2->1 without going through pick_next_task(). */ - if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) { + if (__need_bw_check(rq, rq->curr)) { if (cfs_task_bw_constrained(rq->curr)) return false; } @@@ -1672,6 -1691,9 +1691,9 @@@ static inline void uclamp_rq_inc(struc if (unlikely(!p->sched_class->uclamp_enabled)) return;
+ if (p->se.sched_delayed) + return; + for_each_clamp_id(clamp_id) uclamp_rq_inc_id(rq, p, clamp_id);
@@@ -1696,6 -1718,9 +1718,9 @@@ static inline void uclamp_rq_dec(struc if (unlikely(!p->sched_class->uclamp_enabled)) return;
+ if (p->se.sched_delayed) + return; + for_each_clamp_id(clamp_id) uclamp_rq_dec_id(rq, p, clamp_id); } @@@ -1975,14 -2000,21 +2000,21 @@@ void enqueue_task(struct rq *rq, struc psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); }
- uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); + /* + * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear + * ->sched_delayed. + */ + uclamp_rq_inc(rq, p);
if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); }
- void dequeue_task(struct rq *rq, struct task_struct *p, int flags) + /* + * Must only return false when DEQUEUE_SLEEP. + */ + inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) { if (sched_core_enabled(rq)) sched_core_dequeue(rq, p, flags); @@@ -1995,8 -2027,12 +2027,12 @@@ psi_dequeue(p, flags & DEQUEUE_SLEEP); }
+ /* + * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' + * and mark the task ->sched_delayed. + */ uclamp_rq_dec(rq, p); - p->sched_class->dequeue_task(rq, p, flags); + return p->sched_class->dequeue_task(rq, p, flags); }
void activate_task(struct rq *rq, struct task_struct *p, int flags) @@@ -2014,12 -2050,25 +2050,25 @@@
void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); + SCHED_WARN_ON(flags & DEQUEUE_SLEEP); + + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+ /* + * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before* + * dequeue_task() and cleared *after* enqueue_task(). + */ + dequeue_task(rq, p, flags); }
+ static void block_task(struct rq *rq, struct task_struct *p, int flags) + { + if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags)) + __block_task(rq, p); + } + /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. @@@ -2233,6 -2282,12 +2282,12 @@@ void migrate_disable(void struct task_struct *p = current;
if (p->migration_disabled) { + #ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); + #endif p->migration_disabled++; return; } @@@ -2251,14 -2306,20 +2306,20 @@@ void migrate_enable(void .flags = SCA_MIGRATE_ENABLE, };
+ #ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; + #endif + if (p->migration_disabled > 1) { p->migration_disabled--; return; }
- if (WARN_ON_ONCE(!p->migration_disabled)) - return; - /* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). @@@ -3607,8 -3668,6 +3668,6 @@@ ttwu_do_activate(struct rq *rq, struct rq->idle_stamp = 0; } #endif - - p->dl_server = NULL; }
/* @@@ -3644,12 -3703,14 +3703,14 @@@ static int ttwu_runnable(struct task_st
rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { + update_rq_clock(rq); + if (p->se.sched_delayed) + enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); if (!task_on_cpu(rq, p)) { /* * When on_rq && !on_cpu the task is preempted, see if * it should preempt the task that is current now. */ - update_rq_clock(rq); wakeup_preempt(rq, p, wake_flags); } ttwu_do_wakeup(p); @@@ -4029,11 -4090,16 +4090,16 @@@ int try_to_wake_up(struct task_struct * * case the whole 'p->on_rq && ttwu_runnable()' case below * without taking any locks. * + * Specifically, given current runs ttwu() we must be before + * schedule()'s block_task(), as such this must not observe + * sched_delayed. + * * In particular: * - we rely on Program-Order guarantees for all the ordering, * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ + SCHED_WARN_ON(p->se.sched_delayed); if (!ttwu_state_match(p, state, &success)) goto out;
@@@ -4322,9 -4388,11 +4388,11 @@@ static void __sched_fork(unsigned long p->se.nr_migrations = 0; p->se.vruntime = 0; p->se.vlag = 0; - p->se.slice = sysctl_sched_base_slice; INIT_LIST_HEAD(&p->se.group_node);
+ /* A delayed task cannot be in clone(). */ + SCHED_WARN_ON(p->se.sched_delayed); + #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; #endif @@@ -4572,6 -4640,8 +4640,8 @@@ int sched_fork(unsigned long clone_flag
p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); + p->se.custom_slice = 0; + p->se.slice = sysctl_sched_base_slice;
/* * We don't need the reset flag anymore after the fork. It has @@@ -4686,7 -4756,7 +4756,7 @@@ void wake_up_new_task(struct task_struc update_rq_clock(rq); post_init_entity_util_avg(p);
- activate_task(rq, p, ENQUEUE_NOCLOCK); + activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); wakeup_preempt(rq, p, WF_FORK); #ifdef CONFIG_SMP @@@ -5769,8 -5839,8 +5839,8 @@@ static inline void schedule_debug(struc schedstat_inc(this_rq()->sched_count); }
- static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) + static void prev_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) { #ifdef CONFIG_SMP const struct sched_class *class; @@@ -5787,8 -5857,6 +5857,6 @@@ break; } #endif - - put_prev_task(rq, prev); }
/* @@@ -5800,6 -5868,8 +5868,8 @@@ __pick_next_task(struct rq *rq, struct const struct sched_class *class; struct task_struct *p;
+ rq->dl_server = NULL; + /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a @@@ -5815,35 -5885,28 +5885,28 @@@
/* Assume the next prioritized class is idle_sched_class */ if (!p) { - put_prev_task(rq, prev); - p = pick_next_task_idle(rq); + p = pick_task_idle(rq); + put_prev_set_next_task(rq, prev, p); }
- /* - * This is the fast path; it cannot be a DL server pick; - * therefore even if @p == @prev, ->dl_server must be NULL. - */ - if (p->dl_server) - p->dl_server = NULL; - return p; }
restart: - put_prev_task_balance(rq, prev, rf); - - /* - * We've updated @prev and no longer need the server link, clear it. - * Must be done before ->pick_next_task() because that can (re)set - * ->dl_server. - */ - if (prev->dl_server) - prev->dl_server = NULL; + prev_balance(rq, prev, rf);
for_each_class(class) { - p = class->pick_next_task(rq); - if (p) - return p; + if (class->pick_next_task) { + p = class->pick_next_task(rq, prev); + if (p) + return p; + } else { + p = class->pick_task(rq); + if (p) { + put_prev_set_next_task(rq, prev, p); + return p; + } + } }
BUG(); /* The idle class should always have a runnable task. */ @@@ -5873,6 -5936,8 +5936,8 @@@ static inline struct task_struct *pick_ const struct sched_class *class; struct task_struct *p;
+ rq->dl_server = NULL; + for_each_class(class) { p = class->pick_task(rq); if (p) @@@ -5911,6 -5976,7 +5976,7 @@@ pick_next_task(struct rq *rq, struct ta * another cpu during offline. */ rq->core_pick = NULL; + rq->core_dl_server = NULL; return __pick_next_task(rq, prev, rf); }
@@@ -5929,16 -5995,13 +5995,13 @@@ WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
next = rq->core_pick; - if (next != prev) { - put_prev_task(rq, prev); - set_next_task(rq, next); - } - + rq->dl_server = rq->core_dl_server; rq->core_pick = NULL; - goto out; + rq->core_dl_server = NULL; + goto out_set_next; }
- put_prev_task_balance(rq, prev, rf); + prev_balance(rq, prev, rf);
smt_mask = cpu_smt_mask(cpu); need_sync = !!rq->core->core_cookie; @@@ -5979,6 -6042,7 +6042,7 @@@ next = pick_task(rq); if (!next->core_cookie) { rq->core_pick = NULL; + rq->core_dl_server = NULL; /* * For robustness, update the min_vruntime_fi for * unconstrained picks as well. @@@ -6006,7 -6070,9 +6070,9 @@@ if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i);
- p = rq_i->core_pick = pick_task(rq_i); + rq_i->core_pick = p = pick_task(rq_i); + rq_i->core_dl_server = rq_i->dl_server; + if (!max || prio_less(max, p, fi_before)) max = p; } @@@ -6030,6 -6096,7 +6096,7 @@@ }
rq_i->core_pick = p; + rq_i->core_dl_server = NULL;
if (p == rq_i->idle) { if (rq_i->nr_running) { @@@ -6090,6 -6157,7 +6157,7 @@@
if (i == cpu) { rq_i->core_pick = NULL; + rq_i->core_dl_server = NULL; continue; }
@@@ -6098,6 -6166,7 +6166,7 @@@
if (rq_i->curr == rq_i->core_pick) { rq_i->core_pick = NULL; + rq_i->core_dl_server = NULL; continue; }
@@@ -6105,8 -6174,7 +6174,7 @@@ }
out_set_next: - set_next_task(rq, next); - out: + put_prev_set_next_task(rq, prev, next); if (rq->core->core_forceidle_count && next == rq->idle) queue_core_balance(rq);
@@@ -6452,13 -6520,15 +6520,15 @@@ static void __sched notrace __schedule( if (signal_pending_state(prev_state, prev)) { WRITE_ONCE(prev->__state, TASK_RUNNING); } else { + int flags = DEQUEUE_NOCLOCK; + prev->sched_contributes_to_load = (prev_state & TASK_UNINTERRUPTIBLE) && !(prev_state & TASK_NOLOAD) && !(prev_state & TASK_FROZEN);
- if (prev->sched_contributes_to_load) - rq->nr_uninterruptible++; + if (unlikely(is_special_task_state(prev_state))) + flags |= DEQUEUE_SPECIAL;
/* * __schedule() ttwu() @@@ -6471,12 -6541,7 +6541,7 @@@ * * After this, schedule() must not care about p->state any more. */ - deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); - - if (prev->in_iowait) { - atomic_inc(&rq->nr_iowait); - delayacct_blkio_start(); - } + block_task(rq, prev, flags); } switch_count = &prev->nvcsw; } @@@ -7405,7 -7470,7 +7470,7 @@@ EXPORT_SYMBOL(io_schedule)
void sched_show_task(struct task_struct *p) { - unsigned long free = 0; + unsigned long free; int ppid;
if (!try_get_task_stack(p)) @@@ -7415,7 -7480,9 +7480,7 @@@
if (task_is_running(p)) pr_cont(" running task "); -#ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); -#endif ppid = 0; rcu_read_lock(); if (pid_alive(p)) @@@ -8226,8 -8293,6 +8291,6 @@@ void __init sched_init(void #endif /* CONFIG_RT_GROUP_SCHED */ }
- init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); - #ifdef CONFIG_SMP init_defrootdomain(); #endif @@@ -8282,8 -8347,13 +8345,13 @@@ init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */
- rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED + /* + * This is required for init cpu because rt.c:__enable_runtime() + * starts working after scheduler_running, which is not the case + * yet. + */ + rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif #ifdef CONFIG_SMP @@@ -8315,10 -8385,12 +8383,12 @@@ #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); + fair_server_init(rq);
#ifdef CONFIG_SCHED_CORE rq->core = rq; rq->core_pick = NULL; + rq->core_dl_server = NULL; rq->core_enabled = 0; rq->core_tree = RB_ROOT; rq->core_forceidle_count = 0; @@@ -8331,6 -8403,7 +8401,7 @@@ }
set_load_weight(&init_task, false); + init_task.se.slice = sysctl_sched_base_slice,
/* * The boot idle thread does lazy MMU switching as well: @@@ -8546,7 -8619,7 +8617,7 @@@ void normalize_rt_tasks(void schedstat_set(p->stats.sleep_start, 0); schedstat_set(p->stats.block_start, 0);
- if (!dl_task(p) && !rt_task(p)) { + if (!rt_or_dl_task(p)) { /* * Renice negative nice level userspace * tasks back to 0: diff --combined kernel/sched/fair.c index a1b756f927b23,11e890486c1b2..503d1c2ffd690 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@@ -511,7 -511,7 +511,7 @@@ static int cfs_rq_is_idle(struct cfs_r
static int se_is_idle(struct sched_entity *se) { - return 0; + return task_has_idle_policy(task_of(se)); }
#endif /* CONFIG_FAIR_GROUP_SCHED */ @@@ -779,8 -779,22 +779,22 @@@ static void update_min_vruntime(struct }
/* ensure we never gain time by being placed backwards. */ - u64_u32_store(cfs_rq->min_vruntime, - __update_min_vruntime(cfs_rq, vruntime)); + cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); + } + + static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) + { + struct sched_entity *root = __pick_root_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + u64 min_slice = ~0ULL; + + if (curr && curr->on_rq) + min_slice = curr->slice; + + if (root) + min_slice = min(min_slice, root->min_slice); + + return min_slice; }
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) @@@ -799,19 -813,34 +813,34 @@@ static inline void __min_vruntime_updat } }
+ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node) + { + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (rse->min_slice < se->min_slice) + se->min_slice = rse->min_slice; + } + } + /* * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { u64 old_min_vruntime = se->min_vruntime; + u64 old_min_slice = se->min_slice; struct rb_node *node = &se->run_node;
se->min_vruntime = se->vruntime; __min_vruntime_update(se, node->rb_right); __min_vruntime_update(se, node->rb_left);
- return se->min_vruntime == old_min_vruntime; + se->min_slice = se->slice; + __min_slice_update(se, node->rb_right); + __min_slice_update(se, node->rb_left); + + return se->min_vruntime == old_min_vruntime && + se->min_slice == old_min_slice; }
RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, @@@ -824,6 -853,7 +853,7 @@@ static void __enqueue_entity(struct cfs { avg_vruntime_add(cfs_rq, se); se->min_vruntime = se->vruntime; + se->min_slice = se->slice; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less, &min_vruntime_cb); } @@@ -974,17 -1004,18 +1004,18 @@@ static void clear_buddies(struct cfs_r * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i * this is probably good enough. */ - static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) + static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { if ((s64)(se->vruntime - se->deadline) < 0) - return; + return false;
/* * For EEVDF the virtual time slope is determined by w_i (iow. * nice) while the request time r_i is determined by * sysctl_sched_base_slice. */ - se->slice = sysctl_sched_base_slice; + if (!se->custom_slice) + se->slice = sysctl_sched_base_slice;
/* * EEVDF: vd_i = ve_i + r_i / w_i @@@ -994,10 -1025,7 +1025,7 @@@ /* * The task has consumed its request, reschedule. */ - if (cfs_rq->nr_running > 1) { - resched_curr(rq_of(cfs_rq)); - clear_buddies(cfs_rq, se); - } + return true; }
#include "pelt.h" @@@ -1135,6 -1163,38 +1163,38 @@@ static inline void update_curr_task(str dl_server_update(p->dl_server, delta_exec); }
+ static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) + { + if (!sched_feat(PREEMPT_SHORT)) + return false; + + if (curr->vlag == curr->deadline) + return false; + + return !entity_eligible(cfs_rq, curr); + } + + static inline bool do_preempt_short(struct cfs_rq *cfs_rq, + struct sched_entity *pse, struct sched_entity *se) + { + if (!sched_feat(PREEMPT_SHORT)) + return false; + + if (pse->slice >= se->slice) + return false; + + if (!entity_eligible(cfs_rq, pse)) + return false; + + if (entity_before(pse, se)) + return true; + + if (!entity_eligible(cfs_rq, se)) + return true; + + return false; + } + /* * Used by other classes to account runtime. */ @@@ -1156,23 -1216,44 +1216,44 @@@ s64 update_curr_common(struct rq *rq static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; + struct rq *rq = rq_of(cfs_rq); s64 delta_exec; + bool resched;
if (unlikely(!curr)) return;
- delta_exec = update_curr_se(rq_of(cfs_rq), curr); + delta_exec = update_curr_se(rq, curr); if (unlikely(delta_exec <= 0)) return;
curr->vruntime += calc_delta_fair(delta_exec, curr); - update_deadline(cfs_rq, curr); + resched = update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq);
- if (entity_is_task(curr)) - update_curr_task(task_of(curr), delta_exec); + if (entity_is_task(curr)) { + struct task_struct *p = task_of(curr); + + update_curr_task(p, delta_exec); + + /* + * Any fair task that runs outside of fair_server should + * account against fair_server such that it can account for + * this time and possibly avoid running this period. + */ + if (p->dl_server != &rq->fair_server) + dl_server_update(&rq->fair_server, delta_exec); + }
account_cfs_rq_runtime(cfs_rq, delta_exec); + + if (rq->nr_running == 1) + return; + + if (resched || did_preempt_short(cfs_rq, curr)) { + resched_curr(rq); + clear_buddies(cfs_rq, curr); + } }
static void update_curr_fair(struct rq *rq) @@@ -1742,7 -1823,7 +1823,7 @@@ static bool pgdat_free_space_enough(str continue;
if (zone_watermark_ok(zone, 0, - wmark_pages(zone, WMARK_PROMO) + enough_wmark, + promo_wmark_pages(zone) + enough_wmark, ZONE_MOVABLE, 0)) return true; } @@@ -1840,7 -1921,8 +1921,7 @@@ bool should_numa_migrate_memory(struct * The pages in slow memory node should be migrated according * to hot/cold instead of private/shared. */ - if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && - !node_is_toptier(src_nid)) { + if (folio_use_access_time(folio)) { struct pglist_data *pgdat; unsigned long rate_limit; unsigned int latency, th, def_th; @@@ -3187,15 -3269,6 +3268,15 @@@ static bool vma_is_accessed(struct mm_s return true; }
+ /* + * This vma has not been accessed for a while, and if the number + * the threads in the same process is low, which means no other + * threads can help scan this vma, force a vma scan. + */ + if (READ_ONCE(mm->numa_scan_seq) > + (vma->numab_state->prev_scan_seq + get_nr_threads(current))) + return true; + return false; }
@@@ -5186,7 -5259,8 +5267,8 @@@ place_entity(struct cfs_rq *cfs_rq, str u64 vslice, vruntime = avg_vruntime(cfs_rq); s64 lag = 0;
- se->slice = sysctl_sched_base_slice; + if (!se->custom_slice) + se->slice = sysctl_sched_base_slice; vslice = calc_delta_fair(se->slice, se);
/* @@@ -5267,6 -5341,12 +5349,12 @@@
se->vruntime = vruntime - lag;
+ if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { + se->deadline += se->vruntime; + se->rel_deadline = 0; + return; + } + /* * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks @@@ -5286,6 -5366,9 +5374,9 @@@ static inline int cfs_rq_throttled(stru
static inline bool cfs_bandwidth_used(void);
+ static void + requeue_delayed_entity(struct sched_entity *se); + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@@ -5373,19 -5456,47 +5464,47 @@@ static void clear_buddies(struct cfs_r
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
- static void + static inline void finish_delayed_dequeue_entity(struct sched_entity *se) + { + se->sched_delayed = 0; + if (sched_feat(DELAY_ZERO) && se->vlag > 0) + se->vlag = 0; + } + + static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - int action = UPDATE_TG; + bool sleep = flags & DEQUEUE_SLEEP; + + update_curr(cfs_rq); + + if (flags & DEQUEUE_DELAYED) { + SCHED_WARN_ON(!se->sched_delayed); + } else { + bool delay = sleep; + /* + * DELAY_DEQUEUE relies on spurious wakeups, special task + * states must not suffer spurious wakeups, excempt them. + */ + if (flags & DEQUEUE_SPECIAL) + delay = false;
+ SCHED_WARN_ON(delay && se->sched_delayed); + + if (sched_feat(DELAY_DEQUEUE) && delay && + !entity_eligible(cfs_rq, se)) { + if (cfs_rq->next == se) + cfs_rq->next = NULL; + update_load_avg(cfs_rq, se, 0); + se->sched_delayed = 1; + return false; + } + } + + int action = UPDATE_TG; if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) action |= DO_DETACH;
- /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. @@@ -5403,6 -5514,11 +5522,11 @@@ clear_buddies(cfs_rq, se);
update_entity_lag(cfs_rq, se); + if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { + se->deadline -= se->vruntime; + se->rel_deadline = 1; + } + if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; @@@ -5422,8 -5538,13 +5546,13 @@@ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq);
+ if (flags & DEQUEUE_DELAYED) + finish_delayed_dequeue_entity(se); + if (cfs_rq->nr_running == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); + + return true; }
static void @@@ -5449,6 -5570,7 +5578,7 @@@ set_next_entity(struct cfs_rq *cfs_rq, }
update_stats_curr_start(cfs_rq, se); + SCHED_WARN_ON(cfs_rq->curr); cfs_rq->curr = se;
/* @@@ -5469,6 -5591,8 +5599,8 @@@ se->prev_sum_exec_runtime = se->sum_exec_runtime; }
+ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); + /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups @@@ -5477,16 -5601,26 +5609,26 @@@ * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * - pick_next_entity(struct cfs_rq *cfs_rq) + pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { /* * Enabling NEXT_BUDDY will affect latency but not fairness. */ if (sched_feat(NEXT_BUDDY) && - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { + /* ->next will never be delayed */ + SCHED_WARN_ON(cfs_rq->next->sched_delayed); return cfs_rq->next; + }
- return pick_eevdf(cfs_rq); + struct sched_entity *se = pick_eevdf(cfs_rq); + if (se->sched_delayed) { + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + SCHED_WARN_ON(se->sched_delayed); + SCHED_WARN_ON(se->on_rq); + return NULL; + } + return se; }
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@@ -5510,6 -5644,7 +5652,7 @@@ static void put_prev_entity(struct cfs_ /* in !on_rq case, update occurred at dequeue */ update_load_avg(cfs_rq, prev, 0); } + SCHED_WARN_ON(cfs_rq->curr != prev); cfs_rq->curr = NULL; }
@@@ -5773,6 -5908,7 +5916,7 @@@ static bool throttle_cfs_rq(struct cfs_ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; + long rq_h_nr_running = rq->cfs.h_nr_running;
raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@@ -5806,11 -5942,21 +5950,21 @@@ idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); + int flags; + /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) goto done;
- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + /* + * Abuse SPECIAL to avoid delayed dequeue in this instance. + * This avoids teaching dequeue_entities() about throttled + * entities and keeps things relatively simple. + */ + flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; + if (se->sched_delayed) + flags |= DEQUEUE_DELAYED; + dequeue_entity(qcfs_rq, se, flags);
if (cfs_rq_is_idle(group_cfs_rq(se))) idle_task_delta = cfs_rq->h_nr_running; @@@ -5844,6 -5990,9 +5998,9 @@@ /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, task_delta);
+ /* Stop the fair server if throttling resulted in no runnable tasks */ + if (rq_h_nr_running && !rq->cfs.h_nr_running) + dl_server_stop(&rq->fair_server); done: /* * Note: distribution will already see us throttled via the @@@ -5862,6 -6011,7 +6019,7 @@@ void unthrottle_cfs_rq(struct cfs_rq *c struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; + long rq_h_nr_running = rq->cfs.h_nr_running;
se = cfs_rq->tg->se[cpu_of(rq)];
@@@ -5899,8 -6049,10 +6057,10 @@@ for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- if (se->on_rq) + if (se->on_rq) { + SCHED_WARN_ON(se->sched_delayed); break; + } enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
if (cfs_rq_is_idle(group_cfs_rq(se))) @@@ -5931,6 -6083,10 +6091,10 @@@ goto unthrottle_throttle; }
+ /* Start the fair server if un-throttling resulted in new runnable tasks */ + if (!rq_h_nr_running && rq->cfs.h_nr_running) + dl_server_start(&rq->fair_server); + /* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta);
@@@ -6563,7 -6719,7 +6727,7 @@@ static void sched_fair_update_stop_tick { int cpu = cpu_of(rq);
- if (!sched_feat(HZ_BW) || !cfs_bandwidth_used()) + if (!cfs_bandwidth_used()) return;
if (!tick_nohz_full_cpu(cpu)) @@@ -6746,6 -6902,37 +6910,37 @@@ static int sched_idle_cpu(int cpu } #endif
+ static void + requeue_delayed_entity(struct sched_entity *se) + { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * se->sched_delayed should imply: se->on_rq == 1. + * Because a delayed entity is one that is still on + * the runqueue competing until elegibility. + */ + SCHED_WARN_ON(!se->sched_delayed); + SCHED_WARN_ON(!se->on_rq); + + if (sched_feat(DELAY_ZERO)) { + update_entity_lag(cfs_rq, se); + if (se->vlag > 0) { + cfs_rq->nr_running--; + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->vlag = 0; + place_entity(cfs_rq, se, 0); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + cfs_rq->nr_running++; + } + } + + update_load_avg(cfs_rq, se, 0); + se->sched_delayed = 0; + } + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@@ -6758,6 -6945,13 +6953,13 @@@ enqueue_task_fair(struct rq *rq, struc struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); int task_new = !(flags & ENQUEUE_WAKEUP); + int rq_h_nr_running = rq->cfs.h_nr_running; + u64 slice = 0; + + if (flags & ENQUEUE_DELAYED) { + requeue_delayed_entity(se); + return; + }
/* * The code below (indirectly) updates schedutil which looks at @@@ -6776,10 -6970,24 +6978,24 @@@ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) { - if (se->on_rq) + if (se->on_rq) { + if (se->sched_delayed) + requeue_delayed_entity(se); break; + } cfs_rq = cfs_rq_of(se); + + /* + * Basically set the slice of group entries to the min_slice of + * their respective cfs_rq. This ensures the group can service + * its entities in the desired time-frame. + */ + if (slice) { + se->slice = slice; + se->custom_slice = 1; + } enqueue_entity(cfs_rq, se, flags); + slice = cfs_rq_min_slice(cfs_rq);
cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; @@@ -6801,6 -7009,9 +7017,9 @@@ se_update_runnable(se); update_cfs_group(se);
+ se->slice = slice; + slice = cfs_rq_min_slice(cfs_rq); + cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running;
@@@ -6812,6 -7023,13 +7031,13 @@@ goto enqueue_throttle; }
+ if (!rq_h_nr_running && rq->cfs.h_nr_running) { + /* Account for idle runtime */ + if (!rq->nr_running) + dl_server_update_idle_time(rq, rq->curr); + dl_server_start(&rq->fair_server); + } + /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1);
@@@ -6841,36 -7059,59 +7067,59 @@@ enqueue_throttle static void set_next_buddy(struct sched_entity *se);
/* - * The dequeue_task method is called before nr_running is - * decreased. We remove the task from the rbtree and - * update the fair scheduling stats: + * Basically dequeue_task_fair(), except it can deal with dequeue_entity() + * failing half-way through and resume the dequeue later. + * + * Returns: + * -1 - dequeue delayed + * 0 - dequeue throttled + * 1 - dequeue complete */ - static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; - int task_sleep = flags & DEQUEUE_SLEEP; - int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); + int rq_h_nr_running = rq->cfs.h_nr_running; + bool task_sleep = flags & DEQUEUE_SLEEP; + bool task_delayed = flags & DEQUEUE_DELAYED; + struct task_struct *p = NULL; + int idle_h_nr_running = 0; + int h_nr_running = 0; + struct cfs_rq *cfs_rq; + u64 slice = 0;
- util_est_dequeue(&rq->cfs, p); + if (entity_is_task(se)) { + p = task_of(se); + h_nr_running = 1; + idle_h_nr_running = task_has_idle_policy(p); + } else { + cfs_rq = group_cfs_rq(se); + slice = cfs_rq_min_slice(cfs_rq); + }
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, flags);
- cfs_rq->h_nr_running--; + if (!dequeue_entity(cfs_rq, se, flags)) { + if (p && &p->se == se) + return -1; + + break; + } + + cfs_rq->h_nr_running -= h_nr_running; cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + idle_h_nr_running = h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - goto dequeue_throttle; + return 0;
/* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + slice = cfs_rq_min_slice(cfs_rq); + /* Avoid re-evaluating load for this entity: */ se = parent_entity(se); /* @@@ -6882,6 -7123,7 +7131,7 @@@ break; } flags |= DEQUEUE_SLEEP; + flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL); }
for_each_sched_entity(se) { @@@ -6891,28 -7133,60 +7141,60 @@@ se_update_runnable(se); update_cfs_group(se);
- cfs_rq->h_nr_running--; + se->slice = slice; + slice = cfs_rq_min_slice(cfs_rq); + + cfs_rq->h_nr_running -= h_nr_running; cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + idle_h_nr_running = h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - goto dequeue_throttle; - + return 0; }
- /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, 1); + sub_nr_running(rq, h_nr_running); + + if (rq_h_nr_running && !rq->cfs.h_nr_running) + dl_server_stop(&rq->fair_server);
/* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies;
- dequeue_throttle: - util_est_update(&rq->cfs, p, task_sleep); + if (p && task_delayed) { + SCHED_WARN_ON(!task_sleep); + SCHED_WARN_ON(p->on_rq != 1); + + /* Fix-up what dequeue_task_fair() skipped */ + hrtick_update(rq); + + /* Fix-up what block_task() skipped. */ + __block_task(rq, p); + } + + return 1; + } + + /* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and + * update the fair scheduling stats: + */ + static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + { + util_est_dequeue(&rq->cfs, p); + + if (dequeue_entities(rq, &p->se, flags) < 0) { + util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); + return false; + } + + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); hrtick_update(rq); + return true; }
#ifdef CONFIG_SMP @@@ -8294,7 -8568,21 +8576,21 @@@ static void migrate_task_rq_fair(struc
static void task_dead_fair(struct task_struct *p) { - remove_entity_load_avg(&p->se); + struct sched_entity *se = &p->se; + + if (se->sched_delayed) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + if (se->sched_delayed) { + update_rq_clock(rq); + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + } + task_rq_unlock(rq, p, &rf); + } + + remove_entity_load_avg(se); }
/* @@@ -8330,7 -8618,7 +8626,7 @@@ static void set_cpus_allowed_fair(struc static int balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - if (rq->nr_running) + if (sched_fair_runnable(rq)) return 1;
return sched_balance_newidle(rq, rf) != 0; @@@ -8389,16 -8677,7 +8685,7 @@@ static void check_preempt_wakeup_fair(s if (test_tsk_need_resched(curr)) return;
- /* Idle tasks are by definition preempted by non-idle tasks. */ - if (unlikely(task_has_idle_policy(curr)) && - likely(!task_has_idle_policy(p))) - goto preempt; - - /* - * Batch and idle tasks do not preempt non-idle tasks (their preemption - * is driven by the tick): - */ - if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) + if (!sched_feat(WAKEUP_PREEMPTION)) return;
find_matching_se(&se, &pse); @@@ -8408,7 -8687,7 +8695,7 @@@ pse_is_idle = se_is_idle(pse);
/* - * Preempt an idle group in favor of a non-idle group (and don't preempt + * Preempt an idle entity in favor of a non-idle entity (and don't preempt * in the inverse case). */ if (cse_is_idle && !pse_is_idle) @@@ -8416,11 -8695,26 +8703,26 @@@ if (cse_is_idle != pse_is_idle) return;
+ /* + * BATCH and IDLE tasks do not preempt others. + */ + if (unlikely(p->policy != SCHED_NORMAL)) + return; + cfs_rq = cfs_rq_of(se); update_curr(cfs_rq); + /* + * If @p has a shorter slice than current and @p is eligible, override + * current's slice protection in order to allow preemption. + * + * Note that even if @p does not turn out to be the most eligible + * task at this moment, current's slice protection will be lost. + */ + if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) + se->vlag = se->deadline + 1;
/* - * XXX pick_eevdf(cfs_rq) != se ? + * If @p has become the most eligible task, force preemption. */ if (pick_eevdf(cfs_rq) == pse) goto preempt; @@@ -8431,7 -8725,6 +8733,6 @@@ preempt resched_curr(rq); }
- #ifdef CONFIG_SMP static struct task_struct *pick_task_fair(struct rq *rq) { struct sched_entity *se; @@@ -8443,95 -8736,58 +8744,58 @@@ again return NULL;
do { - struct sched_entity *curr = cfs_rq->curr; + /* Might not have done put_prev_entity() */ + if (cfs_rq->curr && cfs_rq->curr->on_rq) + update_curr(cfs_rq);
- /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ - if (curr) { - if (curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto again;
- if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto again; - } - - se = pick_next_entity(cfs_rq); + se = pick_next_entity(rq, cfs_rq); + if (!se) + goto again; cfs_rq = group_cfs_rq(se); } while (cfs_rq);
return task_of(se); } - #endif + + static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); + static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; struct task_struct *p; int new_tasks;
again: - if (!sched_fair_runnable(rq)) + p = pick_task_fair(rq); + if (!p) goto idle; + se = &p->se;
#ifdef CONFIG_FAIR_GROUP_SCHED - if (!prev || prev->sched_class != &fair_sched_class) + if (prev->sched_class != &fair_sched_class) goto simple;
+ __put_prev_set_next_dl_server(rq, prev, p); + /* * Because of the set_next_buddy() in dequeue_task_fair() it is rather * likely that a next task is from the same cgroup as the current. * * Therefore attempt to avoid putting and setting the entire cgroup * hierarchy, only change the part that actually changes. - */ - - do { - struct sched_entity *curr = cfs_rq->curr; - - /* - * Since we got here without doing put_prev_entity() we also - * have to consider cfs_rq->curr. If it is still a runnable - * entity, update_curr() will update its vruntime, otherwise - * forget we've ever seen it. - */ - if (curr) { - if (curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; - - /* - * This call to check_cfs_rq_runtime() will do the - * throttle and dequeue its entity in the parent(s). - * Therefore the nr_running test will indeed - * be correct. - */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) { - cfs_rq = &rq->cfs; - - if (!cfs_rq->nr_running) - goto idle; - - goto simple; - } - } - - se = pick_next_entity(cfs_rq); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); - - p = task_of(se); - - /* + * * Since we haven't yet done put_prev_entity and if the selected task * is a different task than we started out with, try and touch the * least amount of cfs_rqs. */ if (prev != p) { struct sched_entity *pse = &prev->se; + struct cfs_rq *cfs_rq;
while (!(cfs_rq = is_same_group(se, pse))) { int se_depth = se->depth; @@@ -8549,38 -8805,15 +8813,15 @@@
put_prev_entity(cfs_rq, pse); set_next_entity(cfs_rq, se); - }
- goto done; - simple: - #endif - if (prev) - put_prev_task(rq, prev); - - do { - se = pick_next_entity(cfs_rq); - set_next_entity(cfs_rq, se); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); + __set_next_task_fair(rq, p, true); + }
- p = task_of(se); + return p;
- done: __maybe_unused; - #ifdef CONFIG_SMP - /* - * Move the next running task to the front of - * the list, so our cfs_tasks list becomes MRU - * one. - */ - list_move(&p->se.group_node, &rq->cfs_tasks); + simple: #endif - - if (hrtick_enabled_fair(rq)) - hrtick_start_fair(rq, p); - - update_misfit_status(p, rq); - sched_fair_update_stop_tick(rq, p); - + put_prev_set_next_task(rq, prev, p); return p;
idle: @@@ -8609,15 -8842,34 +8850,34 @@@ return NULL; }
- static struct task_struct *__pick_next_task_fair(struct rq *rq) + static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) + { + return pick_next_task_fair(rq, prev, NULL); + } + + static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) + { + return !!dl_se->rq->cfs.nr_running; + } + + static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) + { + return pick_task_fair(dl_se->rq); + } + + void fair_server_init(struct rq *rq) { - return pick_next_task_fair(rq, NULL, NULL); + struct sched_dl_entity *dl_se = &rq->fair_server; + + init_dl_entity(dl_se); + + dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); }
/* * Account for a descheduled task: */ - static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) + static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; @@@ -12491,7 -12743,7 +12751,7 @@@ out * - indirectly from a remote scheduler_tick() for NOHZ idle balancing * through the SMP cross-call nohz_csd_func() */ - static __latent_entropy void sched_balance_softirq(struct softirq_action *h) + static __latent_entropy void sched_balance_softirq(void) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance; @@@ -12710,22 -12962,7 +12970,7 @@@ static void task_tick_fair(struct rq *r */ static void task_fork_fair(struct task_struct *p) { - struct sched_entity *se = &p->se, *curr; - struct cfs_rq *cfs_rq; - struct rq *rq = this_rq(); - struct rq_flags rf; - - rq_lock(rq, &rf); - update_rq_clock(rq); - set_task_max_allowed_capacity(p); - - cfs_rq = task_cfs_rq(current); - curr = cfs_rq->curr; - if (curr) - update_curr(cfs_rq); - place_entity(cfs_rq, se, ENQUEUE_INITIAL); - rq_unlock(rq, &rf); }
/* @@@ -12837,10 -13074,28 +13082,28 @@@ static void attach_task_cfs_rq(struct t static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); + /* + * Since this is called after changing class, this is a little weird + * and we cannot use DEQUEUE_DELAYED. + */ + if (p->se.sched_delayed) { + /* First, dequeue it from its new class' structures */ + dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); + /* + * Now, clean up the fair_sched_class side of things + * related to sched_delayed being true and that wasn't done + * due to the generic dequeue not using DEQUEUE_DELAYED. + */ + finish_delayed_dequeue_entity(&p->se); + p->se.rel_deadline = 0; + __block_task(rq, p); + } }
static void switched_to_fair(struct rq *rq, struct task_struct *p) { + SCHED_WARN_ON(p->se.sched_delayed); + attach_task_cfs_rq(p);
set_task_max_allowed_capacity(p); @@@ -12858,12 -13113,7 +13121,7 @@@ } }
- /* Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ - static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { struct sched_entity *se = &p->se;
@@@ -12876,6 -13126,27 +13134,27 @@@ list_move(&se->group_node, &rq->cfs_tasks); } #endif + if (!first) + return; + + SCHED_WARN_ON(se->sched_delayed); + + if (hrtick_enabled_fair(rq)) + hrtick_start_fair(rq, p); + + update_misfit_status(p, rq); + sched_fair_update_stop_tick(rq, p); + } + + /* + * Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ + static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + { + struct sched_entity *se = &p->se;
for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@@ -12884,12 -13155,14 +13163,14 @@@ /* ensure bandwidth has been allocated on our new cfs_rq */ account_cfs_rq_runtime(cfs_rq, 0); } + + __set_next_task_fair(rq, p, first); }
void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; - u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif @@@ -12991,28 -13264,35 +13272,35 @@@ void online_fair_sched_group(struct tas
void unregister_fair_sched_group(struct task_group *tg) { - unsigned long flags; - struct rq *rq; int cpu;
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(cpu) { - if (tg->se[cpu]) - remove_entity_load_avg(tg->se[cpu]); + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; + struct sched_entity *se = tg->se[cpu]; + struct rq *rq = cpu_rq(cpu); + + if (se) { + if (se->sched_delayed) { + guard(rq_lock_irqsave)(rq); + if (se->sched_delayed) { + update_rq_clock(rq); + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + } + list_del_leaf_cfs_rq(cfs_rq); + } + remove_entity_load_avg(se); + }
/* * Only empty task groups can be destroyed; so we can speculatively * check on_list without danger of it being re-added. */ - if (!tg->cfs_rq[cpu]->on_list) - continue; - - rq = cpu_rq(cpu); - - raw_spin_rq_lock_irqsave(rq, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_rq_unlock_irqrestore(rq, flags); + if (cfs_rq->on_list) { + guard(rq_lock_irqsave)(rq); + list_del_leaf_cfs_rq(cfs_rq); + } } }
@@@ -13202,13 -13482,13 +13490,13 @@@ DEFINE_SCHED_CLASS(fair) =
.wakeup_preempt = check_preempt_wakeup_fair,
+ .pick_task = pick_task_fair, .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair,
#ifdef CONFIG_SMP .balance = balance_fair, - .pick_task = pick_task_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair,
diff --combined kernel/trace/bpf_trace.c index 98e395f1baae2,ac0a01cc8634a..14719c0c116b7 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@@ -24,6 -24,7 +24,6 @@@ #include <linux/key.h> #include <linux/verification.h> #include <linux/namei.h> -#include <linux/fileattr.h>
#include <net/bpf_sk_storage.h>
@@@ -797,6 -798,29 +797,6 @@@ const struct bpf_func_proto bpf_task_pt .ret_btf_id = &bpf_task_pt_regs_ids[0], };
-BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) -{ - struct bpf_array *array = container_of(map, struct bpf_array, map); - struct cgroup *cgrp; - - if (unlikely(idx >= array->map.max_entries)) - return -E2BIG; - - cgrp = READ_ONCE(array->ptrs[idx]); - if (unlikely(!cgrp)) - return -EAGAIN; - - return task_under_cgroup_hierarchy(current, cgrp); -} - -static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { - .func = bpf_current_task_under_cgroup, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_ANYTHING, -}; - struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; @@@ -1415,6 -1439,73 +1415,6 @@@ static int __init bpf_key_sig_kfuncs_in late_initcall(bpf_key_sig_kfuncs_init); #endif /* CONFIG_KEYS */
-/* filesystem kfuncs */ -__bpf_kfunc_start_defs(); - -/** - * bpf_get_file_xattr - get xattr of a file - * @file: file to get xattr from - * @name__str: name of the xattr - * @value_p: output buffer of the xattr value - * - * Get xattr *name__str* of *file* and store the output in *value_ptr*. - * - * For security reasons, only *name__str* with prefix "user." is allowed. - * - * Return: 0 on success, a negative value on error. - */ -__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, - struct bpf_dynptr *value_p) -{ - struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; - struct dentry *dentry; - u32 value_len; - void *value; - int ret; - - if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return -EPERM; - - value_len = __bpf_dynptr_size(value_ptr); - value = __bpf_dynptr_data_rw(value_ptr, value_len); - if (!value) - return -EINVAL; - - dentry = file_dentry(file); - ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ); - if (ret) - return ret; - return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len); -} - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(fs_kfunc_set_ids) -BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_KFUNCS_END(fs_kfunc_set_ids) - -static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id) -{ - if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id)) - return 0; - - /* Only allow to attach from LSM hooks, to avoid recursion */ - return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0; -} - -static const struct btf_kfunc_id_set bpf_fs_kfunc_set = { - .owner = THIS_MODULE, - .set = &fs_kfunc_set_ids, - .filter = bpf_get_file_xattr_filter, -}; - -static int __init bpf_fs_kfuncs_init(void) -{ - return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set); -} - -late_initcall(bpf_fs_kfuncs_init); - static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@@ -1457,6 -1548,8 +1457,6 @@@ return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_probe_write_user: @@@ -1485,8 -1578,6 +1485,8 @@@ return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: return &bpf_cgrp_storage_delete_proto; + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; @@@ -3069,6 -3160,7 +3069,7 @@@ struct bpf_uprobe loff_t offset; unsigned long ref_ctr_offset; u64 cookie; + struct uprobe *uprobe; struct uprobe_consumer consumer; };
@@@ -3087,15 -3179,15 +3088,15 @@@ struct bpf_uprobe_multi_run_ctx struct bpf_uprobe *uprobe; };
- static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes, - u32 cnt) + static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt) { u32 i;
- for (i = 0; i < cnt; i++) { - uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset, - &uprobes[i].consumer); - } + for (i = 0; i < cnt; i++) + uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer); + + if (cnt) + uprobe_unregister_sync(); }
static void bpf_uprobe_multi_link_release(struct bpf_link *link) @@@ -3103,7 -3195,7 +3104,7 @@@ struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); - bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); + bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt); if (umulti_link->task) put_task_struct(umulti_link->task); path_put(&umulti_link->path); @@@ -3207,7 -3299,7 +3208,7 @@@ static int uprobe_prog_run(struct bpf_u struct bpf_run_ctx *old_run_ctx; int err = 0;
- if (link->task && current->mm != link->task->mm) + if (link->task && !same_thread_group(current, link->task)) return 0;
if (sleepable) @@@ -3231,8 -3323,7 +3232,7 @@@ }
static bool - uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx, - struct mm_struct *mm) + uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm) { struct bpf_uprobe *uprobe;
@@@ -3389,22 -3480,26 +3389,26 @@@ int bpf_uprobe_multi_link_attach(const &bpf_uprobe_multi_link_lops, prog);
for (i = 0; i < cnt; i++) { - err = uprobe_register_refctr(d_real_inode(link->path.dentry), - uprobes[i].offset, - uprobes[i].ref_ctr_offset, - &uprobes[i].consumer); - if (err) { - bpf_uprobe_unregister(&path, uprobes, i); - goto error_free; + uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), + uprobes[i].offset, + uprobes[i].ref_ctr_offset, + &uprobes[i].consumer); + if (IS_ERR(uprobes[i].uprobe)) { + err = PTR_ERR(uprobes[i].uprobe); + link->cnt = i; + goto error_unregister; } }
err = bpf_link_prime(&link->link, &link_primer); if (err) - goto error_free; + goto error_unregister;
return bpf_link_settle(&link_primer);
+ error_unregister: + bpf_uprobe_unregister(uprobes, link->cnt); + error_free: kvfree(uprobes); kfree(link); diff --combined lib/Kconfig.debug index 7d4bd97c99b41,26354671b37df..1256180bee31c --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@@ -97,7 -97,7 +97,7 @@@ config BOOT_PRINTK_DELA using "boot_delay=N".
It is likely that you would also need to use "lpj=M" to preset - the "loops per jiffie" value. + the "loops per jiffy" value. See a previous boot log for the "lpj" value to use for your system, and then set "lpj=M" before setting "boot_delay=N". NOTE: Using this option may adversely affect SMP systems. @@@ -1515,7 -1515,7 +1515,7 @@@ config LOCKDEP_BIT config LOCKDEP_CHAINS_BITS int "Bitsize for MAX_LOCKDEP_CHAINS" depends on LOCKDEP && !LOCKDEP_SMALL - range 10 30 + range 10 21 default 16 help Try increasing this value if you hit "BUG: MAX_LOCKDEP_CHAINS too low!" message. @@@ -2019,7 -2019,7 +2019,7 @@@ config FAULT_INJECTIO depends on DEBUG_KERNEL help Provide fault-injection framework. - For more details, see Documentation/fault-injection/. + For more details, see Documentation/dev-tools/fault-injection/.
config FAILSLAB bool "Fault-injection capability for kmalloc" @@@ -2173,6 -2173,14 +2173,14 @@@ config KCOV_IRQ_AREA_SIZ soft interrupts. This specifies the size of those areas in the number of unsigned long words.
+ config KCOV_SELFTEST + bool "Perform short selftests on boot" + depends on KCOV + help + Run short KCOV coverage collection selftests on boot. + On test failure, causes the kernel to panic. Recommended to be + enabled, ensuring critical functionality works as intended. + menuconfig RUNTIME_TESTING_MENU bool "Runtime Testing" default y @@@ -2225,7 -2233,7 +2233,7 @@@ config LKDT called lkdtm.
Documentation on how to use the module can be found in - Documentation/fault-injection/provoke-crashes.rst + Documentation/dev-tools/fault-injection/provoke-crashes.rst
config CPUMASK_KUNIT_TEST tristate "KUnit test for cpumask" if !KUNIT_ALL_TESTS @@@ -2280,16 -2288,6 +2288,16 @@@ config TEST_DIV6
If unsure, say N.
+config TEST_MULDIV64 + tristate "mul_u64_u64_div_u64() test" + depends on DEBUG_KERNEL || m + help + Enable this to turn on 'mul_u64_u64_div_u64()' function test. + This test is executed only once during system boot (so affects + only boot time), or at module load time. + + If unsure, say N. + config TEST_IOV_ITER tristate "Test iov_iter operation" if !KUNIT_ALL_TESTS depends on KUNIT @@@ -2626,7 -2624,6 +2634,7 @@@ config RESOURCE_KUNIT_TES tristate "KUnit test for resource API" if !KUNIT_ALL_TESTS depends on KUNIT default KUNIT_ALL_TESTS + select GET_FREE_REGION help This builds the resource API unit test. Tests the logic of API provided by resource.c and ioport.h. diff --combined mm/damon/core.c index a83f3b736d51e,94fe2f1f9b0e3..c725c78b43f01 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@@ -552,13 -552,7 +552,13 @@@ static unsigned int damon_accesses_bp_t return accesses_bp * damon_max_nr_accesses(attrs) / 10000; }
-/* convert nr_accesses to access ratio in bp (per 10,000) */ +/* + * Convert nr_accesses to access ratio in bp (per 10,000). + * + * Callers should ensure attrs.aggr_interval is not zero, like + * damon_update_monitoring_results() does . Otherwise, divide-by-zero would + * happen. + */ static unsigned int damon_nr_accesses_to_accesses_bp( unsigned int nr_accesses, struct damon_attrs *attrs) { @@@ -1588,16 -1582,13 +1588,16 @@@ static void damos_adjust_quota(struct d return;
/* Fill up the score histogram */ - memset(quota->histogram, 0, sizeof(quota->histogram)); + memset(c->regions_score_histogram, 0, + sizeof(*c->regions_score_histogram) * + (DAMOS_MAX_SCORE + 1)); damon_for_each_target(t, c) { damon_for_each_region(r, t) { if (!__damos_valid_target(r, s)) continue; score = c->ops.get_scheme_score(c, t, r, s); - quota->histogram[score] += damon_sz_region(r); + c->regions_score_histogram[score] += + damon_sz_region(r); if (score > max_score) max_score = score; } @@@ -1605,7 -1596,7 +1605,7 @@@
/* Set the min score limit */ for (cumulated_sz = 0, score = max_score; ; score--) { - cumulated_sz += quota->histogram[score]; + cumulated_sz += c->regions_score_histogram[score]; if (cumulated_sz >= quota->esz || !score) break; } @@@ -1896,7 -1887,7 +1896,7 @@@ static void kdamond_usleep(unsigned lon if (usecs > 20 * USEC_PER_MSEC) schedule_timeout_idle(usecs_to_jiffies(usecs)); else - usleep_idle_range(usecs, usecs + 1); + usleep_range_idle(usecs, usecs + 1); }
/* Returns negative error code if it's not activated but should return */ @@@ -1966,10 -1957,6 +1966,10 @@@ static int kdamond_fn(void *data ctx->ops.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) goto done; + ctx->regions_score_histogram = kmalloc_array(DAMOS_MAX_SCORE + 1, + sizeof(*ctx->regions_score_histogram), GFP_KERNEL); + if (!ctx->regions_score_histogram) + goto done;
sz_limit = damon_region_sz_limit(ctx);
@@@ -2047,7 -2034,6 +2047,7 @@@ done ctx->callback.before_terminate(ctx); if (ctx->ops.cleanup) ctx->ops.cleanup(ctx); + kfree(ctx->regions_score_histogram);
pr_debug("kdamond (%d) finishes\n", current->pid); mutex_lock(&ctx->kdamond_lock); @@@ -2219,4 -2205,4 +2219,4 @@@ static int __init damon_init(void
subsys_initcall(damon_init);
-#include "core-test.h" +#include "tests/core-kunit.h" diff --combined mm/page-writeback.c index f5448311c89eb,7a04cb1918fd5..fcd4c1439cb9c --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@@ -418,7 -418,7 +418,7 @@@ static void domain_dirty_limits(struct bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
tsk = current; - if (rt_task(tsk)) { + if (rt_or_dl_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } @@@ -477,7 -477,7 +477,7 @@@ static unsigned long node_dirty_limit(s else dirty = vm_dirty_ratio * node_memory / 100;
- if (rt_task(tsk)) + if (rt_or_dl_task(tsk)) dirty += dirty / 4;
/* @@@ -2612,7 -2612,7 +2612,7 @@@ struct folio *writeback_iter(struct add
done: if (wbc->range_cyclic) - mapping->writeback_index = folio->index + folio_nr_pages(folio); + mapping->writeback_index = folio_next_index(folio); folio_batch_release(&wbc->fbatch); return NULL; } diff --combined mm/page_alloc.c index aebc4529d5fcd,0aefae4a26b20..0f33dab6d344f --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@@ -286,7 -286,9 +286,7 @@@ EXPORT_SYMBOL(nr_online_nodes) #endif
static bool page_contains_unaccepted(struct page *page, unsigned int order); -static void accept_page(struct page *page, unsigned int order); static bool cond_accept_memory(struct zone *zone, unsigned int order); -static inline bool has_unaccepted_memory(void); static bool __free_unaccepted(struct page *page);
int page_group_by_mobility_disabled __read_mostly; @@@ -320,11 -322,6 +320,11 @@@ static inline bool deferred_pages_enabl { return false; } + +static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order) +{ + return false; +} #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/* Return a pointer to the bitmap storing bits affecting a block of pages */ @@@ -961,9 -958,8 +961,9 @@@ static int free_tail_page_prepare(struc break; case 2: /* the second tail page: deferred_list overlaps ->mapping */ - if (unlikely(!list_empty(&folio->_deferred_list))) { - bad_page(page, "on deferred list"); + if (unlikely(!list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio))) { + bad_page(page, "partially mapped folio on deferred list"); goto out; } break; @@@ -1091,11 -1087,8 +1091,11 @@@ __always_inline bool free_pages_prepare (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; } } - if (PageMappingFlags(page)) + if (PageMappingFlags(page)) { + if (PageAnon(page)) + mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); page->mapping = NULL; + } if (is_check_pages_enabled()) { if (free_page_is_bad(page)) bad++; @@@ -1206,39 -1199,17 +1206,39 @@@ static void free_pcppages_bulk(struct z spin_unlock_irqrestore(&zone->lock, flags); }
+/* Split a multi-block free page into its individual pageblocks. */ +static void split_large_buddy(struct zone *zone, struct page *page, + unsigned long pfn, int order, fpi_t fpi) +{ + unsigned long end = pfn + (1 << order); + + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order)); + /* Caller removed page from freelist, buddy info cleared! */ + VM_WARN_ON_ONCE(PageBuddy(page)); + + if (order > pageblock_order) + order = pageblock_order; + + while (pfn != end) { + int mt = get_pfnblock_migratetype(page, pfn); + + __free_one_page(page, pfn, zone, order, mt, fpi); + pfn += 1 << order; + page = pfn_to_page(pfn); + } +} + static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, unsigned int order, fpi_t fpi_flags) { unsigned long flags; - int migratetype;
spin_lock_irqsave(&zone->lock, flags); - migratetype = get_pfnblock_migratetype(page, pfn); - __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); + split_large_buddy(zone, page, pfn, order, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); + + __count_vm_events(PGFREE, 1 << order); }
static void __free_pages_ok(struct page *page, unsigned int order, @@@ -1247,8 -1218,12 +1247,8 @@@ unsigned long pfn = page_to_pfn(page); struct zone *zone = page_zone(page);
- if (!free_pages_prepare(page, order)) - return; - - free_one_page(zone, page, pfn, order, fpi_flags); - - __count_vm_events(PGFREE, 1 << order); + if (free_pages_prepare(page, order)) + free_one_page(zone, page, pfn, order, fpi_flags); }
void __meminit __free_pages_core(struct page *page, unsigned int order, @@@ -1295,7 -1270,7 +1295,7 @@@ if (order == MAX_PAGE_ORDER && __free_unaccepted(page)) return;
- accept_page(page, order); + accept_memory(page_to_phys(page), PAGE_SIZE << order); }
/* @@@ -1371,11 -1346,11 +1371,11 @@@ struct page *__pageblock_pfn_to_page(un * * -- nyc */ -static inline void expand(struct zone *zone, struct page *page, - int low, int high, int migratetype) +static inline unsigned int expand(struct zone *zone, struct page *page, int low, + int high, int migratetype) { - unsigned long size = 1 << high; - unsigned long nr_added = 0; + unsigned int size = 1 << high; + unsigned int nr_added = 0;
while (high > low) { high--; @@@ -1395,19 -1370,7 +1395,19 @@@ set_buddy_order(&page[size], high); nr_added += size; } - account_freepages(zone, nr_added, migratetype); + + return nr_added; +} + +static __always_inline void page_del_and_expand(struct zone *zone, + struct page *page, int low, + int high, int migratetype) +{ + int nr_pages = 1 << high; + + __del_page_from_free_list(page, zone, high, migratetype); + nr_pages -= expand(zone, page, low, high, migratetype); + account_freepages(zone, -nr_pages, migratetype); }
static void check_new_page_bad(struct page *page) @@@ -1577,9 -1540,8 +1577,9 @@@ struct page *__rmqueue_smallest(struct page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_list(page, zone, current_order, migratetype); - expand(zone, page, order, current_order, migratetype); + + page_del_and_expand(zone, page, order, current_order, + migratetype); trace_mm_page_alloc_zone_locked(page, order, migratetype, pcp_allowed_order(order) && migratetype < MIGRATE_PCPTYPES); @@@ -1738,6 -1700,27 +1738,6 @@@ static unsigned long find_large_buddy(u return start_pfn; }
-/* Split a multi-block free page into its individual pageblocks */ -static void split_large_buddy(struct zone *zone, struct page *page, - unsigned long pfn, int order) -{ - unsigned long end_pfn = pfn + (1 << order); - - VM_WARN_ON_ONCE(order <= pageblock_order); - VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1)); - - /* Caller removed page from freelist, buddy info cleared! */ - VM_WARN_ON_ONCE(PageBuddy(page)); - - while (pfn != end_pfn) { - int mt = get_pfnblock_migratetype(page, pfn); - - __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE); - pfn += pageblock_nr_pages; - page = pfn_to_page(pfn); - } -} - /** * move_freepages_block_isolate - move free pages in block for page isolation * @zone: the zone @@@ -1778,7 -1761,7 +1778,7 @@@ bool move_freepages_block_isolate(struc del_page_from_free_list(buddy, zone, order, get_pfnblock_migratetype(buddy, pfn)); set_pageblock_migratetype(page, migratetype); - split_large_buddy(zone, buddy, pfn, order); + split_large_buddy(zone, buddy, pfn, order, FPI_NONE); return true; }
@@@ -1789,7 -1772,7 +1789,7 @@@ del_page_from_free_list(page, zone, order, get_pfnblock_migratetype(page, pfn)); set_pageblock_migratetype(page, migratetype); - split_large_buddy(zone, page, pfn, order); + split_large_buddy(zone, page, pfn, order, FPI_NONE); return true; } move: @@@ -1909,12 -1892,9 +1909,12 @@@ steal_suitable_fallback(struct zone *zo
/* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { + unsigned int nr_added; + del_page_from_free_list(page, zone, current_order, block_type); change_pageblock_range(page, current_order, start_type); - expand(zone, page, order, current_order, start_type); + nr_added = expand(zone, page, order, current_order, start_type); + account_freepages(zone, nr_added, start_type); return page; }
@@@ -1967,7 -1947,8 +1967,7 @@@ }
single_page: - del_page_from_free_list(page, zone, current_order, block_type); - expand(zone, page, order, current_order, block_type); + page_del_and_expand(zone, page, order, current_order, block_type); return page; }
@@@ -2236,43 -2217,6 +2236,43 @@@ do_steal return page; }
+#ifdef CONFIG_CMA +/* + * GFP_MOVABLE allocation could drain UNMOVABLE & RECLAIMABLE page blocks via + * the help of CMA which makes GFP_KERNEL failed. Checking if zone_watermark_ok + * again without ALLOC_CMA to see if to use CMA first. + */ +static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags) +{ + unsigned long watermark; + bool cma_first = false; + + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); + /* check if GFP_MOVABLE pass previous zone_watermark_ok via the help of CMA */ + if (zone_watermark_ok(zone, order, watermark, 0, alloc_flags & (~ALLOC_CMA))) { + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + cma_first = (zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2); + } else { + /* + * watermark failed means UNMOVABLE & RECLAIMBLE is not enough + * now, we should use cma first to keep them stay around the + * corresponding watermark + */ + cma_first = true; + } + return cma_first; +} +#else +static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags) +{ + return false; +} +#endif /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. @@@ -2286,11 -2230,12 +2286,11 @@@ __rmqueue(struct zone *zone, unsigned i if (IS_ENABLED(CONFIG_CMA)) { /* * Balance movable allocations between regular and CMA areas by - * allocating from CMA when over half of the zone's free memory - * is in the CMA area. + * allocating from CMA base on judging zone_watermark_ok again + * to see if the latest check got pass via the help of CMA */ if (alloc_flags & ALLOC_CMA && - zone_page_state(zone, NR_FREE_CMA_PAGES) > - zone_page_state(zone, NR_FREE_PAGES) / 2) { + use_cma_first(zone, order, alloc_flags)) { page = __rmqueue_cma_fallback(zone, order); if (page) return page; @@@ -2819,7 -2764,7 +2819,7 @@@ void split_page(struct page *page, unsi for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, order, 0); - pgalloc_tag_split(page, 1 << order); + pgalloc_tag_split(page_folio(page), order, 0); split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@@ -3088,6 -3033,12 +3088,6 @@@ struct page *rmqueue(struct zone *prefe { struct page *page;
- /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - if (likely(pcp_allowed_order(order))) { page = rmqueue_pcplist(preferred_zone, zone, order, migratetype, alloc_flags); @@@ -3406,7 -3357,7 +3406,7 @@@ retry }
if (no_fallback && nr_online_nodes > 1 && - zone != ac->preferred_zoneref->zone) { + zone != zonelist_zone(ac->preferred_zoneref)) { int local_nid;
/* @@@ -3414,7 -3365,7 +3414,7 @@@ * fragmenting fallbacks. Locality is more important * than fragmentation avoidance. */ - local_nid = zone_to_nid(ac->preferred_zoneref->zone); + local_nid = zonelist_node_idx(ac->preferred_zoneref); if (zone_to_nid(zone) != local_nid) { alloc_flags &= ~ALLOC_NOFRAGMENT; goto retry; @@@ -3451,6 -3402,7 +3451,6 @@@ check_alloc_wmark if (cond_accept_memory(zone, order)) goto try_this_zone;
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * Watermark failed for this zone, but see if we can * grow this zone if it contains deferred pages. @@@ -3459,13 -3411,14 +3459,13 @@@ if (_deferred_grow_zone(zone, order)) goto try_this_zone; } -#endif /* Checked here to keep the fast path fast */ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone;
if (!node_reclaim_enabled() || - !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) + !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone)) continue;
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); @@@ -3487,7 -3440,7 +3487,7 @@@ }
try_this_zone: - page = rmqueue(ac->preferred_zoneref->zone, zone, order, + page = rmqueue(zonelist_zone(ac->preferred_zoneref), zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); @@@ -3504,11 -3457,13 +3504,11 @@@ if (cond_accept_memory(zone, order)) goto try_this_zone;
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } -#endif } }
@@@ -4049,7 -4004,7 +4049,7 @@@ gfp_to_alloc_flags(gfp_t gfp_mask, unsi */ if (alloc_flags & ALLOC_MIN_RESERVE) alloc_flags &= ~ALLOC_CPUSET; - } else if (unlikely(rt_task(current)) && in_task()) + } else if (unlikely(rt_or_dl_task(current)) && in_task()) alloc_flags |= ALLOC_MIN_RESERVE;
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); @@@ -4145,11 -4100,6 +4145,11 @@@ should_reclaim_retry(gfp_t gfp_mask, un unsigned long min_wmark = min_wmark_pages(zone); bool wmark;
+ if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp_mask)) + continue; + available = reclaimable = zone_reclaimable_pages(zone); available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
@@@ -4225,7 -4175,6 +4225,7 @@@ __alloc_pages_slowpath(gfp_t gfp_mask, { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; bool can_compact = gfp_compaction_allowed(gfp_mask); + bool nofail = gfp_mask & __GFP_NOFAIL; const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; @@@ -4238,25 -4187,6 +4238,25 @@@ unsigned int zonelist_iter_cookie; int reserve_flags;
+ if (unlikely(nofail)) { + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE(order > 1); + /* + * Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM, + * otherwise, we may result in lockup. + */ + WARN_ON_ONCE(!can_direct_reclaim); + /* + * PF_MEMALLOC request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us. + */ + WARN_ON_ONCE(current->flags & PF_MEMALLOC); + } + restart: compaction_retries = 0; no_progress_loops = 0; @@@ -4279,7 -4209,7 +4279,7 @@@ */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->highest_zoneidx, ac->nodemask); - if (!ac->preferred_zoneref->zone) + if (!zonelist_zone(ac->preferred_zoneref)) goto nopage;
/* @@@ -4291,7 -4221,7 +4291,7 @@@ struct zoneref *z = first_zones_zonelist(ac->zonelist, ac->highest_zoneidx, &cpuset_current_mems_allowed); - if (!z->zone) + if (!zonelist_zone(z)) goto nopage; }
@@@ -4474,15 -4404,29 +4474,15 @@@ nopage * Make sure that __GFP_NOFAIL request doesn't leak out and make sure * we always retry */ - if (gfp_mask & __GFP_NOFAIL) { + if (unlikely(nofail)) { /* - * All existing users of the __GFP_NOFAIL are blockable, so warn - * of any new users that actually require GFP_NOWAIT + * Lacking direct_reclaim we can't do anything to reclaim memory, + * we disregard these unreasonable nofail requests and still + * return NULL */ - if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask)) + if (!can_direct_reclaim) goto fail;
- /* - * PF_MEMALLOC request from this context is rather bizarre - * because we cannot reclaim anything and only can loop waiting - * for somebody to do a work for us - */ - WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask); - - /* - * non failing costly orders are a hard requirement which we - * are not prepared for much so let's warn about these users - * so that we can identify them and convert them to something - * else. - */ - WARN_ON_ONCE_GFP(costly_order, gfp_mask); - /* * Help non-failing allocations by giving some access to memory * reserves normally used for high priority non-blocking @@@ -4634,28 -4578,17 +4634,28 @@@ unsigned long alloc_pages_bulk_noprof(g continue; }
- if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && - zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { + if (nr_online_nodes > 1 && zone != zonelist_zone(ac.preferred_zoneref) && + zone_to_nid(zone) != zonelist_node_idx(ac.preferred_zoneref)) { goto failed; }
+ cond_accept_memory(zone, 0); +retry_this_zone: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; if (zone_watermark_fast(zone, 0, mark, zonelist_zone_idx(ac.preferred_zoneref), alloc_flags, gfp)) { break; } + + if (cond_accept_memory(zone, 0)) + goto retry_this_zone; + + /* Try again if zone has deferred pages */ + if (deferred_pages_enabled()) { + if (_deferred_grow_zone(zone, 0)) + goto retry_this_zone; + } }
/* @@@ -4705,7 -4638,7 +4705,7 @@@ pcp_trylock_finish(UP_flags);
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); - zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); + zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account);
out: return nr_populated; @@@ -4763,7 -4696,7 +4763,7 @@@ struct page *__alloc_pages_noprof(gfp_ * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. */ - alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp); + alloc_flags |= alloc_flags_nofragment(zonelist_zone(ac.preferred_zoneref), gfp);
/* First allocation attempt */ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); @@@ -5017,7 -4950,7 +5017,7 @@@ static void *make_alloc_exact(unsigned struct page *last = page + nr;
split_page_owner(page, order, 0); - pgalloc_tag_split(page, 1 << order); + pgalloc_tag_split(page_folio(page), order, 0); split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last); @@@ -5368,7 -5301,7 +5368,7 @@@ int local_memory_node(int node z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), NULL); - return zone_to_nid(z->zone); + return zonelist_node_idx(z); } #endif
@@@ -6500,31 -6433,6 +6500,31 @@@ int __alloc_contig_migrate_range(struc return (ret < 0) ? ret : 0; }
+static void split_free_pages(struct list_head *list) +{ + int order; + + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct page *page, *next; + int nr_pages = 1 << order; + + list_for_each_entry_safe(page, next, &list[order], lru) { + int i; + + post_alloc_hook(page, order, __GFP_MOVABLE); + if (!order) + continue; + + split_page(page, order); + + /* Add all subpages to the order-0 head, in sequence. */ + list_del(&page->lru); + for (i = 0; i < nr_pages; i++) + list_add_tail(&page[i].lru, &list[0]); + } + } +} + /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate @@@ -6637,25 -6545,12 +6637,25 @@@ int alloc_contig_range_noprof(unsigned goto done; }
- /* Free head and tail (if any) */ - if (start != outer_start) - free_contig_range(outer_start, start - outer_start); - if (end != outer_end) - free_contig_range(end, outer_end - end); + if (!(gfp_mask & __GFP_COMP)) { + split_free_pages(cc.freepages);
+ /* Free head and tail (if any) */ + if (start != outer_start) + free_contig_range(outer_start, start - outer_start); + if (end != outer_end) + free_contig_range(end, outer_end - end); + } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { + struct page *head = pfn_to_page(start); + int order = ilog2(end - start); + + check_new_pages(head, order); + prep_new_page(head, order, gfp_mask, 0); + } else { + ret = -EINVAL; + WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", + start, end, outer_start, outer_end); + } done: undo_isolate_page_range(start, end, migratetype); return ret; @@@ -6764,18 -6659,6 +6764,18 @@@ struct page *alloc_contig_pages_noprof( void free_contig_range(unsigned long pfn, unsigned long nr_pages) { unsigned long count = 0; + struct folio *folio = pfn_folio(pfn); + + if (folio_test_large(folio)) { + int expected = folio_nr_pages(folio); + + if (nr_pages == expected) + folio_put(folio); + else + WARN(true, "PFN %lu: nr_pages %lu != expected %d\n", + pfn, nr_pages, expected); + return; + }
for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@@ -7044,50 -6927,23 +7044,50 @@@ early_param("accept_memory", accept_mem static bool page_contains_unaccepted(struct page *page, unsigned int order) { phys_addr_t start = page_to_phys(page); - phys_addr_t end = start + (PAGE_SIZE << order);
- return range_contains_unaccepted_memory(start, end); + return range_contains_unaccepted_memory(start, PAGE_SIZE << order); }
-static void accept_page(struct page *page, unsigned int order) +static void __accept_page(struct zone *zone, unsigned long *flags, + struct page *page) { - phys_addr_t start = page_to_phys(page); + bool last; + + list_del(&page->lru); + last = list_empty(&zone->unaccepted_pages); + + account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); + __ClearPageUnaccepted(page); + spin_unlock_irqrestore(&zone->lock, *flags); + + accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER); + + __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
- accept_memory(start, start + (PAGE_SIZE << order)); + if (last) + static_branch_dec(&zones_with_unaccepted_pages); +} + +void accept_page(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); + if (!PageUnaccepted(page)) { + spin_unlock_irqrestore(&zone->lock, flags); + return; + } + + /* Unlocks zone->lock */ + __accept_page(zone, &flags, page); }
static bool try_to_accept_memory_one(struct zone *zone) { unsigned long flags; struct page *page; - bool last;
spin_lock_irqsave(&zone->lock, flags); page = list_first_entry_or_null(&zone->unaccepted_pages, @@@ -7097,17 -6953,23 +7097,17 @@@ return false; }
- list_del(&page->lru); - last = list_empty(&zone->unaccepted_pages); - - account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); - __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); - spin_unlock_irqrestore(&zone->lock, flags); - - accept_page(page, MAX_PAGE_ORDER); - - __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); - - if (last) - static_branch_dec(&zones_with_unaccepted_pages); + /* Unlocks zone->lock */ + __accept_page(zone, &flags, page);
return true; }
+static inline bool has_unaccepted_memory(void) +{ + return static_branch_unlikely(&zones_with_unaccepted_pages); +} + static bool cond_accept_memory(struct zone *zone, unsigned int order) { long to_accept; @@@ -7119,8 -6981,8 +7119,8 @@@ if (list_empty(&zone->unaccepted_pages)) return false;
- /* How much to accept to get to high watermark? */ - to_accept = high_wmark_pages(zone) - + /* How much to accept to get to promo watermark? */ + to_accept = promo_wmark_pages(zone) - (zone_page_state(zone, NR_FREE_PAGES) - __zone_watermark_unusable_free(zone, order, 0) - zone_page_state(zone, NR_UNACCEPTED)); @@@ -7135,6 -6997,11 +7135,6 @@@ return ret; }
-static inline bool has_unaccepted_memory(void) -{ - return static_branch_unlikely(&zones_with_unaccepted_pages); -} - static bool __free_unaccepted(struct page *page) { struct zone *zone = page_zone(page); @@@ -7149,7 -7016,6 +7149,7 @@@ list_add_tail(&page->lru, &zone->unaccepted_pages); account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); + __SetPageUnaccepted(page); spin_unlock_irqrestore(&zone->lock, flags);
if (first) @@@ -7165,11 -7031,20 +7165,11 @@@ static bool page_contains_unaccepted(st return false; }
-static void accept_page(struct page *page, unsigned int order) -{ -} - static bool cond_accept_memory(struct zone *zone, unsigned int order) { return false; }
-static inline bool has_unaccepted_memory(void) -{ - return false; -} - static bool __free_unaccepted(struct page *page) { BUILD_BUG(); diff --combined net/core/dev.c index 8f4dead64284f,e4d5e9bdd09e2..91e8f5d0d3b16 --- a/net/core/dev.c +++ b/net/core/dev.c @@@ -158,7 -158,6 +158,7 @@@ #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> #include <net/rps.h> +#include <linux/phy_link_topology.h>
#include "dev.h" #include "net-sysfs.h" @@@ -3387,7 -3386,6 +3387,7 @@@ int skb_crc32c_csum_help(struct sk_buf out: return ret; } +EXPORT_SYMBOL(skb_crc32c_csum_help);
__be16 skb_network_protocol(struct sk_buff *skb, int *depth) { @@@ -3707,7 -3705,7 +3707,7 @@@ struct sk_buff *validate_xmit_skb_list( next = skb->next; skb_mark_not_on_list(skb);
- /* in case skb wont be segmented, point to itself */ + /* in case skb won't be segmented, point to itself */ skb->prev = skb;
skb = validate_xmit_skb(skb, dev, again); @@@ -4247,6 -4245,13 +4247,6 @@@ u16 dev_pick_tx_zero(struct net_device } EXPORT_SYMBOL(dev_pick_tx_zero);
-u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev) -{ - return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; -} -EXPORT_SYMBOL(dev_pick_tx_cpu_id); - u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { @@@ -5243,7 -5248,7 +5243,7 @@@ int netif_rx(struct sk_buff *skb } EXPORT_SYMBOL(netif_rx);
- static __latent_entropy void net_tx_action(struct softirq_action *h) + static __latent_entropy void net_tx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@@ -5720,9 -5725,10 +5720,9 @@@ static void __netif_receive_skb_list_co struct packet_type *pt_curr = NULL; /* Current (common) orig_dev of sublist */ struct net_device *od_curr = NULL; - struct list_head sublist; struct sk_buff *skb, *next; + LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; @@@ -5860,8 -5866,9 +5860,8 @@@ static int netif_receive_skb_internal(s void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); @@@ -6914,7 -6921,7 +6914,7 @@@ static int napi_threaded_poll(void *dat return 0; }
- static __latent_entropy void net_rx_action(struct softirq_action *h) + static __latent_entropy void net_rx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + @@@ -9265,7 -9272,7 +9265,7 @@@ EXPORT_SYMBOL(netdev_port_same_parent_i */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { - if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) + if (!dev->change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; @@@ -9362,15 -9369,6 +9362,15 @@@ u8 dev_xdp_prog_count(struct net_devic } EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
+int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +{ + if (!dev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + + return dev->netdev_ops->ndo_bpf(dev, bpf); +} +EXPORT_SYMBOL_GPL(dev_xdp_propagate); + u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); @@@ -10323,17 -10321,6 +10323,17 @@@ static void netdev_do_free_pcpu_stats(s } }
+static void netdev_free_phy_link_topology(struct net_device *dev) +{ + struct phy_link_topology *topo = dev->link_topo; + + if (IS_ENABLED(CONFIG_PHYLIB) && topo) { + xa_destroy(&topo->phys); + kfree(topo); + dev->link_topo = NULL; + } +} + /** * register_netdevice() - register a network device * @dev: device to register @@@ -10881,7 -10868,7 +10881,7 @@@ noinline void netdev_core_stats_inc(str return; }
- field = (__force unsigned long __percpu *)((__force void *)p + offset); + field = (unsigned long __percpu *)((void __percpu *)p + offset); this_cpu_inc(*field); } EXPORT_SYMBOL_GPL(netdev_core_stats_inc); @@@ -11112,7 -11099,6 +11112,7 @@@ struct net_device *alloc_netdev_mqs(in #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev);
@@@ -11134,7 -11120,7 +11134,7 @@@ if (!dev->ethtool) goto free_all;
- strcpy(dev->name, name); + strscpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) @@@ -11205,8 -11191,6 +11205,8 @@@ void free_netdev(struct net_device *dev free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL;
+ netdev_free_phy_link_topology(dev); + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED || dev->reg_state == NETREG_DUMMY) { @@@ -11423,7 -11407,7 +11423,7 @@@ void unregister_netdevice_many_notify(s * @head: list of devices * * Note: As most callers use a stack allocated list_head, - * we force a list_del() to make sure stack wont be corrupted later. + * we force a list_del() to make sure stack won't be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { @@@ -11478,10 -11462,10 +11478,10 @@@ int __dev_change_net_namespace(struct n
/* Don't allow namespace local devices to be moved. */ err = -EINVAL; - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_local) goto out;
- /* Ensure the device has been registrered */ + /* Ensure the device has been registered */ if (dev->reg_state != NETREG_REGISTERED) goto out;
@@@ -11860,7 -11844,7 +11860,7 @@@ static void __net_exit default_device_e char fb_name[IFNAMSIZ];
/* Ignore unmoveable devices (i.e. loopback) */ - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_local) continue;
/* Leave virtual devices for the generic cleanup */ @@@ -11921,7 -11905,7 +11921,7 @@@ static struct pernet_operations __net_i static void __init net_dev_struct_check(void) { /* TX read-mostly hotpath */ - CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); diff --combined tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index c73d04bc9e9de,1fc16657cf425..6ab61c473c14c --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@@ -17,7 -17,6 +17,7 @@@ #include <linux/in.h> #include <linux/in6.h> #include <linux/un.h> +#include <linux/filter.h> #include <net/sock.h> #include <linux/namei.h> #include "bpf_testmod.h" @@@ -142,12 -141,13 +142,12 @@@ bpf_testmod_test_mod_kfunc(int i
__bpf_kfunc int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt) { - if (cnt < 0) { - it->cnt = 0; + it->cnt = cnt; + + if (cnt < 0) return -EINVAL; - }
it->value = value; - it->cnt = cnt;
return 0; } @@@ -162,14 -162,6 +162,14 @@@ __bpf_kfunc s64 *bpf_iter_testmod_seq_n return &it->value; }
+__bpf_kfunc s64 bpf_iter_testmod_seq_value(int val, struct bpf_iter_testmod_seq* it__iter) +{ + if (it__iter->cnt < 0) + return 0; + + return val + it__iter->value; +} + __bpf_kfunc void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it) { it->cnt = 0; @@@ -184,36 -176,6 +184,36 @@@ __bpf_kfunc void bpf_kfunc_dynptr_test( { }
+__bpf_kfunc struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head *ptr) +{ + return NULL; +} + +__bpf_kfunc struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr) +{ + return NULL; +} + +__bpf_kfunc void bpf_kfunc_nested_release_test(struct sk_buff *ptr) +{ +} + +__bpf_kfunc void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr) +{ +} + +__bpf_kfunc void bpf_kfunc_trusted_task_test(struct task_struct *ptr) +{ +} + +__bpf_kfunc void bpf_kfunc_trusted_num_test(int *ptr) +{ +} + +__bpf_kfunc void bpf_kfunc_rcu_task_test(struct task_struct *ptr) +{ +} + __bpf_kfunc struct bpf_testmod_ctx * bpf_testmod_ctx_create(int *err) { @@@ -470,7 -432,7 +470,7 @@@ uprobe_ret_handler(struct uprobe_consum
struct testmod_uprobe { struct path path; - loff_t offset; + struct uprobe *uprobe; struct uprobe_consumer consumer; };
@@@ -484,25 -446,25 +484,25 @@@ static int testmod_register_uprobe(loff { int err = -EBUSY;
- if (uprobe.offset) + if (uprobe.uprobe) return -EBUSY;
mutex_lock(&testmod_uprobe_mutex);
- if (uprobe.offset) + if (uprobe.uprobe) goto out;
err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, &uprobe.path); if (err) goto out;
- err = uprobe_register_refctr(d_real_inode(uprobe.path.dentry), - offset, 0, &uprobe.consumer); - if (err) + uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry), + offset, 0, &uprobe.consumer); + if (IS_ERR(uprobe.uprobe)) { + err = PTR_ERR(uprobe.uprobe); path_put(&uprobe.path); - else - uprobe.offset = offset; - + uprobe.uprobe = NULL; + } out: mutex_unlock(&testmod_uprobe_mutex); return err; @@@ -512,10 -474,11 +512,11 @@@ static void testmod_unregister_uprobe(v { mutex_lock(&testmod_uprobe_mutex);
- if (uprobe.offset) { - uprobe_unregister(d_real_inode(uprobe.path.dentry), - uprobe.offset, &uprobe.consumer); - uprobe.offset = 0; + if (uprobe.uprobe) { + uprobe_unregister_nosync(uprobe.uprobe, &uprobe.consumer); + uprobe_unregister_sync(); + path_put(&uprobe.path); + uprobe.uprobe = NULL; }
mutex_unlock(&testmod_uprobe_mutex); @@@ -569,16 -532,8 +570,16 @@@ BTF_KFUNCS_START(bpf_testmod_common_kfu BTF_ID_FLAGS(func, bpf_iter_testmod_seq_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_testmod_seq_value) BTF_ID_FLAGS(func, bpf_kfunc_common_test) BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) +BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU) BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE) BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) @@@ -966,51 -921,6 +967,51 @@@ out return err; }
+static DEFINE_MUTEX(st_ops_mutex); +static struct bpf_testmod_st_ops *st_ops; + +__bpf_kfunc int bpf_kfunc_st_ops_test_prologue(struct st_ops_args *args) +{ + int ret = -1; + + mutex_lock(&st_ops_mutex); + if (st_ops && st_ops->test_prologue) + ret = st_ops->test_prologue(args); + mutex_unlock(&st_ops_mutex); + + return ret; +} + +__bpf_kfunc int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args) +{ + int ret = -1; + + mutex_lock(&st_ops_mutex); + if (st_ops && st_ops->test_epilogue) + ret = st_ops->test_epilogue(args); + mutex_unlock(&st_ops_mutex); + + return ret; +} + +__bpf_kfunc int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args) +{ + int ret = -1; + + mutex_lock(&st_ops_mutex); + if (st_ops && st_ops->test_pro_epilogue) + ret = st_ops->test_pro_epilogue(args); + mutex_unlock(&st_ops_mutex); + + return ret; +} + +__bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) +{ + args->a += 10; + return args->a; +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@@ -1047,10 -957,6 +1048,10 @@@ BTF_ID_FLAGS(func, bpf_kfunc_call_kerne BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids)
static int bpf_testmod_ops_init(struct btf *btf) @@@ -1119,11 -1025,6 +1120,11 @@@ static void bpf_testmod_test_2(int a, i { }
+static int bpf_testmod_tramp(int value) +{ + return 0; +} + static int bpf_testmod_ops__test_maybe_null(int dummy, struct task_struct *task__nullable) { @@@ -1170,144 -1071,6 +1171,144 @@@ struct bpf_struct_ops bpf_testmod_ops2 .owner = THIS_MODULE, };
+static int bpf_test_mod_st_ops__test_prologue(struct st_ops_args *args) +{ + return 0; +} + +static int bpf_test_mod_st_ops__test_epilogue(struct st_ops_args *args) +{ + return 0; +} + +static int bpf_test_mod_st_ops__test_pro_epilogue(struct st_ops_args *args) +{ + return 0; +} + +static int st_ops_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (strcmp(prog->aux->attach_func_name, "test_prologue") && + strcmp(prog->aux->attach_func_name, "test_pro_epilogue")) + return 0; + + /* r6 = r1[0]; // r6 will be "struct st_ops *args". r1 is "u64 *ctx". + * r7 = r6->a; + * r7 += 1000; + * r6->a = r7; + */ + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_6, offsetof(struct st_ops_args, a)); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 1000); + *insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_7, offsetof(struct st_ops_args, a)); + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + +static int st_ops_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog, + s16 ctx_stack_off) +{ + struct bpf_insn *insn = insn_buf; + + if (strcmp(prog->aux->attach_func_name, "test_epilogue") && + strcmp(prog->aux->attach_func_name, "test_pro_epilogue")) + return 0; + + /* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx" + * r1 = r1[0]; // r1 will be "struct st_ops *args" + * r6 = r1->a; + * r6 += 10000; + * r1->a = r6; + * r0 = r6; + * r0 *= 2; + * BPF_EXIT; + */ + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, offsetof(struct st_ops_args, a)); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 10000); + *insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, offsetof(struct st_ops_args, a)); + *insn++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_6); + *insn++ = BPF_ALU64_IMM(BPF_MUL, BPF_REG_0, 2); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + +static int st_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + if (off < 0 || off + size > sizeof(struct st_ops_args)) + return -EACCES; + return 0; +} + +static const struct bpf_verifier_ops st_ops_verifier_ops = { + .is_valid_access = bpf_testmod_ops_is_valid_access, + .btf_struct_access = st_ops_btf_struct_access, + .gen_prologue = st_ops_gen_prologue, + .gen_epilogue = st_ops_gen_epilogue, + .get_func_proto = bpf_base_func_proto, +}; + +static struct bpf_testmod_st_ops st_ops_cfi_stubs = { + .test_prologue = bpf_test_mod_st_ops__test_prologue, + .test_epilogue = bpf_test_mod_st_ops__test_epilogue, + .test_pro_epilogue = bpf_test_mod_st_ops__test_pro_epilogue, +}; + +static int st_ops_reg(void *kdata, struct bpf_link *link) +{ + int err = 0; + + mutex_lock(&st_ops_mutex); + if (st_ops) { + pr_err("st_ops has already been registered\n"); + err = -EEXIST; + goto unlock; + } + st_ops = kdata; + +unlock: + mutex_unlock(&st_ops_mutex); + return err; +} + +static void st_ops_unreg(void *kdata, struct bpf_link *link) +{ + mutex_lock(&st_ops_mutex); + st_ops = NULL; + mutex_unlock(&st_ops_mutex); +} + +static int st_ops_init(struct btf *btf) +{ + return 0; +} + +static int st_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static struct bpf_struct_ops testmod_st_ops = { + .verifier_ops = &st_ops_verifier_ops, + .init = st_ops_init, + .init_member = st_ops_init_member, + .reg = st_ops_reg, + .unreg = st_ops_unreg, + .cfi_stubs = &st_ops_cfi_stubs, + .name = "bpf_testmod_st_ops", + .owner = THIS_MODULE, +}; + extern int bpf_fentry_test1(int a);
static int bpf_testmod_init(void) @@@ -1318,17 -1081,14 +1319,17 @@@ .kfunc_btf_id = bpf_testmod_dtor_ids[1] }, }; + void **tramp; int ret;
ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_testmod_common_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_testmod_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_testmod_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_testmod_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_testmod_kfunc_set); ret = ret ?: register_bpf_struct_ops(&bpf_bpf_testmod_ops, bpf_testmod_ops); ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2); + ret = ret ?: register_bpf_struct_ops(&testmod_st_ops, bpf_testmod_st_ops); ret = ret ?: register_btf_id_dtor_kfuncs(bpf_testmod_dtors, ARRAY_SIZE(bpf_testmod_dtors), THIS_MODULE); @@@ -1344,14 -1104,6 +1345,14 @@@ ret = register_bpf_testmod_uprobe(); if (ret < 0) return ret; + + /* Ensure nothing is between tramp_1..tramp_40 */ + BUILD_BUG_ON(offsetof(struct bpf_testmod_ops, tramp_1) + 40 * sizeof(long) != + offsetofend(struct bpf_testmod_ops, tramp_40)); + tramp = (void **)&__bpf_testmod_ops.tramp_1; + while (tramp <= (void **)&__bpf_testmod_ops.tramp_40) + *tramp++ = bpf_testmod_tramp; + return 0; }
diff --combined tools/testing/selftests/mm/Makefile index d7a85059c27bd,4ea188be0588a..02e1204971b0a --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@@ -90,6 -90,7 +90,7 @@@ CAN_BUILD_X86_64 := $(shell ./../x86/ch CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie)
VMTARGETS := protection_keys + VMTARGETS += pkey_sighandler_tests BINARIES_32 := $(VMTARGETS:%=%_32) BINARIES_64 := $(VMTARGETS:%=%_64)
@@@ -106,13 -107,13 +107,13 @@@ TEST_GEN_FILES += $(BINARIES_64 endif else
-ifneq (,$(findstring $(ARCH),powerpc)) +ifneq (,$(filter $(ARCH),arm64 powerpc)) TEST_GEN_FILES += protection_keys endif
endif
-ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) +ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) TEST_GEN_FILES += va_high_addr_switch TEST_GEN_FILES += virtual_address_range TEST_GEN_FILES += write_to_hugetlbfs diff --combined tools/testing/selftests/mm/pkey-helpers.h index 15608350fc017,4d31a309a46b5..9ab6a3ee153b5 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@@ -79,7 -79,18 +79,18 @@@ extern void abort_hooks(void) } \ } while (0)
- __attribute__((noinline)) int read_ptr(int *ptr); + #define barrier() __asm__ __volatile__("": : :"memory") + #ifndef noinline + # define noinline __attribute__((noinline)) + #endif + + noinline int read_ptr(int *ptr) + { + /* Keep GCC from optimizing this away somehow */ + barrier(); + return *ptr; + } + void expected_pkey_fault(int pkey); int sys_pkey_alloc(unsigned long flags, unsigned long init_val); int sys_pkey_free(unsigned long pkey); @@@ -91,17 -102,12 +102,17 @@@ void record_pkey_malloc(void *ptr, lon #include "pkey-x86.h" #elif defined(__powerpc64__) /* arch */ #include "pkey-powerpc.h" +#elif defined(__aarch64__) /* arch */ +#include "pkey-arm64.h" #else /* arch */ #error Architecture not supported #endif /* arch */
+#ifndef PKEY_MASK #define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) +#endif
+#ifndef set_pkey_bits static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) { u32 shift = pkey_bit_position(pkey); @@@ -111,9 -117,7 +122,9 @@@ reg |= (flags & PKEY_MASK) << shift; return reg; } +#endif
+#ifndef get_pkey_bits static inline u64 get_pkey_bits(u64 reg, int pkey) { u32 shift = pkey_bit_position(pkey); @@@ -123,7 -127,6 +134,7 @@@ */ return ((reg >> shift) & PKEY_MASK); } +#endif
extern u64 shadow_pkey_reg;
diff --combined tools/testing/selftests/mm/protection_keys.c index 0789981b72b95,cc6de1644360a..4990f7ab4cb72 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@@ -147,7 -147,7 +147,7 @@@ void abort_hooks(void * will then fault, which makes sure that the fault code handles * execute-only memory properly. */ -#ifdef __powerpc64__ +#if defined(__powerpc64__) || defined(__aarch64__) /* This way, both 4K and 64K alignment are maintained */ __attribute__((__aligned__(65536))) #else @@@ -212,6 -212,7 +212,6 @@@ void pkey_disable_set(int pkey, int fla unsigned long syscall_flags = 0; int ret; int pkey_rights; - u64 orig_pkey_reg = read_pkey_reg();
dprintf1("START->%s(%d, 0x%x)\n", __func__, pkey, flags); @@@ -241,6 -242,8 +241,6 @@@
dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); - if (flags) - pkey_assert(read_pkey_reg() >= orig_pkey_reg); dprintf1("END<---%s(%d, 0x%x)\n", __func__, pkey, flags); } @@@ -250,6 -253,7 +250,6 @@@ void pkey_disable_clear(int pkey, int f unsigned long syscall_flags = 0; int ret; int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u64 orig_pkey_reg = read_pkey_reg();
pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
@@@ -269,6 -273,8 +269,6 @@@
dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); - if (flags) - assert(read_pkey_reg() <= orig_pkey_reg); }
void pkey_write_allow(int pkey) @@@ -308,9 -314,7 +308,9 @@@ void signal_handler(int signum, siginfo ucontext_t *uctxt = vucontext; int trapno; unsigned long ip; +#ifdef MCONTEXT_FPREGS char *fpregs; +#endif #if defined(__i386__) || defined(__x86_64__) /* arch */ u32 *pkey_reg_ptr; int pkey_reg_offset; @@@ -324,11 -328,9 +324,11 @@@ __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg);
- trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; - ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; + trapno = MCONTEXT_TRAPNO(uctxt->uc_mcontext); + ip = MCONTEXT_IP(uctxt->uc_mcontext); +#ifdef MCONTEXT_FPREGS fpregs = (char *) uctxt->uc_mcontext.fpregs; +#endif
dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", __func__, trapno, ip, si_code_str(si->si_code), @@@ -357,9 -359,7 +357,9 @@@ #endif /* arch */
dprintf1("siginfo: %p\n", si); +#ifdef MCONTEXT_FPREGS dprintf1(" fpregs: %p\n", fpregs); +#endif
if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) || @@@ -389,8 -389,6 +389,8 @@@ #elif defined(__powerpc64__) /* arch */ /* restore access and let the faulting instruction continue */ pkey_access_allow(siginfo_pkey); +#elif defined(__aarch64__) + aarch64_write_signal_pkey(uctxt, PKEY_ALLOW_ALL); #endif /* arch */ pkey_faults++; dprintf1("<<<<==================================================\n"); @@@ -904,9 -902,7 +904,9 @@@ void expected_pkey_fault(int pkey * test program continue. We now have to restore it. */ if (__read_pkey_reg() != 0) -#else /* arch */ +#elif defined(__aarch64__) + if (__read_pkey_reg() != PKEY_ALLOW_ALL) +#else if (__read_pkey_reg() != shadow_pkey_reg) #endif /* arch */ pkey_assert(0); @@@ -954,16 -950,6 +954,6 @@@ void close_test_fds(void nr_test_fds = 0; }
- #define barrier() __asm__ __volatile__("": : :"memory") - __attribute__((noinline)) int read_ptr(int *ptr) - { - /* - * Keep GCC from optimizing this away somehow - */ - barrier(); - return *ptr; - } - void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) { int i, err; @@@ -1496,11 -1482,6 +1486,11 @@@ void test_executing_on_unreadable_memor lots_o_noops_around_write(&scratch); do_not_expect_pkey_fault("executing on PROT_EXEC memory"); expect_fault_on_read_execonly_key(p1, pkey); + + // Reset back to PROT_EXEC | PROT_READ for architectures that support + // non-PKEY execute-only permissions. + ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC | PROT_READ, (u64)pkey); + pkey_assert(!ret); }
void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@@ -1674,84 -1655,6 +1664,84 @@@ void test_ptrace_modifies_pkru(int *ptr } #endif
+#if defined(__aarch64__) +void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +{ + pid_t child; + int status, ret; + struct iovec iov; + u64 trace_pkey; + /* Just a random pkey value.. */ + u64 new_pkey = (POE_X << PKEY_BITS_PER_PKEY * 2) | + (POE_NONE << PKEY_BITS_PER_PKEY) | + POE_RWX; + + child = fork(); + pkey_assert(child >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), child); + if (!child) { + ptrace(PTRACE_TRACEME, 0, 0, 0); + + /* Stop and allow the tracer to modify PKRU directly */ + raise(SIGSTOP); + + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + if (__read_pkey_reg() != new_pkey) + exit(1); + + raise(SIGSTOP); + + exit(0); + } + + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + iov.iov_base = &trace_pkey; + iov.iov_len = 8; + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == read_pkey_reg()); + + trace_pkey = new_pkey; + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(&trace_pkey, 0, sizeof(trace_pkey)); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == new_pkey); + + /* Execute the tracee */ + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value change */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(&trace_pkey, 0, sizeof(trace_pkey)); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == new_pkey); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFEXITED(status)); + pkey_assert(WEXITSTATUS(status) == 0); +} +#endif + void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) { int size = PAGE_SIZE; @@@ -1787,7 -1690,7 +1777,7 @@@ void (*pkey_tests[])(int *ptr, u16 pkey test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, test_pkey_alloc_free_attach_pkey0, -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) test_ptrace_modifies_pkru, #endif };