The following commit has been merged in the master branch: commit f56caedaf94f9ced5dbfcdb0060a3e788d2078af Merge: a33f5c380c4bd3fa5278d690421b72052456d9fe 76fd0285b447991267e838842c0be7395eb454bb Author: Linus Torvalds torvalds@linux-foundation.org Date: Sat Jan 15 20:37:06 2022 +0200
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "146 patches.
Subsystems affected by this patch series: kthread, ia64, scripts, ntfs, squashfs, ocfs2, vfs, and mm (slab-generic, slab, kmemleak, dax, kasan, debug, pagecache, gup, shmem, frontswap, memremap, memcg, selftests, pagemap, dma, vmalloc, memory-failure, hugetlb, userfaultfd, vmscan, mempolicy, oom-kill, hugetlbfs, migration, thp, ksm, page-poison, percpu, rmap, zswap, zram, cleanups, hmm, and damon)"
* emailed patches from Andrew Morton akpm@linux-foundation.org: (146 commits) mm/damon: hide kernel pointer from tracepoint event mm/damon/vaddr: hide kernel pointer from damon_va_three_regions() failure log mm/damon/vaddr: use pr_debug() for damon_va_three_regions() failure logging mm/damon/dbgfs: remove an unnecessary variable mm/damon: move the implementation of damon_insert_region to damon.h mm/damon: add access checking for hugetlb pages Docs/admin-guide/mm/damon/usage: update for schemes statistics mm/damon/dbgfs: support all DAMOS stats Docs/admin-guide/mm/damon/reclaim: document statistics parameters mm/damon/reclaim: provide reclamation statistics mm/damon/schemes: account how many times quota limit has exceeded mm/damon/schemes: account scheme actions that successfully applied mm/damon: remove a mistakenly added comment for a future feature Docs/admin-guide/mm/damon/usage: update for kdamond_pid and (mk|rm)_contexts Docs/admin-guide/mm/damon/usage: mention tracepoint at the beginning Docs/admin-guide/mm/damon/usage: remove redundant information Docs/admin-guide/mm/damon/usage: update for scheme quotas and watermarks mm/damon: convert macro functions to static inline functions mm/damon: modify damon_rand() macro to static inline function mm/damon: move damon_rand() definition into damon.h ...
diff --combined MAINTAINERS index 5d0cd537803a,fbdb860c0b8b..474966314383 --- a/MAINTAINERS +++ b/MAINTAINERS @@@ -966,7 -966,6 +966,7 @@@ F: drivers/gpu/drm/amd/include/kgd_kfd_ F: drivers/gpu/drm/amd/include/v9_structs.h F: drivers/gpu/drm/amd/include/vi_structs.h F: include/uapi/linux/kfd_ioctl.h +F: include/uapi/linux/kfd_sysfs.h
AMD SPI DRIVER M: Sanjay R Mehta sanju.mehta@amd.com @@@ -994,13 -993,6 +994,13 @@@ S: Supporte T: git https://gitlab.freedesktop.org/agd5f/linux.git F: drivers/gpu/drm/amd/pm/
+AMD PSTATE DRIVER +M: Huang Rui ray.huang@amd.com +L: linux-pm@vger.kernel.org +S: Supported +F: Documentation/admin-guide/pm/amd-pstate.rst +F: drivers/cpufreq/amd-pstate* + AMD PTDMA DRIVER M: Sanjay R Mehta sanju.mehta@amd.com L: dmaengine@vger.kernel.org @@@ -1077,15 -1069,6 +1077,15 @@@ W: http://ez.analog.com/community/linux F: Documentation/devicetree/bindings/iio/adc/adi,ad7780.yaml F: drivers/iio/adc/ad7780.c
+ANALOG DEVICES INC AD74413R DRIVER +M: Cosmin Tanislav cosmin.tanislav@analog.com +L: linux-iio@vger.kernel.org +S: Supported +W: http://ez.analog.com/community/linux-device-drivers +F: Documentation/devicetree/bindings/iio/addac/adi,ad74413r.yaml +F: drivers/iio/addac/ad74413r.c +F: include/dt-bindings/iio/addac/adi,ad74413r.h + ANALOG DEVICES INC AD9389B DRIVER M: Hans Verkuil hverkuil-cisco@xs4all.nl L: linux-media@vger.kernel.org @@@ -1156,7 -1139,6 +1156,7 @@@ ANALOG DEVICES INC ADV748X DRIVE M: Kieran Bingham kieran.bingham@ideasonboard.com L: linux-media@vger.kernel.org S: Maintained +F: Documentation/devicetree/bindings/media/i2c/adv748x.yaml F: drivers/media/i2c/adv748x/*
ANALOG DEVICES INC ADV7511 DRIVER @@@ -1763,21 -1745,17 +1763,21 @@@ B: https://github.com/AsahiLinux/linux/ C: irc://irc.oftc.net/asahi-dev T: git https://github.com/AsahiLinux/linux.git F: Documentation/devicetree/bindings/arm/apple.yaml +F: Documentation/devicetree/bindings/arm/apple/* F: Documentation/devicetree/bindings/i2c/apple,i2c.yaml F: Documentation/devicetree/bindings/interrupt-controller/apple,aic.yaml F: Documentation/devicetree/bindings/mailbox/apple,mailbox.yaml F: Documentation/devicetree/bindings/pci/apple,pcie.yaml F: Documentation/devicetree/bindings/pinctrl/apple,pinctrl.yaml +F: Documentation/devicetree/bindings/power/apple* +F: Documentation/devicetree/bindings/watchdog/apple,wdt.yaml F: arch/arm64/boot/dts/apple/ F: drivers/i2c/busses/i2c-pasemi-core.c F: drivers/i2c/busses/i2c-pasemi-platform.c F: drivers/irqchip/irq-apple-aic.c F: drivers/mailbox/apple-mailbox.c F: drivers/pinctrl/pinctrl-apple-gpio.c +F: drivers/soc/apple/* F: include/dt-bindings/interrupt-controller/apple-aic.h F: include/dt-bindings/pinctrl/apple.h F: include/linux/apple-mailbox.h @@@ -1912,7 -1890,6 +1912,7 @@@ F: Documentation/trace/coresight/ F: drivers/hwtracing/coresight/* F: include/dt-bindings/arm/coresight-cti-dt.h F: include/linux/coresight* +F: samples/coresight/* F: tools/perf/arch/arm/util/auxtrace.c F: tools/perf/arch/arm/util/cs-etm.c F: tools/perf/arch/arm/util/cs-etm.h @@@ -2314,7 -2291,6 +2314,7 @@@ F: Documentation/devicetree/bindings/gp F: arch/arm/boot/dts/mstar-* F: arch/arm/mach-mstar/ F: drivers/clk/mstar/ +F: drivers/clocksource/timer-msc313e.c F: drivers/gpio/gpio-msc313.c F: drivers/rtc/rtc-msc313.c F: drivers/watchdog/msc313e_wdt.c @@@ -2575,7 -2551,6 +2575,7 @@@ Q: https://patchwork.kernel.org/project F: Documentation/arm/samsung/ F: Documentation/devicetree/bindings/arm/samsung/ F: Documentation/devicetree/bindings/power/pd-samsung.yaml +F: Documentation/devicetree/bindings/soc/samsung/ F: arch/arm/boot/dts/exynos* F: arch/arm/boot/dts/s3c* F: arch/arm/boot/dts/s5p* @@@ -2602,7 -2577,7 +2602,7 @@@ N: s3c64x N: s5pv210
ARM/SAMSUNG S5P SERIES 2D GRAPHICS ACCELERATION (G2D) SUPPORT -M: Andrzej Hajda a.hajda@samsung.com +M: ��ukasz Stelmach l.stelmach@samsung.com L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-media@vger.kernel.org S: Maintained @@@ -2626,8 -2601,7 +2626,8 @@@ S: Maintaine F: drivers/media/platform/s5p-jpeg/
ARM/SAMSUNG S5P SERIES Multi Format Codec (MFC) SUPPORT -M: Andrzej Hajda a.hajda@samsung.com +M: Marek Szyprowski m.szyprowski@samsung.com +M: Andrzej Hajda andrzej.hajda@intel.com L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-media@vger.kernel.org S: Maintained @@@ -2818,15 -2792,12 +2818,15 @@@ L: linux-arm-kernel@lists.infradead.or S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/iwamatsu/linux-visconti.git F: Documentation/devicetree/bindings/arm/toshiba.yaml +F: Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pipllct.yaml +F: Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pismu.yaml F: Documentation/devicetree/bindings/net/toshiba,visconti-dwmac.yaml F: Documentation/devicetree/bindings/gpio/toshiba,gpio-visconti.yaml F: Documentation/devicetree/bindings/pci/toshiba,visconti-pcie.yaml F: Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml F: Documentation/devicetree/bindings/watchdog/toshiba,visconti-wdt.yaml F: arch/arm64/boot/dts/toshiba/ +F: drivers/clk/visconti/ F: drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c F: drivers/gpio/gpio-visconti.c F: drivers/pci/controller/dwc/pcie-visconti.c @@@ -3027,27 -2998,6 +3027,27 @@@ W: http://acpi4asus.sf.ne F: drivers/platform/x86/asus*.c F: drivers/platform/x86/eeepc*.c
+ASUS TF103C DOCK DRIVER +M: Hans de Goede hdegoede@redhat.com +L: platform-driver-x86@vger.kernel.org +S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git +F: drivers/platform/x86/asus-tf103c-dock.c + +ASUS WMI HARDWARE MONITOR DRIVER +M: Ed Brindley kernel@maidavale.org +M: Denis Pauk pauk.denis@gmail.com +L: linux-hwmon@vger.kernel.org +S: Maintained +F: drivers/hwmon/asus_wmi_sensors.c + +ASUS WMI EC HARDWARE MONITOR DRIVER +M: Eugene Shalygin eugene.shalygin@gmail.com +M: Denis Pauk pauk.denis@gmail.com +L: linux-hwmon@vger.kernel.org +S: Maintained +F: drivers/hwmon/asus_wmi_ec_sensors.c + ASUS WIRELESS RADIO CONTROL DRIVER M: Jo��o Paulo Rechi Vita jprvita@gmail.com L: platform-driver-x86@vger.kernel.org @@@ -3430,8 -3380,6 +3430,8 @@@ M: Jens Axboe <axboe@kernel.dk L: linux-block@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git +F: Documentation/ABI/stable/sysfs-block +F: Documentation/block/ F: block/ F: drivers/block/ F: include/linux/blk* @@@ -3621,7 -3569,7 +3621,7 @@@ R: Florent Revest <revest@chromium.org R: Brendan Jackman jackmanb@chromium.org L: bpf@vger.kernel.org S: Maintained -F: Documentation/bpf/bpf_lsm.rst +F: Documentation/bpf/prog_lsm.rst F: include/linux/bpf_lsm.h F: kernel/bpf/bpf_lsm.c F: security/bpf/ @@@ -3688,7 -3636,6 +3688,7 @@@ F: drivers/net/ethernet/broadcom/bcm490 F: drivers/net/ethernet/broadcom/unimac.h
BROADCOM BCM5301X ARM ARCHITECTURE +M: Florian Fainelli f.fainelli@gmail.com M: Hauke Mehrtens hauke@hauke-m.de M: Rafa�� Mi��ecki zajec5@gmail.com M: bcm-kernel-feedback-list@broadcom.com @@@ -3700,7 -3647,6 +3700,7 @@@ F: arch/arm/boot/dts/bcm953012 F: arch/arm/mach-bcm/bcm_5301x.c
BROADCOM BCM53573 ARM ARCHITECTURE +M: Florian Fainelli f.fainelli@gmail.com M: Rafa�� Mi��ecki rafal@milecki.pl L: bcm-kernel-feedback-list@broadcom.com L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@@ -3744,7 -3690,7 +3744,7 @@@ M: Al Cooper <alcooperx@gmail.com L: linux-usb@vger.kernel.org L: bcm-kernel-feedback-list@broadcom.com S: Maintained -F: Documentation/devicetree/bindings/usb/brcm,bdc.txt +F: Documentation/devicetree/bindings/usb/brcm,bdc.yaml F: drivers/usb/gadget/udc/bdc/
BROADCOM BMIPS CPUFREQ DRIVER @@@ -3827,7 -3773,7 +3827,7 @@@ M: Doug Berger <opendmb@gmail.com M: Florian Fainelli f.fainelli@gmail.com L: bcm-kernel-feedback-list@broadcom.com S: Supported -F: Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt +F: Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.yaml F: drivers/gpio/gpio-brcmstb.c
BROADCOM BRCMSTB I2C DRIVER @@@ -3885,7 -3831,7 +3885,7 @@@ M: Florian Fainelli <f.fainelli@gmail.c L: bcm-kernel-feedback-list@broadcom.com L: netdev@vger.kernel.org S: Supported -F: Documentation/devicetree/bindings/net/brcm,bcmgenet.txt +F: Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml F: Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml F: drivers/net/ethernet/broadcom/genet/ F: drivers/net/ethernet/broadcom/unimac.h @@@ -3927,7 -3873,7 +3927,7 @@@ M: Rafa�� Mi��ecki <rafal@milecki.pl M: bcm-kernel-feedback-list@broadcom.com L: netdev@vger.kernel.org S: Maintained -F: Documentation/devicetree/bindings/net/brcm,amac.txt +F: Documentation/devicetree/bindings/net/brcm,amac.yaml F: drivers/net/ethernet/broadcom/bgmac* F: drivers/net/ethernet/broadcom/unimac.h
@@@ -4002,7 -3948,7 +4002,7 @@@ M: Markus Mayer <mmayer@broadcom.com M: bcm-kernel-feedback-list@broadcom.com L: linux-pm@vger.kernel.org S: Maintained -F: Documentation/devicetree/bindings/thermal/brcm,avs-tmon.txt +F: Documentation/devicetree/bindings/thermal/brcm,avs-tmon.yaml F: drivers/thermal/broadcom/brcmstb*
BROADCOM STB DPFE DRIVER @@@ -4038,7 -3984,6 +4038,7 @@@ L: netdev@vger.kernel.or S: Supported F: drivers/net/ethernet/broadcom/bcmsysport.* F: drivers/net/ethernet/broadcom/unimac.h +F: Documentation/devicetree/bindings/net/brcm,systemport.yaml
BROADCOM TG3 GIGABIT ETHERNET DRIVER M: Siva Reddy Kallam siva.kallam@broadcom.com @@@ -4578,12 -4523,9 +4578,12 @@@ F: drivers/media/cec/i2c/ch7322. CIRRUS LOGIC AUDIO CODEC DRIVERS M: James Schulman james.schulman@cirrus.com M: David Rhodes david.rhodes@cirrus.com +M: Lucas Tanure tanureal@opensource.cirrus.com L: alsa-devel@alsa-project.org (moderated for non-subscribers) L: patches@opensource.cirrus.com S: Maintained +F: Documentation/devicetree/bindings/sound/cirrus,cs* +F: sound/pci/hda/cs* F: sound/soc/codecs/cs*
CIRRUS LOGIC DSP FIRMWARE DRIVER @@@ -4783,8 -4725,6 +4783,8 @@@ M: Ian Abbott <abbotti@mev.co.uk M: H Hartley Sweeten hsweeten@visionengravers.com S: Odd Fixes F: drivers/comedi/ +F: include/linux/comedi/ +F: include/uapi/linux/comedi.h
COMMON CLK FRAMEWORK M: Michael Turquette mturquette@baylibre.com @@@ -5483,12 -5423,6 +5483,12 @@@ W: https://linuxtv.or T: git git://linuxtv.org/media_tree.git F: drivers/media/platform/sti/delta
+DELTA AHE-50DC FAN CONTROL MODULE DRIVER +M: Zev Weiss zev@bewilderbeest.net +L: linux-hwmon@vger.kernel.org +S: Maintained +F: drivers/hwmon/pmbus/delta-ahe50dc-fan.c + DELTA DPS920AB PSU DRIVER M: Robert Marko robert.marko@sartura.hr L: linux-hwmon@vger.kernel.org @@@ -6116,7 -6050,6 +6116,7 @@@ F: drivers/gpu/drm/tiny/mi0283qt. DRM DRIVER FOR MSM ADRENO GPU M: Rob Clark robdclark@gmail.com M: Sean Paul sean@poorly.run +R: Abhinav Kumar quic_abhinavk@quicinc.com L: linux-arm-msm@vger.kernel.org L: dri-devel@lists.freedesktop.org L: freedreno@lists.freedesktop.org @@@ -6142,17 -6075,10 +6142,17 @@@ F: drivers/gpu/drm/panel/panel-novatek-
DRM DRIVER FOR NVIDIA GEFORCE/QUADRO GPUS M: Ben Skeggs bskeggs@redhat.com +M: Karol Herbst kherbst@redhat.com +M: Lyude Paul lyude@redhat.com L: dri-devel@lists.freedesktop.org L: nouveau@lists.freedesktop.org S: Supported -T: git git://github.com/skeggsb/linux +W: https://nouveau.freedesktop.org/ +Q: https://patchwork.freedesktop.org/project/nouveau/ +Q: https://gitlab.freedesktop.org/drm/nouveau/-/merge_requests +B: https://gitlab.freedesktop.org/drm/nouveau/-/issues +C: irc://irc.oftc.net/nouveau +T: git https://gitlab.freedesktop.org/drm/nouveau.git F: drivers/gpu/drm/nouveau/ F: include/uapi/drm/nouveau_drm.h
@@@ -6385,7 -6311,7 +6385,7 @@@ F: Documentation/devicetree/bindings/di F: drivers/gpu/drm/atmel-hlcdc/
DRM DRIVERS FOR BRIDGE CHIPS -M: Andrzej Hajda a.hajda@samsung.com +M: Andrzej Hajda andrzej.hajda@intel.com M: Neil Armstrong narmstrong@baylibre.com M: Robert Foss robert.foss@linaro.org R: Laurent Pinchart Laurent.pinchart@ideasonboard.com @@@ -6492,7 -6418,6 +6492,7 @@@ L: dri-devel@lists.freedesktop.or L: linux-renesas-soc@vger.kernel.org S: Supported T: git git://linuxtv.org/pinchartl/media drm/du/next +F: Documentation/devicetree/bindings/display/bridge/renesas,dsi-csi2-tx.yaml F: Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.yaml F: Documentation/devicetree/bindings/display/bridge/renesas,lvds.yaml F: Documentation/devicetree/bindings/display/renesas,du.yaml @@@ -6611,14 -6536,6 +6611,14 @@@ F: drivers/gpu/drm/drm_panel. F: drivers/gpu/drm/panel/ F: include/drm/drm_panel.h
+DRM PRIVACY-SCREEN CLASS +M: Hans de Goede hdegoede@redhat.com +L: dri-devel@lists.freedesktop.org +S: Maintained +T: git git://anongit.freedesktop.org/drm/drm-misc +F: drivers/gpu/drm/drm_privacy_screen* +F: include/drm/drm_privacy_screen* + DRM TTM SUBSYSTEM M: Christian Koenig christian.koenig@amd.com M: Huang Rui ray.huang@amd.com @@@ -7096,7 -7013,9 +7096,7 @@@ S: Maintaine F: drivers/mmc/host/cqhci*
EMULEX 10Gbps iSCSI - OneConnect DRIVER -M: Subbu Seetharaman subbu.seetharaman@broadcom.com M: Ketan Mukadam ketan.mukadam@broadcom.com -M: Jitendra Bhivare jitendra.bhivare@broadcom.com L: linux-scsi@vger.kernel.org S: Supported W: http://www.broadcom.com @@@ -7507,6 -7426,12 +7507,6 @@@ F: Documentation/firmware_class F: drivers/base/firmware_loader/ F: include/linux/firmware.h
-FLASH ADAPTER DRIVER (IBM Flash Adapter 900GB Full Height PCI Flash Card) -M: Joshua Morris josh.h.morris@us.ibm.com -M: Philip Kelleher pjk1939@linux.ibm.com -S: Maintained -F: drivers/block/rsxx/ - FLEXTIMER FTM-QUADDEC DRIVER M: Patrick Havelange patrick.havelange@essensium.com L: linux-iio@vger.kernel.org @@@ -7598,7 -7523,6 +7598,7 @@@ F: include/video FREESCALE CAAM (Cryptographic Acceleration and Assurance Module) DRIVER M: Horia Geant�� horia.geanta@nxp.com M: Pankaj Gupta pankaj.gupta@nxp.com +M: Gaurav Jain gaurav.jain@nxp.com L: linux-crypto@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/crypto/fsl-sec4.txt @@@ -8563,12 -8487,6 +8563,12 @@@ F: drivers/hid F: include/linux/hid* F: include/uapi/linux/hid*
+HID LOGITECH DRIVERS +R: Filipe La��ns lains@riseup.net +L: linux-input@vger.kernel.org +S: Maintained +F: drivers/hid/hid-logitech-* + HID PLAYSTATION DRIVER M: Roderick Colenbrander roderick.colenbrander@sony.com L: linux-input@vger.kernel.org @@@ -8690,10 -8608,8 +8690,10 @@@ F: drivers/misc/hisi_hikey_usb.
HISILICON PMU DRIVER M: Shaokun Zhang zhangshaokun@hisilicon.com +M: Qi Liu liuqi115@huawei.com S: Supported W: http://www.hisilicon.com +F: Documentation/admin-guide/perf/hisi-pcie-pmu.rst F: Documentation/admin-guide/perf/hisi-pmu.rst F: drivers/perf/hisilicon
@@@ -8724,7 -8640,6 +8724,7 @@@ F: drivers/scsi/hisi_sas
HISILICON SECURITY ENGINE V2 DRIVER (SEC2) M: Zaibo Xu xuzaibo@huawei.com +M: Kai Ye yekai13@huawei.com L: linux-crypto@vger.kernel.org S: Maintained F: Documentation/ABI/testing/debugfs-hisi-sec @@@ -9575,7 -9490,6 +9575,7 @@@ INTEL DRM DRIVERS (excluding Poulsbo, M M: Jani Nikula jani.nikula@linux.intel.com M: Joonas Lahtinen joonas.lahtinen@linux.intel.com M: Rodrigo Vivi rodrigo.vivi@intel.com +M: Tvrtko Ursulin tvrtko.ursulin@linux.intel.com L: intel-gfx@lists.freedesktop.org S: Supported W: https://01.org/linuxgraphics/ @@@ -9770,6 -9684,7 +9770,6 @@@ F: Documentation/devicetree/bindings/cr F: drivers/crypto/keembay/Kconfig F: drivers/crypto/keembay/Makefile F: drivers/crypto/keembay/keembay-ocs-ecc.c -F: drivers/crypto/keembay/ocs-ecc-curve-defs.h
INTEL KEEM BAY OCS HCU CRYPTO DRIVER M: Daniele Alessandrelli daniele.alessandrelli@intel.com @@@ -9782,13 -9697,6 +9782,13 @@@ F: drivers/crypto/keembay/keembay-ocs-h F: drivers/crypto/keembay/ocs-hcu.c F: drivers/crypto/keembay/ocs-hcu.h
+INTEL THUNDER BAY EMMC PHY DRIVER +M: Nandhini Srikandan nandhini.srikandan@intel.com +M: Rashmi A rashmi.a@intel.com +S: Maintained +F: Documentation/devicetree/bindings/phy/intel,phy-thunderbay-emmc.yaml +F: drivers/phy/intel/phy-intel-thunderbay-emmc.c + INTEL MANAGEMENT ENGINE (mei) M: Tomas Winkler tomas.winkler@intel.com L: linux-kernel@vger.kernel.org @@@ -9844,9 -9752,10 +9844,9 @@@ S: Maintaine F: drivers/mfd/intel_soc_pmic* F: include/linux/mfd/intel_soc_pmic*
-INTEL PMT DRIVER -M: "David E. Box" david.e.box@linux.intel.com -S: Maintained -F: drivers/mfd/intel_pmt.c +INTEL PMT DRIVERS +M: David E. Box david.e.box@linux.intel.com +S: Supported F: drivers/platform/x86/intel/pmt/
INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT @@@ -9913,11 -9822,6 +9913,11 @@@ L: platform-driver-x86@vger.kernel.or S: Maintained F: drivers/platform/x86/intel/uncore-frequency.c
+INTEL VENDOR SPECIFIC EXTENDED CAPABILITIES DRIVER +M: David E. Box david.e.box@linux.intel.com +S: Supported +F: drivers/platform/x86/intel/vsec.* + INTEL VIRTUAL BUTTON DRIVER M: AceLan Kao acelan.kao@canonical.com L: platform-driver-x86@vger.kernel.org @@@ -10842,13 -10746,6 +10842,13 @@@ S: Maintaine W: http://legousb.sourceforge.net/ F: drivers/usb/misc/legousbtower.c
+LETSKETCH HID TABLET DRIVER +M: Hans de Goede hdegoede@redhat.com +L: linux-input@vger.kernel.org +S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git +F: drivers/hid/hid-letsketch.c + LG LAPTOP EXTRAS M: Matan Ziv-Av matan@svgalib.org L: platform-driver-x86@vger.kernel.org @@@ -11637,12 -11534,6 +11637,12 @@@ S: Maintaine F: Documentation/devicetree/bindings/media/i2c/maxim,max9286.yaml F: drivers/media/i2c/max9286.c
+MAX96712 QUAD GMSL2 DESERIALIZER DRIVER +M: Niklas S��derlund niklas.soderlund@ragnatech.se +L: linux-media@vger.kernel.org +S: Maintained +F: drivers/staging/media/max96712/max96712.c + MAX9860 MONO AUDIO VOICE CODEC DRIVER M: Peter Rosin peda@axentia.se L: alsa-devel@alsa-project.org (moderated for non-subscribers) @@@ -11678,13 -11569,6 +11678,13 @@@ S: Maintaine F: Documentation/devicetree/bindings/power/supply/maxim,max17042.yaml F: drivers/power/supply/max17042_battery.c
+MAXIM MAX20086 CAMERA POWER PROTECTOR DRIVER +M: Laurent Pinchart laurent.pinchart@ideasonboard.com +L: linux-kernel@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/regulator/maxim,max20086.yaml +F: drivers/regulator/max20086-regulator.c + MAXIM MAX77650 PMIC MFD DRIVER M: Bartosz Golaszewski brgl@bgdev.pl L: linux-kernel@vger.kernel.org @@@ -11707,12 -11591,6 +11707,12 @@@ F: Documentation/devicetree/bindings/*/ F: drivers/regulator/max77802-regulator.c F: include/dt-bindings/*/*max77802.h
+MAXIM MAX77976 BATTERY CHARGER +M: Luca Ceresoli luca@lucaceresoli.net +S: Supported +F: Documentation/devicetree/bindings/power/supply/maxim,max77976.yaml +F: drivers/power/supply/max77976_charger.c + MAXIM MUIC CHARGER DRIVERS FOR EXYNOS BASED BOARDS M: Krzysztof Kozlowski krzysztof.kozlowski@canonical.com M: Bartlomiej Zolnierkiewicz b.zolnierkie@samsung.com @@@ -11727,7 -11605,7 +11727,7 @@@ M: Krzysztof Kozlowski <krzysztof.kozlo M: Bartlomiej Zolnierkiewicz b.zolnierkie@samsung.com L: linux-kernel@vger.kernel.org S: Supported -F: Documentation/devicetree/bindings/*/max77686.txt +F: Documentation/devicetree/bindings/*/maxim,max77686.yaml F: Documentation/devicetree/bindings/clock/maxim,max77686.txt F: Documentation/devicetree/bindings/mfd/max14577.txt F: Documentation/devicetree/bindings/mfd/max77693.txt @@@ -12646,13 -12524,6 +12646,13 @@@ L: netdev@vger.kernel.or S: Maintained F: drivers/net/ethernet/microchip/lan743x_*
+MICROCHIP LAN966X ETHERNET DRIVER +M: Horatiu Vultur horatiu.vultur@microchip.com +M: UNGLinuxDriver@microchip.com +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/ethernet/microchip/lan966x/* + MICROCHIP LCDFB DRIVER M: Nicolas Ferre nicolas.ferre@microchip.com L: linux-fbdev@vger.kernel.org @@@ -13836,24 -13707,12 +13836,24 @@@ F: Documentation/devicetree/bindings/di F: drivers/gpu/drm/imx/dcss/
NXP i.MX 8QXP ADC DRIVER -M: Cai Huoqing caihuoqing@baidu.com +M: Cai Huoqing cai.huoqing@linux.dev +M: Haibo Chen haibo.chen@nxp.com +L: linux-imx@nxp.com L: linux-iio@vger.kernel.org -S: Supported +S: Maintained F: Documentation/devicetree/bindings/iio/adc/nxp,imx8qxp-adc.yaml F: drivers/iio/adc/imx8qxp-adc.c
+NXP i.MX 7D/6SX/6UL AND VF610 ADC DRIVER +M: Haibo Chen haibo.chen@nxp.com +L: linux-iio@vger.kernel.org +L: linux-imx@nxp.com +S: Maintained +F: Documentation/devicetree/bindings/iio/adc/fsl,imx7d-adc.yaml +F: Documentation/devicetree/bindings/iio/adc/fsl,vf610-adc.yaml +F: drivers/iio/adc/imx7d_adc.c +F: drivers/iio/adc/vf610_adc.c + NXP PF8100/PF8121A/PF8200 PMIC REGULATOR DEVICE DRIVER M: Jagan Teki jagan@amarulasolutions.com S: Maintained @@@ -13927,13 -13786,6 +13927,13 @@@ S: Maintaine F: Documentation/hwmon/nzxt-kraken2.rst F: drivers/hwmon/nzxt-kraken2.c
+NZXT-SMART2 HARDWARE MONITORING DRIVER +M: Aleksandr Mezin mezin.alexander@gmail.com +L: linux-hwmon@vger.kernel.org +S: Maintained +F: Documentation/hwmon/nzxt-smart2.rst +F: drivers/hwmon/nzxt-smart2.c + OBJAGG M: Jiri Pirko jiri@nvidia.com L: netdev@vger.kernel.org @@@ -14246,6 -14098,7 +14246,6 @@@ F: drivers/media/i2c/ov5647.
OMNIVISION OV5670 SENSOR DRIVER M: Chiranjeevi Rapolu chiranjeevi.rapolu@intel.com -M: Hyungwoo Yang hyungwoo.yang@intel.com L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git @@@ -14258,13 -14111,6 +14258,13 @@@ S: Maintaine T: git git://linuxtv.org/media_tree.git F: drivers/media/i2c/ov5675.c
+OMNIVISION OV5693 SENSOR DRIVER +M: Daniel Scally djrscally@gmail.com +L: linux-media@vger.kernel.org +S: Maintained +T: git git://linuxtv.org/media_tree.git +F: drivers/media/i2c/ov5693.c + OMNIVISION OV5695 SENSOR DRIVER M: Shunqian Zheng zhengsq@rock-chips.com L: linux-media@vger.kernel.org @@@ -14541,6 -14387,15 +14541,15 @@@ F: include/net/page_pool. F: include/trace/events/page_pool.h F: net/core/page_pool.c
+ PAGE TABLE CHECK + M: Pasha Tatashin pasha.tatashin@soleen.com + M: Andrew Morton akpm@linux-foundation.org + L: linux-mm@kvack.org + S: Maintained + F: Documentation/vm/page_table_check.rst + F: include/linux/page_table_check.h + F: mm/page_table_check.c + PANASONIC LAPTOP ACPI EXTRAS DRIVER M: Kenneth Chan kenneth.t.chan@gmail.com L: platform-driver-x86@vger.kernel.org @@@ -15051,7 -14906,7 +15060,7 @@@ F: drivers/pci/controller/dwc/*spear PCMCIA SUBSYSTEM M: Dominik Brodowski linux@dominikbrodowski.net S: Odd Fixes -T: git git://git.kernel.org/pub/scm/linux/kernel/git/brodo/pcmcia.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/brodo/linux.git F: Documentation/pcmcia/ F: drivers/pcmcia/ F: include/pcmcia/ @@@ -15282,11 -15137,6 +15291,11 @@@ L: linux-omap@vger.kernel.or S: Maintained F: drivers/pinctrl/pinctrl-single.c
+PIN CONTROLLER - THUNDERBAY +M: Lakshmi Sowjanya D lakshmi.sowjanya.d@intel.com +S: Supported +F: drivers/pinctrl/pinctrl-thunderbay.c + PKTCDVD DRIVER M: linux-block@vger.kernel.org S: Orphan @@@ -15499,7 -15349,6 +15508,7 @@@ M: Sergey Senozhatsky <senozhatsky@chro R: Steven Rostedt rostedt@goodmis.org R: John Ogness john.ogness@linutronix.de S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git F: include/linux/printk.h F: kernel/printk/
@@@ -15887,14 -15736,6 +15896,14 @@@ W: https://wireless.wiki.kernel.org/en/ F: Documentation/devicetree/bindings/net/wireless/qca,ath9k.yaml F: drivers/net/wireless/ath/ath9k/
+QUALCOMM BAM-DMUX WWAN NETWORK DRIVER +M: Stephan Gerhold stephan@gerhold.net +L: netdev@vger.kernel.org +L: linux-arm-msm@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/net/qcom,bam-dmux.yaml +F: drivers/net/wwan/qcom_bam_dmux.c + QUALCOMM CAMERA SUBSYSTEM DRIVER M: Robert Foss robert.foss@linaro.org M: Todor Tomov todor.too@gmail.com @@@ -15904,15 -15745,6 +15913,15 @@@ F: Documentation/admin-guide/media/qcom F: Documentation/devicetree/bindings/media/*camss* F: drivers/media/platform/qcom/camss/
+QUALCOMM CLOCK DRIVERS +M: Bjorn Andersson bjorn.andersson@linaro.org +L: linux-arm-msm@vger.kernel.org +S: Supported +T: git git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git +F: Documentation/devicetree/bindings/clock/qcom,* +F: drivers/clk/qcom/ +F: include/dt-bindings/clock/qcom,* + QUALCOMM CORE POWER REDUCTION (CPR) AVS DRIVER M: Niklas Cassel nks@flawful.org L: linux-pm@vger.kernel.org @@@ -16166,7 -15998,6 +16175,7 @@@ F: arch/mips/generic/board-ranchu. RANDOM NUMBER DRIVER M: "Theodore Ts'o" tytso@mit.edu M: Jason A. Donenfeld Jason@zx2c4.com +T: git https://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git S: Maintained F: drivers/char/random.c
@@@ -16474,14 -16305,6 +16483,14 @@@ S: Supporte F: Documentation/devicetree/bindings/iio/adc/renesas,rzg2l-adc.yaml F: drivers/iio/adc/rzg2l_adc.c
+RENESAS R-CAR GEN3 & RZ/N1 NAND CONTROLLER DRIVER +M: Miquel Raynal miquel.raynal@bootlin.com +L: linux-mtd@lists.infradead.org +L: linux-renesas-soc@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/mtd/renesas-nandc.yaml +F: drivers/mtd/nand/raw/renesas-nand-controller.c + RESET CONTROLLER FRAMEWORK M: Philipp Zabel p.zabel@pengutronix.de S: Maintained @@@ -16652,19 -16475,27 +16661,19 @@@ ROHM POWER MANAGEMENT IC DEVICE DRIVER R: Matti Vaittinen matti.vaittinen@fi.rohmeurope.com L: linux-power@fi.rohmeurope.com S: Supported -F: Documentation/devicetree/bindings/mfd/rohm,bd70528-pmic.txt -F: Documentation/devicetree/bindings/regulator/rohm,bd70528-regulator.txt F: drivers/clk/clk-bd718x7.c -F: drivers/gpio/gpio-bd70528.c F: drivers/gpio/gpio-bd71815.c F: drivers/gpio/gpio-bd71828.c -F: drivers/mfd/rohm-bd70528.c F: drivers/mfd/rohm-bd71828.c F: drivers/mfd/rohm-bd718x7.c F: drivers/mfd/rohm-bd9576.c -F: drivers/power/supply/bd70528-charger.c -F: drivers/regulator/bd70528-regulator.c F: drivers/regulator/bd71815-regulator.c F: drivers/regulator/bd71828-regulator.c F: drivers/regulator/bd718x7-regulator.c F: drivers/regulator/bd9576-regulator.c F: drivers/regulator/rohm-regulator.c F: drivers/rtc/rtc-bd70528.c -F: drivers/watchdog/bd70528_wdt.c F: drivers/watchdog/bd9576_wdt.c -F: include/linux/mfd/rohm-bd70528.h F: include/linux/mfd/rohm-bd71815.h F: include/linux/mfd/rohm-bd71828.h F: include/linux/mfd/rohm-bd718x7.h @@@ -17015,15 -16846,13 +17024,15 @@@ F: Documentation/devicetree/bindings/ne F: drivers/nfc/s3fwrn5
SAMSUNG S5C73M3 CAMERA DRIVER -M: Andrzej Hajda a.hajda@samsung.com +M: Sylwester Nawrocki s.nawrocki@samsung.com +M: Andrzej Hajda andrzej.hajda@intel.com L: linux-media@vger.kernel.org S: Supported F: drivers/media/i2c/s5c73m3/*
SAMSUNG S5K5BAF CAMERA DRIVER -M: Andrzej Hajda a.hajda@samsung.com +M: Sylwester Nawrocki s.nawrocki@samsung.com +M: Andrzej Hajda andrzej.hajda@intel.com L: linux-media@vger.kernel.org S: Supported F: drivers/media/i2c/s5k5baf.c @@@ -17052,8 -16881,10 +17061,8 @@@ M: Chanwoo Choi <cw00.choi@samsung.com L: linux-samsung-soc@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/snawrocki/clk.git -F: Documentation/devicetree/bindings/clock/exynos*.txt F: Documentation/devicetree/bindings/clock/samsung,*.yaml F: Documentation/devicetree/bindings/clock/samsung,s3c* -F: Documentation/devicetree/bindings/clock/samsung,s5p* F: drivers/clk/samsung/ F: include/dt-bindings/clock/exynos*.h F: include/dt-bindings/clock/s3c*.h @@@ -17300,13 -17131,6 +17309,13 @@@ L: linux-mmc@vger.kernel.or S: Maintained F: drivers/mmc/host/sdhci-omap.c
+SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) NXP i.MX DRIVER +M: Haibo Chen haibo.chen@nxp.com +L: linux-imx@nxp.com +L: linux-mmc@vger.kernel.org +S: Maintained +F: drivers/mmc/host/sdhci-esdhc-imx.c + SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER M: Jonathan Derrick jonathan.derrick@intel.com M: Revanth Rajashekar revanth.rajashekar@intel.com @@@ -17852,17 -17676,12 +17861,17 @@@ F: drivers/firmware/arm_sdei. F: include/linux/arm_sdei.h F: include/uapi/linux/arm_sdei.h
-SOFTWARE NODES +SOFTWARE NODES AND DEVICE PROPERTIES R: Andy Shevchenko andriy.shevchenko@linux.intel.com +R: Daniel Scally djrscally@gmail.com R: Heikki Krogerus heikki.krogerus@linux.intel.com +R: Sakari Ailus sakari.ailus@linux.intel.com L: linux-acpi@vger.kernel.org S: Maintained +F: drivers/base/property.c F: drivers/base/swnode.c +F: include/linux/fwnode.h +F: include/linux/property.h
SOFTWARE RAID (Multiple Disks) SUPPORT M: Song Liu song@kernel.org @@@ -18022,7 -17841,6 +18031,7 @@@ F: Documentation/sound F: include/sound/ F: include/uapi/sound/ F: sound/ +F: tools/testing/selftests/alsa
SOUND - COMPRESSED AUDIO M: Vinod Koul vkoul@kernel.org @@@ -18042,13 -17860,6 +18051,13 @@@ F: include/sound/dmaengine_pcm. F: sound/core/pcm_dmaengine.c F: sound/soc/soc-generic-dmaengine-pcm.c
+SOUND - ALSA SELFTESTS +M: Mark Brown broonie@kernel.org +L: alsa-devel@alsa-project.org (moderated for non-subscribers) +L: linux-kselftest@vger.kernel.org +S: Supported +F: tools/testing/selftests/alsa + SOUND - SOC LAYER / DYNAMIC AUDIO POWER MANAGEMENT (ASoC) M: Liam Girdwood lgirdwood@gmail.com M: Mark Brown broonie@kernel.org @@@ -18155,8 -17966,8 +18164,8 @@@ F: drivers/pinctrl/spear
SPI NOR SUBSYSTEM M: Tudor Ambarus tudor.ambarus@microchip.com +M: Pratyush Yadav p.yadav@ti.com R: Michael Walle michael@walle.cc -R: Pratyush Yadav p.yadav@ti.com L: linux-mtd@lists.infradead.org S: Maintained W: http://www.linux-mtd.infradead.org/ @@@ -18355,28 -18166,6 +18364,28 @@@ M: Ion Badulescu <ionut@badula.org S: Odd Fixes F: drivers/net/ethernet/adaptec/starfire*
+STARFIVE JH7100 CLOCK DRIVER +M: Emil Renner Berthing kernel@esmil.dk +S: Maintained +F: Documentation/devicetree/bindings/clock/starfive,jh7100-clkgen.yaml +F: drivers/clk/starfive/clk-starfive-jh7100.c +F: include/dt-bindings/clock/starfive-jh7100.h + +STARFIVE JH7100 PINCTRL DRIVER +M: Emil Renner Berthing kernel@esmil.dk +L: linux-gpio@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/pinctrl/starfive,jh7100-pinctrl.yaml +F: drivers/pinctrl/pinctrl-starfive.c +F: include/dt-bindings/pinctrl/pinctrl-starfive.h + +STARFIVE JH7100 RESET CONTROLLER DRIVER +M: Emil Renner Berthing kernel@esmil.dk +S: Maintained +F: Documentation/devicetree/bindings/reset/starfive,jh7100-reset.yaml +F: drivers/reset/reset-starfive-jh7100.c +F: include/dt-bindings/reset/starfive-jh7100.h + STATIC BRANCH/CALL M: Peter Zijlstra peterz@infradead.org M: Josh Poimboeuf jpoimboe@redhat.com @@@ -18538,7 -18327,6 +18547,7 @@@ M: Vineet Gupta <vgupta@kernel.org L: linux-snps-arc@lists.infradead.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git +F: Documentation/arc/ F: Documentation/devicetree/bindings/arc/* F: Documentation/devicetree/bindings/interrupt-controller/snps,arc* F: arch/arc/ @@@ -19556,6 -19344,12 +19565,6 @@@ W: https://github.com/srcres258/linux-d T: git git://github.com/srcres258/linux-doc.git doc-zh-tw F: Documentation/translations/zh_TW/
-TRIVIAL PATCHES -M: Jiri Kosina trivial@kernel.org -S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/trivial.git -K: ^Subject:.*(?i)trivial - TTY LAYER M: Greg Kroah-Hartman gregkh@linuxfoundation.org M: Jiri Slaby jirislaby@kernel.org @@@ -19660,7 -19454,6 +19669,7 @@@ S: Supporte W: http://www.linux-mtd.infradead.org/doc/ubifs.html T: git git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git next T: git git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git fixes +F: Documentation/ABI/testing/sysfs-fs-ubifs F: Documentation/filesystems/ubifs-authentication.rst F: Documentation/filesystems/ubifs.rst F: fs/ubifs/ @@@ -20412,8 -20205,6 +20421,8 @@@ F: include/uapi/linux/virtio_gpio. VIRTIO GPU DRIVER M: David Airlie airlied@linux.ie M: Gerd Hoffmann kraxel@redhat.com +R: Gurchetan Singh gurchetansingh@chromium.org +R: Chia-I Wu olvaffe@gmail.com L: dri-devel@lists.freedesktop.org L: virtualization@lists.linux-foundation.org S: Maintained @@@ -20647,7 -20438,7 +20656,7 @@@ M: Sergey Senozhatsky <senozhatsky@chro R: Andy Shevchenko andriy.shevchenko@linux.intel.com R: Rasmus Villemoes linux@rasmusvillemoes.dk S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/pmladek/printk.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git F: Documentation/core-api/printk-formats.rst F: lib/test_printf.c F: lib/test_scanf.c @@@ -20915,13 -20706,6 +20924,13 @@@ S: Maintaine T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/mm F: arch/x86/mm/
+X86 PLATFORM ANDROID TABLETS DSDT FIXUP DRIVER +M: Hans de Goede hdegoede@redhat.com +L: platform-driver-x86@vger.kernel.org +S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git +F: drivers/platform/x86/x86-android-tablets.c + X86 PLATFORM DRIVERS M: Hans de Goede hdegoede@redhat.com M: Mark Gross markgross@kernel.org @@@ -21085,14 -20869,6 +21094,14 @@@ F: drivers/scsi/xen-scsifront. F: drivers/xen/xen-scsiback.c F: include/xen/interface/io/vscsiif.h
+XEN PVUSB DRIVER +M: Juergen Gross jgross@suse.com +L: xen-devel@lists.xenproject.org (moderated for non-subscribers) +L: linux-usb@vger.kernel.org +S: Supported +F: drivers/usb/host/xen* +F: include/xen/interface/io/usbif.h + XEN SOUND FRONTEND DRIVER M: Oleksandr Andrushchenko oleksandr_andrushchenko@epam.com L: xen-devel@lists.xenproject.org (moderated for non-subscribers) @@@ -21125,13 -20901,6 +21134,13 @@@ F: fs/xfs F: include/uapi/linux/dqblk_xfs.h F: include/uapi/linux/fsmap.h
+XILINX AMS DRIVER +M: Anand Ashok Dumbre anand.ashok.dumbre@xilinx.com +L: linux-iio@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/iio/adc/xlnx,zynqmp-ams.yaml +F: drivers/iio/adc/xilinx-ams.c + XILINX AXI ETHERNET DRIVER M: Radhey Shyam Pandey radhey.shyam.pandey@xilinx.com S: Maintained @@@ -21200,12 -20969,6 +21209,12 @@@ T: git https://github.com/Xilinx/linux- F: Documentation/devicetree/bindings/phy/xlnx,zynqmp-psgtr.yaml F: drivers/phy/xilinx/phy-zynqmp.c
+XILINX EVENT MANAGEMENT DRIVER +M: Abhyuday Godhasara abhyuday.godhasara@xilinx.com +S: Maintained +F: drivers/soc/xilinx/xlnx_event_manager.c +F: include/linux/firmware/xlnx-event-manager.h + XILLYBUS DRIVER M: Eli Billauer eli.billauer@gmail.com L: linux-kernel@vger.kernel.org diff --combined arch/Kconfig index 847fde3d22cd,4568b6b70b5d..5a1692392a4d --- a/arch/Kconfig +++ b/arch/Kconfig @@@ -1297,6 -1297,9 +1297,9 @@@ config HAVE_ARCH_PFN_VALI config ARCH_SUPPORTS_DEBUG_PAGEALLOC bool
+ config ARCH_SUPPORTS_PAGE_TABLE_CHECK + bool + config ARCH_SPLIT_ARG64 bool help @@@ -1312,10 -1315,6 +1315,10 @@@ config ARCH_HAS_PARANOID_L1D_FLUS config DYNAMIC_SIGFRAME bool
+# Select, if arch has a named attribute group bound to NUMA device nodes. +config HAVE_ARCH_NODE_DEV_GROUP + bool + source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig" diff --combined arch/arm/mm/fault.c index a1cebe363ed5,c7326a521a69..13949510772a --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@@ -17,7 -17,6 +17,7 @@@ #include <linux/sched/debug.h> #include <linux/highmem.h> #include <linux/perf_event.h> +#include <linux/kfence.h>
#include <asm/system_misc.h> #include <asm/system_info.h> @@@ -100,11 -99,6 +100,11 @@@ void show_pte(const char *lvl, struct m { } #endif /* CONFIG_MMU */
+static inline bool is_write_fault(unsigned int fsr) +{ + return (fsr & FSR_WRITE) && !(fsr & FSR_CM); +} + static void die_kernel_fault(const char *msg, struct mm_struct *mm, unsigned long addr, unsigned int fsr, struct pt_regs *regs) @@@ -137,14 -131,10 +137,14 @@@ __do_kernel_fault(struct mm_struct *mm /* * No handler, we'll have to terminate things with extreme prejudice. */ - if (addr < PAGE_SIZE) + if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; - else + } else { + if (kfence_handle_page_fault(addr, is_write_fault(fsr), regs)) + return; + msg = "paging request"; + }
die_kernel_fault(msg, mm, addr, fsr, regs); } @@@ -201,8 -191,8 +201,8 @@@ void do_bad_area(unsigned long addr, un }
#ifdef CONFIG_MMU -#define VM_FAULT_BADMAP 0x010000 -#define VM_FAULT_BADACCESS 0x020000 +#define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000) +#define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000)
static inline bool is_permission_fault(unsigned int fsr) { @@@ -271,7 -261,7 +271,7 @@@ do_page_fault(unsigned long addr, unsig if (user_mode(regs)) flags |= FAULT_FLAG_USER;
- if ((fsr & FSR_WRITE) && !(fsr & FSR_CM)) { + if (is_write_fault(fsr)) { flags |= FAULT_FLAG_WRITE; vm_flags = VM_WRITE; } @@@ -322,7 -312,7 +322,7 @@@ retry return 0; }
- if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) { + if (!(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; goto retry; diff --combined arch/arm64/mm/fault.c index 9a9e7675b187,a8fb54fccde0..11e04cca0f4f --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@@ -297,8 -297,6 +297,8 @@@ static void die_kernel_fault(const cha pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg, addr);
+ kasan_non_canonical_hook(addr); + mem_abort_decode(esr);
show_pte(addr); @@@ -608,10 -606,8 +608,8 @@@ retry }
if (fault & VM_FAULT_RETRY) { - if (mm_flags & FAULT_FLAG_ALLOW_RETRY) { - mm_flags |= FAULT_FLAG_TRIED; - goto retry; - } + mm_flags |= FAULT_FLAG_TRIED; + goto retry; } mmap_read_unlock(mm);
@@@ -815,8 -811,11 +813,8 @@@ void do_mem_abort(unsigned long far, un if (!inf->fn(far, esr, regs)) return;
- if (!user_mode(regs)) { - pr_alert("Unhandled fault at 0x%016lx\n", addr); - mem_abort_decode(esr); - show_pte(addr); - } + if (!user_mode(regs)) + die_kernel_fault(inf->name, addr, esr, regs);
/* * At this point we have an unrecognized fault type whose tag bits may diff --combined arch/parisc/mm/fault.c index 147868427b7c,360b627645cc..e9eabf8f14d7 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@@ -148,11 -148,11 +148,11 @@@ int fixup_exception(struct pt_regs *reg * Fix up get_user() and put_user(). * ASM_EXCEPTIONTABLE_ENTRY_EFAULT() sets the least-significant * bit in the relative address of the fixup routine to indicate - * that %r8 should be loaded with -EFAULT to report a userspace - * access error. + * that gr[ASM_EXCEPTIONTABLE_REG] should be loaded with + * -EFAULT to report a userspace access error. */ if (fix->fixup & 1) { - regs->gr[8] = -EFAULT; + regs->gr[ASM_EXCEPTIONTABLE_REG] = -EFAULT;
/* zero target register for get_user() */ if (parisc_acctyp(0, regs->iir) == VM_READ) { @@@ -266,14 -266,14 +266,14 @@@ void do_page_fault(struct pt_regs *regs unsigned long acc_type; vm_fault_t fault = 0; unsigned int flags; - - if (faulthandler_disabled()) - goto no_context; + char *msg;
tsk = current; mm = tsk->mm; - if (!mm) + if (!mm) { + msg = "Page fault: no context"; goto no_context; + }
flags = FAULT_FLAG_DEFAULT; if (user_mode(regs)) @@@ -324,16 -324,14 +324,14 @@@ good_area goto bad_area; BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - /* - * No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ - flags |= FAULT_FLAG_TRIED; - goto retry; - } + if (fault & VM_FAULT_RETRY) { + /* + * No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ + flags |= FAULT_FLAG_TRIED; + goto retry; } mmap_read_unlock(mm); return; @@@ -409,7 -407,6 +407,7 @@@ bad_area force_sig_fault(signo, si_code, (void __user *) address); return; } + msg = "Page fault: bad address";
no_context:
@@@ -417,13 -414,11 +415,13 @@@ return; }
- parisc_terminate("Bad Address (null pointer deref?)", regs, code, address); + parisc_terminate(msg, regs, code, address);
- out_of_memory: +out_of_memory: mmap_read_unlock(mm); - if (!user_mode(regs)) + if (!user_mode(regs)) { + msg = "Page fault: out of memory"; goto no_context; + } pagefault_out_of_memory(); } diff --combined arch/powerpc/mm/fault.c index 2d4a411c7c85,ebcc61e47d62..eb8ecd7343a9 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@@ -35,7 -35,6 +35,7 @@@ #include <linux/kfence.h> #include <linux/pkeys.h>
+#include <asm/asm-prototypes.h> #include <asm/firmware.h> #include <asm/interrupt.h> #include <asm/page.h> @@@ -517,10 -516,8 +517,8 @@@ retry * case. */ if (unlikely(fault & VM_FAULT_RETRY)) { - if (flags & FAULT_FLAG_ALLOW_RETRY) { - flags |= FAULT_FLAG_TRIED; - goto retry; - } + flags |= FAULT_FLAG_TRIED; + goto retry; }
mmap_read_unlock(current->mm); @@@ -621,27 -618,4 +619,27 @@@ DEFINE_INTERRUPT_HANDLER(do_bad_page_fa { bad_page_fault(regs, SIGSEGV); } + +/* + * In radix, segment interrupts indicate the EA is not addressable by the + * page table geometry, so they are always sent here. + * + * In hash, this is called if do_slb_fault returns error. Typically it is + * because the EA was outside the region allowed by software. + */ +DEFINE_INTERRUPT_HANDLER(do_bad_segment_interrupt) +{ + int err = regs->result; + + if (err == -EFAULT) { + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar); + else + bad_page_fault(regs, SIGSEGV); + } else if (err == -EINVAL) { + unrecoverable_exception(regs); + } else { + BUG(); + } +} #endif diff --combined arch/s390/mm/fault.c index 6ed2886fc014,d7d6be283d94..ff16ce0d04ee --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@@ -115,7 -115,7 +115,7 @@@ static void dump_pagetable(unsigned lon pr_cont("R1:%016lx ", *table); if (*table & _REGION_ENTRY_INVALID) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION2: table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; @@@ -124,7 -124,7 +124,7 @@@ pr_cont("R2:%016lx ", *table); if (*table & _REGION_ENTRY_INVALID) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION3: table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; @@@ -133,7 -133,7 +133,7 @@@ pr_cont("R3:%016lx ", *table); if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_SEGMENT: table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; @@@ -142,7 -142,7 +142,7 @@@ pr_cont("S:%016lx ", *table); if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) goto out; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); } table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; if (bad_address(table)) @@@ -452,21 -452,21 +452,21 @@@ retry if (unlikely(fault & VM_FAULT_ERROR)) goto out_up;
- if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - if (IS_ENABLED(CONFIG_PGSTE) && gmap && - (flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* FAULT_FLAG_RETRY_NOWAIT has been set, - * mmap_lock has not been released */ - current->thread.gmap_pfault = 1; - fault = VM_FAULT_PFAULT; - goto out_up; - } - flags &= ~FAULT_FLAG_RETRY_NOWAIT; - flags |= FAULT_FLAG_TRIED; - mmap_read_lock(mm); - goto retry; + if (fault & VM_FAULT_RETRY) { + if (IS_ENABLED(CONFIG_PGSTE) && gmap && + (flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* + * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has + * not been released + */ + current->thread.gmap_pfault = 1; + fault = VM_FAULT_PFAULT; + goto out_up; } + flags &= ~FAULT_FLAG_RETRY_NOWAIT; + flags |= FAULT_FLAG_TRIED; + mmap_read_lock(mm); + goto retry; } if (IS_ENABLED(CONFIG_PGSTE) && gmap) { address = __gmap_link(gmap, current->thread.gmap_addr, diff --combined arch/um/kernel/trap.c index 561a2b03c3cf,193503484af5..d1d5d0be0308 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@@ -87,12 -87,10 +87,10 @@@ good_area } BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED;
- goto retry; - } + goto retry; }
pmd = pmd_off(mm, address); @@@ -127,6 -125,7 +125,6 @@@ out_of_memory pagefault_out_of_memory(); return 0; } -EXPORT_SYMBOL(handle_page_fault);
static void show_segv_info(struct uml_pt_regs *regs) { diff --combined arch/x86/Kconfig index 976dd6b532bf,d0628415b93e..407533c835fe --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@@ -104,6 -104,7 +104,7 @@@ config X8 select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC + select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 select ARCH_SUPPORTS_LTO_CLANG @@@ -269,7 -270,6 +270,7 @@@ select HAVE_ARCH_KCSAN if X86_64 select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS + select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI
config INSTRUCTION_DECODER @@@ -473,18 -473,6 +474,18 @@@ config RETPOLIN branches. Requires a compiler with -mindirect-branch=thunk-extern support for full protection. The kernel may run slower.
+config CC_HAS_SLS + def_bool $(cc-option,-mharden-sls=all) + +config SLS + bool "Mitigate Straight-Line-Speculation" + depends on CC_HAS_SLS && X86_64 + default n + help + Compile the kernel with straight-line-speculation options to guard + against straight line speculation. The kernel image might be slightly + larger. + config X86_CPU_RESCTRL bool "x86 CPU resource control support" depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD) @@@ -1536,20 -1524,16 +1537,20 @@@ config X86_CPA_STATISTIC helps to determine the effectiveness of preserving large and huge page mappings when mapping protections are changed.
+config X86_MEM_ENCRYPT + select ARCH_HAS_FORCE_DMA_UNENCRYPTED + select DYNAMIC_PHYSICAL_MASK + select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS + def_bool n + config AMD_MEM_ENCRYPT bool "AMD Secure Memory Encryption (SME) support" depends on X86_64 && CPU_SUP_AMD select DMA_COHERENT_POOL - select DYNAMIC_PHYSICAL_MASK select ARCH_USE_MEMREMAP_PROT - select ARCH_HAS_FORCE_DMA_UNENCRYPTED select INSTRUCTION_DECODER - select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS select ARCH_HAS_CC_PLATFORM + select X86_MEM_ENCRYPT help Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory @@@ -1934,7 -1918,6 +1935,7 @@@ config X86_SG select SRCU select MMU_NOTIFIER select NUMA_KEEP_MEMINFO if NUMA + select XARRAY_MULTI help Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions that can be used by applications to set aside private regions of code @@@ -1964,7 -1947,7 +1965,7 @@@ config EF
config EFI_STUB bool "EFI stub support" - depends on EFI && !X86_USE_3DNOW + depends on EFI depends on $(cc-option,-mabi=ms) || X86_32 select RELOCATABLE help diff --combined arch/x86/include/asm/pgtable.h index a34430b7af4a,d7d287ac1018..8a9432fb3802 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@@ -22,11 -22,11 +22,12 @@@ #define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot)))
#ifndef __ASSEMBLY__ +#include <linux/spinlock.h> #include <asm/x86_init.h> #include <asm/pkru.h> #include <asm/fpu/api.h> #include <asm-generic/pgtable_uffd.h> + #include <linux/page_table_check.h>
extern pgd_t early_top_pgt[PTRS_PER_PGD]; bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); @@@ -753,7 -753,7 +754,7 @@@ static inline bool pte_accessible(struc return true;
if ((pte_flags(a) & _PAGE_PROTNONE) && - mm_tlb_flush_pending(mm)) + atomic_read(&mm->tlb_flush_pending)) return true;
return false; @@@ -1007,18 -1007,21 +1008,21 @@@ static inline pud_t native_local_pudp_g static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { + page_table_check_pte_set(mm, addr, ptep, pte); set_pte(ptep, pte); }
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { + page_table_check_pmd_set(mm, addr, pmdp, pmd); set_pmd(pmdp, pmd); }
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { + page_table_check_pud_set(mm, addr, pudp, pud); native_set_pud(pudp, pud); }
@@@ -1049,6 -1052,7 +1053,7 @@@ static inline pte_t ptep_get_and_clear( pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); + page_table_check_pte_clear(mm, addr, pte); return pte; }
@@@ -1064,12 -1068,23 +1069,23 @@@ static inline pte_t ptep_get_and_clear_ * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); + page_table_check_pte_clear(mm, addr, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } return pte; }
+ #define __HAVE_ARCH_PTEP_CLEAR + static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) + { + if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK)) + ptep_get_and_clear(mm, addr, ptep); + else + pte_clear(mm, addr, ptep); + } + #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@@ -1110,14 -1125,22 +1126,22 @@@ static inline int pmd_write(pmd_t pmd static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - return native_pmdp_get_and_clear(pmdp); + pmd_t pmd = native_pmdp_get_and_clear(pmdp); + + page_table_check_pmd_clear(mm, addr, pmd); + + return pmd; }
#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { - return native_pudp_get_and_clear(pudp); + pud_t pud = native_pudp_get_and_clear(pudp); + + page_table_check_pud_clear(mm, addr, pud); + + return pud; }
#define __HAVE_ARCH_PMDP_SET_WRPROTECT @@@ -1138,6 -1161,7 +1162,7 @@@ static inline int pud_write(pud_t pud static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { + page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { diff --combined drivers/block/zram/zram_drv.c index f6da5293b913,9a46b2ef6951..cb253d80d72b --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@@ -1903,14 -1903,7 +1903,7 @@@ static struct attribute *zram_disk_attr NULL, };
- static const struct attribute_group zram_disk_attr_group = { - .attrs = zram_disk_attrs, - }; - - static const struct attribute_group *zram_disk_attr_groups[] = { - &zram_disk_attr_group, - NULL, - }; + ATTRIBUTE_GROUPS(zram_disk);
/* * Allocate and initialize new zram device. the function returns @@@ -1947,7 -1940,6 +1940,7 @@@ static int zram_add(void zram->disk->major = zram_major; zram->disk->first_minor = device_id; zram->disk->minors = 1; + zram->disk->flags |= GENHD_FL_NO_PART; zram->disk->fops = &zram_devops; zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); @@@ -1983,7 -1975,7 +1976,7 @@@ blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); - ret = device_add_disk(NULL, zram->disk, zram_disk_attr_groups); + ret = device_add_disk(NULL, zram->disk, zram_disk_groups); if (ret) goto out_cleanup_disk;
diff --combined drivers/dax/bus.c index ee4568ef757c,a22350e822fa..1dad813ee4a6 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@@ -10,6 -10,8 +10,6 @@@ #include "dax-private.h" #include "bus.h"
-static struct class *dax_class; - static DEFINE_MUTEX(dax_bus_lock);
#define DAX_NAME_LEN 30 @@@ -127,11 -129,35 +127,35 @@@ ATTRIBUTE_GROUPS(dax_drv)
static int dax_bus_match(struct device *dev, struct device_driver *drv);
+ /* + * Static dax regions are regions created by an external subsystem + * nvdimm where a single range is assigned. Its boundaries are by the external + * subsystem and are usually limited to one physical memory range. For example, + * for PMEM it is usually defined by NVDIMM Namespace boundaries (i.e. a + * single contiguous range) + * + * On dynamic dax regions, the assigned region can be partitioned by dax core + * into multiple subdivisions. A subdivision is represented into one + * /dev/daxN.M device composed by one or more potentially discontiguous ranges. + * + * When allocating a dax region, drivers must set whether it's static + * (IORESOURCE_DAX_STATIC). On static dax devices, the @pgmap is pre-assigned + * to dax core when calling devm_create_dev_dax(), whereas in dynamic dax + * devices it is NULL but afterwards allocated by dax core on device ->probe(). + * Care is needed to make sure that dynamic dax devices are torn down with a + * cleared @pgmap field (see kill_dev_dax()). + */ static bool is_static(struct dax_region *dax_region) { return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0; }
+ bool static_dev_dax(struct dev_dax *dev_dax) + { + return is_static(dev_dax->region); + } + EXPORT_SYMBOL_GPL(static_dev_dax); + static u64 dev_dax_size(struct dev_dax *dev_dax) { u64 size = 0; @@@ -361,6 -387,14 +385,14 @@@ void kill_dev_dax(struct dev_dax *dev_d
kill_dax(dax_dev); unmap_mapping_range(inode->i_mapping, 0, 0, 1); + + /* + * Dynamic dax region have the pgmap allocated via dev_kzalloc() + * and thus freed by devm. Clear the pgmap to not have stale pgmap + * ranges on probe() from previous reconfigurations of region devices. + */ + if (!static_dev_dax(dev_dax)) + dev_dax->pgmap = NULL; } EXPORT_SYMBOL_GPL(kill_dev_dax);
@@@ -1321,17 -1355,14 +1353,17 @@@ struct dev_dax *devm_create_dev_dax(str }
/* - * No 'host' or dax_operations since there is no access to this - * device outside of mmap of the resulting character device. + * No dax_operations since there is no access to this device outside of + * mmap of the resulting character device. */ - dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC); + dax_dev = alloc_dax(dev_dax, NULL); if (IS_ERR(dax_dev)) { rc = PTR_ERR(dax_dev); goto err_alloc_dax; } + set_dax_synchronous(dax_dev); + set_dax_nocache(dax_dev); + set_dax_nomc(dax_dev);
/* a device_dax instance is dead while the driver is not attached */ kill_dax(dax_dev); @@@ -1344,7 -1375,10 +1376,7 @@@
inode = dax_inode(dax_dev); dev->devt = inode->i_rdev; - if (data->subsys == DEV_DAX_BUS) - dev->bus = &dax_bus_type; - else - dev->class = dax_class; + dev->bus = &dax_bus_type; dev->parent = parent; dev->type = &dev_dax_type;
@@@ -1443,10 -1477,22 +1475,10 @@@ EXPORT_SYMBOL_GPL(dax_driver_unregister
int __init dax_bus_init(void) { - int rc; - - if (IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)) { - dax_class = class_create(THIS_MODULE, "dax"); - if (IS_ERR(dax_class)) - return PTR_ERR(dax_class); - } - - rc = bus_register(&dax_bus_type); - if (rc) - class_destroy(dax_class); - return rc; + return bus_register(&dax_bus_type); }
void __exit dax_bus_exit(void) { bus_unregister(&dax_bus_type); - class_destroy(dax_class); } diff --combined drivers/dax/bus.h index 381cec9ff05c,4acdfee7dd59..fbb940293d6d --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@@ -16,15 -16,24 +16,15 @@@ struct dax_region *alloc_dax_region(str struct range *range, int target_node, unsigned int align, unsigned long flags);
-enum dev_dax_subsys { - DEV_DAX_BUS = 0, /* zeroed dev_dax_data picks this by default */ - DEV_DAX_CLASS, -}; - struct dev_dax_data { struct dax_region *dax_region; struct dev_pagemap *pgmap; - enum dev_dax_subsys subsys; resource_size_t size; int id; };
struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data);
-/* to be deleted when DEV_DAX_CLASS is removed */ -struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys); - struct dax_device_driver { struct device_driver drv; struct list_head ids; @@@ -39,7 -48,12 +39,8 @@@ int __dax_driver_register(struct dax_de __dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) void dax_driver_unregister(struct dax_device_driver *dax_drv); void kill_dev_dax(struct dev_dax *dev_dax); + bool static_dev_dax(struct dev_dax *dev_dax);
-#if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) -int dev_dax_probe(struct dev_dax *dev_dax); -#endif - /* * While run_dax() is potentially a generic operation that could be * defined in include/linux/dax.h we don't want to grow any users diff --combined drivers/dax/device.c index e58d597f0415,591f293d326f..d33a0613ed0c --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@@ -73,11 -73,39 +73,39 @@@ __weak phys_addr_t dax_pgoff_to_phys(st return -1; }
+ static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, + unsigned long fault_size) + { + unsigned long i, nr_pages = fault_size / PAGE_SIZE; + struct file *filp = vmf->vma->vm_file; + struct dev_dax *dev_dax = filp->private_data; + pgoff_t pgoff; + + /* mapping is only set on the head */ + if (dev_dax->pgmap->vmemmap_shift) + nr_pages = 1; + + pgoff = linear_page_index(vmf->vma, + ALIGN(vmf->address, fault_size)); + + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i); + + page = compound_head(page); + if (page->mapping) + continue; + + page->mapping = filp->f_mapping; + page->index = pgoff + i; + } + } + static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { struct device *dev = &dev_dax->dev; phys_addr_t phys; + pfn_t pfn; unsigned int fault_size = PAGE_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__)) @@@ -98,18 -126,21 +126,21 @@@ return VM_FAULT_SIGBUS; }
- *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); + dax_set_mapping(vmf, pfn, fault_size); + + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); }
static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { unsigned long pmd_addr = vmf->address & PMD_MASK; struct device *dev = &dev_dax->dev; phys_addr_t phys; pgoff_t pgoff; + pfn_t pfn; unsigned int fault_size = PMD_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__)) @@@ -138,19 -169,22 +169,22 @@@ return VM_FAULT_SIGBUS; }
- *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); + dax_set_mapping(vmf, pfn, fault_size); + + return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); }
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { unsigned long pud_addr = vmf->address & PUD_MASK; struct device *dev = &dev_dax->dev; phys_addr_t phys; pgoff_t pgoff; + pfn_t pfn; unsigned int fault_size = PUD_SIZE;
@@@ -180,13 -214,15 +214,15 @@@ return VM_FAULT_SIGBUS; }
- *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); + dax_set_mapping(vmf, pfn, fault_size); + + return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); } #else static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { return VM_FAULT_FALLBACK; } @@@ -196,10 -232,8 +232,8 @@@ static vm_fault_t dev_dax_huge_fault(st enum page_entry_size pe_size) { struct file *filp = vmf->vma->vm_file; - unsigned long fault_size; vm_fault_t rc = VM_FAULT_SIGBUS; int id; - pfn_t pfn; struct dev_dax *dev_dax = filp->private_data;
dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, @@@ -209,43 -243,18 +243,18 @@@ id = dax_read_lock(); switch (pe_size) { case PE_SIZE_PTE: - fault_size = PAGE_SIZE; - rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn); + rc = __dev_dax_pte_fault(dev_dax, vmf); break; case PE_SIZE_PMD: - fault_size = PMD_SIZE; - rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn); + rc = __dev_dax_pmd_fault(dev_dax, vmf); break; case PE_SIZE_PUD: - fault_size = PUD_SIZE; - rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn); + rc = __dev_dax_pud_fault(dev_dax, vmf); break; default: rc = VM_FAULT_SIGBUS; }
- if (rc == VM_FAULT_NOPAGE) { - unsigned long i; - pgoff_t pgoff; - - /* - * In the device-dax case the only possibility for a - * VM_FAULT_NOPAGE result is when device-dax capacity is - * mapped. No need to consider the zero page, or racing - * conflicting mappings. - */ - pgoff = linear_page_index(vmf->vma, vmf->address - & ~(fault_size - 1)); - for (i = 0; i < fault_size / PAGE_SIZE; i++) { - struct page *page; - - page = pfn_to_page(pfn_t_to_pfn(pfn) + i); - if (page->mapping) - continue; - page->mapping = filp->f_mapping; - page->index = pgoff + i; - } - } dax_read_unlock(id);
return rc; @@@ -398,17 -407,34 +407,34 @@@ int dev_dax_probe(struct dev_dax *dev_d void *addr; int rc, i;
- pgmap = dev_dax->pgmap; - if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1, - "static pgmap / multi-range device conflict\n")) - return -EINVAL; + if (static_dev_dax(dev_dax)) { + if (dev_dax->nr_range > 1) { + dev_warn(dev, + "static pgmap / multi-range device conflict\n"); + return -EINVAL; + }
- if (!pgmap) { - pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range) - * (dev_dax->nr_range - 1), GFP_KERNEL); + pgmap = dev_dax->pgmap; + } else { + if (dev_dax->pgmap) { + dev_warn(dev, + "dynamic-dax with pre-populated page map\n"); + return -EINVAL; + } + + pgmap = devm_kzalloc(dev, + struct_size(pgmap, ranges, dev_dax->nr_range - 1), + GFP_KERNEL); if (!pgmap) return -ENOMEM; + pgmap->nr_range = dev_dax->nr_range; + dev_dax->pgmap = pgmap; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + pgmap->ranges[i] = *range; + } }
for (i = 0; i < dev_dax->nr_range; i++) { @@@ -420,12 -446,12 +446,12 @@@ i, range->start, range->end); return -EBUSY; } - /* don't update the range for static pgmap */ - if (!dev_dax->pgmap) - pgmap->ranges[i] = *range; }
pgmap->type = MEMORY_DEVICE_GENERIC; + if (dev_dax->align > PAGE_SIZE) + pgmap->vmemmap_shift = + order_base_2(dev_dax->align >> PAGE_SHIFT); addr = devm_memremap_pages(dev, pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); @@@ -433,7 -459,11 +459,7 @@@ inode = dax_inode(dax_dev); cdev = inode->i_cdev; cdev_init(cdev, &dax_fops); - if (dev->class) { - /* for the CONFIG_DEV_DAX_PMEM_COMPAT case */ - cdev->owner = dev->parent->driver->owner; - } else - cdev->owner = dev->driver->owner; + cdev->owner = dev->driver->owner; cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) diff --combined drivers/of/fdt.c index ca2cfb3012a4,116c582fea7a..ad85ff6474ff --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@@ -26,6 -26,7 +26,7 @@@ #include <linux/serial_core.h> #include <linux/sysfs.h> #include <linux/random.h> + #include <linux/kmemleak.h>
#include <asm/setup.h> /* for COMMAND_LINE_SIZE */ #include <asm/page.h> @@@ -482,11 -483,9 +483,11 @@@ static int __init early_init_dt_reserve if (nomap) { /* * If the memory is already reserved (by another region), we - * should not allow it to be marked nomap. + * should not allow it to be marked nomap, but don't worry + * if the region isn't memory as it won't be mapped. */ - if (memblock_is_region_reserved(base, size)) + if (memblock_overlaps_region(&memblock.memory, base, size) && + memblock_is_region_reserved(base, size)) return -EBUSY;
return memblock_mark_nomap(base, size); @@@ -524,9 -523,12 +525,12 @@@ static int __init __reserved_mem_reserv size = dt_mem_next_cell(dt_root_size_cells, &prop);
if (size && - early_init_dt_reserve_memory_arch(base, size, nomap) == 0) + early_init_dt_reserve_memory_arch(base, size, nomap) == 0) { pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); + if (!nomap) + kmemleak_alloc_phys(base, size, 0, 0); + } else pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); @@@ -967,22 -969,18 +971,22 @@@ static void __init early_init_dt_check_ elfcorehdr_addr, elfcorehdr_size); }
-static phys_addr_t cap_mem_addr; -static phys_addr_t cap_mem_size; +static unsigned long chosen_node_offset = -FDT_ERR_NOTFOUND;
/** * early_init_dt_check_for_usable_mem_range - Decode usable memory range * location from flat tree - * @node: reference to node containing usable memory range location ('chosen') */ -static void __init early_init_dt_check_for_usable_mem_range(unsigned long node) +void __init early_init_dt_check_for_usable_mem_range(void) { const __be32 *prop; int len; + phys_addr_t cap_mem_addr; + phys_addr_t cap_mem_size; + unsigned long node = chosen_node_offset; + + if ((long)node < 0) + return;
pr_debug("Looking for usable-memory-range property... ");
@@@ -995,8 -993,6 +999,8 @@@
pr_debug("cap_mem_start=%pa cap_mem_size=%pa\n", &cap_mem_addr, &cap_mem_size); + + memblock_cap_memory_range(cap_mem_addr, cap_mem_size); }
#ifdef CONFIG_SERIAL_EARLYCON @@@ -1050,14 -1046,13 +1054,14 @@@ int __init early_init_dt_scan_chosen_st /* * early_init_dt_scan_root - fetch the top level address and size cells */ -int __init early_init_dt_scan_root(unsigned long node, const char *uname, - int depth, void *data) +int __init early_init_dt_scan_root(void) { const __be32 *prop; + const void *fdt = initial_boot_params; + int node = fdt_path_offset(fdt, "/");
- if (depth != 0) - return 0; + if (node < 0) + return -ENODEV;
dt_root_size_cells = OF_ROOT_NODE_SIZE_CELLS_DEFAULT; dt_root_addr_cells = OF_ROOT_NODE_ADDR_CELLS_DEFAULT; @@@ -1072,7 -1067,8 +1076,7 @@@ dt_root_addr_cells = be32_to_cpup(prop); pr_debug("dt_root_addr_cells = %x\n", dt_root_addr_cells);
- /* break now */ - return 1; + return 0; }
u64 __init dt_mem_next_cell(int s, const __be32 **cellp) @@@ -1086,78 -1082,73 +1090,78 @@@ /* * early_init_dt_scan_memory - Look for and parse memory nodes */ -int __init early_init_dt_scan_memory(unsigned long node, const char *uname, - int depth, void *data) +int __init early_init_dt_scan_memory(void) { - const char *type = of_get_flat_dt_prop(node, "device_type", NULL); - const __be32 *reg, *endp; - int l; - bool hotpluggable; + int node; + const void *fdt = initial_boot_params;
- /* We are scanning "memory" nodes only */ - if (type == NULL || strcmp(type, "memory") != 0) - return 0; + fdt_for_each_subnode(node, fdt, 0) { + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *reg, *endp; + int l; + bool hotpluggable;
- reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l); - if (reg == NULL) - reg = of_get_flat_dt_prop(node, "reg", &l); - if (reg == NULL) - return 0; + /* We are scanning "memory" nodes only */ + if (type == NULL || strcmp(type, "memory") != 0) + continue;
- endp = reg + (l / sizeof(__be32)); - hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL); + reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l); + if (reg == NULL) + reg = of_get_flat_dt_prop(node, "reg", &l); + if (reg == NULL) + continue;
- pr_debug("memory scan node %s, reg size %d,\n", uname, l); + endp = reg + (l / sizeof(__be32)); + hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL);
- while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) { - u64 base, size; + pr_debug("memory scan node %s, reg size %d,\n", + fdt_get_name(fdt, node, NULL), l);
- base = dt_mem_next_cell(dt_root_addr_cells, ®); - size = dt_mem_next_cell(dt_root_size_cells, ®); + while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) { + u64 base, size;
- if (size == 0) - continue; - pr_debug(" - %llx, %llx\n", base, size); + base = dt_mem_next_cell(dt_root_addr_cells, ®); + size = dt_mem_next_cell(dt_root_size_cells, ®);
- early_init_dt_add_memory_arch(base, size); + if (size == 0) + continue; + pr_debug(" - %llx, %llx\n", base, size);
- if (!hotpluggable) - continue; + early_init_dt_add_memory_arch(base, size);
- if (memblock_mark_hotplug(base, size)) - pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n", - base, base + size); - } + if (!hotpluggable) + continue;
+ if (memblock_mark_hotplug(base, size)) + pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n", + base, base + size); + } + } return 0; }
-int __init early_init_dt_scan_chosen(unsigned long node, const char *uname, - int depth, void *data) +int __init early_init_dt_scan_chosen(char *cmdline) { - int l; + int l, node; const char *p; const void *rng_seed; + const void *fdt = initial_boot_params;
- pr_debug("search "chosen", depth: %d, uname: %s\n", depth, uname); + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + node = fdt_path_offset(fdt, "/chosen@0"); + if (node < 0) + return -ENOENT;
- if (depth != 1 || !data || - (strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0)) - return 0; + chosen_node_offset = node;
early_init_dt_check_for_initrd(node); early_init_dt_check_for_elfcorehdr(node); - early_init_dt_check_for_usable_mem_range(node);
/* Retrieve command line */ p = of_get_flat_dt_prop(node, "bootargs", &l); if (p != NULL && l > 0) - strlcpy(data, p, min(l, COMMAND_LINE_SIZE)); + strlcpy(cmdline, p, min(l, COMMAND_LINE_SIZE));
/* * CONFIG_CMDLINE is meant to be a default in case nothing else @@@ -1166,18 -1157,18 +1170,18 @@@ */ #ifdef CONFIG_CMDLINE #if defined(CONFIG_CMDLINE_EXTEND) - strlcat(data, " ", COMMAND_LINE_SIZE); - strlcat(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE); + strlcat(cmdline, " ", COMMAND_LINE_SIZE); + strlcat(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE); #elif defined(CONFIG_CMDLINE_FORCE) - strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE); + strlcpy(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE); #else /* No arguments from boot loader, use kernel's cmdl*/ - if (!((char *)data)[0]) - strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE); + if (!((char *)cmdline)[0]) + strlcpy(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE); #endif #endif /* CONFIG_CMDLINE */
- pr_debug("Command line is: %s\n", (char *)data); + pr_debug("Command line is: %s\n", (char *)cmdline);
rng_seed = of_get_flat_dt_prop(node, "rng-seed", &l); if (rng_seed && l > 0) { @@@ -1191,7 -1182,8 +1195,7 @@@ fdt_totalsize(initial_boot_params)); }
- /* break now */ - return 1; + return 0; }
#ifndef MIN_MEMBLOCK_ADDR @@@ -1273,21 -1265,21 +1277,21 @@@ bool __init early_init_dt_verify(void *
void __init early_init_dt_scan_nodes(void) { - int rc = 0; + int rc;
/* Initialize {size,address}-cells info */ - of_scan_flat_dt(early_init_dt_scan_root, NULL); + early_init_dt_scan_root();
/* Retrieve various information from the /chosen node */ - rc = of_scan_flat_dt(early_init_dt_scan_chosen, boot_command_line); - if (!rc) + rc = early_init_dt_scan_chosen(boot_command_line); + if (rc) pr_warn("No chosen node found, continuing without\n");
/* Setup memory, calling early_init_dt_add_memory_arch */ - of_scan_flat_dt(early_init_dt_scan_memory, NULL); + early_init_dt_scan_memory();
/* Handle linux,usable-memory-range property */ - memblock_cap_memory_range(cap_mem_addr, cap_mem_size); + early_init_dt_check_for_usable_mem_range(); }
bool __init early_init_dt_scan(void *params) diff --combined fs/ext4/extents.c index 1077ce7e189f,5582fba36b44..74c91da585d7 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@@ -27,8 -27,8 +27,8 @@@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fiemap.h> - #include <linux/backing-dev.h> #include <linux/iomap.h> + #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" @@@ -1496,7 -1496,8 +1496,7 @@@ static int ext4_ext_search_left(struct EXT4_ERROR_INODE(inode, "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", ix != NULL ? le32_to_cpu(ix->ei_block) : 0, - EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? - le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, + le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block), depth); return -EFSCORRUPTED; } @@@ -2024,6 -2025,7 +2024,6 @@@ int ext4_ext_insert_extent(handle_t *ha + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); - eh = path[depth].p_hdr; nearex = ex; goto merge; } @@@ -2052,6 -2054,7 +2052,6 @@@ prepend + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); - eh = path[depth].p_hdr; nearex = ex; goto merge; } @@@ -4404,8 -4407,7 +4404,7 @@@ retry err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry; } if (err) @@@ -4413,8 -4415,7 +4412,7 @@@ retry_remove_space: err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry_remove_space; } return err; @@@ -4644,6 -4645,8 +4642,6 @@@ static long ext4_zero_range(struct fil ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits, - (offset + len - 1) >> inode->i_sb->s_blocksize_bits); /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); if (ret >= 0) @@@ -4692,6 -4695,8 +4690,6 @@@ long ext4_fallocate(struct file *file, FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP;
- ext4_fc_start_update(inode); - if (mode & FALLOC_FL_PUNCH_HOLE) { ret = ext4_punch_hole(inode, offset, len); goto exit; @@@ -4755,6 -4760,7 +4753,6 @@@ out inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); exit: - ext4_fc_stop_update(inode); return ret; }
@@@ -5336,7 -5342,7 +5334,7 @@@ static int ext4_collapse_range(struct i ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); @@@ -5375,6 -5381,7 +5373,6 @@@
out_stop: ext4_journal_stop(handle); - ext4_fc_stop_ineligible(sb); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: @@@ -5476,7 -5483,7 +5474,7 @@@ static int ext4_insert_range(struct ino ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
/* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; @@@ -5551,6 -5558,7 +5549,6 @@@
out_stop: ext4_journal_stop(handle); - ext4_fc_stop_ineligible(sb); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: diff --combined fs/xfs/xfs_buf.c index bbb0fbd34e64,6c45e3fa56f4..b45e0d50a405 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@@ -394,7 -394,7 +394,7 @@@ xfs_buf_alloc_pages }
XFS_STATS_INC(bp->b_mount, xb_page_retries); - congestion_wait(BLK_RW_ASYNC, HZ / 50); + memalloc_retry_wait(gfp_mask); } return 0; } @@@ -1892,7 -1892,6 +1892,7 @@@ xfs_free_buftarg list_lru_destroy(&btp->bt_lru);
blkdev_issue_flush(btp->bt_bdev); + fs_put_dax(btp->bt_daxdev);
kmem_free(btp); } @@@ -1933,10 -1932,11 +1933,10 @@@ xfs_setsize_buftarg_early return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); }
-xfs_buftarg_t * +struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev, - struct dax_device *dax_dev) + struct block_device *bdev) { xfs_buftarg_t *btp;
@@@ -1945,7 -1945,7 +1945,7 @@@ btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_daxdev = dax_dev; + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
/* * Buffer IO error rate limiting. Limit it to no more than 10 messages diff --combined include/linux/fs.h index f5d3bf5b69a6,5315fa68f751..42ab6d71291c --- a/include/linux/fs.h +++ b/include/linux/fs.h @@@ -41,7 -41,6 +41,7 @@@ #include <linux/stddef.h> #include <linux/mount.h> #include <linux/cred.h> +#include <linux/mnt_idmapping.h>
#include <asm/byteorder.h> #include <uapi/linux/fs.h> @@@ -1600,11 -1599,6 +1600,11 @@@ struct super_block struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout;
+static inline struct user_namespace *i_user_ns(const struct inode *inode) +{ + return inode->i_sb->s_user_ns; +} + /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can * instead deal with the raw numeric values that are stored @@@ -1612,22 -1606,50 +1612,22 @@@ */ static inline uid_t i_uid_read(const struct inode *inode) { - return from_kuid(inode->i_sb->s_user_ns, inode->i_uid); + return from_kuid(i_user_ns(inode), inode->i_uid); }
static inline gid_t i_gid_read(const struct inode *inode) { - return from_kgid(inode->i_sb->s_user_ns, inode->i_gid); + return from_kgid(i_user_ns(inode), inode->i_gid); }
static inline void i_uid_write(struct inode *inode, uid_t uid) { - inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid); + inode->i_uid = make_kuid(i_user_ns(inode), uid); }
static inline void i_gid_write(struct inode *inode, gid_t gid) { - inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid); -} - -/** - * kuid_into_mnt - map a kuid down into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * @kuid: kuid to be mapped - * - * Return: @kuid mapped according to @mnt_userns. - * If @kuid has no mapping INVALID_UID is returned. - */ -static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns, - kuid_t kuid) -{ - return make_kuid(mnt_userns, __kuid_val(kuid)); -} - -/** - * kgid_into_mnt - map a kgid down into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * @kgid: kgid to be mapped - * - * Return: @kgid mapped according to @mnt_userns. - * If @kgid has no mapping INVALID_GID is returned. - */ -static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns, - kgid_t kgid) -{ - return make_kgid(mnt_userns, __kgid_val(kgid)); + inode->i_gid = make_kgid(i_user_ns(inode), gid); }
/** @@@ -1641,7 -1663,7 +1641,7 @@@ static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns, const struct inode *inode) { - return kuid_into_mnt(mnt_userns, inode->i_uid); + return mapped_kuid_fs(mnt_userns, i_user_ns(inode), inode->i_uid); }
/** @@@ -1655,7 -1677,69 +1655,7 @@@ static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns, const struct inode *inode) { - return kgid_into_mnt(mnt_userns, inode->i_gid); -} - -/** - * kuid_from_mnt - map a kuid up into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * @kuid: kuid to be mapped - * - * Return: @kuid mapped up according to @mnt_userns. - * If @kuid has no mapping INVALID_UID is returned. - */ -static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns, - kuid_t kuid) -{ - return KUIDT_INIT(from_kuid(mnt_userns, kuid)); -} - -/** - * kgid_from_mnt - map a kgid up into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * @kgid: kgid to be mapped - * - * Return: @kgid mapped up according to @mnt_userns. - * If @kgid has no mapping INVALID_GID is returned. - */ -static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns, - kgid_t kgid) -{ - return KGIDT_INIT(from_kgid(mnt_userns, kgid)); -} - -/** - * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * - * Use this helper to initialize a new vfs or filesystem object based on - * the caller's fsuid. A common example is initializing the i_uid field of - * a newly allocated inode triggered by a creation event such as mkdir or - * O_CREAT. Other examples include the allocation of quotas for a specific - * user. - * - * Return: the caller's current fsuid mapped up according to @mnt_userns. - */ -static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns) -{ - return kuid_from_mnt(mnt_userns, current_fsuid()); -} - -/** - * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * - * Use this helper to initialize a new vfs or filesystem object based on - * the caller's fsgid. A common example is initializing the i_gid field of - * a newly allocated inode triggered by a creation event such as mkdir or - * O_CREAT. Other examples include the allocation of quotas for a specific - * user. - * - * Return: the caller's current fsgid mapped up according to @mnt_userns. - */ -static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns) -{ - return kgid_from_mnt(mnt_userns, current_fsgid()); + return mapped_kgid_fs(mnt_userns, i_user_ns(inode), inode->i_gid); }
/** @@@ -1669,7 -1753,7 +1669,7 @@@ static inline void inode_fsuid_set(struct inode *inode, struct user_namespace *mnt_userns) { - inode->i_uid = mapped_fsuid(mnt_userns); + inode->i_uid = mapped_fsuid(mnt_userns, i_user_ns(inode)); }
/** @@@ -1683,7 -1767,7 +1683,7 @@@ static inline void inode_fsgid_set(struct inode *inode, struct user_namespace *mnt_userns) { - inode->i_gid = mapped_fsgid(mnt_userns); + inode->i_gid = mapped_fsgid(mnt_userns, i_user_ns(inode)); }
/** @@@ -1700,18 -1784,10 +1700,18 @@@ static inline bool fsuidgid_has_mapping(struct super_block *sb, struct user_namespace *mnt_userns) { - struct user_namespace *s_user_ns = sb->s_user_ns; + struct user_namespace *fs_userns = sb->s_user_ns; + kuid_t kuid; + kgid_t kgid;
- return kuid_has_mapping(s_user_ns, mapped_fsuid(mnt_userns)) && - kgid_has_mapping(s_user_ns, mapped_fsgid(mnt_userns)); + kuid = mapped_fsuid(mnt_userns, fs_userns); + if (!uid_valid(kuid)) + return false; + kgid = mapped_fsgid(mnt_userns, fs_userns); + if (!gid_valid(kgid)) + return false; + return kuid_has_mapping(fs_userns, kuid) && + kgid_has_mapping(fs_userns, kgid); }
extern struct timespec64 current_time(struct inode *inode); @@@ -2173,7 -2249,6 +2173,7 @@@ struct super_operations #define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ +#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
/* * Note that nosuid etc flags are inode-specific: setting some file-system @@@ -2343,8 -2418,6 +2343,8 @@@ static inline void kiocb_clone(struct k * Used to detect that mark_inode_dirty() should not move * inode between dirty lists. * + * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@@ -2367,7 -2440,6 +2367,7 @@@ #define I_CREATING (1 << 15) #define I_DONTCACHE (1 << 16) #define I_SYNC_QUEUED (1 << 17) +#define I_PINNING_FSCACHE_WB (1 << 18)
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) @@@ -2652,21 -2724,6 +2652,21 @@@ static inline struct user_namespace *fi { return mnt_user_ns(file->f_path.mnt); } + +/** + * is_idmapped_mnt - check whether a mount is mapped + * @mnt: the mount to check + * + * If @mnt has an idmapping attached different from the + * filesystem's idmapping then @mnt is mapped. + * + * Return: true if mount is mapped, false if not. + */ +static inline bool is_idmapped_mnt(const struct vfsmount *mnt) +{ + return mnt_user_ns(mnt) != mnt->mnt_sb->s_user_ns; +} + extern long vfs_truncate(const struct path *, loff_t); int do_truncate(struct user_namespace *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); @@@ -2790,6 -2847,8 +2790,6 @@@ static inline int filemap_fdatawait(str
extern bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); -extern bool filemap_range_needs_writeback(struct address_space *, - loff_t lstart, loff_t lend); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); extern int __filemap_fdatawrite_range(struct address_space *mapping, @@@ -3093,6 -3152,7 +3093,7 @@@ extern void unlock_new_inode(struct ino extern void discard_new_inode(struct inode *); extern unsigned int get_next_ino(void); extern void evict_inodes(struct super_block *sb); + void dump_mapping(const struct address_space *);
/* * Userspace may rely on the the inode number being non-zero. For example, glibc diff --combined include/linux/kasan.h index fb78108d694e,89c99e5e67de..4a45562d8893 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@@ -9,7 -9,6 +9,7 @@@
struct kmem_cache; struct page; +struct slab; struct vm_struct; struct task_struct;
@@@ -194,11 -193,11 +194,11 @@@ static __always_inline size_t kasan_met return 0; }
-void __kasan_poison_slab(struct page *page); -static __always_inline void kasan_poison_slab(struct page *page) +void __kasan_poison_slab(struct slab *slab); +static __always_inline void kasan_poison_slab(struct slab *slab) { if (kasan_enabled()) - __kasan_poison_slab(page); + __kasan_poison_slab(slab); }
void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object); @@@ -323,7 -322,7 +323,7 @@@ static inline void kasan_cache_create(s slab_flags_t *flags) {} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } -static inline void kasan_poison_slab(struct page *page) {} +static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} static inline void kasan_poison_object_data(struct kmem_cache *cache, @@@ -475,12 -474,12 +475,12 @@@ static inline void kasan_populate_early * allocations with real shadow memory. With KASAN vmalloc, the special * case is unnecessary, as the work is handled in the generic case. */ - int kasan_module_alloc(void *addr, size_t size); + int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask); void kasan_free_shadow(const struct vm_struct *vm);
#else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
- static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } + static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; } static inline void kasan_free_shadow(const struct vm_struct *vm) {}
#endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ diff --combined include/linux/memcontrol.h index e34112f6a369,0131e5574c88..b72d75141e12 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@@ -33,6 -33,7 +33,7 @@@ enum memcg_stat_item MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, + MEMCG_VMALLOC, MEMCG_NR_STAT, };
@@@ -42,6 -43,7 +43,7 @@@ enum memcg_memory_event MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, + MEMCG_OOM_GROUP_KILL, MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, @@@ -536,6 -538,45 +538,6 @@@ static inline bool folio_memcg_kmem(str return folio->memcg_data & MEMCG_DATA_KMEM; }
-/* - * page_objcgs - get the object cgroups vector associated with a page - * @page: a pointer to the page struct - * - * Returns a pointer to the object cgroups vector associated with the page, - * or NULL. This function assumes that the page is known to have an - * associated object cgroups vector. It's not safe to call this function - * against pages, which might have an associated memory cgroup: e.g. - * kernel stack pages. - */ -static inline struct obj_cgroup **page_objcgs(struct page *page) -{ - unsigned long memcg_data = READ_ONCE(page->memcg_data); - - VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); - - return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); -} - -/* - * page_objcgs_check - get the object cgroups vector associated with a page - * @page: a pointer to the page struct - * - * Returns a pointer to the object cgroups vector associated with the page, - * or NULL. This function is safe to use if the page can be directly associated - * with a memory cgroup. - */ -static inline struct obj_cgroup **page_objcgs_check(struct page *page) -{ - unsigned long memcg_data = READ_ONCE(page->memcg_data); - - if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS)) - return NULL; - - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); - - return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); -}
#else static inline bool folio_memcg_kmem(struct folio *folio) @@@ -543,6 -584,15 +545,6 @@@ return false; }
-static inline struct obj_cgroup **page_objcgs(struct page *page) -{ - return NULL; -} - -static inline struct obj_cgroup **page_objcgs_check(struct page *page) -{ - return NULL; -} #endif
static inline bool PageMemcgKmem(struct page *page) @@@ -943,6 -993,21 +945,21 @@@ static inline void mod_memcg_state(stru local_irq_restore(flags); }
+ static inline void mod_memcg_page_state(struct page *page, + int idx, int val) + { + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return; + + rcu_read_lock(); + memcg = page_memcg(page); + if (memcg) + mod_memcg_state(memcg, idx, val); + rcu_read_unlock(); + } + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return READ_ONCE(memcg->vmstats.state[idx]); @@@ -1398,6 -1463,11 +1415,11 @@@ static inline void mod_memcg_state(stru { }
+ static inline void mod_memcg_page_state(struct page *page, + int idx, int val) + { + } + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return 0; diff --combined include/linux/memremap.h index a8bc588fe7aa,61a6a0e27359..1fafcc38acba --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@@ -72,6 -72,16 +72,6 @@@ struct dev_pagemap_ops */ void (*page_free)(struct page *page);
- /* - * Transition the refcount in struct dev_pagemap to the dead state. - */ - void (*kill)(struct dev_pagemap *pgmap); - - /* - * Wait for refcount in struct dev_pagemap to be idle and reap it. - */ - void (*cleanup)(struct dev_pagemap *pgmap); - /* * Used for private (un-addressable) device memory only. Must migrate * the page back to a CPU accessible page. @@@ -85,9 -95,15 +85,14 @@@ * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations * @ref: reference count that pins the devm_memremap_pages() mapping - * @internal_ref: internal reference if @ref is not provided by the caller - * @done: completion for @internal_ref + * @done: completion for @ref * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior + * @vmemmap_shift: structural definition of how the vmemmap page metadata + * is populated, specifically the metadata page order. + * A zero value (default) uses base pages as the vmemmap metadata + * representation. A bigger value will set up compound struct pages + * of the requested order value. * @ops: method table * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no @@@ -98,10 -114,12 +103,11 @@@ */ struct dev_pagemap { struct vmem_altmap altmap; - struct percpu_ref *ref; - struct percpu_ref internal_ref; + struct percpu_ref ref; struct completion done; enum memory_type type; unsigned int flags; + unsigned long vmemmap_shift; const struct dev_pagemap_ops *ops; void *owner; int nr_range; @@@ -118,6 -136,11 +124,11 @@@ static inline struct vmem_altmap *pgmap return NULL; }
+ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) + { + return 1 << pgmap->vmemmap_shift; + } + #ifdef CONFIG_ZONE_DEVICE void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); @@@ -179,7 -202,7 +190,7 @@@ static inline unsigned long memremap_co static inline void put_dev_pagemap(struct dev_pagemap *pgmap) { if (pgmap) - percpu_ref_put(pgmap->ref); + percpu_ref_put(&pgmap->ref); }
#endif /* _LINUX_MEMREMAP_H_ */ diff --combined include/linux/mm.h index c768a7c81b0b,d4fb49a5d60d..aa47705191bc --- a/include/linux/mm.h +++ b/include/linux/mm.h @@@ -424,51 -424,6 +424,6 @@@ extern unsigned int kobjsize(const voi */ extern pgprot_t protection_map[16];
- /** - * enum fault_flag - Fault flag definitions. - * @FAULT_FLAG_WRITE: Fault was a write fault. - * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. - * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. - * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. - * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. - * @FAULT_FLAG_TRIED: The fault has been tried once. - * @FAULT_FLAG_USER: The fault originated in userspace. - * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. - * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. - * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * - * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify - * whether we would allow page faults to retry by specifying these two - * fault flags correctly. Currently there can be three legal combinations: - * - * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and - * this is the first try - * - * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and - * we've already tried at least once - * - * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry - * - * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never - * be used. Note that page faults can be allowed to retry for multiple times, - * in which case we'll have an initial fault with flags (a) then later on - * continuous faults with flags (b). We should always try to detect pending - * signals before a retry to make sure the continuous page faults can still be - * interrupted if necessary. - */ - enum fault_flag { - FAULT_FLAG_WRITE = 1 << 0, - FAULT_FLAG_MKWRITE = 1 << 1, - FAULT_FLAG_ALLOW_RETRY = 1 << 2, - FAULT_FLAG_RETRY_NOWAIT = 1 << 3, - FAULT_FLAG_KILLABLE = 1 << 4, - FAULT_FLAG_TRIED = 1 << 5, - FAULT_FLAG_USER = 1 << 6, - FAULT_FLAG_REMOTE = 1 << 7, - FAULT_FLAG_INSTRUCTION = 1 << 8, - FAULT_FLAG_INTERRUPTIBLE = 1 << 9, - }; - /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. @@@ -577,6 -532,10 +532,10 @@@ enum page_entry_size */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); + /** + * @close: Called when the VMA is being removed from the MM. + * Context: User context. May sleep. Caller holds mmap_lock. + */ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); @@@ -714,27 -673,6 +673,27 @@@ int vma_is_stack_for_current(struct vm_ struct mmu_gather; struct inode;
+static inline unsigned int compound_order(struct page *page) +{ + if (!PageHead(page)) + return 0; + return page[1].compound_order; +} + +/** + * folio_order - The allocation order of a folio. + * @folio: The folio. + * + * A folio is composed of 2^order pages. See get_order() for the definition + * of order. + * + * Return: The order of the folio. + */ +static inline unsigned int folio_order(struct folio *folio) +{ + return compound_order(&folio->page); +} + #include <linux/huge_mm.h>
/* @@@ -861,19 -799,15 +820,15 @@@ static inline int page_mapcount(struct
#ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); - int page_trans_huge_mapcount(struct page *page, int *total_mapcount); + int page_trans_huge_mapcount(struct page *page); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } - static inline int page_trans_huge_mapcount(struct page *page, - int *total_mapcount) + static inline int page_trans_huge_mapcount(struct page *page) { - int mapcount = page_mapcount(page); - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; + return page_mapcount(page); } #endif
@@@ -884,13 -818,6 +839,13 @@@ static inline struct page *virt_to_head return compound_head(page); }
+static inline struct folio *virt_to_folio(const void *x) +{ + struct page *page = virt_to_page(x); + + return page_folio(page); +} + void __put_page(struct page *page);
void put_pages_list(struct list_head *pages); @@@ -934,6 -861,27 +889,6 @@@ static inline void destroy_compound_pag compound_page_dtors[page[1].compound_dtor](page); }
-static inline unsigned int compound_order(struct page *page) -{ - if (!PageHead(page)) - return 0; - return page[1].compound_order; -} - -/** - * folio_order - The allocation order of a folio. - * @folio: The folio. - * - * A folio is composed of 2^order pages. See get_order() for the definition - * of order. - * - * Return: The order of the folio. - */ -static inline unsigned int folio_order(struct folio *folio) -{ - return compound_order(&folio->page); -} - static inline bool hpage_pincount_available(struct page *page) { /* @@@ -1760,11 -1708,6 +1715,11 @@@ void page_address_init(void) #define page_address_init() do { } while(0) #endif
+static inline void *folio_address(const struct folio *folio) +{ + return page_address(&folio->page); +} + extern void *page_rmapping(struct page *page); extern struct anon_vma *page_anon_vma(struct page *page); extern pgoff_t __page_file_index(struct page *page); @@@ -1837,6 -1780,28 +1792,6 @@@ static inline bool can_do_mlock(void) extern int user_shm_lock(size_t, struct ucounts *); extern void user_shm_unlock(size_t, struct ucounts *);
-/* - * Parameter block passed down to zap_pte_range in exceptional cases. - */ -struct zap_details { - struct address_space *zap_mapping; /* Check page->mapping if set */ - struct page *single_page; /* Locked page to be unmapped */ -}; - -/* - * We set details->zap_mappings when we want to unmap shared but keep private - * pages. Return true if skip zapping this page, false otherwise. - */ -static inline bool -zap_skip_check_mapping(struct zap_details *details, struct page *page) -{ - if (!details || !page) - return false; - - return details->zap_mapping && - (details->zap_mapping != page_rmapping(page)); -} - struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, @@@ -1871,6 -1836,7 +1826,6 @@@ extern void truncate_pagecache(struct i extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); -int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); int invalidate_inode_page(struct page *page);
@@@ -1881,6 -1847,7 +1836,6 @@@ extern vm_fault_t handle_mm_fault(struc extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); -void unmap_mapping_page(struct page *page); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, @@@ -1901,6 -1868,7 +1856,6 @@@ static inline int fixup_user_fault(stru BUG(); return -EFAULT; } -static inline void unmap_mapping_page(struct page *page) { } static inline void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { } static inline void unmap_mapping_range(struct address_space *mapping, @@@ -1957,6 -1925,7 +1912,6 @@@ int get_kernel_pages(const struct kvec struct page **pages); struct page *get_dump_page(unsigned long addr);
-extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned int offset, unsigned int length);
@@@ -2644,7 -2613,7 +2599,7 @@@ static inline int vma_adjust(struct vm_ extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); + struct mempolicy *, struct vm_userfaultfd_ctx, const char *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@@ -3153,7 -3122,6 +3108,6 @@@ int drop_caches_sysctl_handler(struct c #endif
void drop_slab(void); - void drop_slab_node(int nid);
#ifndef CONFIG_MMU #define randomize_va_space 0 @@@ -3206,6 -3174,7 +3160,7 @@@ enum mf_flags MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, + MF_UNPOISON = 1 << 4, }; extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); @@@ -3217,19 -3186,6 +3172,19 @@@ extern void shake_page(struct page *p) extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags);
+#ifndef arch_memory_failure +static inline int arch_memory_failure(unsigned long pfn, int flags) +{ + return -ENXIO; +} +#endif + +#ifndef arch_is_platform_page +static inline bool arch_is_platform_page(u64 paddr) +{ + return false; +} +#endif
/* * Error handlers for various types of pages. @@@ -3246,7 -3202,6 +3201,6 @@@ enum mf_action_page_type MF_MSG_KERNEL_HIGH_ORDER, MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, - MF_MSG_POISONED_HUGE, MF_MSG_HUGE, MF_MSG_FREE_HUGE, MF_MSG_NON_PMD_HUGE, @@@ -3261,7 -3216,6 +3215,6 @@@ MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, - MF_MSG_BUDDY_2ND, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, @@@ -3390,5 -3344,16 +3343,16 @@@ static inline int seal_check_future_wri return 0; }
+ #ifdef CONFIG_ANON_VMA_NAME + int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name); + #else + static inline int + madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) { + return 0; + } + #endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --combined include/linux/mm_types.h index 1ae3537c7920,e3b0476a4fda..3764c1b51b02 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@@ -5,6 -5,7 +5,7 @@@ #include <linux/mm_types_task.h>
#include <linux/auxvec.h> + #include <linux/kref.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rbtree.h> @@@ -56,11 -57,11 +57,11 @@@ struct mem_cgroup * in each subpage, but you may need to restore some of their values * afterwards. * - * SLUB uses cmpxchg_double() to atomically update its freelist and - * counters. That requires that freelist & counters be adjacent and - * double-word aligned. We align all struct pages to double-word - * boundaries, and ensure that 'freelist' is aligned within the - * struct. + * SLUB uses cmpxchg_double() to atomically update its freelist and counters. + * That requires that freelist & counters in struct slab be adjacent and + * double-word aligned. Because struct slab currently just reinterprets the + * bits of struct page, we align all struct pages to double-word boundaries, + * and ensure that 'freelist' is aligned within struct slab. */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) @@@ -386,6 -387,12 +387,12 @@@ struct vm_userfaultfd_ctx struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */
+ struct anon_vma_name { + struct kref kref; + /* The name needs to be at the end because it is dynamically sized. */ + char name[]; + }; + /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@@ -426,11 -433,19 +433,19 @@@ struct vm_area_struct /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. + * + * For private anonymous mappings, a pointer to a null terminated string + * containing the name given to the vma, or NULL if unnamed. */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; + + union { + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; + /* Serialized by mmap_sem. */ + struct anon_vma_name *anon_name; + };
/* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@@ -632,7 -647,7 +647,7 @@@ struct mm_struct atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ - bool tlb_flush_batched; + atomic_t tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_PREEMPT_RT @@@ -677,90 -692,6 +692,6 @@@ extern void tlb_gather_mmu(struct mmu_g extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_finish_mmu(struct mmu_gather *tlb);
- static inline void init_tlb_flush_pending(struct mm_struct *mm) - { - atomic_set(&mm->tlb_flush_pending, 0); - } - - static inline void inc_tlb_flush_pending(struct mm_struct *mm) - { - atomic_inc(&mm->tlb_flush_pending); - /* - * The only time this value is relevant is when there are indeed pages - * to flush. And we'll only flush pages after changing them, which - * requires the PTL. - * - * So the ordering here is: - * - * atomic_inc(&mm->tlb_flush_pending); - * spin_lock(&ptl); - * ... - * set_pte_at(); - * spin_unlock(&ptl); - * - * spin_lock(&ptl) - * mm_tlb_flush_pending(); - * .... - * spin_unlock(&ptl); - * - * flush_tlb_range(); - * atomic_dec(&mm->tlb_flush_pending); - * - * Where the increment if constrained by the PTL unlock, it thus - * ensures that the increment is visible if the PTE modification is - * visible. After all, if there is no PTE modification, nobody cares - * about TLB flushes either. - * - * This very much relies on users (mm_tlb_flush_pending() and - * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and - * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc - * locks (PPC) the unlock of one doesn't order against the lock of - * another PTL. - * - * The decrement is ordered by the flush_tlb_range(), such that - * mm_tlb_flush_pending() will not return false unless all flushes have - * completed. - */ - } - - static inline void dec_tlb_flush_pending(struct mm_struct *mm) - { - /* - * See inc_tlb_flush_pending(). - * - * This cannot be smp_mb__before_atomic() because smp_mb() simply does - * not order against TLB invalidate completion, which is what we need. - * - * Therefore we must rely on tlb_flush_*() to guarantee order. - */ - atomic_dec(&mm->tlb_flush_pending); - } - - static inline bool mm_tlb_flush_pending(struct mm_struct *mm) - { - /* - * Must be called after having acquired the PTL; orders against that - * PTLs release and therefore ensures that if we observe the modified - * PTE we must also observe the increment from inc_tlb_flush_pending(). - * - * That is, it only guarantees to return true if there is a flush - * pending for _this_ PTL. - */ - return atomic_read(&mm->tlb_flush_pending); - } - - static inline bool mm_tlb_flush_nested(struct mm_struct *mm) - { - /* - * Similar to mm_tlb_flush_pending(), we must have acquired the PTL - * for which there is a TLB flush pending in order to guarantee - * we've seen both that PTE modification and the increment. - * - * (no requirement on actually still holding the PTL, that is irrelevant) - */ - return atomic_read(&mm->tlb_flush_pending) > 1; - } - struct vm_fault;
/** @@@ -875,4 -806,49 +806,49 @@@ typedef struct unsigned long val; } swp_entry_t;
+ /** + * enum fault_flag - Fault flag definitions. + * @FAULT_FLAG_WRITE: Fault was a write fault. + * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. + * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. + * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. + * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. + * @FAULT_FLAG_TRIED: The fault has been tried once. + * @FAULT_FLAG_USER: The fault originated in userspace. + * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. + * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. + * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two + * fault flags correctly. Currently there can be three legal combinations: + * + * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and + * this is the first try + * + * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and + * we've already tried at least once + * + * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry + * + * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never + * be used. Note that page faults can be allowed to retry for multiple times, + * in which case we'll have an initial fault with flags (a) then later on + * continuous faults with flags (b). We should always try to detect pending + * signals before a retry to make sure the continuous page faults can still be + * interrupted if necessary. + */ + enum fault_flag { + FAULT_FLAG_WRITE = 1 << 0, + FAULT_FLAG_MKWRITE = 1 << 1, + FAULT_FLAG_ALLOW_RETRY = 1 << 2, + FAULT_FLAG_RETRY_NOWAIT = 1 << 3, + FAULT_FLAG_KILLABLE = 1 << 4, + FAULT_FLAG_TRIED = 1 << 5, + FAULT_FLAG_USER = 1 << 6, + FAULT_FLAG_REMOTE = 1 << 7, + FAULT_FLAG_INSTRUCTION = 1 << 8, + FAULT_FLAG_INTERRUPTIBLE = 1 << 9, + }; + #endif /* _LINUX_MM_TYPES_H */ diff --combined include/linux/page-flags.h index b3d353d537e2,7e2b90dc7d3f..129421002443 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@@ -68,6 -68,9 +68,6 @@@ * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as * a result of MADV_FREE). * - * PG_uptodate tells whether the page's contents is valid. When a read - * completes, the page becomes uptodate, unless a disk I/O error happened. - * * PG_referenced, PG_reclaim are used for page reclaim for anonymous and * file-backed pagecache (see mm/vmscan.c). * @@@ -380,7 -383,7 +380,7 @@@ static __always_inline int TestClearPag TESTCLEARFLAG(uname, lname, policy)
#define TESTPAGEFLAG_FALSE(uname, lname) \ - static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \ + static inline bool folio_test_##lname(const struct folio *folio) { return false; } \ static inline int Page##uname(const struct page *page) { return 0; }
#define SETPAGEFLAG_NOOP(uname, lname) \ @@@ -519,7 -522,11 +519,11 @@@ PAGEFLAG_FALSE(Uncached, uncached PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) + #define MAGIC_HWPOISON 0x48575053U /* HWPS */ + extern void SetPageHWPoisonTakenOff(struct page *page); + extern void ClearPageHWPoisonTakenOff(struct page *page); extern bool take_page_off_buddy(struct page *page); + extern bool put_page_back_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 @@@ -612,16 -619,6 +616,16 @@@ TESTPAGEFLAG_FALSE(Ksm, ksm
u64 stable_page_flags(struct page *page);
+/** + * folio_test_uptodate - Is this folio up to date? + * @folio: The folio. + * + * The uptodate flag is set on a folio when every byte in the folio is + * at least as new as the corresponding bytes on storage. Anonymous + * and CoW folios are always uptodate. If the folio is not uptodate, + * some of the bytes in it may be; see the is_partially_uptodate() + * address_space operation. + */ static inline bool folio_test_uptodate(struct folio *folio) { bool ret = test_bit(PG_uptodate, folio_flags(folio, 0)); diff --combined kernel/fork.c index 3161d7980155,75737e566441..1c989cc4208a --- a/kernel/fork.c +++ b/kernel/fork.c @@@ -42,6 -42,7 +42,7 @@@ #include <linux/mmu_notifier.h> #include <linux/fs.h> #include <linux/mm.h> + #include <linux/mm_inline.h> #include <linux/vmacache.h> #include <linux/nsproxy.h> #include <linux/capability.h> @@@ -365,12 -366,14 +366,14 @@@ struct vm_area_struct *vm_area_dup(stru *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; + dup_vma_anon_name(orig, new); } return new; }
void vm_area_free(struct vm_area_struct *vma) { + free_vma_anon_name(vma); kmem_cache_free(vm_area_cachep, vma); }
@@@ -1556,6 -1559,32 +1559,6 @@@ out return error; }
-static int copy_io(unsigned long clone_flags, struct task_struct *tsk) -{ -#ifdef CONFIG_BLOCK - struct io_context *ioc = current->io_context; - struct io_context *new_ioc; - - if (!ioc) - return 0; - /* - * Share io context with parent, if CLONE_IO is set - */ - if (clone_flags & CLONE_IO) { - ioc_task_link(ioc); - tsk->io_context = ioc; - } else if (ioprio_valid(ioc->ioprio)) { - new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); - if (unlikely(!new_ioc)) - return -ENOMEM; - - new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc); - } -#endif - return 0; -} - static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; diff --combined kernel/rcu/rcutorture.c index 33ea446101b3,42bc66a2f170..422f7e4cc08d --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@@ -46,7 -46,6 +46,7 @@@ #include <linux/oom.h> #include <linux/tick.h> #include <linux/rcupdate_trace.h> +#include <linux/nmi.h>
#include "rcu.h"
@@@ -54,18 -53,15 +54,18 @@@ MODULE_LICENSE("GPL") MODULE_AUTHOR("Paul E. McKenney paulmck@linux.ibm.com and Josh Triplett josh@joshtriplett.org");
/* Bits for ->extendables field, extendables param, and related definitions. */ -#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) +#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2) #define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ #define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ #define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ #define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ #define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ -#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */ +#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */ +#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */ +#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */ #define RCUTORTURE_MAX_EXTEND \ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) @@@ -79,7 -75,7 +79,7 @@@ torture_param(int, fqs_duration, 0 "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); -torture_param(bool, fwd_progress, 1, "Test grace-period forward progress"); +torture_param(int, fwd_progress, 1, "Test grace-period forward progress"); torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait"); torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)"); @@@ -113,8 -109,6 +113,8 @@@ torture_param(int, shutdown_secs, 0, "S torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); +torture_param(bool, stall_no_softlockup, false, + "Avoid softlockup warning during cpu stall."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); torture_param(int, stall_gp_kthread, 0, @@@ -146,7 -140,7 +146,7 @@@ static struct task_struct *stats_task static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *stall_task; -static struct task_struct *fwd_prog_task; +static struct task_struct **fwd_prog_tasks; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; static struct task_struct *read_exit_task; @@@ -348,12 -342,10 +348,12 @@@ struct rcu_torture_ops void (*gp_kthread_dbg)(void); bool (*check_boost_failed)(unsigned long gp_state, int *cpup); int (*stall_dur)(void); + long cbflood_max; int irq_capable; int can_boost; int extendables; int slow_gps; + int no_pi_lock; const char *name; };
@@@ -675,7 -667,6 +675,7 @@@ static struct rcu_torture_ops srcu_ops .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcu" };
@@@ -709,7 -700,6 +709,7 @@@ static struct rcu_torture_ops srcud_op .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcud" };
@@@ -730,7 -720,6 +730,7 @@@ static struct rcu_torture_ops busted_sr .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted_srcud" }; @@@ -842,7 -831,6 +842,7 @@@ static struct rcu_torture_ops tasks_rud .call = call_rcu_tasks_rude, .cb_barrier = rcu_barrier_tasks_rude, .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, + .cbflood_max = 50000, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@@ -883,7 -871,6 +883,7 @@@ static struct rcu_torture_ops tasks_tra .call = call_rcu_tasks_trace, .cb_barrier = rcu_barrier_tasks_trace, .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, + .cbflood_max = 50000, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@@ -1433,15 -1420,13 +1433,15 @@@ static void rcutorture_one_extend(int * struct rt_read_seg *rtrsp) { unsigned long flags; - int idxnew = -1; - int idxold = *readstate; + int idxnew1 = -1; + int idxnew2 = -1; + int idxold1 = *readstate; + int idxold2 = idxold1; int statesnew = ~*readstate & newstate; int statesold = *readstate & ~newstate;
- WARN_ON_ONCE(idxold < 0); - WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1); + WARN_ON_ONCE(idxold2 < 0); + WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1); rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */ @@@ -1455,10 -1440,8 +1455,10 @@@ preempt_disable(); if (statesnew & RCUTORTURE_RDR_SCHED) rcu_read_lock_sched(); - if (statesnew & RCUTORTURE_RDR_RCU) - idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT; + if (statesnew & RCUTORTURE_RDR_RCU_1) + idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1; + if (statesnew & RCUTORTURE_RDR_RCU_2) + idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2;
/* * Next, remove old protection, in decreasing order of strength @@@ -1477,20 -1460,12 +1477,20 @@@ local_bh_enable(); if (statesold & RCUTORTURE_RDR_RBH) rcu_read_unlock_bh(); - if (statesold & RCUTORTURE_RDR_RCU) { - bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); + if (statesold & RCUTORTURE_RDR_RCU_2) { + cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1); + WARN_ON_ONCE(idxnew2 != -1); + idxold2 = 0; + } + if (statesold & RCUTORTURE_RDR_RCU_1) { + bool lockit;
+ lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff); if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); - cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1); + WARN_ON_ONCE(idxnew1 != -1); + idxold1 = 0; if (lockit) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } @@@ -1500,19 -1475,13 +1500,19 @@@ cur_ops->read_delay(trsp, rtrsp);
/* Update the reader state. */ - if (idxnew == -1) - idxnew = idxold & ~RCUTORTURE_RDR_MASK; - WARN_ON_ONCE(idxnew < 0); - WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1); - *readstate = idxnew | newstate; - WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0); - WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1); + if (idxnew1 == -1) + idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1; + WARN_ON_ONCE(idxnew1 < 0); + if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1)) + pr_info("Unexpected idxnew1 value of %#x\n", idxnew1); + if (idxnew2 == -1) + idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2; + WARN_ON_ONCE(idxnew2 < 0); + WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1); + *readstate = idxnew1 | idxnew2 | newstate; + WARN_ON_ONCE(*readstate < 0); + if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1)) + pr_info("Unexpected idxnew2 value of %#x\n", idxnew2); }
/* Return the biggest extendables mask given current RCU and boot parameters. */ @@@ -1522,7 -1491,7 +1522,7 @@@ static int rcutorture_extend_mask_max(v
WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND); mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables; - mask = mask | RCUTORTURE_RDR_RCU; + mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; return mask; }
@@@ -1537,21 -1506,13 +1537,21 @@@ rcutorture_extend_mask(int oldmask, str unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; else mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
+ // Can't have nested RCU reader without outer RCU reader. + if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) { + if (oldmask & RCUTORTURE_RDR_RCU_1) + mask &= ~RCUTORTURE_RDR_RCU_2; + else + mask |= RCUTORTURE_RDR_RCU_1; + } + /* * Can't enable bh w/irq disabled. */ @@@ -1571,7 -1532,7 +1571,7 @@@ mask |= oldmask & bhs; }
- return mask ?: RCUTORTURE_RDR_RCU; + return mask ?: RCUTORTURE_RDR_RCU_1; }
/* @@@ -1665,7 -1626,7 +1665,7 @@@ static bool rcu_torture_one_read(struc rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); - WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); @@@ -2031,9 -1992,8 +2031,8 @@@ static int rcutorture_booster_init(unsi mutex_lock(&boost_mutex); rcu_torture_disable_rt_throttle(); VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, - cpu_to_node(cpu), - "rcu_torture_boost"); + boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL, + cpu, "rcu_torture_boost_%u"); if (IS_ERR(boost_tasks[cpu])) { retval = PTR_ERR(boost_tasks[cpu]); VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed"); @@@ -2042,8 -2002,6 +2041,6 @@@ mutex_unlock(&boost_mutex); return retval; } - kthread_bind(boost_tasks[cpu], cpu); - wake_up_process(boost_tasks[cpu]); mutex_unlock(&boost_mutex); return 0; } @@@ -2091,8 -2049,6 +2088,8 @@@ static int rcu_torture_stall(void *args #else schedule_timeout_uninterruptible(HZ); #endif + } else if (stall_no_softlockup) { + touch_softlockup_watchdog(); } if (stall_cpu_irqsoff) local_irq_enable(); @@@ -2164,13 -2120,10 +2161,13 @@@ struct rcu_fwd unsigned long rcu_fwd_startat; struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; unsigned long rcu_launder_gp_seq_start; + int rcu_fwd_id; };
static DEFINE_MUTEX(rcu_fwd_mutex); static struct rcu_fwd *rcu_fwds; +static unsigned long rcu_fwd_seq; +static atomic_long_t rcu_fwd_max_cbs; static bool rcu_fwd_emergency_stop;
static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) @@@ -2183,9 -2136,8 +2180,9 @@@ for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--) if (rfp->n_launders_hist[i].n_launders > 0) break; - pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", - __func__, jiffies - rfp->rcu_fwd_startat); + mutex_lock(&rcu_fwd_mutex); // Serialize histograms. + pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):", + __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat); gps_old = rfp->rcu_launder_gp_seq_start; for (j = 0; j <= i; j++) { gps = rfp->n_launders_hist[j].launder_gp_seq; @@@ -2196,7 -2148,6 +2193,7 @@@ gps_old = gps; } pr_cont("\n"); + mutex_unlock(&rcu_fwd_mutex); }
/* Callback function for continuous-flood RCU callbacks. */ @@@ -2322,8 -2273,7 +2319,8 @@@ static void rcu_torture_fwd_prog_nr(str cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); WARN_ON(!cver && gps < 2); - pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); + pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__, + rfp->rcu_fwd_id, dur, cver, gps); } if (selfpropcb) { WRITE_ONCE(fcs.stop, 1); @@@ -2391,7 -2341,7 +2388,7 @@@ static void rcu_torture_fwd_prog_cr(str rfp->rcu_fwd_cb_head = rfcpn; n_launders++; n_launders_sa++; - } else { + } else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) { rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL); if (WARN_ON_ONCE(!rfcp)) { schedule_timeout_interruptible(1); @@@ -2401,11 -2351,8 +2398,11 @@@ n_launders_sa = 0; rfcp->rfc_gps = 0; rfcp->rfc_rfp = rfp; + } else { + rfcp = NULL; } - cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); + if (rfcp) + cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); if (tick_nohz_full_enabled()) { local_irq_save(flags); @@@ -2429,7 -2376,6 +2426,7 @@@ n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); + atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs); rcu_torture_fwd_cb_hist(rfp); } schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ @@@ -2445,8 -2391,6 +2442,8 @@@ static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { + int i; + long ncbs; struct rcu_fwd *rfp;
mutex_lock(&rcu_fwd_mutex); @@@ -2457,26 -2401,18 +2454,26 @@@ } WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); - rcu_torture_fwd_cb_hist(rfp); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2); + for (i = 0; i < fwd_progress; i++) { + rcu_torture_fwd_cb_hist(&rfp[i]); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2); + } WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree(rfp)); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); rcu_barrier(); - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree(rfp)); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); rcu_barrier(); - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree(rfp)); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); @@@ -2491,10 -2427,7 +2488,10 @@@ static struct notifier_block rcutorture /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { + bool firsttime = true; + long max_cbs; int oldnice = task_nice(current); + unsigned long oldseq = READ_ONCE(rcu_fwd_seq); struct rcu_fwd *rfp = args; int tested = 0; int tested_tries = 0; @@@ -2504,38 -2437,21 +2501,38 @@@ if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) set_user_nice(current, MAX_NICE); do { - schedule_timeout_interruptible(fwd_progress_holdoff * HZ); - WRITE_ONCE(rcu_fwd_emergency_stop, false); - if (!IS_ENABLED(CONFIG_TINY_RCU) || - rcu_inkernel_boot_has_ended()) - rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); - if (rcu_inkernel_boot_has_ended()) + if (!rfp->rcu_fwd_id) { + schedule_timeout_interruptible(fwd_progress_holdoff * HZ); + WRITE_ONCE(rcu_fwd_emergency_stop, false); + if (!firsttime) { + max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0); + pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs); + } + firsttime = false; + WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); + } else { + while (READ_ONCE(rcu_fwd_seq) == oldseq) + schedule_timeout_interruptible(1); + oldseq = READ_ONCE(rcu_fwd_seq); + } + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); + if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id) rcu_torture_fwd_prog_cr(rfp); + if ((cur_ops->stall_dur && cur_ops->stall_dur() > 0) && + (!IS_ENABLED(CONFIG_TINY_RCU) || + (rcu_inkernel_boot_has_ended() && + torture_num_online_cpus() > rfp->rcu_fwd_id))) + rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
/* Avoid slow periods, better to test when busy. */ if (stutter_wait("rcu_torture_fwd_prog")) sched_set_normal(current, oldnice); } while (!torture_must_stop()); /* Short runs might not contain a valid forward-progress attempt. */ - WARN_ON(!tested && tested_tries >= 5); - pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); + if (!rfp->rcu_fwd_id) { + WARN_ON(!tested && tested_tries >= 5); + pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); + } torture_kthread_stopping("rcu_torture_fwd_prog"); return 0; } @@@ -2543,28 -2459,17 +2540,28 @@@ /* If forward-progress checking is requested and feasible, spawn the thread. */ static int __init rcu_torture_fwd_prog_init(void) { + int i; + int ret = 0; struct rcu_fwd *rfp;
if (!fwd_progress) return 0; /* Not requested, so don't do it. */ + if (fwd_progress >= nr_cpu_ids) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n"); + fwd_progress = nr_cpu_ids; + } else if (fwd_progress < 0) { + fwd_progress = nr_cpu_ids; + } if ((!cur_ops->sync && !cur_ops->call) || - !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) { + (!cur_ops->cbflood_max && (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0)) || + cur_ops == &rcu_busted_ops) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); + fwd_progress = 0; return 0; } if (stall_cpu > 0) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing"); + fwd_progress = 0; if (IS_MODULE(CONFIG_RCU_TORTURE_TEST)) return -EINVAL; /* In module, can fail back to user. */ WARN_ON(1); /* Make sure rcutorture notices conflict. */ @@@ -2574,51 -2479,29 +2571,51 @@@ fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) fwd_progress_div = 4; - rfp = kzalloc(sizeof(*rfp), GFP_KERNEL); - if (!rfp) + rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL); + fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL); + if (!rfp || !fwd_prog_tasks) { + kfree(rfp); + kfree(fwd_prog_tasks); + fwd_prog_tasks = NULL; + fwd_progress = 0; return -ENOMEM; - spin_lock_init(&rfp->rcu_fwd_lock); - rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + } + for (i = 0; i < fwd_progress; i++) { + spin_lock_init(&rfp[i].rcu_fwd_lock); + rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head; + rfp[i].rcu_fwd_id = i; + } mutex_lock(&rcu_fwd_mutex); rcu_fwds = rfp; mutex_unlock(&rcu_fwd_mutex); register_oom_notifier(&rcutorture_oom_nb); - return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); + for (i = 0; i < fwd_progress; i++) { + ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]); + if (ret) { + fwd_progress = i; + return ret; + } + } + return 0; }
static void rcu_torture_fwd_prog_cleanup(void) { + int i; struct rcu_fwd *rfp;
- torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); - rfp = rcu_fwds; + if (!rcu_fwds || !fwd_prog_tasks) + return; + for (i = 0; i < fwd_progress; i++) + torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]); + unregister_oom_notifier(&rcutorture_oom_nb); mutex_lock(&rcu_fwd_mutex); + rfp = rcu_fwds; rcu_fwds = NULL; mutex_unlock(&rcu_fwd_mutex); - unregister_oom_notifier(&rcutorture_oom_nb); kfree(rfp); + kfree(fwd_prog_tasks); + fwd_prog_tasks = NULL; }
/* Callback function for RCU barrier testing. */ @@@ -2855,7 -2738,7 +2852,7 @@@ static int rcu_torture_read_exit(void * &trs, "%s", "rcu_torture_read_exit_child"); if (IS_ERR(tsp)) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); errexit = true; tsp = NULL; break; @@@ -3182,7 -3065,7 +3179,7 @@@ rcu_torture_init(void sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } @@@ -3198,7 -3081,7 +3195,7 @@@ rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk), GFP_KERNEL); if (!reader_tasks || !rcu_torture_reader_mbchk) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } @@@ -3217,7 -3100,7 +3214,7 @@@ if (nrealnocbers > 0) { nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL); if (nocb_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } diff --combined kernel/sysctl.c index d7ed1dffa426,2ab4edb6e450..ef77be575d87 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@@ -33,7 -33,6 +33,7 @@@ #include <linux/security.h> #include <linux/ctype.h> #include <linux/kmemleak.h> +#include <linux/filter.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> @@@ -123,6 -122,7 +123,7 @@@ static unsigned long long_max = LONG_MA static int one_hundred = 100; static int two_hundred = 200; static int one_thousand = 1000; + static int three_thousand = 3000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@@ -2960,7 -2960,7 +2961,7 @@@ static struct ctl_table vm_table[] = .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &one_thousand, + .extra2 = &three_thousand, }, { .procname = "percpu_pagelist_high_fraction", diff --combined mm/Makefile index 7919cd7f13f2,5c5a3a480fa6..588d3113f3b0 --- a/mm/Makefile +++ b/mm/Makefile @@@ -15,8 -15,6 +15,8 @@@ KCSAN_SANITIZE_slab_common.o := KCSAN_SANITIZE_slab.o := n KCSAN_SANITIZE_slub.o := n KCSAN_SANITIZE_page_alloc.o := n +# But enable explicit instrumentation for memory barriers. +KCSAN_INSTRUMENT_BARRIERS := y
# These files are disabled because they produce non-interesting and/or # flaky coverage that is not a function of syscall inputs. E.g. slab is out of @@@ -114,6 -112,7 +114,7 @@@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o + obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o diff --combined mm/huge_memory.c index f58524394dc1,6ed86a8f6a5b..406a3c28c026 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@@ -1322,7 -1322,7 +1322,7 @@@ vm_fault_t do_huge_pmd_wp_page(struct v * We can only reuse the page if nobody else maps the huge page or it's * part. */ - if (reuse_swap_page(page, NULL)) { + if (reuse_swap_page(page)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@@ -2542,38 -2542,28 +2542,28 @@@ int total_mapcount(struct page *page * need full accuracy to avoid breaking page pinning, because * page_trans_huge_mapcount() is slower than page_mapcount(). */ - int page_trans_huge_mapcount(struct page *page, int *total_mapcount) + int page_trans_huge_mapcount(struct page *page) { - int i, ret, _total_mapcount, mapcount; + int i, ret;
/* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page);
- if (likely(!PageTransCompound(page))) { - mapcount = atomic_read(&page->_mapcount) + 1; - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; - } + if (likely(!PageTransCompound(page))) + return atomic_read(&page->_mapcount) + 1;
page = compound_head(page);
- _total_mapcount = ret = 0; + ret = 0; for (i = 0; i < thp_nr_pages(page); i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; + int mapcount = atomic_read(&page[i]._mapcount) + 1; ret = max(ret, mapcount); - _total_mapcount += mapcount; } - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) ret -= 1; - _total_mapcount -= thp_nr_pages(page); - } - mapcount = compound_mapcount(page); - ret += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; - return ret; + + return ret + compound_mapcount(page); }
/* Racy check whether the huge page can be split */ @@@ -2614,7 -2604,6 +2604,7 @@@ int split_huge_page_to_list(struct pag { struct page *head = compound_head(page); struct deferred_split *ds_queue = get_deferred_split_queue(head); + XA_STATE(xas, &head->mapping->i_pages, head->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; @@@ -2653,13 -2642,6 +2643,13 @@@ goto out; }
+ xas_split_alloc(&xas, head, compound_order(head), + mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); + if (xas_error(&xas)) { + ret = xas_error(&xas); + goto out; + } + anon_vma = NULL; i_mmap_lock_read(mapping);
@@@ -2689,12 -2671,13 +2679,12 @@@ /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); if (mapping) { - XA_STATE(xas, &mapping->i_pages, page_index(head)); - /* * Check if the head page is present in page cache. * We assume all tail are present too, if head is there. */ - xa_lock(&mapping->i_pages); + xas_lock(&xas); + xas_reset(&xas); if (xas_load(&xas) != head) goto fail; } @@@ -2710,7 -2693,6 +2700,7 @@@ if (mapping) { int nr = thp_nr_pages(head);
+ xas_split(&xas, head, thp_order(head)); if (PageSwapBacked(head)) { __mod_lruvec_page_state(head, NR_SHMEM_THPS, -nr); @@@ -2727,7 -2709,7 +2717,7 @@@ spin_unlock(&ds_queue->split_queue_lock); fail: if (mapping) - xa_unlock(&mapping->i_pages); + xas_unlock(&xas); local_irq_enable(); remap_page(head, thp_nr_pages(head)); ret = -EBUSY; @@@ -2741,8 -2723,6 +2731,8 @@@ out_unlock if (mapping) i_mmap_unlock_read(mapping); out: + /* Free any memory we didn't use */ + xas_nomem(&xas, 0); count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; } diff --combined mm/internal.h index 26af8a5a5be3,c5834cc28a44..d80300392a19 --- a/mm/internal.h +++ b/mm/internal.h @@@ -12,8 -12,6 +12,8 @@@ #include <linux/pagemap.h> #include <linux/tracepoint-defs.h>
+struct folio_batch; + /* * The set of flags that only affect watermark checking and reclaim * behaviour. This is used by the MM to obey the caller constraints @@@ -23,7 -21,7 +23,7 @@@ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC) + __GFP_ATOMIC|__GFP_NOLOCKDEP)
/* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) @@@ -76,7 -74,6 +76,7 @@@ static inline bool can_madv_lru_vma(str return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); }
+struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, @@@ -93,13 -90,7 +93,13 @@@ static inline void force_page_cache_rea }
unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices); + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); +unsigned find_get_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); +void filemap_free_folio(struct address_space *mapping, struct folio *folio); +int truncate_inode_folio(struct address_space *mapping, struct folio *folio); +bool truncate_inode_partial_folio(struct folio *folio, loff_t start, + loff_t end);
/** * folio_evictable - Test whether a folio is evictable. @@@ -166,11 -157,6 +166,6 @@@ extern void reclaim_throttle(pg_data_t */ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
- /* - * in mm/memcontrol.c: - */ - extern bool cgroup_memory_nokmem; - /* * in mm/page_alloc.c */ @@@ -397,7 -383,6 +392,7 @@@ void __vma_link_list(struct mm_struct * void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
#ifdef CONFIG_MMU +void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_vma_page_range(struct vm_area_struct *vma, @@@ -501,8 -486,8 +496,8 @@@ static inline struct file *maybe_unlock } return fpin; } - #else /* !CONFIG_MMU */ +static inline void unmap_mapping_folio(struct folio *folio) { } static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) diff --combined mm/kasan/quarantine.c index 587da8995f2d,47ed4fc33a29..08291ed33e93 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@@ -117,7 -117,7 +117,7 @@@ static unsigned long quarantine_batch_s
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) { - return virt_to_head_page(qlink)->slab_cache; + return virt_to_slab(qlink)->slab_cache; }
static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache) @@@ -132,11 -132,22 +132,22 @@@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) { void *object = qlink_to_object(qlink, cache); + struct kasan_free_meta *meta = kasan_get_free_meta(cache, object); unsigned long flags;
if (IS_ENABLED(CONFIG_SLAB)) local_irq_save(flags);
+ /* + * If init_on_free is enabled and KASAN's free metadata is stored in + * the object, zero the metadata. Otherwise, the object's memory will + * not be properly zeroed, as KASAN saves the metadata after the slab + * allocator zeroes the object. + */ + if (slab_want_init_on_free(cache) && + cache->kasan_info.free_meta_offset == 0) + memzero_explicit(meta, sizeof(*meta)); + /* * As the object now gets freed from the quarantine, assume that its * free track is no longer valid. diff --combined mm/khugepaged.c index 2e1911cc3466,7af84bac6fc2..35f14d0a00a6 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@@ -618,6 -618,7 +618,7 @@@ static int __collapse_huge_page_isolate continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out; } } @@@ -636,6 -637,7 +637,7 @@@ if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; }
@@@ -681,7 -683,7 +683,7 @@@ goto out; } if (!pte_write(pteval) && PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { + !reuse_swap_page(page)) { /* * Page is in the swap cache and cannot be re-used. * It cannot be collapsed into a THP. @@@ -756,11 -758,7 +758,7 @@@ static void __collapse_huge_page_copy(p * ptl mostly unnecessary. */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); + ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); } } else { @@@ -774,11 -772,7 +772,7 @@@ * inside page_remove_rmap(). */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); + ptep_clear(vma->vm_mm, address, _pte); page_remove_rmap(src_page, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); @@@ -1261,6 -1255,7 +1255,7 @@@ static int khugepaged_scan_pmd(struct m continue; } else { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } @@@ -1270,6 -1265,7 +1265,7 @@@ continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } @@@ -1298,6 -1294,7 +1294,7 @@@ if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; }
@@@ -1306,7 -1303,7 +1303,7 @@@ /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. - * Khupaged will allocate hugepage from the node has the max + * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); @@@ -1667,10 -1664,7 +1664,10 @@@ static void collapse_file(struct mm_str } count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
- /* This will be less messy when we use multi-index entries */ + /* + * Ensure we have slots for all the pages in the range. This is + * almost certainly a no-op because most of the pages must be present + */ do { xas_lock_irq(&xas); xas_create_range(&xas); @@@ -1895,9 -1889,6 +1892,9 @@@ out_unlock __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); }
+ /* Join all the small entries into a single multi-index entry */ + xas_set_order(&xas, start, HPAGE_PMD_ORDER); + xas_store(&xas, new_page); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@@ -2014,15 -2005,12 +2011,16 @@@ static void khugepaged_scan_file(struc if (xa_is_value(page)) { if (++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; }
+ /* + * XXX: khugepaged should compact smaller compound pages + * into a PMD sized page + */ if (PageTransCompound(page)) { result = SCAN_PAGE_COMPOUND; break; @@@ -2064,6 -2052,7 +2062,7 @@@ if (result == SCAN_SUCCEED) { if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { node = khugepaged_find_target_node(); collapse_file(mm, file, start, hpage, node); diff --combined mm/memcontrol.c index 4a7b3ebf8e48,c9ddd02dc5de..09d342c7cbd0 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@@ -84,7 -84,7 +84,7 @@@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_me static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */ - bool cgroup_memory_nokmem __ro_after_init; + static bool cgroup_memory_nokmem __ro_after_init;
/* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP @@@ -629,11 -629,17 +629,17 @@@ static DEFINE_SPINLOCK(stats_flush_lock static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
- static inline void memcg_rstat_updated(struct mem_cgroup *memcg) + static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { + unsigned int x; + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); - if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH)) - atomic_inc(&stats_flush_threshold); + + x = __this_cpu_add_return(stats_updates, abs(val)); + if (x > MEMCG_CHARGE_BATCH) { + atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); + __this_cpu_write(stats_updates, 0); + } }
static void __mem_cgroup_flush_stats(void) @@@ -656,7 -662,7 +662,7 @@@ void mem_cgroup_flush_stats(void
static void flush_memcg_stats_dwork(struct work_struct *w) { - mem_cgroup_flush_stats(); + __mem_cgroup_flush_stats(); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); }
@@@ -672,7 -678,7 +678,7 @@@ void __mod_memcg_state(struct mem_cgrou return;
__this_cpu_add(memcg->vmstats_percpu->state[idx], val); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, val); }
/* idx can be of type enum memcg_stat_item or node_stat_item. */ @@@ -705,7 -711,7 +711,7 @@@ void __mod_memcg_lruvec_state(struct lr /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
- memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, val); }
/** @@@ -789,7 -795,7 +795,7 @@@ void __count_memcg_events(struct mem_cg return;
__this_cpu_add(memcg->vmstats_percpu->events[idx], count); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, count); }
static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@@ -1369,6 -1375,7 +1375,7 @@@ static const struct memory_stat memory_ { "pagetables", NR_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, + { "vmalloc", MEMCG_VMALLOC }, { "shmem", NR_SHMEM }, { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, @@@ -2816,31 -2823,31 +2823,31 @@@ static inline void mod_objcg_mlstate(st rcu_read_unlock(); }
-int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, - gfp_t gfp, bool new_page) +int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) { - unsigned int objects = objs_per_slab_page(s, page); + unsigned int objects = objs_per_slab(s, slab); unsigned long memcg_data; void *vec;
gfp &= ~OBJCGS_CLEAR_MASK; vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, - page_to_nid(page)); + slab_nid(slab)); if (!vec) return -ENOMEM;
memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; - if (new_page) { + if (new_slab) { /* - * If the slab page is brand new and nobody can yet access - * it's memcg_data, no synchronization is required and - * memcg_data can be simply assigned. + * If the slab is brand new and nobody can yet access its + * memcg_data, no synchronization is required and memcg_data can + * be simply assigned. */ - page->memcg_data = memcg_data; - } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) { + slab->memcg_data = memcg_data; + } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { /* - * If the slab page is already in use, somebody can allocate - * and assign obj_cgroups in parallel. In this case the existing + * If the slab is already in use, somebody can allocate and + * assign obj_cgroups in parallel. In this case the existing * objcg vector should be reused. */ kfree(vec); @@@ -2865,43 -2872,38 +2872,43 @@@ */ struct mem_cgroup *mem_cgroup_from_obj(void *p) { - struct page *page; + struct folio *folio;
if (mem_cgroup_disabled()) return NULL;
- page = virt_to_head_page(p); + folio = virt_to_folio(p);
/* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in - * the page->obj_cgroups. + * slab->memcg_data. */ - if (page_objcgs_check(page)) { - struct obj_cgroup *objcg; + if (folio_test_slab(folio)) { + struct obj_cgroup **objcgs; + struct slab *slab; unsigned int off;
- off = obj_to_index(page->slab_cache, page, p); - objcg = page_objcgs(page)[off]; - if (objcg) - return obj_cgroup_memcg(objcg); + slab = folio_slab(folio); + objcgs = slab_objcgs(slab); + if (!objcgs) + return NULL; + + off = obj_to_index(slab->slab_cache, slab, p); + if (objcgs[off]) + return obj_cgroup_memcg(objcgs[off]);
return NULL; }
/* - * page_memcg_check() is used here, because page_has_obj_cgroups() - * check above could fail because the object cgroups vector wasn't set - * at that moment, but it can be set concurrently. + * page_memcg_check() is used here, because in theory we can encounter + * a folio where the slab flag has been cleared already, but + * slab->memcg_data has not been freed yet * page_memcg_check(page) will guarantee that a proper memory * cgroup pointer or NULL will be returned. */ - return page_memcg_check(page); + return page_memcg_check(folio_page(folio, 0)); }
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) @@@ -4850,6 -4852,17 +4857,17 @@@ out_kfree return ret; }
+ #if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) + static int mem_cgroup_slab_show(struct seq_file *m, void *p) + { + /* + * Deprecated. + * Please, take a look at tools/cgroup/slabinfo.py . + */ + return 0; + } + #endif + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@@ -4950,7 -4963,7 +4968,7 @@@ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) { .name = "kmem.slabinfo", - .seq_show = memcg_slab_show, + .seq_show = mem_cgroup_slab_show, }, #endif { @@@ -5110,15 -5123,11 +5128,11 @@@ static void mem_cgroup_free(struct mem_ static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - unsigned int size; int node; int __maybe_unused i; long error = -ENOMEM;
- size = sizeof(struct mem_cgroup); - size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); - - memcg = kzalloc(size, GFP_KERNEL); + memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); if (!memcg) return ERR_PTR(error);
@@@ -6312,6 -6321,8 +6326,8 @@@ static void __memory_events_show(struc seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); seq_printf(m, "oom_kill %lu\n", atomic_long_read(&events[MEMCG_OOM_KILL])); + seq_printf(m, "oom_group_kill %lu\n", + atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); }
static int memory_events_show(struct seq_file *m, void *v) diff --combined mm/memory-failure.c index f1c389f7e669,373837bb94cb..14ae5c18e776 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@@ -58,6 -58,7 +58,7 @@@ #include <linux/ratelimit.h> #include <linux/page-isolation.h> #include <linux/pagewalk.h> + #include <linux/shmem_fs.h> #include "internal.h" #include "ras/ras_event.h"
@@@ -722,7 -723,6 +723,6 @@@ static const char * const action_page_t [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", [MF_MSG_SLAB] = "kernel slab page", [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", - [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", @@@ -737,7 -737,6 +737,6 @@@ [MF_MSG_CLEAN_LRU] = "clean LRU page", [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", [MF_MSG_BUDDY] = "free buddy page", - [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_UNKNOWN] = "unknown page", @@@ -867,6 -866,7 +866,7 @@@ static int me_pagecache_clean(struct pa { int ret; struct address_space *mapping; + bool extra_pins;
delete_from_lru_cache(p);
@@@ -895,18 -895,24 +895,24 @@@ goto out; }
+ /* + * The shmem page is kept in page cache instead of truncating + * so is expected to have an extra refcount after error-handling. + */ + extra_pins = shmem_mapping(mapping); + /* * Truncation is a bit tricky. Enable it per file system for now. * * Open: to take i_rwsem or not for this? Right now we don't. */ ret = truncate_error_page(p, page_to_pfn(p), mapping); + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + out: unlock_page(p);
- if (has_extra_refcount(ps, p, false)) - ret = MF_FAILED; - return ret; }
@@@ -1154,6 -1160,22 +1160,22 @@@ static int page_action(struct page_stat return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; }
+ static inline bool PageHWPoisonTakenOff(struct page *page) + { + return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON; + } + + void SetPageHWPoisonTakenOff(struct page *page) + { + set_page_private(page, MAGIC_HWPOISON); + } + + void ClearPageHWPoisonTakenOff(struct page *page) + { + if (PageHWPoison(page)) + set_page_private(page, 0); + } + /* * Return true if a page type of a given page is supported by hwpoison * mechanism (while handling could fail), otherwise false. This function @@@ -1256,6 -1278,27 +1278,27 @@@ out return ret; }
+ static int __get_unpoison_page(struct page *page) + { + struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; + + /* + * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison, + * but also isolated from buddy freelist, so need to identify the + * state and have to cancel both operations to unpoison. + */ + if (PageHWPoisonTakenOff(page)) + return -EHWPOISON; + + return get_page_unless_zero(page) ? 1 : 0; + } + /** * get_hwpoison_page() - Get refcount for memory error handling * @p: Raw error page (hit by memory error) @@@ -1263,7 -1306,7 +1306,7 @@@ * * get_hwpoison_page() takes a page refcount of an error page to handle memory * error on it, after checking that the error page is in a well-defined state - * (defined as a page-type we can successfully handle the memor error on it, + * (defined as a page-type we can successfully handle the memory error on it, * such as LRU page and hugetlb page). * * Memory error handling could be triggered at any time on any type of page, @@@ -1272,18 -1315,26 +1315,26 @@@ * extra care for the error page's state (as done in __get_hwpoison_page()), * and has some retry logic in get_any_page(). * + * When called from unpoison_memory(), the caller should already ensure that + * the given page has PG_hwpoison. So it's never reused for other page + * allocations, and __get_unpoison_page() never races with them. + * * Return: 0 on failure, * 1 on success for in-use pages in a well-defined state, * -EIO for pages on which we can not handle memory errors, * -EBUSY when get_hwpoison_page() has raced with page lifecycle - * operations like allocation and free. + * operations like allocation and free, + * -EHWPOISON when the page is hwpoisoned and taken off from buddy. */ static int get_hwpoison_page(struct page *p, unsigned long flags) { int ret;
zone_pcp_disable(page_zone(p)); - ret = get_any_page(p, flags); + if (flags & MF_UNPOISON) + ret = __get_unpoison_page(p); + else + ret = get_any_page(p, flags); zone_pcp_enable(page_zone(p));
return ret; @@@ -1494,14 -1545,6 +1545,6 @@@ static int memory_failure_hugetlb(unsig lock_page(head); page_flags = head->flags;
- if (!PageHWPoison(head)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(head); - put_page(head); - return 0; - } - /* * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so * simply disable it. In order to make it work properly, we need @@@ -1615,6 -1658,8 +1658,8 @@@ out return rc; }
+ static DEFINE_MUTEX(mf_mutex); + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@@ -1641,33 -1686,25 +1686,32 @@@ int memory_failure(unsigned long pfn, i int res = 0; unsigned long page_flags; bool retry = true; - static DEFINE_MUTEX(mf_mutex);
if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn);
+ mutex_lock(&mf_mutex); + p = pfn_to_online_page(pfn); if (!p) { + res = arch_memory_failure(pfn, flags); + if (res == 0) + goto unlock_mutex; + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn, NULL); - if (pgmap) - return memory_failure_dev_pagemap(pfn, flags, - pgmap); + if (pgmap) { + res = memory_failure_dev_pagemap(pfn, flags, + pgmap); + goto unlock_mutex; + } } pr_err("Memory failure: %#lx: memory outside kernel control\n", pfn); - return -ENXIO; + res = -ENXIO; + goto unlock_mutex; }
- mutex_lock(&mf_mutex); - try_again: if (PageHuge(p)) { res = memory_failure_hugetlb(pfn, flags); @@@ -1782,16 -1819,6 +1826,6 @@@ */ page_flags = p->flags;
- /* - * unpoison always clear PG_hwpoison inside page lock - */ - if (!PageHWPoison(p)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(p); - put_page(p); - goto unlock_mutex; - } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); @@@ -1955,6 -1982,28 +1989,28 @@@ core_initcall(memory_failure_init) pr_info(fmt, pfn); \ })
+ static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p) + { + if (TestClearPageHWPoison(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + num_poisoned_pages_dec(); + return 1; + } + return 0; + } + + static inline int unpoison_taken_off_page(struct ratelimit_state *rs, + struct page *p) + { + if (put_page_back_buddy(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + return 0; + } + return -EBUSY; + } + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@@ -1971,8 -2020,7 +2027,7 @@@ int unpoison_memory(unsigned long pfn { struct page *page; struct page *p; - int freeit = 0; - unsigned long flags = 0; + int ret = -EBUSY; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
@@@ -1982,69 -2030,60 +2037,60 @@@ p = pfn_to_page(pfn); page = compound_head(p);
+ mutex_lock(&mf_mutex); + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; }
if (page_count(page) > 1) { unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; }
if (page_mapped(page)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; }
if (page_mapping(page)) { unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); - return 0; - } - - /* - * unpoison_memory() can encounter thp only when the thp is being - * worked by memory_failure() and the page lock is not held yet. - * In such case, we yield to memory_failure() and make unpoison fail. - */ - if (!PageHuge(page) && PageTransHuge(page)) { - unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", - pfn, &unpoison_rs); - return 0; + goto unlock_mutex; }
- if (!get_hwpoison_page(p, flags)) { - if (TestClearPageHWPoison(p)) - num_poisoned_pages_dec(); - unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", - pfn, &unpoison_rs); - return 0; - } + if (PageSlab(page) || PageTable(page)) + goto unlock_mutex;
- lock_page(page); - /* - * This test is racy because PG_hwpoison is set outside of page lock. - * That's acceptable because that won't trigger kernel panic. Instead, - * the PG_hwpoison page will be caught and isolated on the entrance to - * the free buddy page pool. - */ - if (TestClearPageHWPoison(page)) { - unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", - pfn, &unpoison_rs); - num_poisoned_pages_dec(); - freeit = 1; - } - unlock_page(page); + ret = get_hwpoison_page(p, MF_UNPOISON); + if (!ret) { + if (clear_page_hwpoison(&unpoison_rs, page)) + ret = 0; + else + ret = -EBUSY; + } else if (ret < 0) { + if (ret == -EHWPOISON) { + ret = unpoison_taken_off_page(&unpoison_rs, p); + } else + unpoison_pr_info("Unpoison: failed to grab page %#lx\n", + pfn, &unpoison_rs); + } else { + int freeit = clear_page_hwpoison(&unpoison_rs, p);
- put_page(page); - if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) { + put_page(page); + ret = 0; + } + }
- return 0; + unlock_mutex: + mutex_unlock(&mf_mutex); + return ret; } EXPORT_SYMBOL(unpoison_memory);
@@@ -2225,9 -2264,12 +2271,12 @@@ int soft_offline_page(unsigned long pfn return -EIO; }
+ mutex_lock(&mf_mutex); + if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); put_ref_page(ref_page); + mutex_unlock(&mf_mutex); return 0; }
@@@ -2246,5 -2288,7 +2295,7 @@@ retry } }
+ mutex_unlock(&mf_mutex); + return ret; } diff --combined mm/memory.c index 23f2f1300d42,571d02f419ba..f306e698a1e3 --- a/mm/memory.c +++ b/mm/memory.c @@@ -41,6 -41,7 +41,7 @@@
#include <linux/kernel_stat.h> #include <linux/mm.h> + #include <linux/mm_inline.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/numa_balancing.h> @@@ -719,8 -720,6 +720,6 @@@ static void restore_exclusive_pte(struc else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- set_pte_at(vma->vm_mm, address, ptep, pte); - /* * No need to take a page reference as one was already * created when the swap entry was made. @@@ -734,6 -733,8 +733,8 @@@ */ WARN_ON_ONCE(!PageAnon(page));
+ set_pte_at(vma->vm_mm, address, ptep, pte); + if (vma->vm_flags & VM_LOCKED) mlock_vma_page(page);
@@@ -1304,28 -1305,6 +1305,28 @@@ copy_page_range(struct vm_area_struct * return ret; }
+/* + * Parameter block passed down to zap_pte_range in exceptional cases. + */ +struct zap_details { + struct address_space *zap_mapping; /* Check page->mapping if set */ + struct folio *single_folio; /* Locked folio to be unmapped */ +}; + +/* + * We set details->zap_mapping when we want to unmap shared but keep private + * pages. Return true if skip zapping this page, false otherwise. + */ +static inline bool +zap_skip_check_mapping(struct zap_details *details, struct page *page) +{ + if (!details || !page) + return false; + + return details->zap_mapping && + (details->zap_mapping != page_rmapping(page)); +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@@ -1465,8 -1444,8 +1466,8 @@@ static inline unsigned long zap_pmd_ran else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ - } else if (details && details->single_page && - PageTransCompound(details->single_page) && + } else if (details && details->single_folio && + folio_test_pmd_mappable(details->single_folio) && next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { spinlock_t *ptl = pmd_lock(tlb->mm, pmd); /* @@@ -3354,30 -3333,31 +3355,30 @@@ static inline void unmap_mapping_range_ }
/** - * unmap_mapping_page() - Unmap single page from processes. - * @page: The locked page to be unmapped. + * unmap_mapping_folio() - Unmap single folio from processes. + * @folio: The locked folio to be unmapped. * - * Unmap this page from any userspace process which still has it mmaped. + * Unmap this folio from any userspace process which still has it mmaped. * Typically, for efficiency, the range of nearby pages has already been * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once - * truncation or invalidation holds the lock on a page, it may find that - * the page has been remapped again: and then uses unmap_mapping_page() + * truncation or invalidation holds the lock on a folio, it may find that + * the page has been remapped again: and then uses unmap_mapping_folio() * to unmap it finally. */ -void unmap_mapping_page(struct page *page) +void unmap_mapping_folio(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct zap_details details = { }; pgoff_t first_index; pgoff_t last_index;
- VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageTail(page)); + VM_BUG_ON(!folio_test_locked(folio));
- first_index = page->index; - last_index = page->index + thp_nr_pages(page) - 1; + first_index = folio->index; + last_index = folio->index + folio_nr_pages(folio) - 1;
details.zap_mapping = mapping; - details.single_page = page; + details.single_folio = folio;
i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) @@@ -3647,7 -3627,7 +3648,7 @@@ vm_fault_t do_swap_page(struct vm_faul inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; @@@ -3660,8 -3640,6 +3661,6 @@@ pte = pte_mkuffd_wp(pte); pte = pte_wrprotect(pte); } - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte;
/* ksm created a completely new copy */ @@@ -3672,6 -3650,9 +3671,9 @@@ do_page_add_anon_rmap(page, vma, vmf->address, exclusive); }
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); + swap_free(entry); if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) diff --combined mm/memremap.c index 643965da13a6,a2869d8519a2..6aa5f0c2d11f --- a/mm/memremap.c +++ b/mm/memremap.c @@@ -102,16 -102,47 +102,23 @@@ static unsigned long pfn_end(struct dev return (range->start + range_len(range)) >> PAGE_SHIFT; }
- static unsigned long pfn_next(unsigned long pfn) + static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn) { - if (pfn % 1024 == 0) + if (pfn % (1024 << pgmap->vmemmap_shift)) cond_resched(); - return pfn + 1; + return pfn + pgmap_vmemmap_nr(pgmap); + } + + static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) + { + return (pfn_end(pgmap, range_id) - + pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; }
#define for_each_device_pfn(pfn, map, i) \ - for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn)) + for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ + pfn = pfn_next(map, pfn))
-static void dev_pagemap_kill(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->kill) - pgmap->ops->kill(pgmap); - else - percpu_ref_kill(pgmap->ref); -} - -static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->cleanup) { - pgmap->ops->cleanup(pgmap); - } else { - wait_for_completion(&pgmap->done); - percpu_ref_exit(pgmap->ref); - } - /* - * Undo the pgmap ref assignment for the internal case as the - * caller may re-enable the same pgmap. - */ - if (pgmap->ref == &pgmap->internal_ref) - pgmap->ref = NULL; -} - static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { struct range *range = &pgmap->ranges[range_id]; @@@ -143,12 -174,11 +150,12 @@@ void memunmap_pages(struct dev_pagemap unsigned long pfn; int i;
- dev_pagemap_kill(pgmap); + percpu_ref_kill(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) for_each_device_pfn(pfn, pgmap, i) put_page(pfn_to_page(pfn)); - dev_pagemap_cleanup(pgmap); + wait_for_completion(&pgmap->done); + percpu_ref_exit(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++) pageunmap_range(pgmap, i); @@@ -165,7 -195,8 +172,7 @@@ static void devm_memremap_pages_release
static void dev_pagemap_percpu_release(struct percpu_ref *ref) { - struct dev_pagemap *pgmap = - container_of(ref, struct dev_pagemap, internal_ref); + struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref);
complete(&pgmap->done); } @@@ -271,8 -302,7 +278,7 @@@ static int pagemap_range(struct dev_pag memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], PHYS_PFN(range->start), PHYS_PFN(range_len(range)), pgmap); - percpu_ref_get_many(&pgmap->ref, - pfn_end(pgmap, range_id) - pfn_first(pgmap, range_id)); - percpu_ref_get_many(pgmap->ref, pfn_len(pgmap, range_id)); ++ percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); return 0;
err_add_memory: @@@ -338,11 -368,22 +344,11 @@@ void *memremap_pages(struct dev_pagema break; }
- if (!pgmap->ref) { - if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) - return ERR_PTR(-EINVAL); - - init_completion(&pgmap->done); - error = percpu_ref_init(&pgmap->internal_ref, - dev_pagemap_percpu_release, 0, GFP_KERNEL); - if (error) - return ERR_PTR(error); - pgmap->ref = &pgmap->internal_ref; - } else { - if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); - } - } + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0, + GFP_KERNEL); + if (error) + return ERR_PTR(error);
devmap_managed_enable_get(pgmap);
@@@ -451,7 -492,7 +457,7 @@@ struct dev_pagemap *get_dev_pagemap(uns /* fall back to slow path lookup */ rcu_read_lock(); pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); - if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) + if (pgmap && !percpu_ref_tryget_live(&pgmap->ref)) pgmap = NULL; rcu_read_unlock();
diff --combined mm/migrate.c index 7079e6b7dbe7,05af2b2336b9..18ce840914f0 --- a/mm/migrate.c +++ b/mm/migrate.c @@@ -50,6 -50,7 +50,7 @@@ #include <linux/ptrace.h> #include <linux/oom.h> #include <linux/memory.h> + #include <linux/random.h>
#include <asm/tlbflush.h>
@@@ -236,20 -237,19 +237,19 @@@ static bool remove_migration_pte(struc
pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); - set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, pvmw.address); else page_dup_rmap(new, true); + set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } else #endif { - set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else page_add_file_rmap(new, false); + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); @@@ -291,7 -291,7 +291,7 @@@ void __migration_entry_wait(struct mm_s { pte_t pte; swp_entry_t entry; - struct page *page; + struct folio *folio;
spin_lock(ptl); pte = *ptep; @@@ -302,17 -302,18 +302,17 @@@ if (!is_migration_entry(entry)) goto out;
- page = pfn_swap_entry_to_page(entry); - page = compound_head(page); + folio = page_folio(pfn_swap_entry_to_page(entry));
/* * Once page cache replacement of page migration started, page_count - * is zero; but we must not call put_and_wait_on_page_locked() without - * a ref. Use get_page_unless_zero(), and just fault again if it fails. + * is zero; but we must not call folio_put_wait_locked() without + * a ref. Use folio_try_get(), and just fault again if it fails. */ - if (!get_page_unless_zero(page)) + if (!folio_try_get(folio)) goto out; pte_unmap_unlock(ptep, ptl); - put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); + folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); return; out: pte_unmap_unlock(ptep, ptl); @@@ -337,16 -338,16 +337,16 @@@ void migration_entry_wait_huge(struct v void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; - struct page *page; + struct folio *folio;
ptl = pmd_lock(mm, pmd); if (!is_pmd_migration_entry(*pmd)) goto unlock; - page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd)); - if (!get_page_unless_zero(page)) + folio = page_folio(pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd))); + if (!folio_try_get(folio)) goto unlock; spin_unlock(ptl); - put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); + folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); return; unlock: spin_unlock(ptl); @@@ -433,6 -434,14 +433,6 @@@ int folio_migrate_mapping(struct addres }
xas_store(&xas, newfolio); - if (nr > 1) { - int i; - - for (i = 1; i < nr; i++) { - xas_next(&xas); - xas_store(&xas, newfolio); - } - }
/* * Drop cache reference from old page by unfreezing @@@ -1084,80 -1093,6 +1084,6 @@@ out return rc; }
- - /* - * node_demotion[] example: - * - * Consider a system with two sockets. Each socket has - * three classes of memory attached: fast, medium and slow. - * Each memory class is placed in its own NUMA node. The - * CPUs are placed in the node with the "fast" memory. The - * 6 NUMA nodes (0-5) might be split among the sockets like - * this: - * - * Socket A: 0, 1, 2 - * Socket B: 3, 4, 5 - * - * When Node 0 fills up, its memory should be migrated to - * Node 1. When Node 1 fills up, it should be migrated to - * Node 2. The migration path start on the nodes with the - * processors (since allocations default to this node) and - * fast memory, progress through medium and end with the - * slow memory: - * - * 0 -> 1 -> 2 -> stop - * 3 -> 4 -> 5 -> stop - * - * This is represented in the node_demotion[] like this: - * - * { 1, // Node 0 migrates to 1 - * 2, // Node 1 migrates to 2 - * -1, // Node 2 does not migrate - * 4, // Node 3 migrates to 4 - * 5, // Node 4 migrates to 5 - * -1} // Node 5 does not migrate - */ - - /* - * Writes to this array occur without locking. Cycles are - * not allowed: Node X demotes to Y which demotes to X... - * - * If multiple reads are performed, a single rcu_read_lock() - * must be held over all reads to ensure that no cycles are - * observed. - */ - static int node_demotion[MAX_NUMNODES] __read_mostly = - {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE}; - - /** - * next_demotion_node() - Get the next node in the demotion path - * @node: The starting node to lookup the next node - * - * Return: node id for next memory node in the demotion path hierarchy - * from @node; NUMA_NO_NODE if @node is terminal. This does not keep - * @node online or guarantee that it *continues* to be the next demotion - * target. - */ - int next_demotion_node(int node) - { - int target; - - /* - * node_demotion[] is updated without excluding this - * function from running. RCU doesn't provide any - * compiler barriers, so the READ_ONCE() is required - * to avoid compiler reordering or read merging. - * - * Make sure to use RCU over entire code blocks if - * node_demotion[] reads need to be consistent. - */ - rcu_read_lock(); - target = READ_ONCE(node_demotion[node]); - rcu_read_unlock(); - - return target; - } - /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. @@@ -1413,7 -1348,7 +1339,7 @@@ static inline int try_split_thp(struct * @mode: The migration mode that specifies the constraints for * page migration, if any. * @reason: The reason for page migration. - * @ret_succeeded: Set to the number of pages migrated successfully if + * @ret_succeeded: Set to the number of normal pages migrated successfully if * the caller passes a non-NULL pointer. * * The function returns after 10 attempts or if no pages are movable any more @@@ -1421,7 -1356,9 +1347,9 @@@ * It is caller's responsibility to call putback_movable_pages() to return pages * to the LRU or free list only if ret != 0. * - * Returns the number of pages that were not migrated, or an error code. + * Returns the number of {normal page, THP, hugetlb} that were not migrated, or + * an error code. The number of THP splits will be considered as the number of + * non-migrated THP, no matter how many subpages of the THP are migrated successfully. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, @@@ -1430,6 -1367,7 +1358,7 @@@ int retry = 1; int thp_retry = 1; int nr_failed = 0; + int nr_failed_pages = 0; int nr_succeeded = 0; int nr_thp_succeeded = 0; int nr_thp_failed = 0; @@@ -1441,13 -1379,16 +1370,16 @@@ int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; LIST_HEAD(ret_pages); + LIST_HEAD(thp_split_pages); bool nosplit = (reason == MR_NUMA_MISPLACED); + bool no_subpage_counting = false;
trace_mm_migrate_pages_start(mode, reason);
if (!swapwrite) current->flags |= PF_SWAPWRITE;
+ thp_subpage_migration: for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { retry = 0; thp_retry = 0; @@@ -1460,7 -1401,7 +1392,7 @@@ retry * during migration. */ is_thp = PageTransHuge(page) && !PageHuge(page); - nr_subpages = thp_nr_pages(page); + nr_subpages = compound_nr(page); cond_resched();
if (PageHuge(page)) @@@ -1496,18 -1437,20 +1428,20 @@@ case -ENOSYS: /* THP migration is unsupported */ if (is_thp) { - if (!try_split_thp(page, &page2, from)) { + nr_thp_failed++; + if (!try_split_thp(page, &page2, &thp_split_pages)) { nr_thp_split++; goto retry; }
- nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; break; }
/* Hugetlb migration is unsupported */ - nr_failed++; + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages += nr_subpages; break; case -ENOMEM: /* @@@ -1516,16 -1459,19 +1450,19 @@@ * THP NUMA faulting doesn't split THP to retry. */ if (is_thp && !nosplit) { - if (!try_split_thp(page, &page2, from)) { + nr_thp_failed++; + if (!try_split_thp(page, &page2, &thp_split_pages)) { nr_thp_split++; goto retry; }
- nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; goto out; } - nr_failed++; + + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages += nr_subpages; goto out; case -EAGAIN: if (is_thp) { @@@ -1535,12 -1481,11 +1472,11 @@@ retry++; break; case MIGRATEPAGE_SUCCESS: + nr_succeeded += nr_subpages; if (is_thp) { nr_thp_succeeded++; - nr_succeeded += nr_subpages; break; } - nr_succeeded++; break; default: /* @@@ -1551,17 -1496,37 +1487,37 @@@ */ if (is_thp) { nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; break; } - nr_failed++; + + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages += nr_subpages; break; } } } - nr_failed += retry + thp_retry; + nr_failed += retry; nr_thp_failed += thp_retry; - rc = nr_failed; + /* + * Try to migrate subpages of fail-to-migrate THPs, no nr_failed + * counting in this round, since all subpages of a THP is counted + * as 1 failure in the first round. + */ + if (!list_empty(&thp_split_pages)) { + /* + * Move non-migrated pages (after 10 retries) to ret_pages + * to avoid migrating them again. + */ + list_splice_init(from, &ret_pages); + list_splice_init(&thp_split_pages, from); + no_subpage_counting = true; + retry = 1; + goto thp_subpage_migration; + } + + rc = nr_failed + nr_thp_failed; out: /* * Put the permanent failure page back to migration list, they @@@ -1570,11 -1535,11 +1526,11 @@@ list_splice(&ret_pages, from);
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); - count_vm_events(PGMIGRATE_FAIL, nr_failed); + count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); - trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded, + trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded, nr_thp_failed, nr_thp_split, mode, reason);
if (!swapwrite) @@@ -2516,8 -2481,7 +2472,7 @@@ static bool migrate_vma_check_page(stru static void migrate_vma_unmap(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; - unsigned long addr, i, restore = 0; + unsigned long i, restore = 0; bool allow_drain = true;
lru_add_drain(); @@@ -2563,7 -2527,7 +2518,7 @@@ } }
- for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { + for (i = 0; i < npages && restore; i++) { struct page *page = migrate_pfn_to_page(migrate->src[i]);
if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) @@@ -2961,14 -2925,152 +2916,152 @@@ void migrate_vma_finalize(struct migrat EXPORT_SYMBOL(migrate_vma_finalize); #endif /* CONFIG_DEVICE_PRIVATE */
+ /* + * node_demotion[] example: + * + * Consider a system with two sockets. Each socket has + * three classes of memory attached: fast, medium and slow. + * Each memory class is placed in its own NUMA node. The + * CPUs are placed in the node with the "fast" memory. The + * 6 NUMA nodes (0-5) might be split among the sockets like + * this: + * + * Socket A: 0, 1, 2 + * Socket B: 3, 4, 5 + * + * When Node 0 fills up, its memory should be migrated to + * Node 1. When Node 1 fills up, it should be migrated to + * Node 2. The migration path start on the nodes with the + * processors (since allocations default to this node) and + * fast memory, progress through medium and end with the + * slow memory: + * + * 0 -> 1 -> 2 -> stop + * 3 -> 4 -> 5 -> stop + * + * This is represented in the node_demotion[] like this: + * + * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1 + * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2 + * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate + * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4 + * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5 + * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate + * + * Moreover some systems may have multiple slow memory nodes. + * Suppose a system has one socket with 3 memory nodes, node 0 + * is fast memory type, and node 1/2 both are slow memory + * type, and the distance between fast memory node and slow + * memory node is same. So the migration path should be: + * + * 0 -> 1/2 -> stop + * + * This is represented in the node_demotion[] like this: + * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 + * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate + * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate + */ + + /* + * Writes to this array occur without locking. Cycles are + * not allowed: Node X demotes to Y which demotes to X... + * + * If multiple reads are performed, a single rcu_read_lock() + * must be held over all reads to ensure that no cycles are + * observed. + */ + #define DEFAULT_DEMOTION_TARGET_NODES 15 + + #if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES + #define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1) + #else + #define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES + #endif + + struct demotion_nodes { + unsigned short nr; + short nodes[DEMOTION_TARGET_NODES]; + }; + + static struct demotion_nodes *node_demotion __read_mostly; + + /** + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node + * + * Return: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal. This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. + */ + int next_demotion_node(int node) + { + struct demotion_nodes *nd; + unsigned short target_nr, index; + int target; + + if (!node_demotion) + return NUMA_NO_NODE; + + nd = &node_demotion[node]; + + /* + * node_demotion[] is updated without excluding this + * function from running. RCU doesn't provide any + * compiler barriers, so the READ_ONCE() is required + * to avoid compiler reordering or read merging. + * + * Make sure to use RCU over entire code blocks if + * node_demotion[] reads need to be consistent. + */ + rcu_read_lock(); + target_nr = READ_ONCE(nd->nr); + + switch (target_nr) { + case 0: + target = NUMA_NO_NODE; + goto out; + case 1: + index = 0; + break; + default: + /* + * If there are multiple target nodes, just select one + * target node randomly. + * + * In addition, we can also use round-robin to select + * target node, but we should introduce another variable + * for node_demotion[] to record last selected target node, + * that may cause cache ping-pong due to the changing of + * last target node. Or introducing per-cpu data to avoid + * caching issue, which seems more complicated. So selecting + * target node randomly seems better until now. + */ + index = get_random_int() % target_nr; + break; + } + + target = READ_ONCE(nd->nodes[index]); + + out: + rcu_read_unlock(); + return target; + } + #if defined(CONFIG_HOTPLUG_CPU) /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) { - int node; + int node, i; + + if (!node_demotion) + return;
- for_each_online_node(node) - node_demotion[node] = NUMA_NO_NODE; + for_each_online_node(node) { + node_demotion[node].nr = 0; + for (i = 0; i < DEMOTION_TARGET_NODES; i++) + node_demotion[node].nodes[i] = NUMA_NO_NODE; + } }
static void disable_all_migrate_targets(void) @@@ -2995,26 -3097,40 +3088,40 @@@ * Failing here is OK. It might just indicate * being at the end of a chain. */ - static int establish_migrate_target(int node, nodemask_t *used) + static int establish_migrate_target(int node, nodemask_t *used, + int best_distance) { - int migration_target; + int migration_target, index, val; + struct demotion_nodes *nd;
- /* - * Can not set a migration target on a - * node with it already set. - * - * No need for READ_ONCE() here since this - * in the write path for node_demotion[]. - * This should be the only thread writing. - */ - if (node_demotion[node] != NUMA_NO_NODE) + if (!node_demotion) return NUMA_NO_NODE;
+ nd = &node_demotion[node]; + migration_target = find_next_best_node(node, used); if (migration_target == NUMA_NO_NODE) return NUMA_NO_NODE;
- node_demotion[node] = migration_target; + /* + * If the node has been set a migration target node before, + * which means it's the best distance between them. Still + * check if this node can be demoted to other target nodes + * if they have a same best distance. + */ + if (best_distance != -1) { + val = node_distance(node, migration_target); + if (val > best_distance) + return NUMA_NO_NODE; + } + + index = nd->nr; + if (WARN_ONCE(index >= DEMOTION_TARGET_NODES, + "Exceeds maximum demotion target nodes\n")) + return NUMA_NO_NODE; + + nd->nodes[index] = migration_target; + nd->nr++;
return migration_target; } @@@ -3030,7 -3146,9 +3137,9 @@@ * * The difference here is that cycles must be avoided. If * node0 migrates to node1, then neither node1, nor anything - * node1 migrates to can migrate to node0. + * node1 migrates to can migrate to node0. Also one node can + * be migrated to multiple nodes if the target nodes all have + * a same best-distance against the source node. * * This function can run simultaneously with readers of * node_demotion[]. However, it can not run simultaneously @@@ -3042,7 -3160,7 +3151,7 @@@ static void __set_migration_target_node nodemask_t next_pass = NODE_MASK_NONE; nodemask_t this_pass = NODE_MASK_NONE; nodemask_t used_targets = NODE_MASK_NONE; - int node; + int node, best_distance;
/* * Avoid any oddities like cycles that could occur @@@ -3071,18 -3189,33 +3180,33 @@@ again * multiple source nodes to share a destination. */ nodes_or(used_targets, used_targets, this_pass); - for_each_node_mask(node, this_pass) { - int target_node = establish_migrate_target(node, &used_targets);
- if (target_node == NUMA_NO_NODE) - continue; + for_each_node_mask(node, this_pass) { + best_distance = -1;
/* - * Visit targets from this pass in the next pass. - * Eventually, every node will have been part of - * a pass, and will become set in 'used_targets'. + * Try to set up the migration path for the node, and the target + * migration nodes can be multiple, so doing a loop to find all + * the target nodes if they all have a best node distance. */ - node_set(target_node, next_pass); + do { + int target_node = + establish_migrate_target(node, &used_targets, + best_distance); + + if (target_node == NUMA_NO_NODE) + break; + + if (best_distance == -1) + best_distance = node_distance(node, target_node); + + /* + * Visit targets from this pass in the next pass. + * Eventually, every node will have been part of + * a pass, and will become set in 'used_targets'. + */ + node_set(target_node, next_pass); + } while (1); } /* * 'next_pass' contains nodes which became migration @@@ -3183,6 -3316,11 +3307,11 @@@ static int __init migrate_on_reclaim_in { int ret;
+ node_demotion = kmalloc_array(nr_node_ids, + sizeof(struct demotion_nodes), + GFP_KERNEL); + WARN_ON(!node_demotion); + ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline", NULL, migration_offline_cpu); /* diff --combined mm/shmem.c index 28d627444a24,0700e9acf53b..66909efd0a1b --- a/mm/shmem.c +++ b/mm/shmem.c @@@ -554,7 -554,7 +554,7 @@@ static unsigned long shmem_unused_huge_ struct shmem_inode_info *info; struct page *page; unsigned long batch = sc ? sc->nr_to_scan : 128; - int removed = 0, split = 0; + int split = 0;
if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@@ -569,7 -569,6 +569,6 @@@ /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); - removed++; goto next; }
@@@ -577,12 -576,12 +576,12 @@@ if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { list_move(&info->shrinklist, &to_remove); - removed++; goto next; }
list_move(&info->shrinklist, &list); next: + sbinfo->shrinklist_len--; if (!--batch) break; } @@@ -602,7 -601,7 +601,7 @@@ inode = &info->vfs_inode;
if (nr_to_split && split >= nr_to_split) - goto leave; + goto move_back;
page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); @@@ -616,38 -615,44 +615,44 @@@ }
/* - * Leave the inode on the list if we failed to lock - * the page at this time. + * Move the inode on the list back to shrinklist if we failed + * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!trylock_page(page)) { put_page(page); - goto leave; + goto move_back; }
ret = split_huge_page(page); unlock_page(page); put_page(page);
- /* If split failed leave the inode on the list */ + /* If split failed move the inode on the list back to shrinklist */ if (ret) - goto leave; + goto move_back;
split++; drop: list_del_init(&info->shrinklist); - removed++; - leave: + goto put; + move_back: + /* + * Make sure the inode is either on the global list or deleted + * from any local list before iput() since it could be deleted + * in another thread once we put the inode (then the local list + * is corrupted). + */ + spin_lock(&sbinfo->shrinklist_lock); + list_move(&info->shrinklist, &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + spin_unlock(&sbinfo->shrinklist_lock); + put: iput(inode); }
- spin_lock(&sbinfo->shrinklist_lock); - list_splice_tail(&list, &sbinfo->shrinklist); - sbinfo->shrinklist_len -= removed; - spin_unlock(&sbinfo->shrinklist_lock); - return split; }
@@@ -694,6 -699,7 +699,6 @@@ static int shmem_add_to_page_cache(stru struct mm_struct *charge_mm) { XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); - unsigned long i = 0; unsigned long nr = compound_nr(page); int error;
@@@ -720,18 -726,20 +725,18 @@@ cgroup_throttle_swaprate(page, gfp);
do { - void *entry; xas_lock_irq(&xas); - entry = xas_find_conflict(&xas); - if (entry != expected) + if (expected != xas_find_conflict(&xas)) { xas_set_err(&xas, -EEXIST); - xas_create_range(&xas); - if (xas_error(&xas)) goto unlock; -next: - xas_store(&xas, page); - if (++i < nr) { - xas_next(&xas); - goto next; } + if (expected && xas_find_conflict(&xas)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + xas_store(&xas, page); + if (xas_error(&xas)) + goto unlock; if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); __mod_lruvec_page_state(page, NR_SHMEM_THPS, nr); @@@ -877,26 -885,30 +882,26 @@@ void shmem_unlock_mapping(struct addres } }
-/* - * Check whether a hole-punch or truncation needs to split a huge page, - * returning true if no split was required, or the split has been successful. - * - * Eviction (or truncation to 0 size) should never need to split a huge page; - * but in rare cases might do so, if shmem_undo_range() failed to trylock on - * head, and then succeeded to trylock on tail. - * - * A split can only succeed when there are no additional references on the - * huge page: so the split below relies upon find_get_entries() having stopped - * when it found a subpage of the huge page, without getting further references. - */ -static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end) +static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { - if (!PageTransCompound(page)) - return true; - - /* Just proceed to delete a huge page wholly within the range punched */ - if (PageHead(page) && - page->index >= start && page->index + HPAGE_PMD_NR <= end) - return true; + struct folio *folio; + struct page *page;
- /* Try to split huge page, so we can truly punch the hole or truncate */ - return split_huge_page(page) >= 0; + /* + * At first avoid shmem_getpage(,,,SGP_READ): that fails + * beyond i_size, and reports fallocated pages as holes. + */ + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_ENTRY | FGP_LOCK, 0); + if (!xa_is_value(folio)) + return folio; + /* + * But read a page back from swap if any of it is within i_size + * (although in some cases this is just a waste of time). + */ + page = NULL; + shmem_getpage(inode, index, &page, SGP_READ); + return page ? page_folio(page) : NULL; }
/* @@@ -910,10 -922,10 +915,10 @@@ static void shmem_undo_range(struct ino struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; - unsigned int partial_start = lstart & (PAGE_SIZE - 1); - unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1); - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; + struct folio *folio; + bool same_folio; long nr_swaps_freed = 0; pgoff_t index; int i; @@@ -924,64 -936,67 +929,64 @@@ if (info->fallocend > start && info->fallocend <= end && !unfalloc) info->fallocend = start;
- pagevec_init(&pvec); + folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, index, end - 1, - &pvec, indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i];
index = indices[i];
- if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, - index, page); + index, folio); continue; } - index += thp_nr_pages(page) - 1; + index += folio_nr_pages(folio) - 1;
- if (!unfalloc || !PageUptodate(page)) - truncate_inode_page(mapping, page); - unlock_page(page); + if (!unfalloc || !folio_test_uptodate(folio)) + truncate_inode_folio(mapping, folio); + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; }
- if (partial_start) { - struct page *page = NULL; - shmem_getpage(inode, start - 1, &page, SGP_READ); - if (page) { - unsigned int top = PAGE_SIZE; - if (start > end) { - top = partial_end; - partial_end = 0; - } - zero_user_segment(page, partial_start, top); - set_page_dirty(page); - unlock_page(page); - put_page(page); + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); + if (folio) { + same_folio = lend < folio_pos(folio) + folio_size(folio); + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) { + start = folio->index + folio_nr_pages(folio); + if (same_folio) + end = folio->index; } + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - if (partial_end) { - struct page *page = NULL; - shmem_getpage(inode, end, &page, SGP_READ); - if (page) { - zero_user_segment(page, 0, partial_end); - set_page_dirty(page); - unlock_page(page); - put_page(page); - } + + if (!same_folio) + folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); + if (folio) { + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); } - if (start >= end) - return;
index = start; while (index < end) { cond_resched();
- if (!find_get_entries(mapping, index, end - 1, &pvec, + if (!find_get_entries(mapping, index, end - 1, &fbatch, indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) @@@ -990,14 -1005,14 +995,14 @@@ index = start; continue; } - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i];
index = indices[i]; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (unfalloc) continue; - if (shmem_free_swap(mapping, index, page)) { + if (shmem_free_swap(mapping, index, folio)) { /* Swap was replaced by page: retry */ index--; break; @@@ -1006,24 -1021,32 +1011,24 @@@ continue; }
- lock_page(page); + folio_lock(folio);
- if (!unfalloc || !PageUptodate(page)) { - if (page_mapping(page) != mapping) { + if (!unfalloc || !folio_test_uptodate(folio)) { + if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ - unlock_page(page); + folio_unlock(folio); index--; break; } - VM_BUG_ON_PAGE(PageWriteback(page), page); - if (shmem_punch_compound(page, start, end)) - truncate_inode_page(mapping, page); - else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - /* Wipe the page and don't get stuck */ - clear_highpage(page); - flush_dcache_page(page); - set_page_dirty(page); - if (index < - round_up(start, HPAGE_PMD_NR)) - start = index + 1; - } + VM_BUG_ON_FOLIO(folio_test_writeback(folio), + folio); + truncate_inode_folio(mapping, folio); } - unlock_page(page); + index = folio->index + folio_nr_pages(folio) - 1; + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); index++; }
@@@ -1541,8 -1564,7 +1546,7 @@@ static struct page *shmem_alloc_hugepag return NULL;
shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), - true); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); @@@ -2439,6 -2461,7 +2443,7 @@@ shmem_write_begin(struct file *file, st struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + int ret = 0;
/* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | @@@ -2449,7 -2472,19 +2454,19 @@@ return -EPERM; }
- return shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + + if (ret) + return ret; + + if (PageHWPoison(*pagep)) { + unlock_page(*pagep); + put_page(*pagep); + *pagep = NULL; + return -EIO; + } + + return 0; }
static int @@@ -2536,6 -2571,12 +2553,12 @@@ static ssize_t shmem_file_read_iter(str if (sgp == SGP_CACHE) set_page_dirty(page); unlock_page(page); + + if (PageHWPoison(page)) { + put_page(page); + error = -EIO; + break; + } }
/* @@@ -3075,7 -3116,8 +3098,8 @@@ static const char *shmem_get_link(struc page = find_get_page(inode->i_mapping, 0); if (!page) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { + if (PageHWPoison(page) || + !PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } @@@ -3083,6 -3125,13 +3107,13 @@@ error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); + if (!page) + return ERR_PTR(-ECHILD); + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ECHILD); + } unlock_page(page); } set_delayed_call(done, shmem_put_link, page); @@@ -3733,6 -3782,13 +3764,13 @@@ static void shmem_destroy_inodecache(vo kmem_cache_destroy(shmem_inode_cachep); }
+ /* Keep the page in page cache instead of truncating it */ + static int shmem_error_remove_page(struct address_space *mapping, + struct page *page) + { + return 0; + } + const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, @@@ -3743,7 -3799,7 +3781,7 @@@ #ifdef CONFIG_MIGRATION .migratepage = migrate_page, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_page = shmem_error_remove_page, }; EXPORT_SYMBOL(shmem_aops);
@@@ -4151,9 -4207,14 +4189,14 @@@ struct page *shmem_read_mapping_page_gf error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) - page = ERR_PTR(error); - else - unlock_page(page); + return ERR_PTR(error); + + unlock_page(page); + if (PageHWPoison(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; #else /* diff --combined mm/slab.h index 95b9a74a2d51,053eefaf6cbd..7edb7d23f141 --- a/mm/slab.h +++ b/mm/slab.h @@@ -5,197 -5,6 +5,197 @@@ * Internal slab definitions */
+/* Reuses the bits in struct page */ +struct slab { + unsigned long __page_flags; + +#if defined(CONFIG_SLAB) + + union { + struct list_head slab_list; + struct rcu_head rcu_head; + }; + struct kmem_cache *slab_cache; + void *freelist; /* array of free object indexes */ + void *s_mem; /* first object */ + unsigned int active; + +#elif defined(CONFIG_SLUB) + + union { + struct list_head slab_list; + struct rcu_head rcu_head; +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct { + struct slab *next; + int slabs; /* Nr of slabs left */ + }; +#endif + }; + struct kmem_cache *slab_cache; + /* Double-word boundary */ + void *freelist; /* first free object */ + union { + unsigned long counters; + struct { + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; + unsigned int __unused; + +#elif defined(CONFIG_SLOB) + + struct list_head slab_list; + void *__unused_1; + void *freelist; /* first free block */ + long units; + unsigned int __unused_2; + +#else +#error "Unexpected slab allocator configured" +#endif + + atomic_t __page_refcount; +#ifdef CONFIG_MEMCG + unsigned long memcg_data; +#endif +}; + +#define SLAB_MATCH(pg, sl) \ + static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) +SLAB_MATCH(flags, __page_flags); +SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */ +SLAB_MATCH(slab_list, slab_list); +#ifndef CONFIG_SLOB +SLAB_MATCH(rcu_head, rcu_head); +SLAB_MATCH(slab_cache, slab_cache); +#endif +#ifdef CONFIG_SLAB +SLAB_MATCH(s_mem, s_mem); +SLAB_MATCH(active, active); +#endif +SLAB_MATCH(_refcount, __page_refcount); +#ifdef CONFIG_MEMCG +SLAB_MATCH(memcg_data, memcg_data); +#endif +#undef SLAB_MATCH +static_assert(sizeof(struct slab) <= sizeof(struct page)); + +/** + * folio_slab - Converts from folio to slab. + * @folio: The folio. + * + * Currently struct slab is a different representation of a folio where + * folio_test_slab() is true. + * + * Return: The slab which contains this folio. + */ +#define folio_slab(folio) (_Generic((folio), \ + const struct folio *: (const struct slab *)(folio), \ + struct folio *: (struct slab *)(folio))) + +/** + * slab_folio - The folio allocated for a slab + * @slab: The slab. + * + * Slabs are allocated as folios that contain the individual objects and are + * using some fields in the first struct page of the folio - those fields are + * now accessed by struct slab. It is occasionally necessary to convert back to + * a folio in order to communicate with the rest of the mm. Please use this + * helper function instead of casting yourself, as the implementation may change + * in the future. + */ +#define slab_folio(s) (_Generic((s), \ + const struct slab *: (const struct folio *)s, \ + struct slab *: (struct folio *)s)) + +/** + * page_slab - Converts from first struct page to slab. + * @p: The first (either head of compound or single) page of slab. + * + * A temporary wrapper to convert struct page to struct slab in situations where + * we know the page is the compound head, or single order-0 page. + * + * Long-term ideally everything would work with struct slab directly or go + * through folio to struct slab. + * + * Return: The slab which contains this page + */ +#define page_slab(p) (_Generic((p), \ + const struct page *: (const struct slab *)(p), \ + struct page *: (struct slab *)(p))) + +/** + * slab_page - The first struct page allocated for a slab + * @slab: The slab. + * + * A convenience wrapper for converting slab to the first struct page of the + * underlying folio, to communicate with code not yet converted to folio or + * struct slab. + */ +#define slab_page(s) folio_page(slab_folio(s), 0) + +/* + * If network-based swap is enabled, sl*b must keep track of whether pages + * were allocated from pfmemalloc reserves. + */ +static inline bool slab_test_pfmemalloc(const struct slab *slab) +{ + return folio_test_active((struct folio *)slab_folio(slab)); +} + +static inline void slab_set_pfmemalloc(struct slab *slab) +{ + folio_set_active(slab_folio(slab)); +} + +static inline void slab_clear_pfmemalloc(struct slab *slab) +{ + folio_clear_active(slab_folio(slab)); +} + +static inline void __slab_clear_pfmemalloc(struct slab *slab) +{ + __folio_clear_active(slab_folio(slab)); +} + +static inline void *slab_address(const struct slab *slab) +{ + return folio_address(slab_folio(slab)); +} + +static inline int slab_nid(const struct slab *slab) +{ + return folio_nid(slab_folio(slab)); +} + +static inline pg_data_t *slab_pgdat(const struct slab *slab) +{ + return folio_pgdat(slab_folio(slab)); +} + +static inline struct slab *virt_to_slab(const void *addr) +{ + struct folio *folio = virt_to_folio(addr); + + if (!folio_test_slab(folio)) + return NULL; + + return folio_slab(folio); +} + +static inline int slab_order(const struct slab *slab) +{ + return folio_order((struct folio *)slab_folio(slab)); +} + +static inline size_t slab_size(const struct slab *slab) +{ + return PAGE_SIZE << slab_order(slab); +} + #ifdef CONFIG_SLOB /* * Common fields provided in kmem_cache by all slab allocators @@@ -436,33 -245,15 +436,33 @@@ static inline bool kmem_cache_debug_fla }
#ifdef CONFIG_MEMCG_KMEM -int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, - gfp_t gfp, bool new_page); +/* + * slab_objcgs - get the object cgroups vector associated with a slab + * @slab: a pointer to the slab struct + * + * Returns a pointer to the object cgroups vector associated with the slab, + * or NULL if no such vector has been associated yet. + */ +static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +{ + unsigned long memcg_data = READ_ONCE(slab->memcg_data); + + VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), + slab_page(slab)); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab)); + + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab); void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr);
-static inline void memcg_free_page_obj_cgroups(struct page *page) +static inline void memcg_free_slab_cgroups(struct slab *slab) { - kfree(page_objcgs(page)); - page->memcg_data = 0; + kfree(slab_objcgs(slab)); + slab->memcg_data = 0; }
static inline size_t obj_full_size(struct kmem_cache *s) @@@ -507,7 -298,7 +507,7 @@@ static inline void memcg_slab_post_allo gfp_t flags, size_t size, void **p) { - struct page *page; + struct slab *slab; unsigned long off; size_t i;
@@@ -516,19 -307,19 +516,19 @@@
for (i = 0; i < size; i++) { if (likely(p[i])) { - page = virt_to_head_page(p[i]); + slab = virt_to_slab(p[i]);
- if (!page_objcgs(page) && - memcg_alloc_page_obj_cgroups(page, s, flags, + if (!slab_objcgs(slab) && + memcg_alloc_slab_cgroups(slab, s, flags, false)) { obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; }
- off = obj_to_index(s, page, p[i]); + off = obj_to_index(s, slab, p[i]); obj_cgroup_get(objcg); - page_objcgs(page)[off] = objcg; - mod_objcg_state(objcg, page_pgdat(page), + slab_objcgs(slab)[off] = objcg; + mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), obj_full_size(s)); } else { obj_cgroup_uncharge(objcg, obj_full_size(s)); @@@ -543,7 -334,7 +543,7 @@@ static inline void memcg_slab_free_hook struct kmem_cache *s; struct obj_cgroup **objcgs; struct obj_cgroup *objcg; - struct page *page; + struct slab *slab; unsigned int off; int i;
@@@ -554,52 -345,43 +554,52 @@@ if (unlikely(!p[i])) continue;
- page = virt_to_head_page(p[i]); - objcgs = page_objcgs_check(page); + slab = virt_to_slab(p[i]); + /* we could be given a kmalloc_large() object, skip those */ + if (!slab) + continue; + + objcgs = slab_objcgs(slab); if (!objcgs) continue;
if (!s_orig) - s = page->slab_cache; + s = slab->slab_cache; else s = s_orig;
- off = obj_to_index(s, page, p[i]); + off = obj_to_index(s, slab, p[i]); objcg = objcgs[off]; if (!objcg) continue;
objcgs[off] = NULL; obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s), + mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), -obj_full_size(s)); obj_cgroup_put(objcg); } }
#else /* CONFIG_MEMCG_KMEM */ +static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +{ + return NULL; +} + static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) { return NULL; }
-static inline int memcg_alloc_page_obj_cgroups(struct page *page, +static inline int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, gfp_t gfp, - bool new_page) + bool new_slab) { return 0; }
-static inline void memcg_free_page_obj_cgroups(struct page *page) +static inline void memcg_free_slab_cgroups(struct slab *slab) { }
@@@ -623,35 -405,35 +623,35 @@@ static inline void memcg_slab_free_hook } #endif /* CONFIG_MEMCG_KMEM */
+#ifndef CONFIG_SLOB static inline struct kmem_cache *virt_to_cache(const void *obj) { - struct page *page; + struct slab *slab;
- page = virt_to_head_page(obj); - if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n", + slab = virt_to_slab(obj); + if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) return NULL; - return page->slab_cache; + return slab->slab_cache; }
-static __always_inline void account_slab_page(struct page *page, int order, - struct kmem_cache *s, - gfp_t gfp) +static __always_inline void account_slab(struct slab *slab, int order, + struct kmem_cache *s, gfp_t gfp) { if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT)) - memcg_alloc_page_obj_cgroups(page, s, gfp, true); + memcg_alloc_slab_cgroups(slab, s, gfp, true);
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), PAGE_SIZE << order); }
-static __always_inline void unaccount_slab_page(struct page *page, int order, - struct kmem_cache *s) +static __always_inline void unaccount_slab(struct slab *slab, int order, + struct kmem_cache *s) { if (memcg_kmem_enabled()) - memcg_free_page_obj_cgroups(page); + memcg_free_slab_cgroups(slab);
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), -(PAGE_SIZE << order)); }
@@@ -670,7 -452,6 +670,7 @@@ static inline struct kmem_cache *cache_ print_tracking(cachep, x); return cachep; } +#endif /* CONFIG_SLOB */
static inline size_t slab_ksize(const struct kmem_cache *s) { @@@ -794,11 -575,6 +794,6 @@@ static inline struct kmem_cache_node *g
#endif
- void *slab_start(struct seq_file *m, loff_t *pos); - void *slab_next(struct seq_file *m, void *p, loff_t *pos); - void slab_stop(struct seq_file *m, void *p); - int memcg_slab_show(struct seq_file *m, void *p); - #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) void dump_unreclaimable_slab(void); #else @@@ -854,7 -630,7 +849,7 @@@ static inline void debugfs_slab_release #define KS_ADDRS_COUNT 16 struct kmem_obj_info { void *kp_ptr; - struct page *kp_page; + struct slab *kp_slab; void *kp_objp; unsigned long kp_data_offset; struct kmem_cache *kp_slab_cache; @@@ -862,18 -638,7 +857,18 @@@ void *kp_stack[KS_ADDRS_COUNT]; void *kp_free_stack[KS_ADDRS_COUNT]; }; -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab); +#endif + +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR +void __check_heap_object(const void *ptr, unsigned long n, + const struct slab *slab, bool to_user); +#else +static inline +void __check_heap_object(const void *ptr, unsigned long n, + const struct slab *slab, bool to_user) +{ +} #endif
#endif /* MM_SLAB_H */ diff --combined mm/slab_common.c index dc15566141d4,9513244457e6..23f2ab0713b7 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@@ -489,9 -489,7 +489,7 @@@ void slab_kmem_cache_release(struct kme
void kmem_cache_destroy(struct kmem_cache *s) { - int err; - - if (unlikely(!s)) + if (unlikely(!s) || !kasan_check_byte(s)) return;
cpus_read_lock(); @@@ -501,12 -499,9 +499,9 @@@ if (s->refcount) goto out_unlock;
- err = shutdown_cache(s); - if (err) { - pr_err("%s %s: Slab cache still has objects\n", - __func__, s->name); - dump_stack(); - } + WARN(shutdown_cache(s), + "%s %s: Slab cache still has objects when called from %pS", + __func__, s->name, (void *)_RET_IP_); out_unlock: mutex_unlock(&slab_mutex); cpus_read_unlock(); @@@ -550,13 -545,13 +545,13 @@@ bool slab_is_available(void */ bool kmem_valid_obj(void *object) { - struct page *page; + struct folio *folio;
/* Some arches consider ZERO_SIZE_PTR to be a valid address. */ if (object < (void *)PAGE_SIZE || !virt_addr_valid(object)) return false; - page = virt_to_head_page(object); - return PageSlab(page); + folio = virt_to_folio(object); + return folio_test_slab(folio); } EXPORT_SYMBOL_GPL(kmem_valid_obj);
@@@ -579,18 -574,18 +574,18 @@@ void kmem_dump_obj(void *object { char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc"; int i; - struct page *page; + struct slab *slab; unsigned long ptroffset; struct kmem_obj_info kp = { };
if (WARN_ON_ONCE(!virt_addr_valid(object))) return; - page = virt_to_head_page(object); - if (WARN_ON_ONCE(!PageSlab(page))) { + slab = virt_to_slab(object); + if (WARN_ON_ONCE(!slab)) { pr_cont(" non-slab memory.\n"); return; } - kmem_obj_info(&kp, object, page); + kmem_obj_info(&kp, object, slab); if (kp.kp_slab_cache) pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); else @@@ -824,7 -819,7 +819,7 @@@ void __init setup_kmalloc_cache_index_t
if (KMALLOC_MIN_SIZE >= 64) { /* - * The 96 byte size cache is not used if the alignment + * The 96 byte sized cache is not used if the alignment * is 64 byte. */ for (i = 64 + 8; i <= 96; i += 8) @@@ -849,7 -844,7 +844,7 @@@ new_kmalloc_cache(int idx, enum kmalloc if (type == KMALLOC_RECLAIM) { flags |= SLAB_RECLAIM_ACCOUNT; } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { - if (cgroup_memory_nokmem) { + if (mem_cgroup_kmem_disabled()) { kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx]; return; } @@@ -1044,18 -1039,18 +1039,18 @@@ static void print_slabinfo_header(struc seq_putc(m, '\n'); }
- void *slab_start(struct seq_file *m, loff_t *pos) + static void *slab_start(struct seq_file *m, loff_t *pos) { mutex_lock(&slab_mutex); return seq_list_start(&slab_caches, *pos); }
- void *slab_next(struct seq_file *m, void *p, loff_t *pos) + static void *slab_next(struct seq_file *m, void *p, loff_t *pos) { return seq_list_next(p, &slab_caches, pos); }
- void slab_stop(struct seq_file *m, void *p) + static void slab_stop(struct seq_file *m, void *p) { mutex_unlock(&slab_mutex); } @@@ -1123,17 -1118,6 +1118,6 @@@ void dump_unreclaimable_slab(void mutex_unlock(&slab_mutex); }
- #if defined(CONFIG_MEMCG_KMEM) - int memcg_slab_show(struct seq_file *m, void *p) - { - /* - * Deprecated. - * Please, take a look at tools/cgroup/slabinfo.py . - */ - return 0; - } - #endif - /* * slabinfo_op - iterator that generates /proc/slabinfo * diff --combined mm/swap.c index 74f6b311d7ee,b461814ce0cb..bcf3ac288b56 --- a/mm/swap.c +++ b/mm/swap.c @@@ -882,7 -882,7 +882,7 @@@ void lru_cache_disable(void * all online CPUs so any calls of lru_cache_disabled wrapped by * local_lock or preemption disabled would be ordered by that. * The atomic operation doesn't need to have stronger ordering - * requirements because that is enforeced by the scheduling + * requirements because that is enforced by the scheduling * guarantees. */ __lru_add_drain_all(true); @@@ -1077,24 -1077,24 +1077,24 @@@ void __pagevec_lru_add(struct pagevec * }
/** - * pagevec_remove_exceptionals - pagevec exceptionals pruning - * @pvec: The pagevec to prune + * folio_batch_remove_exceptionals() - Prune non-folios from a batch. + * @fbatch: The batch to prune * - * find_get_entries() fills both pages and XArray value entries (aka - * exceptional entries) into the pagevec. This function prunes all - * exceptionals from @pvec without leaving holes, so that it can be - * passed on to page-only pagevec operations. + * find_get_entries() fills a batch with both folios and shadow/swap/DAX + * entries. This function prunes all the non-folio entries from @fbatch + * without leaving holes, so that it can be passed on to folio-only batch + * operations. */ -void pagevec_remove_exceptionals(struct pagevec *pvec) +void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { - int i, j; + unsigned int i, j;
- for (i = 0, j = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - if (!xa_is_value(page)) - pvec->pages[j++] = page; + for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; + if (!xa_is_value(folio)) + fbatch->folios[j++] = folio; } - pvec->nr = j; + fbatch->nr = j; }
/** diff --combined mm/truncate.c index 5c87cdc70e7b,41b8249b3b4a..5e243d7269c0 --- a/mm/truncate.c +++ b/mm/truncate.c @@@ -56,11 -56,11 +56,11 @@@ static void clear_shadow_entry(struct a
/* * Unconditionally remove exceptional entries. Usually called from truncate - * path. Note that the pagevec may be altered by this function by removing - * exceptional entries similar to what pagevec_remove_exceptionals does. + * path. Note that the folio_batch may be altered by this function by removing + * exceptional entries similar to what folio_batch_remove_exceptionals() does. */ -static void truncate_exceptional_pvec_entries(struct address_space *mapping, - struct pagevec *pvec, pgoff_t *indices) +static void truncate_folio_batch_exceptionals(struct address_space *mapping, + struct folio_batch *fbatch, pgoff_t *indices) { int i, j; bool dax; @@@ -69,11 -69,11 +69,11 @@@ if (shmem_mapping(mapping)) return;
- for (j = 0; j < pagevec_count(pvec); j++) - if (xa_is_value(pvec->pages[j])) + for (j = 0; j < folio_batch_count(fbatch); j++) + if (xa_is_value(fbatch->folios[j])) break;
- if (j == pagevec_count(pvec)) + if (j == folio_batch_count(fbatch)) return;
dax = dax_mapping(mapping); @@@ -82,12 -82,12 +82,12 @@@ xa_lock_irq(&mapping->i_pages); }
- for (i = j; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; + for (i = j; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; pgoff_t index = indices[i];
- if (!xa_is_value(page)) { - pvec->pages[j++] = page; + if (!xa_is_value(folio)) { + fbatch->folios[j++] = folio; continue; }
@@@ -96,7 -96,7 +96,7 @@@ continue; }
- __clear_shadow_entry(mapping, index, page); + __clear_shadow_entry(mapping, index, folio); }
if (!dax) { @@@ -105,7 -105,7 +105,7 @@@ inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } - pvec->nr = j; + fbatch->nr = j; }
/* @@@ -177,21 -177,21 +177,21 @@@ void do_invalidatepage(struct page *pag * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static void truncate_cleanup_page(struct page *page) +static void truncate_cleanup_folio(struct folio *folio) { - if (page_mapped(page)) - unmap_mapping_page(page); + if (folio_mapped(folio)) + unmap_mapping_folio(folio);
- if (page_has_private(page)) - do_invalidatepage(page, 0, thp_size(page)); + if (folio_has_private(folio)) + do_invalidatepage(&folio->page, 0, folio_size(folio));
/* * Some filesystems seem to re-dirty the page even after * the VM has canceled the dirty bit (eg ext3 journaling). * Hence dirty accounting check is placed after invalidation. */ - cancel_dirty_page(page); - ClearPageMappedToDisk(page); + folio_cancel_dirty(folio); + folio_clear_mappedtodisk(folio); }
/* @@@ -205,7 -205,6 +205,6 @@@ static int invalidate_complete_page(struct address_space *mapping, struct page *page) { - int ret;
if (page->mapping != mapping) return 0; @@@ -213,80 -212,26 +212,78 @@@ if (page_has_private(page) && !try_to_release_page(page, 0)) return 0;
- ret = remove_mapping(mapping, page); - - return ret; + return remove_mapping(mapping, page); }
-int truncate_inode_page(struct address_space *mapping, struct page *page) +int truncate_inode_folio(struct address_space *mapping, struct folio *folio) { - VM_BUG_ON_PAGE(PageTail(page), page); - - if (page->mapping != mapping) + if (folio->mapping != mapping) return -EIO;
- truncate_cleanup_page(page); - delete_from_page_cache(page); + truncate_cleanup_folio(folio); + filemap_remove_folio(folio); return 0; }
+/* + * Handle partial folios. The folio may be entirely within the + * range if a split has raced with us. If not, we zero the part of the + * folio that's within the [start, end] range, and then split the folio if + * it's large. split_page_range() will discard pages which now lie beyond + * i_size, and we rely on the caller to discard pages which lie within a + * newly created hole. + * + * Returns false if splitting failed so the caller can avoid + * discarding the entire folio which is stubbornly unsplit. + */ +bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) +{ + loff_t pos = folio_pos(folio); + unsigned int offset, length; + + if (pos < start) + offset = start - pos; + else + offset = 0; + length = folio_size(folio); + if (pos + length <= (u64)end) + length = length - offset; + else + length = end + 1 - pos - offset; + + folio_wait_writeback(folio); + if (length == folio_size(folio)) { + truncate_inode_folio(folio->mapping, folio); + return true; + } + + /* + * We may be zeroing pages we're about to discard, but it avoids + * doing a complex calculation here, and then doing the zeroing + * anyway if the page split fails. + */ + folio_zero_range(folio, offset, length); + + cleancache_invalidate_page(folio->mapping, &folio->page); + if (folio_has_private(folio)) + do_invalidatepage(&folio->page, offset, length); + if (!folio_test_large(folio)) + return true; + if (split_huge_page(&folio->page) == 0) + return true; + if (folio_test_dirty(folio)) + return false; + truncate_inode_folio(folio->mapping, folio); + return true; +} + /* * Used to get rid of pages on hardware memory corruption. */ int generic_error_remove_page(struct address_space *mapping, struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); + if (!mapping) return -EINVAL; /* @@@ -295,7 -240,7 +292,7 @@@ */ if (!S_ISREG(mapping->host->i_mode)) return -EIO; - return truncate_inode_page(mapping, page); + return truncate_inode_folio(mapping, page_folio(page)); } EXPORT_SYMBOL(generic_error_remove_page);
@@@ -346,16 -291,20 +343,16 @@@ void truncate_inode_pages_range(struct { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ - unsigned int partial_start; /* inclusive */ - unsigned int partial_end; /* exclusive */ - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; + struct folio *folio; + bool same_folio;
if (mapping_empty(mapping)) goto out;
- /* Offsets within partial pages */ - partial_start = lstart & (PAGE_SIZE - 1); - partial_end = (lend + 1) & (PAGE_SIZE - 1); - /* * 'start' and 'end' always covers the range of pages to be fully * truncated. Partial pages are covered with 'partial_start' at the @@@ -373,49 -322,64 +370,49 @@@ else end = (lend + 1) >> PAGE_SHIFT;
- pagevec_init(&pvec); + folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, index, end - 1, - &pvec, indices)) { - index = indices[pagevec_count(&pvec) - 1] + 1; - truncate_exceptional_pvec_entries(mapping, &pvec, indices); - for (i = 0; i < pagevec_count(&pvec); i++) - truncate_cleanup_page(pvec.pages[i]); - delete_from_page_cache_batch(mapping, &pvec); - for (i = 0; i < pagevec_count(&pvec); i++) - unlock_page(pvec.pages[i]); - pagevec_release(&pvec); + &fbatch, indices)) { + index = indices[folio_batch_count(&fbatch) - 1] + 1; + truncate_folio_batch_exceptionals(mapping, &fbatch, indices); + for (i = 0; i < folio_batch_count(&fbatch); i++) + truncate_cleanup_folio(fbatch.folios[i]); + delete_from_page_cache_batch(mapping, &fbatch); + for (i = 0; i < folio_batch_count(&fbatch); i++) + folio_unlock(fbatch.folios[i]); + folio_batch_release(&fbatch); cond_resched(); }
- if (partial_start) { - struct page *page = find_lock_page(mapping, start - 1); - if (page) { - unsigned int top = PAGE_SIZE; - if (start > end) { - /* Truncation within a single page */ - top = partial_end; - partial_end = 0; - } - wait_on_page_writeback(page); - zero_user_segment(page, partial_start, top); - cleancache_invalidate_page(mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, partial_start, - top - partial_start); - unlock_page(page); - put_page(page); + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); + if (folio) { + same_folio = lend < folio_pos(folio) + folio_size(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) { + start = folio->index + folio_nr_pages(folio); + if (same_folio) + end = folio->index; } + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - if (partial_end) { - struct page *page = find_lock_page(mapping, end); - if (page) { - wait_on_page_writeback(page); - zero_user_segment(page, 0, partial_end); - cleancache_invalidate_page(mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, 0, - partial_end); - unlock_page(page); - put_page(page); - } + + if (!same_folio) + folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT, + FGP_LOCK, 0); + if (folio) { + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); } - /* - * If the truncation happened within a single page no pages - * will be released, just zeroed, so we can bail out now. - */ - if (start >= end) - goto out;
index = start; - for ( ; ; ) { + while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &pvec, + if (!find_get_entries(mapping, index, end - 1, &fbatch, indices)) { /* If all gone from start onwards, we're done */ if (index == start) @@@ -425,24 -389,23 +422,24 @@@ continue; }
- for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing page->index */ index = indices[i];
- if (xa_is_value(page)) + if (xa_is_value(folio)) continue;
- lock_page(page); - WARN_ON(page_to_index(page) != index); - wait_on_page_writeback(page); - truncate_inode_page(mapping, page); - unlock_page(page); + folio_lock(folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + folio_wait_writeback(folio); + truncate_inode_folio(mapping, folio); + folio_unlock(folio); + index = folio_index(folio) + folio_nr_pages(folio) - 1; } - truncate_exceptional_pvec_entries(mapping, &pvec, indices); - pagevec_release(&pvec); + truncate_folio_batch_exceptionals(mapping, &fbatch, indices); + folio_batch_release(&fbatch); index++; }
@@@ -513,16 -476,16 +510,16 @@@ static unsigned long __invalidate_mappi pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i;
- pagevec_init(&pvec); - while (find_lock_entries(mapping, index, end, &pvec, indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + folio_batch_init(&fbatch); + while (find_lock_entries(mapping, index, end, &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct page *page = &fbatch.folios[i]->page;
/* We rely upon deletion not changing page->index */ index = indices[i]; @@@ -549,8 -512,8 +546,8 @@@ } count += ret; } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; } @@@ -602,29 -565,31 +599,29 @@@ void invalidate_mapping_pagevec(struct * shrink_page_list() has a temp ref on them, or because they're transiently * sitting in the lru_cache_add() pagevecs. */ -static int -invalidate_complete_page2(struct address_space *mapping, struct page *page) +static int invalidate_complete_folio2(struct address_space *mapping, + struct folio *folio) { - if (page->mapping != mapping) + if (folio->mapping != mapping) return 0;
- if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) + if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_KERNEL)) return 0;
spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); - if (PageDirty(page)) + if (folio_test_dirty(folio)) goto failed;
- BUG_ON(page_has_private(page)); - __delete_from_page_cache(page, NULL); + BUG_ON(folio_has_private(folio)); + __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock);
- if (mapping->a_ops->freepage) - mapping->a_ops->freepage(page); - - put_page(page); /* pagecache ref */ + filemap_free_folio(mapping, folio); return 1; failed: xa_unlock_irq(&mapping->i_pages); @@@ -632,13 -597,13 +629,13 @@@ return 0; }
-static int do_launder_page(struct address_space *mapping, struct page *page) +static int do_launder_folio(struct address_space *mapping, struct folio *folio) { - if (!PageDirty(page)) + if (!folio_test_dirty(folio)) return 0; - if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) + if (folio->mapping != mapping || mapping->a_ops->launder_page == NULL) return 0; - return mapping->a_ops->launder_page(page); + return mapping->a_ops->launder_page(&folio->page); }
/** @@@ -656,7 -621,7 +653,7 @@@ int invalidate_inode_pages2_range(struc pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index; int i; int ret = 0; @@@ -666,25 -631,25 +663,25 @@@ if (mapping_empty(mapping)) goto out;
- pagevec_init(&pvec); + folio_batch_init(&fbatch); index = start; - while (find_get_entries(mapping, index, end, &pvec, indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + while (find_get_entries(mapping, index, end, &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i];
- /* We rely upon deletion not changing page->index */ + /* We rely upon deletion not changing folio->index */ index = indices[i];
- if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (!invalidate_exceptional_entry2(mapping, - index, page)) + index, folio)) ret = -EBUSY; continue; }
- if (!did_range_unmap && page_mapped(page)) { + if (!did_range_unmap && folio_mapped(folio)) { /* - * If page is mapped, before taking its lock, + * If folio is mapped, before taking its lock, * zap the rest of the file in one hit. */ unmap_mapping_pages(mapping, index, @@@ -692,29 -657,29 +689,29 @@@ did_range_unmap = 1; }
- lock_page(page); - WARN_ON(page_to_index(page) != index); - if (page->mapping != mapping) { - unlock_page(page); + folio_lock(folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + if (folio->mapping != mapping) { + folio_unlock(folio); continue; } - wait_on_page_writeback(page); + folio_wait_writeback(folio);
- if (page_mapped(page)) - unmap_mapping_page(page); - BUG_ON(page_mapped(page)); + if (folio_mapped(folio)) + unmap_mapping_folio(folio); + BUG_ON(folio_mapped(folio));
- ret2 = do_launder_page(mapping, page); + ret2 = do_launder_folio(mapping, folio); if (ret2 == 0) { - if (!invalidate_complete_page2(mapping, page)) + if (!invalidate_complete_folio2(mapping, folio)) ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; - unlock_page(page); + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; }
linux-merge@lists.open-mesh.org