[linux-next] LinuxNextTracking branch, master, updated. next-20220117 - linux-merge - lists.open-mesh.org

17 Jan 2022

The following commit has been merged in the master branch:
commit f56caedaf94f9ced5dbfcdb0060a3e788d2078af
Merge: a33f5c380c4bd3fa5278d690421b72052456d9fe 76fd0285b447991267e838842c0be7395eb454bb
Author: Linus Torvalds torvalds@linux-foundation.org
Date:   Sat Jan 15 20:37:06 2022 +0200
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton:
     "146 patches.
Subsystems affected by this patch series: kthread, ia64, scripts,
      ntfs, squashfs, ocfs2, vfs, and mm (slab-generic, slab, kmemleak,
      dax, kasan, debug, pagecache, gup, shmem, frontswap, memremap,
      memcg, selftests, pagemap, dma, vmalloc, memory-failure, hugetlb,
      userfaultfd, vmscan, mempolicy, oom-kill, hugetlbfs, migration, thp,
      ksm, page-poison, percpu, rmap, zswap, zram, cleanups, hmm, and
      damon)"
* emailed patches from Andrew Morton akpm@linux-foundation.org: (146 commits)
      mm/damon: hide kernel pointer from tracepoint event
      mm/damon/vaddr: hide kernel pointer from damon_va_three_regions() failure log
      mm/damon/vaddr: use pr_debug() for damon_va_three_regions() failure logging
      mm/damon/dbgfs: remove an unnecessary variable
      mm/damon: move the implementation of damon_insert_region to damon.h
      mm/damon: add access checking for hugetlb pages
      Docs/admin-guide/mm/damon/usage: update for schemes statistics
      mm/damon/dbgfs: support all DAMOS stats
      Docs/admin-guide/mm/damon/reclaim: document statistics parameters
      mm/damon/reclaim: provide reclamation statistics
      mm/damon/schemes: account how many times quota limit has exceeded
      mm/damon/schemes: account scheme actions that successfully applied
      mm/damon: remove a mistakenly added comment for a future feature
      Docs/admin-guide/mm/damon/usage: update for kdamond_pid and (mk|rm)_contexts
      Docs/admin-guide/mm/damon/usage: mention tracepoint at the beginning
      Docs/admin-guide/mm/damon/usage: remove redundant information
      Docs/admin-guide/mm/damon/usage: update for scheme quotas and watermarks
      mm/damon: convert macro functions to static inline functions
      mm/damon: modify damon_rand() macro to static inline function
      mm/damon: move damon_rand() definition into damon.h
      ...
diff --combined MAINTAINERS
index 5d0cd537803a,fbdb860c0b8b..474966314383

--- a/MAINTAINERS
+++ b/MAINTAINERS
@@@ -966,7 -966,6 +966,7 @@@ F:	drivers/gpu/drm/amd/include/kgd_kfd_
  F:	drivers/gpu/drm/amd/include/v9_structs.h
  F:	drivers/gpu/drm/amd/include/vi_structs.h
  F:	include/uapi/linux/kfd_ioctl.h
 +F:	include/uapi/linux/kfd_sysfs.h
AMD SPI DRIVER
  M:	Sanjay R Mehta sanju.mehta@amd.com
@@@ -994,13 -993,6 +994,13 @@@ S:	Supporte
  T:	git https://gitlab.freedesktop.org/agd5f/linux.git
  F:	drivers/gpu/drm/amd/pm/
+AMD PSTATE DRIVER
 +M:	Huang Rui ray.huang@amd.com
 +L:	linux-pm@vger.kernel.org
 +S:	Supported
 +F:	Documentation/admin-guide/pm/amd-pstate.rst
 +F:	drivers/cpufreq/amd-pstate*
 +
  AMD PTDMA DRIVER
  M:	Sanjay R Mehta sanju.mehta@amd.com
  L:	dmaengine@vger.kernel.org
@@@ -1077,15 -1069,6 +1077,15 @@@ W:	http://ez.analog.com/community/linux
  F:	Documentation/devicetree/bindings/iio/adc/adi,ad7780.yaml
  F:	drivers/iio/adc/ad7780.c
+ANALOG DEVICES INC AD74413R DRIVER
 +M:	Cosmin Tanislav cosmin.tanislav@analog.com
 +L:	linux-iio@vger.kernel.org
 +S:	Supported
 +W:	http://ez.analog.com/community/linux-device-drivers
 +F:	Documentation/devicetree/bindings/iio/addac/adi,ad74413r.yaml
 +F:	drivers/iio/addac/ad74413r.c
 +F:	include/dt-bindings/iio/addac/adi,ad74413r.h
 +
  ANALOG DEVICES INC AD9389B DRIVER
  M:	Hans Verkuil hverkuil-cisco@xs4all.nl
  L:	linux-media@vger.kernel.org
@@@ -1156,7 -1139,6 +1156,7 @@@ ANALOG DEVICES INC ADV748X DRIVE
  M:	Kieran Bingham kieran.bingham@ideasonboard.com
  L:	linux-media@vger.kernel.org
  S:	Maintained
 +F:	Documentation/devicetree/bindings/media/i2c/adv748x.yaml
  F:	drivers/media/i2c/adv748x/*
ANALOG DEVICES INC ADV7511 DRIVER
@@@ -1763,21 -1745,17 +1763,21 @@@ B:	https://github.com/AsahiLinux/linux/
  C:	irc://irc.oftc.net/asahi-dev
  T:	git https://github.com/AsahiLinux/linux.git
  F:	Documentation/devicetree/bindings/arm/apple.yaml
 +F:	Documentation/devicetree/bindings/arm/apple/*
  F:	Documentation/devicetree/bindings/i2c/apple,i2c.yaml
  F:	Documentation/devicetree/bindings/interrupt-controller/apple,aic.yaml
  F:	Documentation/devicetree/bindings/mailbox/apple,mailbox.yaml
  F:	Documentation/devicetree/bindings/pci/apple,pcie.yaml
  F:	Documentation/devicetree/bindings/pinctrl/apple,pinctrl.yaml
 +F:	Documentation/devicetree/bindings/power/apple*
 +F:	Documentation/devicetree/bindings/watchdog/apple,wdt.yaml
  F:	arch/arm64/boot/dts/apple/
  F:	drivers/i2c/busses/i2c-pasemi-core.c
  F:	drivers/i2c/busses/i2c-pasemi-platform.c
  F:	drivers/irqchip/irq-apple-aic.c
  F:	drivers/mailbox/apple-mailbox.c
  F:	drivers/pinctrl/pinctrl-apple-gpio.c
 +F:	drivers/soc/apple/*
  F:	include/dt-bindings/interrupt-controller/apple-aic.h
  F:	include/dt-bindings/pinctrl/apple.h
  F:	include/linux/apple-mailbox.h
@@@ -1912,7 -1890,6 +1912,7 @@@ F:	Documentation/trace/coresight/
  F:	drivers/hwtracing/coresight/*
  F:	include/dt-bindings/arm/coresight-cti-dt.h
  F:	include/linux/coresight*
 +F:	samples/coresight/*
  F:	tools/perf/arch/arm/util/auxtrace.c
  F:	tools/perf/arch/arm/util/cs-etm.c
  F:	tools/perf/arch/arm/util/cs-etm.h
@@@ -2314,7 -2291,6 +2314,7 @@@ F:	Documentation/devicetree/bindings/gp
  F:	arch/arm/boot/dts/mstar-*
  F:	arch/arm/mach-mstar/
  F:	drivers/clk/mstar/
 +F:	drivers/clocksource/timer-msc313e.c
  F:	drivers/gpio/gpio-msc313.c
  F:	drivers/rtc/rtc-msc313.c
  F:	drivers/watchdog/msc313e_wdt.c
@@@ -2575,7 -2551,6 +2575,7 @@@ Q:	https://patchwork.kernel.org/project
  F:	Documentation/arm/samsung/
  F:	Documentation/devicetree/bindings/arm/samsung/
  F:	Documentation/devicetree/bindings/power/pd-samsung.yaml
 +F:	Documentation/devicetree/bindings/soc/samsung/
  F:	arch/arm/boot/dts/exynos*
  F:	arch/arm/boot/dts/s3c*
  F:	arch/arm/boot/dts/s5p*
@@@ -2602,7 -2577,7 +2602,7 @@@ N:	s3c64x
  N:	s5pv210
ARM/SAMSUNG S5P SERIES 2D GRAPHICS ACCELERATION (G2D) SUPPORT
 -M:	Andrzej Hajda a.hajda@samsung.com
 +M:	��ukasz Stelmach l.stelmach@samsung.com
  L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  L:	linux-media@vger.kernel.org
  S:	Maintained
@@@ -2626,8 -2601,7 +2626,8 @@@ S:	Maintaine
  F:	drivers/media/platform/s5p-jpeg/
ARM/SAMSUNG S5P SERIES Multi Format Codec (MFC) SUPPORT
 -M:	Andrzej Hajda a.hajda@samsung.com
 +M:	Marek Szyprowski m.szyprowski@samsung.com
 +M:	Andrzej Hajda andrzej.hajda@intel.com
  L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  L:	linux-media@vger.kernel.org
  S:	Maintained
@@@ -2818,15 -2792,12 +2818,15 @@@ L:	linux-arm-kernel@lists.infradead.or
  S:	Supported
  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/iwamatsu/linux-visconti.git
  F:	Documentation/devicetree/bindings/arm/toshiba.yaml
 +F:	Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pipllct.yaml
 +F:	Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pismu.yaml
  F:	Documentation/devicetree/bindings/net/toshiba,visconti-dwmac.yaml
  F:	Documentation/devicetree/bindings/gpio/toshiba,gpio-visconti.yaml
  F:	Documentation/devicetree/bindings/pci/toshiba,visconti-pcie.yaml
  F:	Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml
  F:	Documentation/devicetree/bindings/watchdog/toshiba,visconti-wdt.yaml
  F:	arch/arm64/boot/dts/toshiba/
 +F:	drivers/clk/visconti/
  F:	drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c
  F:	drivers/gpio/gpio-visconti.c
  F:	drivers/pci/controller/dwc/pcie-visconti.c
@@@ -3027,27 -2998,6 +3027,27 @@@ W:	http://acpi4asus.sf.ne
  F:	drivers/platform/x86/asus*.c
  F:	drivers/platform/x86/eeepc*.c
+ASUS TF103C DOCK DRIVER
 +M:	Hans de Goede hdegoede@redhat.com
 +L:	platform-driver-x86@vger.kernel.org
 +S:	Maintained
 +T:	git git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git
 +F:	drivers/platform/x86/asus-tf103c-dock.c
 +
 +ASUS WMI HARDWARE MONITOR DRIVER
 +M:	Ed Brindley kernel@maidavale.org
 +M:	Denis Pauk pauk.denis@gmail.com
 +L:	linux-hwmon@vger.kernel.org
 +S:	Maintained
 +F:	drivers/hwmon/asus_wmi_sensors.c
 +
 +ASUS WMI EC HARDWARE MONITOR DRIVER
 +M:	Eugene Shalygin eugene.shalygin@gmail.com
 +M:	Denis Pauk pauk.denis@gmail.com
 +L:	linux-hwmon@vger.kernel.org
 +S:	Maintained
 +F:	drivers/hwmon/asus_wmi_ec_sensors.c
 +
  ASUS WIRELESS RADIO CONTROL DRIVER
  M:	Jo��o Paulo Rechi Vita jprvita@gmail.com
  L:	platform-driver-x86@vger.kernel.org
@@@ -3430,8 -3380,6 +3430,8 @@@ M:	Jens Axboe <axboe@kernel.dk
  L:	linux-block@vger.kernel.org
  S:	Maintained
  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
 +F:	Documentation/ABI/stable/sysfs-block
 +F:	Documentation/block/
  F:	block/
  F:	drivers/block/
  F:	include/linux/blk*
@@@ -3621,7 -3569,7 +3621,7 @@@ R:	Florent Revest <revest@chromium.org
  R:	Brendan Jackman jackmanb@chromium.org
  L:	bpf@vger.kernel.org
  S:	Maintained
 -F:	Documentation/bpf/bpf_lsm.rst
 +F:	Documentation/bpf/prog_lsm.rst
  F:	include/linux/bpf_lsm.h
  F:	kernel/bpf/bpf_lsm.c
  F:	security/bpf/
@@@ -3688,7 -3636,6 +3688,7 @@@ F:	drivers/net/ethernet/broadcom/bcm490
  F:	drivers/net/ethernet/broadcom/unimac.h
BROADCOM BCM5301X ARM ARCHITECTURE
 +M:	Florian Fainelli f.fainelli@gmail.com
  M:	Hauke Mehrtens hauke@hauke-m.de
  M:	Rafa�� Mi��ecki zajec5@gmail.com
  M:	bcm-kernel-feedback-list@broadcom.com
@@@ -3700,7 -3647,6 +3700,7 @@@ F:	arch/arm/boot/dts/bcm953012
  F:	arch/arm/mach-bcm/bcm_5301x.c
BROADCOM BCM53573 ARM ARCHITECTURE
 +M:	Florian Fainelli f.fainelli@gmail.com
  M:	Rafa�� Mi��ecki rafal@milecki.pl
  L:	bcm-kernel-feedback-list@broadcom.com
  L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -3744,7 -3690,7 +3744,7 @@@ M:	Al Cooper <alcooperx@gmail.com
  L:	linux-usb@vger.kernel.org
  L:	bcm-kernel-feedback-list@broadcom.com
  S:	Maintained
 -F:	Documentation/devicetree/bindings/usb/brcm,bdc.txt
 +F:	Documentation/devicetree/bindings/usb/brcm,bdc.yaml
  F:	drivers/usb/gadget/udc/bdc/
BROADCOM BMIPS CPUFREQ DRIVER
@@@ -3827,7 -3773,7 +3827,7 @@@ M:	Doug Berger <opendmb@gmail.com
  M:	Florian Fainelli f.fainelli@gmail.com
  L:	bcm-kernel-feedback-list@broadcom.com
  S:	Supported
 -F:	Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt
 +F:	Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.yaml
  F:	drivers/gpio/gpio-brcmstb.c
BROADCOM BRCMSTB I2C DRIVER
@@@ -3885,7 -3831,7 +3885,7 @@@ M:	Florian Fainelli <f.fainelli@gmail.c
  L:	bcm-kernel-feedback-list@broadcom.com
  L:	netdev@vger.kernel.org
  S:	Supported
 -F:	Documentation/devicetree/bindings/net/brcm,bcmgenet.txt
 +F:	Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml
  F:	Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml
  F:	drivers/net/ethernet/broadcom/genet/
  F:	drivers/net/ethernet/broadcom/unimac.h
@@@ -3927,7 -3873,7 +3927,7 @@@ M:	Rafa�� Mi��ecki <rafal@milecki.pl
  M:	bcm-kernel-feedback-list@broadcom.com
  L:	netdev@vger.kernel.org
  S:	Maintained
 -F:	Documentation/devicetree/bindings/net/brcm,amac.txt
 +F:	Documentation/devicetree/bindings/net/brcm,amac.yaml
  F:	drivers/net/ethernet/broadcom/bgmac*
  F:	drivers/net/ethernet/broadcom/unimac.h
@@@ -4002,7 -3948,7 +4002,7 @@@ M:	Markus Mayer <mmayer@broadcom.com
  M:	bcm-kernel-feedback-list@broadcom.com
  L:	linux-pm@vger.kernel.org
  S:	Maintained
 -F:	Documentation/devicetree/bindings/thermal/brcm,avs-tmon.txt
 +F:	Documentation/devicetree/bindings/thermal/brcm,avs-tmon.yaml
  F:	drivers/thermal/broadcom/brcmstb*
BROADCOM STB DPFE DRIVER
@@@ -4038,7 -3984,6 +4038,7 @@@ L:	netdev@vger.kernel.or
  S:	Supported
  F:	drivers/net/ethernet/broadcom/bcmsysport.*
  F:	drivers/net/ethernet/broadcom/unimac.h
 +F:	Documentation/devicetree/bindings/net/brcm,systemport.yaml
BROADCOM TG3 GIGABIT ETHERNET DRIVER
  M:	Siva Reddy Kallam siva.kallam@broadcom.com
@@@ -4578,12 -4523,9 +4578,12 @@@ F:	drivers/media/cec/i2c/ch7322.
  CIRRUS LOGIC AUDIO CODEC DRIVERS
  M:	James Schulman james.schulman@cirrus.com
  M:	David Rhodes david.rhodes@cirrus.com
 +M:	Lucas Tanure tanureal@opensource.cirrus.com
  L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
  L:	patches@opensource.cirrus.com
  S:	Maintained
 +F:	Documentation/devicetree/bindings/sound/cirrus,cs*
 +F:	sound/pci/hda/cs*
  F:	sound/soc/codecs/cs*
CIRRUS LOGIC DSP FIRMWARE DRIVER
@@@ -4783,8 -4725,6 +4783,8 @@@ M:	Ian Abbott <abbotti@mev.co.uk
  M:	H Hartley Sweeten hsweeten@visionengravers.com
  S:	Odd Fixes
  F:	drivers/comedi/
 +F:	include/linux/comedi/
 +F:	include/uapi/linux/comedi.h
COMMON CLK FRAMEWORK
  M:	Michael Turquette mturquette@baylibre.com
@@@ -5483,12 -5423,6 +5483,12 @@@ W:	https://linuxtv.or
  T:	git git://linuxtv.org/media_tree.git
  F:	drivers/media/platform/sti/delta
+DELTA AHE-50DC FAN CONTROL MODULE DRIVER
 +M:	Zev Weiss zev@bewilderbeest.net
 +L:	linux-hwmon@vger.kernel.org
 +S:	Maintained
 +F:	drivers/hwmon/pmbus/delta-ahe50dc-fan.c
 +
  DELTA DPS920AB PSU DRIVER
  M:	Robert Marko robert.marko@sartura.hr
  L:	linux-hwmon@vger.kernel.org
@@@ -6116,7 -6050,6 +6116,7 @@@ F:	drivers/gpu/drm/tiny/mi0283qt.
  DRM DRIVER FOR MSM ADRENO GPU
  M:	Rob Clark robdclark@gmail.com
  M:	Sean Paul sean@poorly.run
 +R:	Abhinav Kumar quic_abhinavk@quicinc.com
  L:	linux-arm-msm@vger.kernel.org
  L:	dri-devel@lists.freedesktop.org
  L:	freedreno@lists.freedesktop.org
@@@ -6142,17 -6075,10 +6142,17 @@@ F:	drivers/gpu/drm/panel/panel-novatek-
DRM DRIVER FOR NVIDIA GEFORCE/QUADRO GPUS
  M:	Ben Skeggs bskeggs@redhat.com
 +M:	Karol Herbst kherbst@redhat.com
 +M:	Lyude Paul lyude@redhat.com
  L:	dri-devel@lists.freedesktop.org
  L:	nouveau@lists.freedesktop.org
  S:	Supported
 -T:	git git://github.com/skeggsb/linux
 +W:	https://nouveau.freedesktop.org/
 +Q:	https://patchwork.freedesktop.org/project/nouveau/
 +Q:	https://gitlab.freedesktop.org/drm/nouveau/-/merge_requests
 +B:	https://gitlab.freedesktop.org/drm/nouveau/-/issues
 +C:	irc://irc.oftc.net/nouveau
 +T:	git https://gitlab.freedesktop.org/drm/nouveau.git
  F:	drivers/gpu/drm/nouveau/
  F:	include/uapi/drm/nouveau_drm.h
@@@ -6385,7 -6311,7 +6385,7 @@@ F:	Documentation/devicetree/bindings/di
  F:	drivers/gpu/drm/atmel-hlcdc/
DRM DRIVERS FOR BRIDGE CHIPS
 -M:	Andrzej Hajda a.hajda@samsung.com
 +M:	Andrzej Hajda andrzej.hajda@intel.com
  M:	Neil Armstrong narmstrong@baylibre.com
  M:	Robert Foss robert.foss@linaro.org
  R:	Laurent Pinchart Laurent.pinchart@ideasonboard.com
@@@ -6492,7 -6418,6 +6492,7 @@@ L:	dri-devel@lists.freedesktop.or
  L:	linux-renesas-soc@vger.kernel.org
  S:	Supported
  T:	git git://linuxtv.org/pinchartl/media drm/du/next
 +F:	Documentation/devicetree/bindings/display/bridge/renesas,dsi-csi2-tx.yaml
  F:	Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.yaml
  F:	Documentation/devicetree/bindings/display/bridge/renesas,lvds.yaml
  F:	Documentation/devicetree/bindings/display/renesas,du.yaml
@@@ -6611,14 -6536,6 +6611,14 @@@ F:	drivers/gpu/drm/drm_panel.
  F:	drivers/gpu/drm/panel/
  F:	include/drm/drm_panel.h
+DRM PRIVACY-SCREEN CLASS
 +M:	Hans de Goede hdegoede@redhat.com
 +L:	dri-devel@lists.freedesktop.org
 +S:	Maintained
 +T:	git git://anongit.freedesktop.org/drm/drm-misc
 +F:	drivers/gpu/drm/drm_privacy_screen*
 +F:	include/drm/drm_privacy_screen*
 +
  DRM TTM SUBSYSTEM
  M:	Christian Koenig christian.koenig@amd.com
  M:	Huang Rui ray.huang@amd.com
@@@ -7096,7 -7013,9 +7096,7 @@@ S:	Maintaine
  F:	drivers/mmc/host/cqhci*
EMULEX 10Gbps iSCSI - OneConnect DRIVER
 -M:	Subbu Seetharaman subbu.seetharaman@broadcom.com
  M:	Ketan Mukadam ketan.mukadam@broadcom.com
 -M:	Jitendra Bhivare jitendra.bhivare@broadcom.com
  L:	linux-scsi@vger.kernel.org
  S:	Supported
  W:	http://www.broadcom.com
@@@ -7507,6 -7426,12 +7507,6 @@@ F:	Documentation/firmware_class
  F:	drivers/base/firmware_loader/
  F:	include/linux/firmware.h
-FLASH ADAPTER DRIVER (IBM Flash Adapter 900GB Full Height PCI Flash Card)
 -M:	Joshua Morris josh.h.morris@us.ibm.com
 -M:	Philip Kelleher pjk1939@linux.ibm.com
 -S:	Maintained
 -F:	drivers/block/rsxx/
 -
  FLEXTIMER FTM-QUADDEC DRIVER
  M:	Patrick Havelange patrick.havelange@essensium.com
  L:	linux-iio@vger.kernel.org
@@@ -7598,7 -7523,6 +7598,7 @@@ F:	include/video
  FREESCALE CAAM (Cryptographic Acceleration and Assurance Module) DRIVER
  M:	Horia Geant�� horia.geanta@nxp.com
  M:	Pankaj Gupta pankaj.gupta@nxp.com
 +M:	Gaurav Jain gaurav.jain@nxp.com
  L:	linux-crypto@vger.kernel.org
  S:	Maintained
  F:	Documentation/devicetree/bindings/crypto/fsl-sec4.txt
@@@ -8563,12 -8487,6 +8563,12 @@@ F:	drivers/hid
  F:	include/linux/hid*
  F:	include/uapi/linux/hid*
+HID LOGITECH DRIVERS
 +R:	Filipe La��ns lains@riseup.net
 +L:	linux-input@vger.kernel.org
 +S:	Maintained
 +F:	drivers/hid/hid-logitech-*
 +
  HID PLAYSTATION DRIVER
  M:	Roderick Colenbrander roderick.colenbrander@sony.com
  L:	linux-input@vger.kernel.org
@@@ -8690,10 -8608,8 +8690,10 @@@ F:	drivers/misc/hisi_hikey_usb.
HISILICON PMU DRIVER
  M:	Shaokun Zhang zhangshaokun@hisilicon.com
 +M:	Qi Liu liuqi115@huawei.com
  S:	Supported
  W:	http://www.hisilicon.com
 +F:	Documentation/admin-guide/perf/hisi-pcie-pmu.rst
  F:	Documentation/admin-guide/perf/hisi-pmu.rst
  F:	drivers/perf/hisilicon
@@@ -8724,7 -8640,6 +8724,7 @@@ F:	drivers/scsi/hisi_sas
HISILICON SECURITY ENGINE V2 DRIVER (SEC2)
  M:	Zaibo Xu xuzaibo@huawei.com
 +M:	Kai Ye yekai13@huawei.com
  L:	linux-crypto@vger.kernel.org
  S:	Maintained
  F:	Documentation/ABI/testing/debugfs-hisi-sec
@@@ -9575,7 -9490,6 +9575,7 @@@ INTEL DRM DRIVERS (excluding Poulsbo, M
  M:	Jani Nikula jani.nikula@linux.intel.com
  M:	Joonas Lahtinen joonas.lahtinen@linux.intel.com
  M:	Rodrigo Vivi rodrigo.vivi@intel.com
 +M:	Tvrtko Ursulin tvrtko.ursulin@linux.intel.com
  L:	intel-gfx@lists.freedesktop.org
  S:	Supported
  W:	https://01.org/linuxgraphics/
@@@ -9770,6 -9684,7 +9770,6 @@@ F:	Documentation/devicetree/bindings/cr
  F:	drivers/crypto/keembay/Kconfig
  F:	drivers/crypto/keembay/Makefile
  F:	drivers/crypto/keembay/keembay-ocs-ecc.c
 -F:	drivers/crypto/keembay/ocs-ecc-curve-defs.h
INTEL KEEM BAY OCS HCU CRYPTO DRIVER
  M:	Daniele Alessandrelli daniele.alessandrelli@intel.com
@@@ -9782,13 -9697,6 +9782,13 @@@ F:	drivers/crypto/keembay/keembay-ocs-h
  F:	drivers/crypto/keembay/ocs-hcu.c
  F:	drivers/crypto/keembay/ocs-hcu.h
+INTEL THUNDER BAY EMMC PHY DRIVER
 +M:	Nandhini Srikandan nandhini.srikandan@intel.com
 +M:	Rashmi A rashmi.a@intel.com
 +S:	Maintained
 +F:	Documentation/devicetree/bindings/phy/intel,phy-thunderbay-emmc.yaml
 +F:	drivers/phy/intel/phy-intel-thunderbay-emmc.c
 +
  INTEL MANAGEMENT ENGINE (mei)
  M:	Tomas Winkler tomas.winkler@intel.com
  L:	linux-kernel@vger.kernel.org
@@@ -9844,9 -9752,10 +9844,9 @@@ S:	Maintaine
  F:	drivers/mfd/intel_soc_pmic*
  F:	include/linux/mfd/intel_soc_pmic*
-INTEL PMT DRIVER
 -M:	"David E. Box" david.e.box@linux.intel.com
 -S:	Maintained
 -F:	drivers/mfd/intel_pmt.c
 +INTEL PMT DRIVERS
 +M:	David E. Box david.e.box@linux.intel.com
 +S:	Supported
  F:	drivers/platform/x86/intel/pmt/
INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT
@@@ -9913,11 -9822,6 +9913,11 @@@ L:	platform-driver-x86@vger.kernel.or
  S:	Maintained
  F:	drivers/platform/x86/intel/uncore-frequency.c
+INTEL VENDOR SPECIFIC EXTENDED CAPABILITIES DRIVER
 +M:	David E. Box david.e.box@linux.intel.com
 +S:	Supported
 +F:	drivers/platform/x86/intel/vsec.*
 +
  INTEL VIRTUAL BUTTON DRIVER
  M:	AceLan Kao acelan.kao@canonical.com
  L:	platform-driver-x86@vger.kernel.org
@@@ -10842,13 -10746,6 +10842,13 @@@ S:	Maintaine
  W:	http://legousb.sourceforge.net/
  F:	drivers/usb/misc/legousbtower.c
+LETSKETCH HID TABLET DRIVER
 +M:	Hans de Goede hdegoede@redhat.com
 +L:	linux-input@vger.kernel.org
 +S:	Maintained
 +T:	git git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git
 +F:	drivers/hid/hid-letsketch.c
 +
  LG LAPTOP EXTRAS
  M:	Matan Ziv-Av matan@svgalib.org
  L:	platform-driver-x86@vger.kernel.org
@@@ -11637,12 -11534,6 +11637,12 @@@ S:	Maintaine
  F:	Documentation/devicetree/bindings/media/i2c/maxim,max9286.yaml
  F:	drivers/media/i2c/max9286.c
+MAX96712 QUAD GMSL2 DESERIALIZER DRIVER
 +M:	Niklas S��derlund niklas.soderlund@ragnatech.se
 +L:	linux-media@vger.kernel.org
 +S:	Maintained
 +F:	drivers/staging/media/max96712/max96712.c
 +
  MAX9860 MONO AUDIO VOICE CODEC DRIVER
  M:	Peter Rosin peda@axentia.se
  L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
@@@ -11678,13 -11569,6 +11678,13 @@@ S:	Maintaine
  F:	Documentation/devicetree/bindings/power/supply/maxim,max17042.yaml
  F:	drivers/power/supply/max17042_battery.c
+MAXIM MAX20086 CAMERA POWER PROTECTOR DRIVER
 +M:	Laurent Pinchart laurent.pinchart@ideasonboard.com
 +L:	linux-kernel@vger.kernel.org
 +S:	Maintained
 +F:	Documentation/devicetree/bindings/regulator/maxim,max20086.yaml
 +F:	drivers/regulator/max20086-regulator.c
 +
  MAXIM MAX77650 PMIC MFD DRIVER
  M:	Bartosz Golaszewski brgl@bgdev.pl
  L:	linux-kernel@vger.kernel.org
@@@ -11707,12 -11591,6 +11707,12 @@@ F:	Documentation/devicetree/bindings/*/
  F:	drivers/regulator/max77802-regulator.c
  F:	include/dt-bindings/*/*max77802.h
+MAXIM MAX77976 BATTERY CHARGER
 +M:	Luca Ceresoli luca@lucaceresoli.net
 +S:	Supported
 +F:	Documentation/devicetree/bindings/power/supply/maxim,max77976.yaml
 +F:	drivers/power/supply/max77976_charger.c
 +
  MAXIM MUIC CHARGER DRIVERS FOR EXYNOS BASED BOARDS
  M:	Krzysztof Kozlowski krzysztof.kozlowski@canonical.com
  M:	Bartlomiej Zolnierkiewicz b.zolnierkie@samsung.com
@@@ -11727,7 -11605,7 +11727,7 @@@ M:	Krzysztof Kozlowski <krzysztof.kozlo
  M:	Bartlomiej Zolnierkiewicz b.zolnierkie@samsung.com
  L:	linux-kernel@vger.kernel.org
  S:	Supported
 -F:	Documentation/devicetree/bindings/*/max77686.txt
 +F:	Documentation/devicetree/bindings/*/maxim,max77686.yaml
  F:	Documentation/devicetree/bindings/clock/maxim,max77686.txt
  F:	Documentation/devicetree/bindings/mfd/max14577.txt
  F:	Documentation/devicetree/bindings/mfd/max77693.txt
@@@ -12646,13 -12524,6 +12646,13 @@@ L:	netdev@vger.kernel.or
  S:	Maintained
  F:	drivers/net/ethernet/microchip/lan743x_*
+MICROCHIP LAN966X ETHERNET DRIVER
 +M:	Horatiu Vultur horatiu.vultur@microchip.com
 +M:	UNGLinuxDriver@microchip.com
 +L:	netdev@vger.kernel.org
 +S:	Maintained
 +F:	drivers/net/ethernet/microchip/lan966x/*
 +
  MICROCHIP LCDFB DRIVER
  M:	Nicolas Ferre nicolas.ferre@microchip.com
  L:	linux-fbdev@vger.kernel.org
@@@ -13836,24 -13707,12 +13836,24 @@@ F:	Documentation/devicetree/bindings/di
  F:	drivers/gpu/drm/imx/dcss/
NXP i.MX 8QXP ADC DRIVER
 -M:	Cai Huoqing caihuoqing@baidu.com
 +M:	Cai Huoqing cai.huoqing@linux.dev
 +M:	Haibo Chen haibo.chen@nxp.com
 +L:	linux-imx@nxp.com
  L:	linux-iio@vger.kernel.org
 -S:	Supported
 +S:	Maintained
  F:	Documentation/devicetree/bindings/iio/adc/nxp,imx8qxp-adc.yaml
  F:	drivers/iio/adc/imx8qxp-adc.c
+NXP i.MX 7D/6SX/6UL AND VF610 ADC DRIVER
 +M:	Haibo Chen haibo.chen@nxp.com
 +L:	linux-iio@vger.kernel.org
 +L:	linux-imx@nxp.com
 +S:	Maintained
 +F:	Documentation/devicetree/bindings/iio/adc/fsl,imx7d-adc.yaml
 +F:	Documentation/devicetree/bindings/iio/adc/fsl,vf610-adc.yaml
 +F:	drivers/iio/adc/imx7d_adc.c
 +F:	drivers/iio/adc/vf610_adc.c
 +
  NXP PF8100/PF8121A/PF8200 PMIC REGULATOR DEVICE DRIVER
  M:	Jagan Teki jagan@amarulasolutions.com
  S:	Maintained
@@@ -13927,13 -13786,6 +13927,13 @@@ S:	Maintaine
  F:	Documentation/hwmon/nzxt-kraken2.rst
  F:	drivers/hwmon/nzxt-kraken2.c
+NZXT-SMART2 HARDWARE MONITORING DRIVER
 +M:	Aleksandr Mezin mezin.alexander@gmail.com
 +L:	linux-hwmon@vger.kernel.org
 +S:	Maintained
 +F:	Documentation/hwmon/nzxt-smart2.rst
 +F:	drivers/hwmon/nzxt-smart2.c
 +
  OBJAGG
  M:	Jiri Pirko jiri@nvidia.com
  L:	netdev@vger.kernel.org
@@@ -14246,6 -14098,7 +14246,6 @@@ F:	drivers/media/i2c/ov5647.
OMNIVISION OV5670 SENSOR DRIVER
  M:	Chiranjeevi Rapolu chiranjeevi.rapolu@intel.com
 -M:	Hyungwoo Yang hyungwoo.yang@intel.com
  L:	linux-media@vger.kernel.org
  S:	Maintained
  T:	git git://linuxtv.org/media_tree.git
@@@ -14258,13 -14111,6 +14258,13 @@@ S:	Maintaine
  T:	git git://linuxtv.org/media_tree.git
  F:	drivers/media/i2c/ov5675.c
+OMNIVISION OV5693 SENSOR DRIVER
 +M:	Daniel Scally djrscally@gmail.com
 +L:	linux-media@vger.kernel.org
 +S:	Maintained
 +T:	git git://linuxtv.org/media_tree.git
 +F:	drivers/media/i2c/ov5693.c
 +
  OMNIVISION OV5695 SENSOR DRIVER
  M:	Shunqian Zheng zhengsq@rock-chips.com
  L:	linux-media@vger.kernel.org
@@@ -14541,6 -14387,15 +14541,15 @@@ F:	include/net/page_pool.
  F:	include/trace/events/page_pool.h
  F:	net/core/page_pool.c
+ PAGE TABLE CHECK
+ M:	Pasha Tatashin pasha.tatashin@soleen.com
+ M:	Andrew Morton akpm@linux-foundation.org
+ L:	linux-mm@kvack.org
+ S:	Maintained
+ F:	Documentation/vm/page_table_check.rst
+ F:	include/linux/page_table_check.h
+ F:	mm/page_table_check.c
+ 
  PANASONIC LAPTOP ACPI EXTRAS DRIVER
  M:	Kenneth Chan kenneth.t.chan@gmail.com
  L:	platform-driver-x86@vger.kernel.org
@@@ -15051,7 -14906,7 +15060,7 @@@ F:	drivers/pci/controller/dwc/*spear
  PCMCIA SUBSYSTEM
  M:	Dominik Brodowski linux@dominikbrodowski.net
  S:	Odd Fixes
 -T:	git git://git.kernel.org/pub/scm/linux/kernel/git/brodo/pcmcia.git
 +T:	git git://git.kernel.org/pub/scm/linux/kernel/git/brodo/linux.git
  F:	Documentation/pcmcia/
  F:	drivers/pcmcia/
  F:	include/pcmcia/
@@@ -15282,11 -15137,6 +15291,11 @@@ L:	linux-omap@vger.kernel.or
  S:	Maintained
  F:	drivers/pinctrl/pinctrl-single.c
+PIN CONTROLLER - THUNDERBAY
 +M:	Lakshmi Sowjanya D lakshmi.sowjanya.d@intel.com
 +S:	Supported
 +F:	drivers/pinctrl/pinctrl-thunderbay.c
 +
  PKTCDVD DRIVER
  M:	linux-block@vger.kernel.org
  S:	Orphan
@@@ -15499,7 -15349,6 +15508,7 @@@ M:	Sergey Senozhatsky <senozhatsky@chro
  R:	Steven Rostedt rostedt@goodmis.org
  R:	John Ogness john.ogness@linutronix.de
  S:	Maintained
 +T:	git git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git
  F:	include/linux/printk.h
  F:	kernel/printk/
@@@ -15887,14 -15736,6 +15896,14 @@@ W:	https://wireless.wiki.kernel.org/en/
  F:	Documentation/devicetree/bindings/net/wireless/qca,ath9k.yaml
  F:	drivers/net/wireless/ath/ath9k/
+QUALCOMM BAM-DMUX WWAN NETWORK DRIVER
 +M:	Stephan Gerhold stephan@gerhold.net
 +L:	netdev@vger.kernel.org
 +L:	linux-arm-msm@vger.kernel.org
 +S:	Maintained
 +F:	Documentation/devicetree/bindings/net/qcom,bam-dmux.yaml
 +F:	drivers/net/wwan/qcom_bam_dmux.c
 +
  QUALCOMM CAMERA SUBSYSTEM DRIVER
  M:	Robert Foss robert.foss@linaro.org
  M:	Todor Tomov todor.too@gmail.com
@@@ -15904,15 -15745,6 +15913,15 @@@ F:	Documentation/admin-guide/media/qcom
  F:	Documentation/devicetree/bindings/media/*camss*
  F:	drivers/media/platform/qcom/camss/
+QUALCOMM CLOCK DRIVERS
 +M:	Bjorn Andersson bjorn.andersson@linaro.org
 +L:	linux-arm-msm@vger.kernel.org
 +S:	Supported
 +T:	git git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git
 +F:	Documentation/devicetree/bindings/clock/qcom,*
 +F:	drivers/clk/qcom/
 +F:	include/dt-bindings/clock/qcom,*
 +
  QUALCOMM CORE POWER REDUCTION (CPR) AVS DRIVER
  M:	Niklas Cassel nks@flawful.org
  L:	linux-pm@vger.kernel.org
@@@ -16166,7 -15998,6 +16175,7 @@@ F:	arch/mips/generic/board-ranchu.
  RANDOM NUMBER DRIVER
  M:	"Theodore Ts'o" tytso@mit.edu
  M:	Jason A. Donenfeld Jason@zx2c4.com
 +T:	git https://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git
  S:	Maintained
  F:	drivers/char/random.c
@@@ -16474,14 -16305,6 +16483,14 @@@ S:	Supporte
  F:	Documentation/devicetree/bindings/iio/adc/renesas,rzg2l-adc.yaml
  F:	drivers/iio/adc/rzg2l_adc.c
+RENESAS R-CAR GEN3 & RZ/N1 NAND CONTROLLER DRIVER
 +M:	Miquel Raynal miquel.raynal@bootlin.com
 +L:	linux-mtd@lists.infradead.org
 +L:	linux-renesas-soc@vger.kernel.org
 +S:	Maintained
 +F:	Documentation/devicetree/bindings/mtd/renesas-nandc.yaml
 +F:	drivers/mtd/nand/raw/renesas-nand-controller.c
 +
  RESET CONTROLLER FRAMEWORK
  M:	Philipp Zabel p.zabel@pengutronix.de
  S:	Maintained
@@@ -16652,19 -16475,27 +16661,19 @@@ ROHM POWER MANAGEMENT IC DEVICE DRIVER
  R:	Matti Vaittinen matti.vaittinen@fi.rohmeurope.com
  L:	linux-power@fi.rohmeurope.com
  S:	Supported
 -F:	Documentation/devicetree/bindings/mfd/rohm,bd70528-pmic.txt
 -F:	Documentation/devicetree/bindings/regulator/rohm,bd70528-regulator.txt
  F:	drivers/clk/clk-bd718x7.c
 -F:	drivers/gpio/gpio-bd70528.c
  F:	drivers/gpio/gpio-bd71815.c
  F:	drivers/gpio/gpio-bd71828.c
 -F:	drivers/mfd/rohm-bd70528.c
  F:	drivers/mfd/rohm-bd71828.c
  F:	drivers/mfd/rohm-bd718x7.c
  F:	drivers/mfd/rohm-bd9576.c
 -F:	drivers/power/supply/bd70528-charger.c
 -F:	drivers/regulator/bd70528-regulator.c
  F:	drivers/regulator/bd71815-regulator.c
  F:	drivers/regulator/bd71828-regulator.c
  F:	drivers/regulator/bd718x7-regulator.c
  F:	drivers/regulator/bd9576-regulator.c
  F:	drivers/regulator/rohm-regulator.c
  F:	drivers/rtc/rtc-bd70528.c
 -F:	drivers/watchdog/bd70528_wdt.c
  F:	drivers/watchdog/bd9576_wdt.c
 -F:	include/linux/mfd/rohm-bd70528.h
  F:	include/linux/mfd/rohm-bd71815.h
  F:	include/linux/mfd/rohm-bd71828.h
  F:	include/linux/mfd/rohm-bd718x7.h
@@@ -17015,15 -16846,13 +17024,15 @@@ F:	Documentation/devicetree/bindings/ne
  F:	drivers/nfc/s3fwrn5
SAMSUNG S5C73M3 CAMERA DRIVER
 -M:	Andrzej Hajda a.hajda@samsung.com
 +M:	Sylwester Nawrocki s.nawrocki@samsung.com
 +M:	Andrzej Hajda andrzej.hajda@intel.com
  L:	linux-media@vger.kernel.org
  S:	Supported
  F:	drivers/media/i2c/s5c73m3/*
SAMSUNG S5K5BAF CAMERA DRIVER
 -M:	Andrzej Hajda a.hajda@samsung.com
 +M:	Sylwester Nawrocki s.nawrocki@samsung.com
 +M:	Andrzej Hajda andrzej.hajda@intel.com
  L:	linux-media@vger.kernel.org
  S:	Supported
  F:	drivers/media/i2c/s5k5baf.c
@@@ -17052,8 -16881,10 +17061,8 @@@ M:	Chanwoo Choi <cw00.choi@samsung.com
  L:	linux-samsung-soc@vger.kernel.org
  S:	Supported
  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/snawrocki/clk.git
 -F:	Documentation/devicetree/bindings/clock/exynos*.txt
  F:	Documentation/devicetree/bindings/clock/samsung,*.yaml
  F:	Documentation/devicetree/bindings/clock/samsung,s3c*
 -F:	Documentation/devicetree/bindings/clock/samsung,s5p*
  F:	drivers/clk/samsung/
  F:	include/dt-bindings/clock/exynos*.h
  F:	include/dt-bindings/clock/s3c*.h
@@@ -17300,13 -17131,6 +17309,13 @@@ L:	linux-mmc@vger.kernel.or
  S:	Maintained
  F:	drivers/mmc/host/sdhci-omap.c
+SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) NXP i.MX DRIVER
 +M:	Haibo Chen haibo.chen@nxp.com
 +L:	linux-imx@nxp.com
 +L:	linux-mmc@vger.kernel.org
 +S:	Maintained
 +F:	drivers/mmc/host/sdhci-esdhc-imx.c
 +
  SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER
  M:	Jonathan Derrick jonathan.derrick@intel.com
  M:	Revanth Rajashekar revanth.rajashekar@intel.com
@@@ -17852,17 -17676,12 +17861,17 @@@ F:	drivers/firmware/arm_sdei.
  F:	include/linux/arm_sdei.h
  F:	include/uapi/linux/arm_sdei.h
-SOFTWARE NODES
 +SOFTWARE NODES AND DEVICE PROPERTIES
  R:	Andy Shevchenko andriy.shevchenko@linux.intel.com
 +R:	Daniel Scally djrscally@gmail.com
  R:	Heikki Krogerus heikki.krogerus@linux.intel.com
 +R:	Sakari Ailus sakari.ailus@linux.intel.com
  L:	linux-acpi@vger.kernel.org
  S:	Maintained
 +F:	drivers/base/property.c
  F:	drivers/base/swnode.c
 +F:	include/linux/fwnode.h
 +F:	include/linux/property.h
SOFTWARE RAID (Multiple Disks) SUPPORT
  M:	Song Liu song@kernel.org
@@@ -18022,7 -17841,6 +18031,7 @@@ F:	Documentation/sound
  F:	include/sound/
  F:	include/uapi/sound/
  F:	sound/
 +F:	tools/testing/selftests/alsa
SOUND - COMPRESSED AUDIO
  M:	Vinod Koul vkoul@kernel.org
@@@ -18042,13 -17860,6 +18051,13 @@@ F:	include/sound/dmaengine_pcm.
  F:	sound/core/pcm_dmaengine.c
  F:	sound/soc/soc-generic-dmaengine-pcm.c
+SOUND - ALSA SELFTESTS
 +M:	Mark Brown broonie@kernel.org
 +L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
 +L:	linux-kselftest@vger.kernel.org
 +S:	Supported
 +F:	tools/testing/selftests/alsa
 +
  SOUND - SOC LAYER / DYNAMIC AUDIO POWER MANAGEMENT (ASoC)
  M:	Liam Girdwood lgirdwood@gmail.com
  M:	Mark Brown broonie@kernel.org
@@@ -18155,8 -17966,8 +18164,8 @@@ F:	drivers/pinctrl/spear
SPI NOR SUBSYSTEM
  M:	Tudor Ambarus tudor.ambarus@microchip.com
 +M:	Pratyush Yadav p.yadav@ti.com
  R:	Michael Walle michael@walle.cc
 -R:	Pratyush Yadav p.yadav@ti.com
  L:	linux-mtd@lists.infradead.org
  S:	Maintained
  W:	http://www.linux-mtd.infradead.org/
@@@ -18355,28 -18166,6 +18364,28 @@@ M:	Ion Badulescu <ionut@badula.org
  S:	Odd Fixes
  F:	drivers/net/ethernet/adaptec/starfire*
+STARFIVE JH7100 CLOCK DRIVER
 +M:	Emil Renner Berthing kernel@esmil.dk
 +S:	Maintained
 +F:	Documentation/devicetree/bindings/clock/starfive,jh7100-clkgen.yaml
 +F:	drivers/clk/starfive/clk-starfive-jh7100.c
 +F:	include/dt-bindings/clock/starfive-jh7100.h
 +
 +STARFIVE JH7100 PINCTRL DRIVER
 +M:	Emil Renner Berthing kernel@esmil.dk
 +L:	linux-gpio@vger.kernel.org
 +S:	Maintained
 +F:	Documentation/devicetree/bindings/pinctrl/starfive,jh7100-pinctrl.yaml
 +F:	drivers/pinctrl/pinctrl-starfive.c
 +F:	include/dt-bindings/pinctrl/pinctrl-starfive.h
 +
 +STARFIVE JH7100 RESET CONTROLLER DRIVER
 +M:	Emil Renner Berthing kernel@esmil.dk
 +S:	Maintained
 +F:	Documentation/devicetree/bindings/reset/starfive,jh7100-reset.yaml
 +F:	drivers/reset/reset-starfive-jh7100.c
 +F:	include/dt-bindings/reset/starfive-jh7100.h
 +
  STATIC BRANCH/CALL
  M:	Peter Zijlstra peterz@infradead.org
  M:	Josh Poimboeuf jpoimboe@redhat.com
@@@ -18538,7 -18327,6 +18547,7 @@@ M:	Vineet Gupta <vgupta@kernel.org
  L:	linux-snps-arc@lists.infradead.org
  S:	Supported
  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git
 +F:	Documentation/arc/
  F:	Documentation/devicetree/bindings/arc/*
  F:	Documentation/devicetree/bindings/interrupt-controller/snps,arc*
  F:	arch/arc/
@@@ -19556,6 -19344,12 +19565,6 @@@ W:	https://github.com/srcres258/linux-d
  T:	git git://github.com/srcres258/linux-doc.git doc-zh-tw
  F:	Documentation/translations/zh_TW/
-TRIVIAL PATCHES
 -M:	Jiri Kosina trivial@kernel.org
 -S:	Maintained
 -T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/trivial.git
 -K:	^Subject:.*(?i)trivial
 -
  TTY LAYER
  M:	Greg Kroah-Hartman gregkh@linuxfoundation.org
  M:	Jiri Slaby jirislaby@kernel.org
@@@ -19660,7 -19454,6 +19669,7 @@@ S:	Supporte
  W:	http://www.linux-mtd.infradead.org/doc/ubifs.html
  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git next
  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git fixes
 +F:	Documentation/ABI/testing/sysfs-fs-ubifs
  F:	Documentation/filesystems/ubifs-authentication.rst
  F:	Documentation/filesystems/ubifs.rst
  F:	fs/ubifs/
@@@ -20412,8 -20205,6 +20421,8 @@@ F:	include/uapi/linux/virtio_gpio.
  VIRTIO GPU DRIVER
  M:	David Airlie airlied@linux.ie
  M:	Gerd Hoffmann kraxel@redhat.com
 +R:	Gurchetan Singh gurchetansingh@chromium.org
 +R:	Chia-I Wu olvaffe@gmail.com
  L:	dri-devel@lists.freedesktop.org
  L:	virtualization@lists.linux-foundation.org
  S:	Maintained
@@@ -20647,7 -20438,7 +20656,7 @@@ M:	Sergey Senozhatsky <senozhatsky@chro
  R:	Andy Shevchenko andriy.shevchenko@linux.intel.com
  R:	Rasmus Villemoes linux@rasmusvillemoes.dk
  S:	Maintained
 -T:	git git://git.kernel.org/pub/scm/linux/kernel/git/pmladek/printk.git
 +T:	git git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git
  F:	Documentation/core-api/printk-formats.rst
  F:	lib/test_printf.c
  F:	lib/test_scanf.c
@@@ -20915,13 -20706,6 +20924,13 @@@ S:	Maintaine
  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/mm
  F:	arch/x86/mm/
+X86 PLATFORM ANDROID TABLETS DSDT FIXUP DRIVER
 +M:	Hans de Goede hdegoede@redhat.com
 +L:	platform-driver-x86@vger.kernel.org
 +S:	Maintained
 +T:	git git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git
 +F:	drivers/platform/x86/x86-android-tablets.c
 +
  X86 PLATFORM DRIVERS
  M:	Hans de Goede hdegoede@redhat.com
  M:	Mark Gross markgross@kernel.org
@@@ -21085,14 -20869,6 +21094,14 @@@ F:	drivers/scsi/xen-scsifront.
  F:	drivers/xen/xen-scsiback.c
  F:	include/xen/interface/io/vscsiif.h
+XEN PVUSB DRIVER
 +M:	Juergen Gross jgross@suse.com
 +L:	xen-devel@lists.xenproject.org (moderated for non-subscribers)
 +L:	linux-usb@vger.kernel.org
 +S:	Supported
 +F:	drivers/usb/host/xen*
 +F:	include/xen/interface/io/usbif.h
 +
  XEN SOUND FRONTEND DRIVER
  M:	Oleksandr Andrushchenko oleksandr_andrushchenko@epam.com
  L:	xen-devel@lists.xenproject.org (moderated for non-subscribers)
@@@ -21125,13 -20901,6 +21134,13 @@@ F:	fs/xfs
  F:	include/uapi/linux/dqblk_xfs.h
  F:	include/uapi/linux/fsmap.h
+XILINX AMS DRIVER
 +M:	Anand Ashok Dumbre anand.ashok.dumbre@xilinx.com
 +L:	linux-iio@vger.kernel.org
 +S:	Maintained
 +F:	Documentation/devicetree/bindings/iio/adc/xlnx,zynqmp-ams.yaml
 +F:	drivers/iio/adc/xilinx-ams.c
 +
  XILINX AXI ETHERNET DRIVER
  M:	Radhey Shyam Pandey radhey.shyam.pandey@xilinx.com
  S:	Maintained
@@@ -21200,12 -20969,6 +21209,12 @@@ T:	git https://github.com/Xilinx/linux-
  F:	Documentation/devicetree/bindings/phy/xlnx,zynqmp-psgtr.yaml
  F:	drivers/phy/xilinx/phy-zynqmp.c
+XILINX EVENT MANAGEMENT DRIVER
 +M:	Abhyuday Godhasara abhyuday.godhasara@xilinx.com
 +S:	Maintained
 +F:	drivers/soc/xilinx/xlnx_event_manager.c
 +F:	include/linux/firmware/xlnx-event-manager.h
 +
  XILLYBUS DRIVER
  M:	Eli Billauer eli.billauer@gmail.com
  L:	linux-kernel@vger.kernel.org
diff --combined arch/Kconfig
index 847fde3d22cd,4568b6b70b5d..5a1692392a4d
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@@ -1297,6 -1297,9 +1297,9 @@@ config HAVE_ARCH_PFN_VALI
  config ARCH_SUPPORTS_DEBUG_PAGEALLOC
    bool
+ config ARCH_SUPPORTS_PAGE_TABLE_CHECK
+ 	bool
+ 
  config ARCH_SPLIT_ARG64
    bool
    help
@@@ -1312,10 -1315,6 +1315,10 @@@ config ARCH_HAS_PARANOID_L1D_FLUS
  config DYNAMIC_SIGFRAME
    bool
+# Select, if arch has a named attribute group bound to NUMA device nodes.
 +config HAVE_ARCH_NODE_DEV_GROUP
 +	bool
 +
  source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig"
diff --combined arch/arm/mm/fault.c
index a1cebe363ed5,c7326a521a69..13949510772a
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@@ -17,7 -17,6 +17,7 @@@
  #include <linux/sched/debug.h>
  #include <linux/highmem.h>
  #include <linux/perf_event.h>
 +#include <linux/kfence.h>
#include <asm/system_misc.h>
  #include <asm/system_info.h>
@@@ -100,11 -99,6 +100,11 @@@ void show_pte(const char *lvl, struct m
  { }
  #endif					/* CONFIG_MMU */
+static inline bool is_write_fault(unsigned int fsr)
 +{
 +	return (fsr & FSR_WRITE) && !(fsr & FSR_CM);
 +}
 +
  static void die_kernel_fault(const char *msg, struct mm_struct *mm,
    		     unsigned long addr, unsigned int fsr,
    		     struct pt_regs *regs)
@@@ -137,14 -131,10 +137,14 @@@ __do_kernel_fault(struct mm_struct *mm
    /*
     * No handler, we'll have to terminate things with extreme prejudice.
     */
 -	if (addr < PAGE_SIZE)
 +	if (addr < PAGE_SIZE) {
    	msg = "NULL pointer dereference";
 -	else
 +	} else {
 +		if (kfence_handle_page_fault(addr, is_write_fault(fsr), regs))
 +			return;
 +
    	msg = "paging request";
 +	}
die_kernel_fault(msg, mm, addr, fsr, regs);
  }
@@@ -201,8 -191,8 +201,8 @@@ void do_bad_area(unsigned long addr, un
  }
#ifdef CONFIG_MMU
 -#define VM_FAULT_BADMAP		0x010000
 -#define VM_FAULT_BADACCESS	0x020000
 +#define VM_FAULT_BADMAP		((__force vm_fault_t)0x010000)
 +#define VM_FAULT_BADACCESS	((__force vm_fault_t)0x020000)
static inline bool is_permission_fault(unsigned int fsr)
  {
@@@ -271,7 -261,7 +271,7 @@@ do_page_fault(unsigned long addr, unsig
    if (user_mode(regs))
    	flags |= FAULT_FLAG_USER;
-	if ((fsr & FSR_WRITE) && !(fsr & FSR_CM)) {
 +	if (is_write_fault(fsr)) {
    	flags |= FAULT_FLAG_WRITE;
    	vm_flags = VM_WRITE;
    }
@@@ -322,7 -312,7 +322,7 @@@ retry
    	return 0;
    }
- 	if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) {
+ 	if (!(fault & VM_FAULT_ERROR)) {
    	if (fault & VM_FAULT_RETRY) {
    		flags |= FAULT_FLAG_TRIED;
    		goto retry;
diff --combined arch/arm64/mm/fault.c
index 9a9e7675b187,a8fb54fccde0..11e04cca0f4f
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@@ -297,8 -297,6 +297,8 @@@ static void die_kernel_fault(const cha
    pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
    	 addr);
+	kasan_non_canonical_hook(addr);
 +
    mem_abort_decode(esr);
show_pte(addr);
@@@ -608,10 -606,8 +608,8 @@@ retry
    }
if (fault & VM_FAULT_RETRY) {
- 		if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
- 			mm_flags |= FAULT_FLAG_TRIED;
- 			goto retry;
- 		}
+ 		mm_flags |= FAULT_FLAG_TRIED;
+ 		goto retry;
    }
    mmap_read_unlock(mm);
@@@ -815,8 -811,11 +813,8 @@@ void do_mem_abort(unsigned long far, un
    if (!inf->fn(far, esr, regs))
    	return;
-	if (!user_mode(regs)) {
 -		pr_alert("Unhandled fault at 0x%016lx\n", addr);
 -		mem_abort_decode(esr);
 -		show_pte(addr);
 -	}
 +	if (!user_mode(regs))
 +		die_kernel_fault(inf->name, addr, esr, regs);
/*
     * At this point we have an unrecognized fault type whose tag bits may
diff --combined arch/parisc/mm/fault.c
index 147868427b7c,360b627645cc..e9eabf8f14d7
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@@ -148,11 -148,11 +148,11 @@@ int fixup_exception(struct pt_regs *reg
    	 * Fix up get_user() and put_user().
    	 * ASM_EXCEPTIONTABLE_ENTRY_EFAULT() sets the least-significant
    	 * bit in the relative address of the fixup routine to indicate
 -		 * that %r8 should be loaded with -EFAULT to report a userspace
 -		 * access error.
 +		 * that gr[ASM_EXCEPTIONTABLE_REG] should be loaded with
 +		 * -EFAULT to report a userspace access error.
    	 */
    	if (fix->fixup & 1) {
 -			regs->gr[8] = -EFAULT;
 +			regs->gr[ASM_EXCEPTIONTABLE_REG] = -EFAULT;
/* zero target register for get_user() */
    		if (parisc_acctyp(0, regs->iir) == VM_READ) {
@@@ -266,14 -266,14 +266,14 @@@ void do_page_fault(struct pt_regs *regs
    unsigned long acc_type;
    vm_fault_t fault = 0;
    unsigned int flags;
 -
 -	if (faulthandler_disabled())
 -		goto no_context;
 +	char *msg;
tsk = current;
    mm = tsk->mm;
 -	if (!mm)
 +	if (!mm) {
 +		msg = "Page fault: no context";
    	goto no_context;
 +	}
flags = FAULT_FLAG_DEFAULT;
    if (user_mode(regs))
@@@ -324,16 -324,14 +324,14 @@@ good_area
    		goto bad_area;
    	BUG();
    }
- 	if (flags & FAULT_FLAG_ALLOW_RETRY) {
- 		if (fault & VM_FAULT_RETRY) {
- 			/*
- 			 * No need to mmap_read_unlock(mm) as we would
- 			 * have already released it in __lock_page_or_retry
- 			 * in mm/filemap.c.
- 			 */
- 			flags |= FAULT_FLAG_TRIED;
- 			goto retry;
- 		}
+ 	if (fault & VM_FAULT_RETRY) {
+ 		/*
+ 		 * No need to mmap_read_unlock(mm) as we would
+ 		 * have already released it in __lock_page_or_retry
+ 		 * in mm/filemap.c.
+ 		 */
+ 		flags |= FAULT_FLAG_TRIED;
+ 		goto retry;
    }
    mmap_read_unlock(mm);
    return;
@@@ -409,7 -407,6 +407,7 @@@ bad_area
    	force_sig_fault(signo, si_code, (void __user *) address);
    	return;
    }
 +	msg = "Page fault: bad address";
no_context:
@@@ -417,13 -414,11 +415,13 @@@
    	return;
    }
-	parisc_terminate("Bad Address (null pointer deref?)", regs, code, address);
 +	parisc_terminate(msg, regs, code, address);
-  out_of_memory:
 +out_of_memory:
    mmap_read_unlock(mm);
 -	if (!user_mode(regs))
 +	if (!user_mode(regs)) {
 +		msg = "Page fault: out of memory";
    	goto no_context;
 +	}
    pagefault_out_of_memory();
  }
diff --combined arch/powerpc/mm/fault.c
index 2d4a411c7c85,ebcc61e47d62..eb8ecd7343a9
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@@ -35,7 -35,6 +35,7 @@@
  #include <linux/kfence.h>
  #include <linux/pkeys.h>
+#include <asm/asm-prototypes.h>
  #include <asm/firmware.h>
  #include <asm/interrupt.h>
  #include <asm/page.h>
@@@ -517,10 -516,8 +517,8 @@@ retry
     * case.
     */
    if (unlikely(fault & VM_FAULT_RETRY)) {
- 		if (flags & FAULT_FLAG_ALLOW_RETRY) {
- 			flags |= FAULT_FLAG_TRIED;
- 			goto retry;
- 		}
+ 		flags |= FAULT_FLAG_TRIED;
+ 		goto retry;
    }
mmap_read_unlock(current->mm);
@@@ -621,27 -618,4 +619,27 @@@ DEFINE_INTERRUPT_HANDLER(do_bad_page_fa
  {
    bad_page_fault(regs, SIGSEGV);
  }
 +
 +/*
 + * In radix, segment interrupts indicate the EA is not addressable by the
 + * page table geometry, so they are always sent here.
 + *
 + * In hash, this is called if do_slb_fault returns error. Typically it is
 + * because the EA was outside the region allowed by software.
 + */
 +DEFINE_INTERRUPT_HANDLER(do_bad_segment_interrupt)
 +{
 +	int err = regs->result;
 +
 +	if (err == -EFAULT) {
 +		if (user_mode(regs))
 +			_exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar);
 +		else
 +			bad_page_fault(regs, SIGSEGV);
 +	} else if (err == -EINVAL) {
 +		unrecoverable_exception(regs);
 +	} else {
 +		BUG();
 +	}
 +}
  #endif
diff --combined arch/s390/mm/fault.c
index 6ed2886fc014,d7d6be283d94..ff16ce0d04ee
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@@ -115,7 -115,7 +115,7 @@@ static void dump_pagetable(unsigned lon
    	pr_cont("R1:%016lx ", *table);
    	if (*table & _REGION_ENTRY_INVALID)
    		goto out;
 -		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 +		table = __va(*table & _REGION_ENTRY_ORIGIN);
    	fallthrough;
    case _ASCE_TYPE_REGION2:
    	table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
@@@ -124,7 -124,7 +124,7 @@@
    	pr_cont("R2:%016lx ", *table);
    	if (*table & _REGION_ENTRY_INVALID)
    		goto out;
 -		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 +		table = __va(*table & _REGION_ENTRY_ORIGIN);
    	fallthrough;
    case _ASCE_TYPE_REGION3:
    	table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
@@@ -133,7 -133,7 +133,7 @@@
    	pr_cont("R3:%016lx ", *table);
    	if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
    		goto out;
 -		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 +		table = __va(*table & _REGION_ENTRY_ORIGIN);
    	fallthrough;
    case _ASCE_TYPE_SEGMENT:
    	table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
@@@ -142,7 -142,7 +142,7 @@@
    	pr_cont("S:%016lx ", *table);
    	if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
    		goto out;
 -		table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
 +		table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
    }
    table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
    if (bad_address(table))
@@@ -452,21 -452,21 +452,21 @@@ retry
    if (unlikely(fault & VM_FAULT_ERROR))
    	goto out_up;
- 	if (flags & FAULT_FLAG_ALLOW_RETRY) {
- 		if (fault & VM_FAULT_RETRY) {
- 			if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
- 			    (flags & FAULT_FLAG_RETRY_NOWAIT)) {
- 				/* FAULT_FLAG_RETRY_NOWAIT has been set,
- 				 * mmap_lock has not been released */
- 				current->thread.gmap_pfault = 1;
- 				fault = VM_FAULT_PFAULT;
- 				goto out_up;
- 			}
- 			flags &= ~FAULT_FLAG_RETRY_NOWAIT;
- 			flags |= FAULT_FLAG_TRIED;
- 			mmap_read_lock(mm);
- 			goto retry;
+ 	if (fault & VM_FAULT_RETRY) {
+ 		if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
+ 			(flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ 			/*
+ 			 * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has
+ 			 * not been released
+ 			 */
+ 			current->thread.gmap_pfault = 1;
+ 			fault = VM_FAULT_PFAULT;
+ 			goto out_up;
    	}
+ 		flags &= ~FAULT_FLAG_RETRY_NOWAIT;
+ 		flags |= FAULT_FLAG_TRIED;
+ 		mmap_read_lock(mm);
+ 		goto retry;
    }
    if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
    	address =  __gmap_link(gmap, current->thread.gmap_addr,
diff --combined arch/um/kernel/trap.c
index 561a2b03c3cf,193503484af5..d1d5d0be0308
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@@ -87,12 -87,10 +87,10 @@@ good_area
    		}
    		BUG();
    	}
- 		if (flags & FAULT_FLAG_ALLOW_RETRY) {
- 			if (fault & VM_FAULT_RETRY) {
- 				flags |= FAULT_FLAG_TRIED;
+ 		if (fault & VM_FAULT_RETRY) {
+ 			flags |= FAULT_FLAG_TRIED;
- 				goto retry;
- 			}
+ 			goto retry;
    	}
pmd = pmd_off(mm, address);
@@@ -127,6 -125,7 +125,6 @@@ out_of_memory
    pagefault_out_of_memory();
    return 0;
  }
 -EXPORT_SYMBOL(handle_page_fault);
static void show_segv_info(struct uml_pt_regs *regs)
  {
diff --combined arch/x86/Kconfig
index 976dd6b532bf,d0628415b93e..407533c835fe
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -104,6 -104,7 +104,7 @@@ config X8
    select ARCH_SUPPORTS_ACPI
    select ARCH_SUPPORTS_ATOMIC_RMW
    select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK	if X86_64
    select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
    select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
    select ARCH_SUPPORTS_LTO_CLANG
@@@ -269,7 -270,6 +270,7 @@@
    select HAVE_ARCH_KCSAN			if X86_64
    select X86_FEATURE_NAMES		if PROC_FS
    select PROC_PID_ARCH_STATUS		if PROC_FS
 +	select HAVE_ARCH_NODE_DEV_GROUP		if X86_SGX
    imply IMA_SECURE_AND_OR_TRUSTED_BOOT    if EFI
config INSTRUCTION_DECODER
@@@ -473,18 -473,6 +474,18 @@@ config RETPOLIN
      branches. Requires a compiler with -mindirect-branch=thunk-extern
      support for full protection. The kernel may run slower.
+config CC_HAS_SLS
 +	def_bool $(cc-option,-mharden-sls=all)
 +
 +config SLS
 +	bool "Mitigate Straight-Line-Speculation"
 +	depends on CC_HAS_SLS && X86_64
 +	default n
 +	help
 +	  Compile the kernel with straight-line-speculation options to guard
 +	  against straight line speculation. The kernel image might be slightly
 +	  larger.
 +
  config X86_CPU_RESCTRL
    bool "x86 CPU resource control support"
    depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
@@@ -1536,20 -1524,16 +1537,20 @@@ config X86_CPA_STATISTIC
      helps to determine the effectiveness of preserving large and huge
      page mappings when mapping protections are changed.
+config X86_MEM_ENCRYPT
 +	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
 +	select DYNAMIC_PHYSICAL_MASK
 +	select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
 +	def_bool n
 +
  config AMD_MEM_ENCRYPT
    bool "AMD Secure Memory Encryption (SME) support"
    depends on X86_64 && CPU_SUP_AMD
    select DMA_COHERENT_POOL
 -	select DYNAMIC_PHYSICAL_MASK
    select ARCH_USE_MEMREMAP_PROT
 -	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
    select INSTRUCTION_DECODER
 -	select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
    select ARCH_HAS_CC_PLATFORM
 +	select X86_MEM_ENCRYPT
    help
      Say yes to enable support for the encryption of system memory.
      This requires an AMD processor that supports Secure Memory
@@@ -1934,7 -1918,6 +1935,7 @@@ config X86_SG
    select SRCU
    select MMU_NOTIFIER
    select NUMA_KEEP_MEMINFO if NUMA
 +	select XARRAY_MULTI
    help
      Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
      that can be used by applications to set aside private regions of code
@@@ -1964,7 -1947,7 +1965,7 @@@ config EF
config EFI_STUB
    bool "EFI stub support"
 -	depends on EFI && !X86_USE_3DNOW
 +	depends on EFI
    depends on $(cc-option,-mabi=ms) || X86_32
    select RELOCATABLE
    help
diff --combined arch/x86/include/asm/pgtable.h
index a34430b7af4a,d7d287ac1018..8a9432fb3802
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@@ -22,11 -22,11 +22,12 @@@
  #define pgprot_decrypted(prot)	__pgprot(__sme_clr(pgprot_val(prot)))
#ifndef __ASSEMBLY__
 +#include <linux/spinlock.h>
  #include <asm/x86_init.h>
  #include <asm/pkru.h>
  #include <asm/fpu/api.h>
  #include <asm-generic/pgtable_uffd.h>
+ #include <linux/page_table_check.h>
extern pgd_t early_top_pgt[PTRS_PER_PGD];
  bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
@@@ -753,7 -753,7 +754,7 @@@ static inline bool pte_accessible(struc
    	return true;
if ((pte_flags(a) & _PAGE_PROTNONE) &&
- 			mm_tlb_flush_pending(mm))
+ 			atomic_read(&mm->tlb_flush_pending))
    	return true;
return false;
@@@ -1007,18 -1007,21 +1008,21 @@@ static inline pud_t native_local_pudp_g
  static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
    		      pte_t *ptep, pte_t pte)
  {
+ 	page_table_check_pte_set(mm, addr, ptep, pte);
    set_pte(ptep, pte);
  }
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
    		      pmd_t *pmdp, pmd_t pmd)
  {
+ 	page_table_check_pmd_set(mm, addr, pmdp, pmd);
    set_pmd(pmdp, pmd);
  }
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
    		      pud_t *pudp, pud_t pud)
  {
+ 	page_table_check_pud_set(mm, addr, pudp, pud);
    native_set_pud(pudp, pud);
  }
@@@ -1049,6 -1052,7 +1053,7 @@@ static inline pte_t ptep_get_and_clear(
    			       pte_t *ptep)
  {
    pte_t pte = native_ptep_get_and_clear(ptep);
+ 	page_table_check_pte_clear(mm, addr, pte);
    return pte;
  }
@@@ -1064,12 -1068,23 +1069,23 @@@ static inline pte_t ptep_get_and_clear_
    	 * care about updates and native needs no locking
    	 */
    	pte = native_local_ptep_get_and_clear(ptep);
+ 		page_table_check_pte_clear(mm, addr, pte);
    } else {
    	pte = ptep_get_and_clear(mm, addr, ptep);
    }
    return pte;
  }
+ #define __HAVE_ARCH_PTEP_CLEAR
+ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
+ 			      pte_t *ptep)
+ {
+ 	if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK))
+ 		ptep_get_and_clear(mm, addr, ptep);
+ 	else
+ 		pte_clear(mm, addr, ptep);
+ }
+ 
  #define __HAVE_ARCH_PTEP_SET_WRPROTECT
  static inline void ptep_set_wrprotect(struct mm_struct *mm,
    			      unsigned long addr, pte_t *ptep)
@@@ -1110,14 -1125,22 +1126,22 @@@ static inline int pmd_write(pmd_t pmd
  static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
    			       pmd_t *pmdp)
  {
- 	return native_pmdp_get_and_clear(pmdp);
+ 	pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+ 
+ 	page_table_check_pmd_clear(mm, addr, pmd);
+ 
+ 	return pmd;
  }
#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
  static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
    				unsigned long addr, pud_t *pudp)
  {
- 	return native_pudp_get_and_clear(pudp);
+ 	pud_t pud = native_pudp_get_and_clear(pudp);
+ 
+ 	page_table_check_pud_clear(mm, addr, pud);
+ 
+ 	return pud;
  }
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
@@@ -1138,6 -1161,7 +1162,7 @@@ static inline int pud_write(pud_t pud
  static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
    	unsigned long address, pmd_t *pmdp, pmd_t pmd)
  {
+ 	page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
    if (IS_ENABLED(CONFIG_SMP)) {
    	return xchg(pmdp, pmd);
    } else {
diff --combined drivers/block/zram/zram_drv.c
index f6da5293b913,9a46b2ef6951..cb253d80d72b
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@@ -1903,14 -1903,7 +1903,7 @@@ static struct attribute *zram_disk_attr
    NULL,
  };
- static const struct attribute_group zram_disk_attr_group = {
- 	.attrs = zram_disk_attrs,
- };
- 
- static const struct attribute_group *zram_disk_attr_groups[] = {
- 	&zram_disk_attr_group,
- 	NULL,
- };
+ ATTRIBUTE_GROUPS(zram_disk);
/*
   * Allocate and initialize new zram device. the function returns
@@@ -1947,7 -1940,6 +1940,7 @@@ static int zram_add(void
    zram->disk->major = zram_major;
    zram->disk->first_minor = device_id;
    zram->disk->minors = 1;
 +	zram->disk->flags |= GENHD_FL_NO_PART;
    zram->disk->fops = &zram_devops;
    zram->disk->private_data = zram;
    snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
@@@ -1983,7 -1975,7 +1976,7 @@@
    	blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
- 	ret = device_add_disk(NULL, zram->disk, zram_disk_attr_groups);
+ 	ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
    if (ret)
    	goto out_cleanup_disk;
diff --combined drivers/dax/bus.c
index ee4568ef757c,a22350e822fa..1dad813ee4a6
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@@ -10,6 -10,8 +10,6 @@@
  #include "dax-private.h"
  #include "bus.h"
-static struct class *dax_class;
 -
  static DEFINE_MUTEX(dax_bus_lock);
#define DAX_NAME_LEN 30
@@@ -127,11 -129,35 +127,35 @@@ ATTRIBUTE_GROUPS(dax_drv)
static int dax_bus_match(struct device *dev, struct device_driver *drv);
+ /*
+  * Static dax regions are regions created by an external subsystem
+  * nvdimm where a single range is assigned. Its boundaries are by the external
+  * subsystem and are usually limited to one physical memory range. For example,
+  * for PMEM it is usually defined by NVDIMM Namespace boundaries (i.e. a
+  * single contiguous range)
+  *
+  * On dynamic dax regions, the assigned region can be partitioned by dax core
+  * into multiple subdivisions. A subdivision is represented into one
+  * /dev/daxN.M device composed by one or more potentially discontiguous ranges.
+  *
+  * When allocating a dax region, drivers must set whether it's static
+  * (IORESOURCE_DAX_STATIC).  On static dax devices, the @pgmap is pre-assigned
+  * to dax core when calling devm_create_dev_dax(), whereas in dynamic dax
+  * devices it is NULL but afterwards allocated by dax core on device ->probe().
+  * Care is needed to make sure that dynamic dax devices are torn down with a
+  * cleared @pgmap field (see kill_dev_dax()).
+  */
  static bool is_static(struct dax_region *dax_region)
  {
    return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
  }
+ bool static_dev_dax(struct dev_dax *dev_dax)
+ {
+ 	return is_static(dev_dax->region);
+ }
+ EXPORT_SYMBOL_GPL(static_dev_dax);
+ 
  static u64 dev_dax_size(struct dev_dax *dev_dax)
  {
    u64 size = 0;
@@@ -361,6 -387,14 +385,14 @@@ void kill_dev_dax(struct dev_dax *dev_d
kill_dax(dax_dev);
    unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+ 
+ 	/*
+ 	 * Dynamic dax region have the pgmap allocated via dev_kzalloc()
+ 	 * and thus freed by devm. Clear the pgmap to not have stale pgmap
+ 	 * ranges on probe() from previous reconfigurations of region devices.
+ 	 */
+ 	if (!static_dev_dax(dev_dax))
+ 		dev_dax->pgmap = NULL;
  }
  EXPORT_SYMBOL_GPL(kill_dev_dax);
@@@ -1321,17 -1355,14 +1353,17 @@@ struct dev_dax *devm_create_dev_dax(str
    }
/*
 -	 * No 'host' or dax_operations since there is no access to this
 -	 * device outside of mmap of the resulting character device.
 +	 * No dax_operations since there is no access to this device outside of
 +	 * mmap of the resulting character device.
     */
 -	dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
 +	dax_dev = alloc_dax(dev_dax, NULL);
    if (IS_ERR(dax_dev)) {
    	rc = PTR_ERR(dax_dev);
    	goto err_alloc_dax;
    }
 +	set_dax_synchronous(dax_dev);
 +	set_dax_nocache(dax_dev);
 +	set_dax_nomc(dax_dev);
/* a device_dax instance is dead while the driver is not attached */
    kill_dax(dax_dev);
@@@ -1344,7 -1375,10 +1376,7 @@@
inode = dax_inode(dax_dev);
    dev->devt = inode->i_rdev;
 -	if (data->subsys == DEV_DAX_BUS)
 -		dev->bus = &dax_bus_type;
 -	else
 -		dev->class = dax_class;
 +	dev->bus = &dax_bus_type;
    dev->parent = parent;
    dev->type = &dev_dax_type;
@@@ -1443,10 -1477,22 +1475,10 @@@ EXPORT_SYMBOL_GPL(dax_driver_unregister
int __init dax_bus_init(void)
  {
 -	int rc;
 -
 -	if (IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)) {
 -		dax_class = class_create(THIS_MODULE, "dax");
 -		if (IS_ERR(dax_class))
 -			return PTR_ERR(dax_class);
 -	}
 -
 -	rc = bus_register(&dax_bus_type);
 -	if (rc)
 -		class_destroy(dax_class);
 -	return rc;
 +	return bus_register(&dax_bus_type);
  }
void __exit dax_bus_exit(void)
  {
    bus_unregister(&dax_bus_type);
 -	class_destroy(dax_class);
  }
diff --combined drivers/dax/bus.h
index 381cec9ff05c,4acdfee7dd59..fbb940293d6d
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@@ -16,15 -16,24 +16,15 @@@ struct dax_region *alloc_dax_region(str
    	struct range *range, int target_node, unsigned int align,
    	unsigned long flags);
-enum dev_dax_subsys {
 -	DEV_DAX_BUS = 0, /* zeroed dev_dax_data picks this by default */
 -	DEV_DAX_CLASS,
 -};
 -
  struct dev_dax_data {
    struct dax_region *dax_region;
    struct dev_pagemap *pgmap;
 -	enum dev_dax_subsys subsys;
    resource_size_t size;
    int id;
  };
struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data);
-/* to be deleted when DEV_DAX_CLASS is removed */
 -struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys);
 -
  struct dax_device_driver {
    struct device_driver drv;
    struct list_head ids;
@@@ -39,7 -48,12 +39,8 @@@ int __dax_driver_register(struct dax_de
    __dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
  void dax_driver_unregister(struct dax_device_driver *dax_drv);
  void kill_dev_dax(struct dev_dax *dev_dax);
+ bool static_dev_dax(struct dev_dax *dev_dax);
-#if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)
 -int dev_dax_probe(struct dev_dax *dev_dax);
 -#endif
 -
  /*
   * While run_dax() is potentially a generic operation that could be
   * defined in include/linux/dax.h we don't want to grow any users
diff --combined drivers/dax/device.c
index e58d597f0415,591f293d326f..d33a0613ed0c
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@@ -73,11 -73,39 +73,39 @@@ __weak phys_addr_t dax_pgoff_to_phys(st
    return -1;
  }
+ static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn,
+ 			      unsigned long fault_size)
+ {
+ 	unsigned long i, nr_pages = fault_size / PAGE_SIZE;
+ 	struct file *filp = vmf->vma->vm_file;
+ 	struct dev_dax *dev_dax = filp->private_data;
+ 	pgoff_t pgoff;
+ 
+ 	/* mapping is only set on the head */
+ 	if (dev_dax->pgmap->vmemmap_shift)
+ 		nr_pages = 1;
+ 
+ 	pgoff = linear_page_index(vmf->vma,
+ 			ALIGN(vmf->address, fault_size));
+ 
+ 	for (i = 0; i < nr_pages; i++) {
+ 		struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
+ 
+ 		page = compound_head(page);
+ 		if (page->mapping)
+ 			continue;
+ 
+ 		page->mapping = filp->f_mapping;
+ 		page->index = pgoff + i;
+ 	}
+ }
+ 
  static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
- 				struct vm_fault *vmf, pfn_t *pfn)
+ 				struct vm_fault *vmf)
  {
    struct device *dev = &dev_dax->dev;
    phys_addr_t phys;
+ 	pfn_t pfn;
    unsigned int fault_size = PAGE_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__))
@@@ -98,18 -126,21 +126,21 @@@
    	return VM_FAULT_SIGBUS;
    }
- 	*pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
+ 	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- 	return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
+ 	dax_set_mapping(vmf, pfn, fault_size);
+ 
+ 	return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
  }
static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
- 				struct vm_fault *vmf, pfn_t *pfn)
+ 				struct vm_fault *vmf)
  {
    unsigned long pmd_addr = vmf->address & PMD_MASK;
    struct device *dev = &dev_dax->dev;
    phys_addr_t phys;
    pgoff_t pgoff;
+ 	pfn_t pfn;
    unsigned int fault_size = PMD_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__))
@@@ -138,19 -169,22 +169,22 @@@
    	return VM_FAULT_SIGBUS;
    }
- 	*pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
+ 	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- 	return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
+ 	dax_set_mapping(vmf, pfn, fault_size);
+ 
+ 	return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
  }
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
- 				struct vm_fault *vmf, pfn_t *pfn)
+ 				struct vm_fault *vmf)
  {
    unsigned long pud_addr = vmf->address & PUD_MASK;
    struct device *dev = &dev_dax->dev;
    phys_addr_t phys;
    pgoff_t pgoff;
+ 	pfn_t pfn;
    unsigned int fault_size = PUD_SIZE;
@@@ -180,13 -214,15 +214,15 @@@
    	return VM_FAULT_SIGBUS;
    }
- 	*pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
+ 	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- 	return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
+ 	dax_set_mapping(vmf, pfn, fault_size);
+ 
+ 	return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
  }
  #else
  static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
- 				struct vm_fault *vmf, pfn_t *pfn)
+ 				struct vm_fault *vmf)
  {
    return VM_FAULT_FALLBACK;
  }
@@@ -196,10 -232,8 +232,8 @@@ static vm_fault_t dev_dax_huge_fault(st
    	enum page_entry_size pe_size)
  {
    struct file *filp = vmf->vma->vm_file;
- 	unsigned long fault_size;
    vm_fault_t rc = VM_FAULT_SIGBUS;
    int id;
- 	pfn_t pfn;
    struct dev_dax *dev_dax = filp->private_data;
dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm,
@@@ -209,43 -243,18 +243,18 @@@
    id = dax_read_lock();
    switch (pe_size) {
    case PE_SIZE_PTE:
- 		fault_size = PAGE_SIZE;
- 		rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn);
+ 		rc = __dev_dax_pte_fault(dev_dax, vmf);
    	break;
    case PE_SIZE_PMD:
- 		fault_size = PMD_SIZE;
- 		rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn);
+ 		rc = __dev_dax_pmd_fault(dev_dax, vmf);
    	break;
    case PE_SIZE_PUD:
- 		fault_size = PUD_SIZE;
- 		rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn);
+ 		rc = __dev_dax_pud_fault(dev_dax, vmf);
    	break;
    default:
    	rc = VM_FAULT_SIGBUS;
    }
- 	if (rc == VM_FAULT_NOPAGE) {
- 		unsigned long i;
- 		pgoff_t pgoff;
- 
- 		/*
- 		 * In the device-dax case the only possibility for a
- 		 * VM_FAULT_NOPAGE result is when device-dax capacity is
- 		 * mapped. No need to consider the zero page, or racing
- 		 * conflicting mappings.
- 		 */
- 		pgoff = linear_page_index(vmf->vma, vmf->address
- 				& ~(fault_size - 1));
- 		for (i = 0; i < fault_size / PAGE_SIZE; i++) {
- 			struct page *page;
- 
- 			page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
- 			if (page->mapping)
- 				continue;
- 			page->mapping = filp->f_mapping;
- 			page->index = pgoff + i;
- 		}
- 	}
    dax_read_unlock(id);
return rc;
@@@ -398,17 -407,34 +407,34 @@@ int dev_dax_probe(struct dev_dax *dev_d
    void *addr;
    int rc, i;
- 	pgmap = dev_dax->pgmap;
- 	if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1,
- 			"static pgmap / multi-range device conflict\n"))
- 		return -EINVAL;
+ 	if (static_dev_dax(dev_dax))  {
+ 		if (dev_dax->nr_range > 1) {
+ 			dev_warn(dev,
+ 				"static pgmap / multi-range device conflict\n");
+ 			return -EINVAL;
+ 		}
- 	if (!pgmap) {
- 		pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range)
- 				* (dev_dax->nr_range - 1), GFP_KERNEL);
+ 		pgmap = dev_dax->pgmap;
+ 	} else {
+ 		if (dev_dax->pgmap) {
+ 			dev_warn(dev,
+ 				 "dynamic-dax with pre-populated page map\n");
+ 			return -EINVAL;
+ 		}
+ 
+ 		pgmap = devm_kzalloc(dev,
+                        struct_size(pgmap, ranges, dev_dax->nr_range - 1),
+                        GFP_KERNEL);
    	if (!pgmap)
    		return -ENOMEM;
+ 
    	pgmap->nr_range = dev_dax->nr_range;
+ 		dev_dax->pgmap = pgmap;
+ 
+ 		for (i = 0; i < dev_dax->nr_range; i++) {
+ 			struct range *range = &dev_dax->ranges[i].range;
+ 			pgmap->ranges[i] = *range;
+ 		}
    }
for (i = 0; i < dev_dax->nr_range; i++) {
@@@ -420,12 -446,12 +446,12 @@@
    				i, range->start, range->end);
    		return -EBUSY;
    	}
- 		/* don't update the range for static pgmap */
- 		if (!dev_dax->pgmap)
- 			pgmap->ranges[i] = *range;
    }
pgmap->type = MEMORY_DEVICE_GENERIC;
+ 	if (dev_dax->align > PAGE_SIZE)
+ 		pgmap->vmemmap_shift =
+ 			order_base_2(dev_dax->align >> PAGE_SHIFT);
    addr = devm_memremap_pages(dev, pgmap);
    if (IS_ERR(addr))
    	return PTR_ERR(addr);
@@@ -433,7 -459,11 +459,7 @@@
    inode = dax_inode(dax_dev);
    cdev = inode->i_cdev;
    cdev_init(cdev, &dax_fops);
 -	if (dev->class) {
 -		/* for the CONFIG_DEV_DAX_PMEM_COMPAT case */
 -		cdev->owner = dev->parent->driver->owner;
 -	} else
 -		cdev->owner = dev->driver->owner;
 +	cdev->owner = dev->driver->owner;
    cdev_set_parent(cdev, &dev->kobj);
    rc = cdev_add(cdev, dev->devt, 1);
    if (rc)
diff --combined drivers/of/fdt.c
index ca2cfb3012a4,116c582fea7a..ad85ff6474ff
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@@ -26,6 -26,7 +26,7 @@@
  #include <linux/serial_core.h>
  #include <linux/sysfs.h>
  #include <linux/random.h>
+ #include <linux/kmemleak.h>
#include <asm/setup.h>  /* for COMMAND_LINE_SIZE */
  #include <asm/page.h>
@@@ -482,11 -483,9 +483,11 @@@ static int __init early_init_dt_reserve
    if (nomap) {
    	/*
    	 * If the memory is already reserved (by another region), we
 -		 * should not allow it to be marked nomap.
 +		 * should not allow it to be marked nomap, but don't worry
 +		 * if the region isn't memory as it won't be mapped.
    	 */
 -		if (memblock_is_region_reserved(base, size))
 +		if (memblock_overlaps_region(&memblock.memory, base, size) &&
 +		    memblock_is_region_reserved(base, size))
    		return -EBUSY;
return memblock_mark_nomap(base, size);
@@@ -524,9 -523,12 +525,12 @@@ static int __init __reserved_mem_reserv
    	size = dt_mem_next_cell(dt_root_size_cells, &prop);
if (size &&
- 		    early_init_dt_reserve_memory_arch(base, size, nomap) == 0)
+ 		    early_init_dt_reserve_memory_arch(base, size, nomap) == 0) {
    		pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",
    			uname, &base, (unsigned long)(size / SZ_1M));
+ 			if (!nomap)
+ 				kmemleak_alloc_phys(base, size, 0, 0);
+ 		}
    	else
    		pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n",
    			uname, &base, (unsigned long)(size / SZ_1M));
@@@ -967,22 -969,18 +971,22 @@@ static void __init early_init_dt_check_
    	 elfcorehdr_addr, elfcorehdr_size);
  }
-static phys_addr_t cap_mem_addr;
 -static phys_addr_t cap_mem_size;
 +static unsigned long chosen_node_offset = -FDT_ERR_NOTFOUND;
/**
   * early_init_dt_check_for_usable_mem_range - Decode usable memory range
   * location from flat tree
 - * @node: reference to node containing usable memory range location ('chosen')
   */
 -static void __init early_init_dt_check_for_usable_mem_range(unsigned long node)
 +void __init early_init_dt_check_for_usable_mem_range(void)
  {
    const __be32 *prop;
    int len;
 +	phys_addr_t cap_mem_addr;
 +	phys_addr_t cap_mem_size;
 +	unsigned long node = chosen_node_offset;
 +
 +	if ((long)node < 0)
 +		return;
pr_debug("Looking for usable-memory-range property... ");
@@@ -995,8 -993,6 +999,8 @@@
pr_debug("cap_mem_start=%pa cap_mem_size=%pa\n", &cap_mem_addr,
    	 &cap_mem_size);
 +
 +	memblock_cap_memory_range(cap_mem_addr, cap_mem_size);
  }
#ifdef CONFIG_SERIAL_EARLYCON
@@@ -1050,14 -1046,13 +1054,14 @@@ int __init early_init_dt_scan_chosen_st
  /*
   * early_init_dt_scan_root - fetch the top level address and size cells
   */
 -int __init early_init_dt_scan_root(unsigned long node, const char *uname,
 -				   int depth, void *data)
 +int __init early_init_dt_scan_root(void)
  {
    const __be32 *prop;
 +	const void *fdt = initial_boot_params;
 +	int node = fdt_path_offset(fdt, "/");
-	if (depth != 0)
 -		return 0;
 +	if (node < 0)
 +		return -ENODEV;
dt_root_size_cells = OF_ROOT_NODE_SIZE_CELLS_DEFAULT;
    dt_root_addr_cells = OF_ROOT_NODE_ADDR_CELLS_DEFAULT;
@@@ -1072,7 -1067,8 +1076,7 @@@
    	dt_root_addr_cells = be32_to_cpup(prop);
    pr_debug("dt_root_addr_cells = %x\n", dt_root_addr_cells);
-	/* break now */
 -	return 1;
 +	return 0;
  }
u64 __init dt_mem_next_cell(int s, const __be32 **cellp)
@@@ -1086,78 -1082,73 +1090,78 @@@
  /*
   * early_init_dt_scan_memory - Look for and parse memory nodes
   */
 -int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
 -				     int depth, void *data)
 +int __init early_init_dt_scan_memory(void)
  {
 -	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
 -	const __be32 *reg, *endp;
 -	int l;
 -	bool hotpluggable;
 +	int node;
 +	const void *fdt = initial_boot_params;
-	/* We are scanning "memory" nodes only */
 -	if (type == NULL || strcmp(type, "memory") != 0)
 -		return 0;
 +	fdt_for_each_subnode(node, fdt, 0) {
 +		const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
 +		const __be32 *reg, *endp;
 +		int l;
 +		bool hotpluggable;
-	reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l);
 -	if (reg == NULL)
 -		reg = of_get_flat_dt_prop(node, "reg", &l);
 -	if (reg == NULL)
 -		return 0;
 +		/* We are scanning "memory" nodes only */
 +		if (type == NULL || strcmp(type, "memory") != 0)
 +			continue;
-	endp = reg + (l / sizeof(__be32));
 -	hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL);
 +		reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l);
 +		if (reg == NULL)
 +			reg = of_get_flat_dt_prop(node, "reg", &l);
 +		if (reg == NULL)
 +			continue;
-	pr_debug("memory scan node %s, reg size %d,\n", uname, l);
 +		endp = reg + (l / sizeof(__be32));
 +		hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL);
-	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
 -		u64 base, size;
 +		pr_debug("memory scan node %s, reg size %d,\n",
 +			 fdt_get_name(fdt, node, NULL), l);
-		base = dt_mem_next_cell(dt_root_addr_cells, &reg);
 -		size = dt_mem_next_cell(dt_root_size_cells, &reg);
 +		while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
 +			u64 base, size;
-		if (size == 0)
 -			continue;
 -		pr_debug(" - %llx, %llx\n", base, size);
 +			base = dt_mem_next_cell(dt_root_addr_cells, &reg);
 +			size = dt_mem_next_cell(dt_root_size_cells, &reg);
-		early_init_dt_add_memory_arch(base, size);
 +			if (size == 0)
 +				continue;
 +			pr_debug(" - %llx, %llx\n", base, size);
-		if (!hotpluggable)
 -			continue;
 +			early_init_dt_add_memory_arch(base, size);
-		if (memblock_mark_hotplug(base, size))
 -			pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n",
 -				base, base + size);
 -	}
 +			if (!hotpluggable)
 +				continue;
+			if (memblock_mark_hotplug(base, size))
 +				pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n",
 +					base, base + size);
 +		}
 +	}
    return 0;
  }
-int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
 -				     int depth, void *data)
 +int __init early_init_dt_scan_chosen(char *cmdline)
  {
 -	int l;
 +	int l, node;
    const char *p;
    const void *rng_seed;
 +	const void *fdt = initial_boot_params;
-	pr_debug("search "chosen", depth: %d, uname: %s\n", depth, uname);
 +	node = fdt_path_offset(fdt, "/chosen");
 +	if (node < 0)
 +		node = fdt_path_offset(fdt, "/chosen@0");
 +	if (node < 0)
 +		return -ENOENT;
-	if (depth != 1 || !data ||
 -	    (strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0))
 -		return 0;
 +	chosen_node_offset = node;
early_init_dt_check_for_initrd(node);
    early_init_dt_check_for_elfcorehdr(node);
 -	early_init_dt_check_for_usable_mem_range(node);
/* Retrieve command line */
    p = of_get_flat_dt_prop(node, "bootargs", &l);
    if (p != NULL && l > 0)
 -		strlcpy(data, p, min(l, COMMAND_LINE_SIZE));
 +		strlcpy(cmdline, p, min(l, COMMAND_LINE_SIZE));
/*
     * CONFIG_CMDLINE is meant to be a default in case nothing else
@@@ -1166,18 -1157,18 +1170,18 @@@
     */
  #ifdef CONFIG_CMDLINE
  #if defined(CONFIG_CMDLINE_EXTEND)
 -	strlcat(data, " ", COMMAND_LINE_SIZE);
 -	strlcat(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
 +	strlcat(cmdline, " ", COMMAND_LINE_SIZE);
 +	strlcat(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
  #elif defined(CONFIG_CMDLINE_FORCE)
 -	strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
 +	strlcpy(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
  #else
    /* No arguments from boot loader, use kernel's  cmdl*/
 -	if (!((char *)data)[0])
 -		strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
 +	if (!((char *)cmdline)[0])
 +		strlcpy(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
  #endif
  #endif /* CONFIG_CMDLINE */
-	pr_debug("Command line is: %s\n", (char *)data);
 +	pr_debug("Command line is: %s\n", (char *)cmdline);
rng_seed = of_get_flat_dt_prop(node, "rng-seed", &l);
    if (rng_seed && l > 0) {
@@@ -1191,7 -1182,8 +1195,7 @@@
    			fdt_totalsize(initial_boot_params));
    }
-	/* break now */
 -	return 1;
 +	return 0;
  }
#ifndef MIN_MEMBLOCK_ADDR
@@@ -1273,21 -1265,21 +1277,21 @@@ bool __init early_init_dt_verify(void *
void __init early_init_dt_scan_nodes(void)
  {
 -	int rc = 0;
 +	int rc;
/* Initialize {size,address}-cells info */
 -	of_scan_flat_dt(early_init_dt_scan_root, NULL);
 +	early_init_dt_scan_root();
/* Retrieve various information from the /chosen node */
 -	rc = of_scan_flat_dt(early_init_dt_scan_chosen, boot_command_line);
 -	if (!rc)
 +	rc = early_init_dt_scan_chosen(boot_command_line);
 +	if (rc)
    	pr_warn("No chosen node found, continuing without\n");
/* Setup memory, calling early_init_dt_add_memory_arch */
 -	of_scan_flat_dt(early_init_dt_scan_memory, NULL);
 +	early_init_dt_scan_memory();
/* Handle linux,usable-memory-range property */
 -	memblock_cap_memory_range(cap_mem_addr, cap_mem_size);
 +	early_init_dt_check_for_usable_mem_range();
  }
bool __init early_init_dt_scan(void *params)
diff --combined fs/ext4/extents.c
index 1077ce7e189f,5582fba36b44..74c91da585d7
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@@ -27,8 -27,8 +27,8 @@@
  #include <linux/slab.h>
  #include <linux/uaccess.h>
  #include <linux/fiemap.h>
- #include <linux/backing-dev.h>
  #include <linux/iomap.h>
+ #include <linux/sched/mm.h>
  #include "ext4_jbd2.h"
  #include "ext4_extents.h"
  #include "xattr.h"
@@@ -1496,7 -1496,8 +1496,7 @@@ static int ext4_ext_search_left(struct 
    			EXT4_ERROR_INODE(inode,
    			  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
    			  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
 -				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
 -		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
 +				  le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block),
    			  depth);
    			return -EFSCORRUPTED;
    		}
@@@ -2024,6 -2025,7 +2024,6 @@@ int ext4_ext_insert_extent(handle_t *ha
    				+ ext4_ext_get_actual_len(newext));
    		if (unwritten)
    			ext4_ext_mark_unwritten(ex);
 -			eh = path[depth].p_hdr;
    		nearex = ex;
    		goto merge;
    	}
@@@ -2052,6 -2054,7 +2052,6 @@@ prepend
    				+ ext4_ext_get_actual_len(newext));
    		if (unwritten)
    			ext4_ext_mark_unwritten(ex);
 -			eh = path[depth].p_hdr;
    		nearex = ex;
    		goto merge;
    	}
@@@ -4404,8 -4407,7 +4404,7 @@@ retry
    err = ext4_es_remove_extent(inode, last_block,
    			    EXT_MAX_BLOCKS - last_block);
    if (err == -ENOMEM) {
- 		cond_resched();
- 		congestion_wait(BLK_RW_ASYNC, HZ/50);
+ 		memalloc_retry_wait(GFP_ATOMIC);
    	goto retry;
    }
    if (err)
@@@ -4413,8 -4415,7 +4412,7 @@@
  retry_remove_space:
    err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
    if (err == -ENOMEM) {
- 		cond_resched();
- 		congestion_wait(BLK_RW_ASYNC, HZ/50);
+ 		memalloc_retry_wait(GFP_ATOMIC);
    	goto retry_remove_space;
    }
    return err;
@@@ -4644,6 -4645,8 +4642,6 @@@ static long ext4_zero_range(struct fil
    ret = ext4_mark_inode_dirty(handle, inode);
    if (unlikely(ret))
    	goto out_handle;
 -	ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,
 -			(offset + len - 1) >> inode->i_sb->s_blocksize_bits);
    /* Zero out partial block at the edges of the range */
    ret = ext4_zero_partial_blocks(handle, inode, offset, len);
    if (ret >= 0)
@@@ -4692,6 -4695,8 +4690,6 @@@ long ext4_fallocate(struct file *file, 
    	     FALLOC_FL_INSERT_RANGE))
    	return -EOPNOTSUPP;
-	ext4_fc_start_update(inode);
 -
    if (mode & FALLOC_FL_PUNCH_HOLE) {
    	ret = ext4_punch_hole(inode, offset, len);
    	goto exit;
@@@ -4755,6 -4760,7 +4753,6 @@@ out
    inode_unlock(inode);
    trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
  exit:
 -	ext4_fc_stop_update(inode);
    return ret;
  }
@@@ -5336,7 -5342,7 +5334,7 @@@ static int ext4_collapse_range(struct i
    	ret = PTR_ERR(handle);
    	goto out_mmap;
    }
 -	ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
 +	ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
down_write(&EXT4_I(inode)->i_data_sem);
    ext4_discard_preallocations(inode, 0);
@@@ -5375,6 -5381,7 +5373,6 @@@
out_stop:
    ext4_journal_stop(handle);
 -	ext4_fc_stop_ineligible(sb);
  out_mmap:
    filemap_invalidate_unlock(mapping);
  out_mutex:
@@@ -5476,7 -5483,7 +5474,7 @@@ static int ext4_insert_range(struct ino
    	ret = PTR_ERR(handle);
    	goto out_mmap;
    }
 -	ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
 +	ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
/* Expand file to avoid data loss if there is error while shifting */
    inode->i_size += len;
@@@ -5551,6 -5558,7 +5549,6 @@@
out_stop:
    ext4_journal_stop(handle);
 -	ext4_fc_stop_ineligible(sb);
  out_mmap:
    filemap_invalidate_unlock(mapping);
  out_mutex:
diff --combined fs/xfs/xfs_buf.c
index bbb0fbd34e64,6c45e3fa56f4..b45e0d50a405
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@@ -394,7 -394,7 +394,7 @@@ xfs_buf_alloc_pages
    	}
XFS_STATS_INC(bp->b_mount, xb_page_retries);
- 		congestion_wait(BLK_RW_ASYNC, HZ / 50);
+ 		memalloc_retry_wait(gfp_mask);
    }
    return 0;
  }
@@@ -1892,7 -1892,6 +1892,7 @@@ xfs_free_buftarg
    list_lru_destroy(&btp->bt_lru);
blkdev_issue_flush(btp->bt_bdev);
 +	fs_put_dax(btp->bt_daxdev);
kmem_free(btp);
  }
@@@ -1933,10 -1932,11 +1933,10 @@@ xfs_setsize_buftarg_early
    return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
  }
-xfs_buftarg_t *
 +struct xfs_buftarg *
  xfs_alloc_buftarg(
    struct xfs_mount	*mp,
 -	struct block_device	*bdev,
 -	struct dax_device	*dax_dev)
 +	struct block_device	*bdev)
  {
    xfs_buftarg_t		*btp;
@@@ -1945,7 -1945,7 +1945,7 @@@
    btp->bt_mount = mp;
    btp->bt_dev =  bdev->bd_dev;
    btp->bt_bdev = bdev;
 -	btp->bt_daxdev = dax_dev;
 +	btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
/*
     * Buffer IO error rate limiting. Limit it to no more than 10 messages
diff --combined include/linux/fs.h
index f5d3bf5b69a6,5315fa68f751..42ab6d71291c
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -41,7 -41,6 +41,7 @@@
  #include <linux/stddef.h>
  #include <linux/mount.h>
  #include <linux/cred.h>
 +#include <linux/mnt_idmapping.h>
#include <asm/byteorder.h>
  #include <uapi/linux/fs.h>
@@@ -1600,11 -1599,6 +1600,11 @@@ struct super_block 
    struct list_head	s_inodes_wb;	/* writeback inodes */
  } __randomize_layout;
+static inline struct user_namespace *i_user_ns(const struct inode *inode)
 +{
 +	return inode->i_sb->s_user_ns;
 +}
 +
  /* Helper functions so that in most cases filesystems will
   * not need to deal directly with kuid_t and kgid_t and can
   * instead deal with the raw numeric values that are stored
@@@ -1612,22 -1606,50 +1612,22 @@@
   */
  static inline uid_t i_uid_read(const struct inode *inode)
  {
 -	return from_kuid(inode->i_sb->s_user_ns, inode->i_uid);
 +	return from_kuid(i_user_ns(inode), inode->i_uid);
  }
static inline gid_t i_gid_read(const struct inode *inode)
  {
 -	return from_kgid(inode->i_sb->s_user_ns, inode->i_gid);
 +	return from_kgid(i_user_ns(inode), inode->i_gid);
  }
static inline void i_uid_write(struct inode *inode, uid_t uid)
  {
 -	inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid);
 +	inode->i_uid = make_kuid(i_user_ns(inode), uid);
  }
static inline void i_gid_write(struct inode *inode, gid_t gid)
  {
 -	inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
 -}
 -
 -/**
 - * kuid_into_mnt - map a kuid down into a mnt_userns
 - * @mnt_userns: user namespace of the relevant mount
 - * @kuid: kuid to be mapped
 - *
 - * Return: @kuid mapped according to @mnt_userns.
 - * If @kuid has no mapping INVALID_UID is returned.
 - */
 -static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns,
 -				   kuid_t kuid)
 -{
 -	return make_kuid(mnt_userns, __kuid_val(kuid));
 -}
 -
 -/**
 - * kgid_into_mnt - map a kgid down into a mnt_userns
 - * @mnt_userns: user namespace of the relevant mount
 - * @kgid: kgid to be mapped
 - *
 - * Return: @kgid mapped according to @mnt_userns.
 - * If @kgid has no mapping INVALID_GID is returned.
 - */
 -static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns,
 -				   kgid_t kgid)
 -{
 -	return make_kgid(mnt_userns, __kgid_val(kgid));
 +	inode->i_gid = make_kgid(i_user_ns(inode), gid);
  }
/**
@@@ -1641,7 -1663,7 +1641,7 @@@
  static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
    			    const struct inode *inode)
  {
 -	return kuid_into_mnt(mnt_userns, inode->i_uid);
 +	return mapped_kuid_fs(mnt_userns, i_user_ns(inode), inode->i_uid);
  }
/**
@@@ -1655,7 -1677,69 +1655,7 @@@
  static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
    			    const struct inode *inode)
  {
 -	return kgid_into_mnt(mnt_userns, inode->i_gid);
 -}
 -
 -/**
 - * kuid_from_mnt - map a kuid up into a mnt_userns
 - * @mnt_userns: user namespace of the relevant mount
 - * @kuid: kuid to be mapped
 - *
 - * Return: @kuid mapped up according to @mnt_userns.
 - * If @kuid has no mapping INVALID_UID is returned.
 - */
 -static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns,
 -				   kuid_t kuid)
 -{
 -	return KUIDT_INIT(from_kuid(mnt_userns, kuid));
 -}
 -
 -/**
 - * kgid_from_mnt - map a kgid up into a mnt_userns
 - * @mnt_userns: user namespace of the relevant mount
 - * @kgid: kgid to be mapped
 - *
 - * Return: @kgid mapped up according to @mnt_userns.
 - * If @kgid has no mapping INVALID_GID is returned.
 - */
 -static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
 -				   kgid_t kgid)
 -{
 -	return KGIDT_INIT(from_kgid(mnt_userns, kgid));
 -}
 -
 -/**
 - * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns
 - * @mnt_userns: user namespace of the relevant mount
 - *
 - * Use this helper to initialize a new vfs or filesystem object based on
 - * the caller's fsuid. A common example is initializing the i_uid field of
 - * a newly allocated inode triggered by a creation event such as mkdir or
 - * O_CREAT. Other examples include the allocation of quotas for a specific
 - * user.
 - *
 - * Return: the caller's current fsuid mapped up according to @mnt_userns.
 - */
 -static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns)
 -{
 -	return kuid_from_mnt(mnt_userns, current_fsuid());
 -}
 -
 -/**
 - * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns
 - * @mnt_userns: user namespace of the relevant mount
 - *
 - * Use this helper to initialize a new vfs or filesystem object based on
 - * the caller's fsgid. A common example is initializing the i_gid field of
 - * a newly allocated inode triggered by a creation event such as mkdir or
 - * O_CREAT. Other examples include the allocation of quotas for a specific
 - * user.
 - *
 - * Return: the caller's current fsgid mapped up according to @mnt_userns.
 - */
 -static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns)
 -{
 -	return kgid_from_mnt(mnt_userns, current_fsgid());
 +	return mapped_kgid_fs(mnt_userns, i_user_ns(inode), inode->i_gid);
  }
/**
@@@ -1669,7 -1753,7 +1669,7 @@@
  static inline void inode_fsuid_set(struct inode *inode,
    			   struct user_namespace *mnt_userns)
  {
 -	inode->i_uid = mapped_fsuid(mnt_userns);
 +	inode->i_uid = mapped_fsuid(mnt_userns, i_user_ns(inode));
  }
/**
@@@ -1683,7 -1767,7 +1683,7 @@@
  static inline void inode_fsgid_set(struct inode *inode,
    			   struct user_namespace *mnt_userns)
  {
 -	inode->i_gid = mapped_fsgid(mnt_userns);
 +	inode->i_gid = mapped_fsgid(mnt_userns, i_user_ns(inode));
  }
/**
@@@ -1700,18 -1784,10 +1700,18 @@@
  static inline bool fsuidgid_has_mapping(struct super_block *sb,
    				struct user_namespace *mnt_userns)
  {
 -	struct user_namespace *s_user_ns = sb->s_user_ns;
 +	struct user_namespace *fs_userns = sb->s_user_ns;
 +	kuid_t kuid;
 +	kgid_t kgid;
-	return kuid_has_mapping(s_user_ns, mapped_fsuid(mnt_userns)) &&
 -	       kgid_has_mapping(s_user_ns, mapped_fsgid(mnt_userns));
 +	kuid = mapped_fsuid(mnt_userns, fs_userns);
 +	if (!uid_valid(kuid))
 +		return false;
 +	kgid = mapped_fsgid(mnt_userns, fs_userns);
 +	if (!gid_valid(kgid))
 +		return false;
 +	return kuid_has_mapping(fs_userns, kuid) &&
 +	       kgid_has_mapping(fs_userns, kgid);
  }
extern struct timespec64 current_time(struct inode *inode);
@@@ -2173,7 -2249,6 +2173,7 @@@ struct super_operations 
  #define S_ENCRYPTED	(1 << 14) /* Encrypted file (using fs/crypto/) */
  #define S_CASEFOLD	(1 << 15) /* Casefolded file */
  #define S_VERITY	(1 << 16) /* Verity file (using fs/verity/) */
 +#define S_KERNEL_FILE	(1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
/*
   * Note that nosuid etc flags are inode-specific: setting some file-system
@@@ -2343,8 -2418,6 +2343,8 @@@ static inline void kiocb_clone(struct k
   *			Used to detect that mark_inode_dirty() should not move
   * 			inode between dirty lists.
   *
 + * I_PINNING_FSCACHE_WB	Inode is pinning an fscache object for writeback.
 + *
   * Q: What is the difference between I_WILL_FREE and I_FREEING?
   */
  #define I_DIRTY_SYNC		(1 << 0)
@@@ -2367,7 -2440,6 +2367,7 @@@
  #define I_CREATING		(1 << 15)
  #define I_DONTCACHE		(1 << 16)
  #define I_SYNC_QUEUED		(1 << 17)
 +#define I_PINNING_FSCACHE_WB	(1 << 18)
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
  #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
@@@ -2652,21 -2724,6 +2652,21 @@@ static inline struct user_namespace *fi
  {
    return mnt_user_ns(file->f_path.mnt);
  }
 +
 +/**
 + * is_idmapped_mnt - check whether a mount is mapped
 + * @mnt: the mount to check
 + *
 + * If @mnt has an idmapping attached different from the
 + * filesystem's idmapping then @mnt is mapped.
 + *
 + * Return: true if mount is mapped, false if not.
 + */
 +static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
 +{
 +	return mnt_user_ns(mnt) != mnt->mnt_sb->s_user_ns;
 +}
 +
  extern long vfs_truncate(const struct path *, loff_t);
  int do_truncate(struct user_namespace *, struct dentry *, loff_t start,
    	unsigned int time_attrs, struct file *filp);
@@@ -2790,6 -2847,8 +2790,6 @@@ static inline int filemap_fdatawait(str
extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
    			  loff_t lend);
 -extern bool filemap_range_needs_writeback(struct address_space *,
 -					  loff_t lstart, loff_t lend);
  extern int filemap_write_and_wait_range(struct address_space *mapping,
    			        loff_t lstart, loff_t lend);
  extern int __filemap_fdatawrite_range(struct address_space *mapping,
@@@ -3093,6 -3152,7 +3093,7 @@@ extern void unlock_new_inode(struct ino
  extern void discard_new_inode(struct inode *);
  extern unsigned int get_next_ino(void);
  extern void evict_inodes(struct super_block *sb);
+ void dump_mapping(const struct address_space *);
/*
   * Userspace may rely on the the inode number being non-zero. For example, glibc
diff --combined include/linux/kasan.h
index fb78108d694e,89c99e5e67de..4a45562d8893
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@@ -9,7 -9,6 +9,7 @@@
struct kmem_cache;
  struct page;
 +struct slab;
  struct vm_struct;
  struct task_struct;
@@@ -194,11 -193,11 +194,11 @@@ static __always_inline size_t kasan_met
    return 0;
  }
-void __kasan_poison_slab(struct page *page);
 -static __always_inline void kasan_poison_slab(struct page *page)
 +void __kasan_poison_slab(struct slab *slab);
 +static __always_inline void kasan_poison_slab(struct slab *slab)
  {
    if (kasan_enabled())
 -		__kasan_poison_slab(page);
 +		__kasan_poison_slab(slab);
  }
void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@@ -323,7 -322,7 +323,7 @@@ static inline void kasan_cache_create(s
    			      slab_flags_t *flags) {}
  static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
  static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 -static inline void kasan_poison_slab(struct page *page) {}
 +static inline void kasan_poison_slab(struct slab *slab) {}
  static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
    				void *object) {}
  static inline void kasan_poison_object_data(struct kmem_cache *cache,
@@@ -475,12 -474,12 +475,12 @@@ static inline void kasan_populate_early
   * allocations with real shadow memory. With KASAN vmalloc, the special
   * case is unnecessary, as the work is handled in the generic case.
   */
- int kasan_module_alloc(void *addr, size_t size);
+ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask);
  void kasan_free_shadow(const struct vm_struct *vm);
#else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
- static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+ static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; }
  static inline void kasan_free_shadow(const struct vm_struct *vm) {}
#endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
diff --combined include/linux/memcontrol.h
index e34112f6a369,0131e5574c88..b72d75141e12
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@@ -33,6 -33,7 +33,7 @@@ enum memcg_stat_item 
    MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
    MEMCG_SOCK,
    MEMCG_PERCPU_B,
+ 	MEMCG_VMALLOC,
    MEMCG_NR_STAT,
  };
@@@ -42,6 -43,7 +43,7 @@@ enum memcg_memory_event 
    MEMCG_MAX,
    MEMCG_OOM,
    MEMCG_OOM_KILL,
+ 	MEMCG_OOM_GROUP_KILL,
    MEMCG_SWAP_HIGH,
    MEMCG_SWAP_MAX,
    MEMCG_SWAP_FAIL,
@@@ -536,6 -538,45 +538,6 @@@ static inline bool folio_memcg_kmem(str
    return folio->memcg_data & MEMCG_DATA_KMEM;
  }
-/*
 - * page_objcgs - get the object cgroups vector associated with a page
 - * @page: a pointer to the page struct
 - *
 - * Returns a pointer to the object cgroups vector associated with the page,
 - * or NULL. This function assumes that the page is known to have an
 - * associated object cgroups vector. It's not safe to call this function
 - * against pages, which might have an associated memory cgroup: e.g.
 - * kernel stack pages.
 - */
 -static inline struct obj_cgroup **page_objcgs(struct page *page)
 -{
 -	unsigned long memcg_data = READ_ONCE(page->memcg_data);
 -
 -	VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page);
 -	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
 -
 -	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 -}
 -
 -/*
 - * page_objcgs_check - get the object cgroups vector associated with a page
 - * @page: a pointer to the page struct
 - *
 - * Returns a pointer to the object cgroups vector associated with the page,
 - * or NULL. This function is safe to use if the page can be directly associated
 - * with a memory cgroup.
 - */
 -static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 -{
 -	unsigned long memcg_data = READ_ONCE(page->memcg_data);
 -
 -	if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS))
 -		return NULL;
 -
 -	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
 -
 -	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 -}
#else
  static inline bool folio_memcg_kmem(struct folio *folio)
@@@ -543,6 -584,15 +545,6 @@@
    return false;
  }
-static inline struct obj_cgroup **page_objcgs(struct page *page)
 -{
 -	return NULL;
 -}
 -
 -static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 -{
 -	return NULL;
 -}
  #endif
static inline bool PageMemcgKmem(struct page *page)
@@@ -943,6 -993,21 +945,21 @@@ static inline void mod_memcg_state(stru
    local_irq_restore(flags);
  }
+ static inline void mod_memcg_page_state(struct page *page,
+ 					int idx, int val)
+ {
+ 	struct mem_cgroup *memcg;
+ 
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 
+ 	rcu_read_lock();
+ 	memcg = page_memcg(page);
+ 	if (memcg)
+ 		mod_memcg_state(memcg, idx, val);
+ 	rcu_read_unlock();
+ }
+ 
  static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
  {
    return READ_ONCE(memcg->vmstats.state[idx]);
@@@ -1398,6 -1463,11 +1415,11 @@@ static inline void mod_memcg_state(stru
  {
  }
+ static inline void mod_memcg_page_state(struct page *page,
+ 					int idx, int val)
+ {
+ }
+ 
  static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
  {
    return 0;
diff --combined include/linux/memremap.h
index a8bc588fe7aa,61a6a0e27359..1fafcc38acba
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@@ -72,6 -72,16 +72,6 @@@ struct dev_pagemap_ops 
     */
    void (*page_free)(struct page *page);
-	/*
 -	 * Transition the refcount in struct dev_pagemap to the dead state.
 -	 */
 -	void (*kill)(struct dev_pagemap *pgmap);
 -
 -	/*
 -	 * Wait for refcount in struct dev_pagemap to be idle and reap it.
 -	 */
 -	void (*cleanup)(struct dev_pagemap *pgmap);
 -
    /*
     * Used for private (un-addressable) device memory only.  Must migrate
     * the page back to a CPU accessible page.
@@@ -85,9 -95,15 +85,14 @@@
   * struct dev_pagemap - metadata for ZONE_DEVICE mappings
   * @altmap: pre-allocated/reserved memory for vmemmap allocations
   * @ref: reference count that pins the devm_memremap_pages() mapping
 - * @internal_ref: internal reference if @ref is not provided by the caller
 - * @done: completion for @internal_ref
 + * @done: completion for @ref
   * @type: memory type: see MEMORY_* in memory_hotplug.h
   * @flags: PGMAP_* flags to specify defailed behavior
+  * @vmemmap_shift: structural definition of how the vmemmap page metadata
+  *      is populated, specifically the metadata page order.
+  *	A zero value (default) uses base pages as the vmemmap metadata
+  *	representation. A bigger value will set up compound struct pages
+  *	of the requested order value.
   * @ops: method table
   * @owner: an opaque pointer identifying the entity that manages this
   *	instance.  Used by various helpers to make sure that no
@@@ -98,10 -114,12 +103,11 @@@
   */
  struct dev_pagemap {
    struct vmem_altmap altmap;
 -	struct percpu_ref *ref;
 -	struct percpu_ref internal_ref;
 +	struct percpu_ref ref;
    struct completion done;
    enum memory_type type;
    unsigned int flags;
+ 	unsigned long vmemmap_shift;
    const struct dev_pagemap_ops *ops;
    void *owner;
    int nr_range;
@@@ -118,6 -136,11 +124,11 @@@ static inline struct vmem_altmap *pgmap
    return NULL;
  }
+ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
+ {
+ 	return 1 << pgmap->vmemmap_shift;
+ }
+ 
  #ifdef CONFIG_ZONE_DEVICE
  void *memremap_pages(struct dev_pagemap *pgmap, int nid);
  void memunmap_pages(struct dev_pagemap *pgmap);
@@@ -179,7 -202,7 +190,7 @@@ static inline unsigned long memremap_co
  static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
  {
    if (pgmap)
 -		percpu_ref_put(pgmap->ref);
 +		percpu_ref_put(&pgmap->ref);
  }
#endif /* _LINUX_MEMREMAP_H_ */
diff --combined include/linux/mm.h
index c768a7c81b0b,d4fb49a5d60d..aa47705191bc
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -424,51 -424,6 +424,6 @@@ extern unsigned int kobjsize(const voi
   */
  extern pgprot_t protection_map[16];
- /**
-  * enum fault_flag - Fault flag definitions.
-  * @FAULT_FLAG_WRITE: Fault was a write fault.
-  * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
-  * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
-  * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
-  * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
-  * @FAULT_FLAG_TRIED: The fault has been tried once.
-  * @FAULT_FLAG_USER: The fault originated in userspace.
-  * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
-  * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
-  * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
-  *
-  * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
-  * whether we would allow page faults to retry by specifying these two
-  * fault flags correctly.  Currently there can be three legal combinations:
-  *
-  * (a) ALLOW_RETRY and !TRIED:  this means the page fault allows retry, and
-  *                              this is the first try
-  *
-  * (b) ALLOW_RETRY and TRIED:   this means the page fault allows retry, and
-  *                              we've already tried at least once
-  *
-  * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
-  *
-  * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
-  * be used.  Note that page faults can be allowed to retry for multiple times,
-  * in which case we'll have an initial fault with flags (a) then later on
-  * continuous faults with flags (b).  We should always try to detect pending
-  * signals before a retry to make sure the continuous page faults can still be
-  * interrupted if necessary.
-  */
- enum fault_flag {
- 	FAULT_FLAG_WRITE =		1 << 0,
- 	FAULT_FLAG_MKWRITE =		1 << 1,
- 	FAULT_FLAG_ALLOW_RETRY =	1 << 2,
- 	FAULT_FLAG_RETRY_NOWAIT = 	1 << 3,
- 	FAULT_FLAG_KILLABLE =		1 << 4,
- 	FAULT_FLAG_TRIED = 		1 << 5,
- 	FAULT_FLAG_USER =		1 << 6,
- 	FAULT_FLAG_REMOTE =		1 << 7,
- 	FAULT_FLAG_INSTRUCTION =	1 << 8,
- 	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
- };
- 
  /*
   * The default fault flags that should be used by most of the
   * arch-specific page fault handlers.
@@@ -577,6 -532,10 +532,10 @@@ enum page_entry_size 
   */
  struct vm_operations_struct {
    void (*open)(struct vm_area_struct * area);
+ 	/**
+ 	 * @close: Called when the VMA is being removed from the MM.
+ 	 * Context: User context.  May sleep.  Caller holds mmap_lock.
+ 	 */
    void (*close)(struct vm_area_struct * area);
    /* Called any time before splitting to check if it's allowed */
    int (*may_split)(struct vm_area_struct *area, unsigned long addr);
@@@ -714,27 -673,6 +673,27 @@@ int vma_is_stack_for_current(struct vm_
  struct mmu_gather;
  struct inode;
+static inline unsigned int compound_order(struct page *page)
 +{
 +	if (!PageHead(page))
 +		return 0;
 +	return page[1].compound_order;
 +}
 +
 +/**
 + * folio_order - The allocation order of a folio.
 + * @folio: The folio.
 + *
 + * A folio is composed of 2^order pages.  See get_order() for the definition
 + * of order.
 + *
 + * Return: The order of the folio.
 + */
 +static inline unsigned int folio_order(struct folio *folio)
 +{
 +	return compound_order(&folio->page);
 +}
 +
  #include <linux/huge_mm.h>
/*
@@@ -861,19 -799,15 +820,15 @@@ static inline int page_mapcount(struct
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
  int total_mapcount(struct page *page);
- int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
+ int page_trans_huge_mapcount(struct page *page);
  #else
  static inline int total_mapcount(struct page *page)
  {
    return page_mapcount(page);
  }
- static inline int page_trans_huge_mapcount(struct page *page,
- 					   int *total_mapcount)
+ static inline int page_trans_huge_mapcount(struct page *page)
  {
- 	int mapcount = page_mapcount(page);
- 	if (total_mapcount)
- 		*total_mapcount = mapcount;
- 	return mapcount;
+ 	return page_mapcount(page);
  }
  #endif
@@@ -884,13 -818,6 +839,13 @@@ static inline struct page *virt_to_head
    return compound_head(page);
  }
+static inline struct folio *virt_to_folio(const void *x)
 +{
 +	struct page *page = virt_to_page(x);
 +
 +	return page_folio(page);
 +}
 +
  void __put_page(struct page *page);
void put_pages_list(struct list_head *pages);
@@@ -934,6 -861,27 +889,6 @@@ static inline void destroy_compound_pag
    compound_page_dtors[page[1].compound_dtor](page);
  }
-static inline unsigned int compound_order(struct page *page)
 -{
 -	if (!PageHead(page))
 -		return 0;
 -	return page[1].compound_order;
 -}
 -
 -/**
 - * folio_order - The allocation order of a folio.
 - * @folio: The folio.
 - *
 - * A folio is composed of 2^order pages.  See get_order() for the definition
 - * of order.
 - *
 - * Return: The order of the folio.
 - */
 -static inline unsigned int folio_order(struct folio *folio)
 -{
 -	return compound_order(&folio->page);
 -}
 -
  static inline bool hpage_pincount_available(struct page *page)
  {
    /*
@@@ -1760,11 -1708,6 +1715,11 @@@ void page_address_init(void)
  #define page_address_init()  do { } while(0)
  #endif
+static inline void *folio_address(const struct folio *folio)
 +{
 +	return page_address(&folio->page);
 +}
 +
  extern void *page_rmapping(struct page *page);
  extern struct anon_vma *page_anon_vma(struct page *page);
  extern pgoff_t __page_file_index(struct page *page);
@@@ -1837,6 -1780,28 +1792,6 @@@ static inline bool can_do_mlock(void) 
  extern int user_shm_lock(size_t, struct ucounts *);
  extern void user_shm_unlock(size_t, struct ucounts *);
-/*
 - * Parameter block passed down to zap_pte_range in exceptional cases.
 - */
 -struct zap_details {
 -	struct address_space *zap_mapping;	/* Check page->mapping if set */
 -	struct page *single_page;		/* Locked page to be unmapped */
 -};
 -
 -/*
 - * We set details->zap_mappings when we want to unmap shared but keep private
 - * pages. Return true if skip zapping this page, false otherwise.
 - */
 -static inline bool
 -zap_skip_check_mapping(struct zap_details *details, struct page *page)
 -{
 -	if (!details || !page)
 -		return false;
 -
 -	return details->zap_mapping &&
 -	    (details->zap_mapping != page_rmapping(page));
 -}
 -
  struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
    		     pte_t pte);
  struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
@@@ -1871,6 -1836,7 +1826,6 @@@ extern void truncate_pagecache(struct i
  extern void truncate_setsize(struct inode *inode, loff_t newsize);
  void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
  void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
 -int truncate_inode_page(struct address_space *mapping, struct page *page);
  int generic_error_remove_page(struct address_space *mapping, struct page *page);
  int invalidate_inode_page(struct page *page);
@@@ -1881,6 -1847,7 +1836,6 @@@ extern vm_fault_t handle_mm_fault(struc
  extern int fixup_user_fault(struct mm_struct *mm,
    		    unsigned long address, unsigned int fault_flags,
    		    bool *unlocked);
 -void unmap_mapping_page(struct page *page);
  void unmap_mapping_pages(struct address_space *mapping,
    	pgoff_t start, pgoff_t nr, bool even_cows);
  void unmap_mapping_range(struct address_space *mapping,
@@@ -1901,6 -1868,7 +1856,6 @@@ static inline int fixup_user_fault(stru
    BUG();
    return -EFAULT;
  }
 -static inline void unmap_mapping_page(struct page *page) { }
  static inline void unmap_mapping_pages(struct address_space *mapping,
    	pgoff_t start, pgoff_t nr, bool even_cows) { }
  static inline void unmap_mapping_range(struct address_space *mapping,
@@@ -1957,6 -1925,7 +1912,6 @@@ int get_kernel_pages(const struct kvec 
    		struct page **pages);
  struct page *get_dump_page(unsigned long addr);
-extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
  extern void do_invalidatepage(struct page *page, unsigned int offset,
    		      unsigned int length);
@@@ -2644,7 -2613,7 +2599,7 @@@ static inline int vma_adjust(struct vm_
  extern struct vm_area_struct *vma_merge(struct mm_struct *,
    struct vm_area_struct *prev, unsigned long addr, unsigned long end,
    unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- 	struct mempolicy *, struct vm_userfaultfd_ctx);
+ 	struct mempolicy *, struct vm_userfaultfd_ctx, const char *);
  extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
  extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
    unsigned long addr, int new_below);
@@@ -3153,7 -3122,6 +3108,6 @@@ int drop_caches_sysctl_handler(struct c
  #endif
void drop_slab(void);
- void drop_slab_node(int nid);
#ifndef CONFIG_MMU
  #define randomize_va_space 0
@@@ -3206,6 -3174,7 +3160,7 @@@ enum mf_flags 
    MF_ACTION_REQUIRED = 1 << 1,
    MF_MUST_KILL = 1 << 2,
    MF_SOFT_OFFLINE = 1 << 3,
+ 	MF_UNPOISON = 1 << 4,
  };
  extern int memory_failure(unsigned long pfn, int flags);
  extern void memory_failure_queue(unsigned long pfn, int flags);
@@@ -3217,19 -3186,6 +3172,19 @@@ extern void shake_page(struct page *p)
  extern atomic_long_t num_poisoned_pages __read_mostly;
  extern int soft_offline_page(unsigned long pfn, int flags);
+#ifndef arch_memory_failure
 +static inline int arch_memory_failure(unsigned long pfn, int flags)
 +{
 +	return -ENXIO;
 +}
 +#endif
 +
 +#ifndef arch_is_platform_page
 +static inline bool arch_is_platform_page(u64 paddr)
 +{
 +	return false;
 +}
 +#endif
/*
   * Error handlers for various types of pages.
@@@ -3246,7 -3202,6 +3201,6 @@@ enum mf_action_page_type 
    MF_MSG_KERNEL_HIGH_ORDER,
    MF_MSG_SLAB,
    MF_MSG_DIFFERENT_COMPOUND,
- 	MF_MSG_POISONED_HUGE,
    MF_MSG_HUGE,
    MF_MSG_FREE_HUGE,
    MF_MSG_NON_PMD_HUGE,
@@@ -3261,7 -3216,6 +3215,6 @@@
    MF_MSG_CLEAN_LRU,
    MF_MSG_TRUNCATED_LRU,
    MF_MSG_BUDDY,
- 	MF_MSG_BUDDY_2ND,
    MF_MSG_DAX,
    MF_MSG_UNSPLIT_THP,
    MF_MSG_UNKNOWN,
@@@ -3390,5 -3344,16 +3343,16 @@@ static inline int seal_check_future_wri
    return 0;
  }
+ #ifdef CONFIG_ANON_VMA_NAME
+ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+ 			  unsigned long len_in, const char *name);
+ #else
+ static inline int
+ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+ 		      unsigned long len_in, const char *name) {
+ 	return 0;
+ }
+ #endif
+ 
  #endif /* __KERNEL__ */
  #endif /* _LINUX_MM_H */
diff --combined include/linux/mm_types.h
index 1ae3537c7920,e3b0476a4fda..3764c1b51b02
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@@ -5,6 -5,7 +5,7 @@@
  #include <linux/mm_types_task.h>
#include <linux/auxvec.h>
+ #include <linux/kref.h>
  #include <linux/list.h>
  #include <linux/spinlock.h>
  #include <linux/rbtree.h>
@@@ -56,11 -57,11 +57,11 @@@ struct mem_cgroup
   * in each subpage, but you may need to restore some of their values
   * afterwards.
   *
 - * SLUB uses cmpxchg_double() to atomically update its freelist and
 - * counters.  That requires that freelist & counters be adjacent and
 - * double-word aligned.  We align all struct pages to double-word
 - * boundaries, and ensure that 'freelist' is aligned within the
 - * struct.
 + * SLUB uses cmpxchg_double() to atomically update its freelist and counters.
 + * That requires that freelist & counters in struct slab be adjacent and
 + * double-word aligned. Because struct slab currently just reinterprets the
 + * bits of struct page, we align all struct pages to double-word boundaries,
 + * and ensure that 'freelist' is aligned within struct slab.
   */
  #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
  #define _struct_page_alignment	__aligned(2 * sizeof(unsigned long))
@@@ -386,6 -387,12 +387,12 @@@ struct vm_userfaultfd_ctx 
  struct vm_userfaultfd_ctx {};
  #endif /* CONFIG_USERFAULTFD */
+ struct anon_vma_name {
+ 	struct kref kref;
+ 	/* The name needs to be at the end because it is dynamically sized. */
+ 	char name[];
+ };
+ 
  /*
   * This struct describes a virtual memory area. There is one of these
   * per VM-area/task. A VM area is any part of the process virtual memory
@@@ -426,11 -433,19 +433,19 @@@ struct vm_area_struct 
    /*
     * For areas with an address space and backing store,
     * linkage into the address_space->i_mmap interval tree.
+ 	 *
+ 	 * For private anonymous mappings, a pointer to a null terminated string
+ 	 * containing the name given to the vma, or NULL if unnamed.
     */
- 	struct {
- 		struct rb_node rb;
- 		unsigned long rb_subtree_last;
- 	} shared;
+ 
+ 	union {
+ 		struct {
+ 			struct rb_node rb;
+ 			unsigned long rb_subtree_last;
+ 		} shared;
+ 		/* Serialized by mmap_sem. */
+ 		struct anon_vma_name *anon_name;
+ 	};
/*
     * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
@@@ -632,7 -647,7 +647,7 @@@ struct mm_struct 
    	atomic_t tlb_flush_pending;
  #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
    	/* See flush_tlb_batched_pending() */
- 		bool tlb_flush_batched;
+ 		atomic_t tlb_flush_batched;
  #endif
    	struct uprobes_state uprobes_state;
  #ifdef CONFIG_PREEMPT_RT
@@@ -677,90 -692,6 +692,6 @@@ extern void tlb_gather_mmu(struct mmu_g
  extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
  extern void tlb_finish_mmu(struct mmu_gather *tlb);
- static inline void init_tlb_flush_pending(struct mm_struct *mm)
- {
- 	atomic_set(&mm->tlb_flush_pending, 0);
- }
- 
- static inline void inc_tlb_flush_pending(struct mm_struct *mm)
- {
- 	atomic_inc(&mm->tlb_flush_pending);
- 	/*
- 	 * The only time this value is relevant is when there are indeed pages
- 	 * to flush. And we'll only flush pages after changing them, which
- 	 * requires the PTL.
- 	 *
- 	 * So the ordering here is:
- 	 *
- 	 *	atomic_inc(&mm->tlb_flush_pending);
- 	 *	spin_lock(&ptl);
- 	 *	...
- 	 *	set_pte_at();
- 	 *	spin_unlock(&ptl);
- 	 *
- 	 *				spin_lock(&ptl)
- 	 *				mm_tlb_flush_pending();
- 	 *				....
- 	 *				spin_unlock(&ptl);
- 	 *
- 	 *	flush_tlb_range();
- 	 *	atomic_dec(&mm->tlb_flush_pending);
- 	 *
- 	 * Where the increment if constrained by the PTL unlock, it thus
- 	 * ensures that the increment is visible if the PTE modification is
- 	 * visible. After all, if there is no PTE modification, nobody cares
- 	 * about TLB flushes either.
- 	 *
- 	 * This very much relies on users (mm_tlb_flush_pending() and
- 	 * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
- 	 * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
- 	 * locks (PPC) the unlock of one doesn't order against the lock of
- 	 * another PTL.
- 	 *
- 	 * The decrement is ordered by the flush_tlb_range(), such that
- 	 * mm_tlb_flush_pending() will not return false unless all flushes have
- 	 * completed.
- 	 */
- }
- 
- static inline void dec_tlb_flush_pending(struct mm_struct *mm)
- {
- 	/*
- 	 * See inc_tlb_flush_pending().
- 	 *
- 	 * This cannot be smp_mb__before_atomic() because smp_mb() simply does
- 	 * not order against TLB invalidate completion, which is what we need.
- 	 *
- 	 * Therefore we must rely on tlb_flush_*() to guarantee order.
- 	 */
- 	atomic_dec(&mm->tlb_flush_pending);
- }
- 
- static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
- {
- 	/*
- 	 * Must be called after having acquired the PTL; orders against that
- 	 * PTLs release and therefore ensures that if we observe the modified
- 	 * PTE we must also observe the increment from inc_tlb_flush_pending().
- 	 *
- 	 * That is, it only guarantees to return true if there is a flush
- 	 * pending for _this_ PTL.
- 	 */
- 	return atomic_read(&mm->tlb_flush_pending);
- }
- 
- static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
- {
- 	/*
- 	 * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
- 	 * for which there is a TLB flush pending in order to guarantee
- 	 * we've seen both that PTE modification and the increment.
- 	 *
- 	 * (no requirement on actually still holding the PTL, that is irrelevant)
- 	 */
- 	return atomic_read(&mm->tlb_flush_pending) > 1;
- }
- 
  struct vm_fault;
/**
@@@ -875,4 -806,49 +806,49 @@@ typedef struct 
    unsigned long val;
  } swp_entry_t;
+ /**
+  * enum fault_flag - Fault flag definitions.
+  * @FAULT_FLAG_WRITE: Fault was a write fault.
+  * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
+  * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
+  * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
+  * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
+  * @FAULT_FLAG_TRIED: The fault has been tried once.
+  * @FAULT_FLAG_USER: The fault originated in userspace.
+  * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
+  * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
+  * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
+  *
+  * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
+  * whether we would allow page faults to retry by specifying these two
+  * fault flags correctly.  Currently there can be three legal combinations:
+  *
+  * (a) ALLOW_RETRY and !TRIED:  this means the page fault allows retry, and
+  *                              this is the first try
+  *
+  * (b) ALLOW_RETRY and TRIED:   this means the page fault allows retry, and
+  *                              we've already tried at least once
+  *
+  * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
+  *
+  * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
+  * be used.  Note that page faults can be allowed to retry for multiple times,
+  * in which case we'll have an initial fault with flags (a) then later on
+  * continuous faults with flags (b).  We should always try to detect pending
+  * signals before a retry to make sure the continuous page faults can still be
+  * interrupted if necessary.
+  */
+ enum fault_flag {
+ 	FAULT_FLAG_WRITE =		1 << 0,
+ 	FAULT_FLAG_MKWRITE =		1 << 1,
+ 	FAULT_FLAG_ALLOW_RETRY =	1 << 2,
+ 	FAULT_FLAG_RETRY_NOWAIT = 	1 << 3,
+ 	FAULT_FLAG_KILLABLE =		1 << 4,
+ 	FAULT_FLAG_TRIED = 		1 << 5,
+ 	FAULT_FLAG_USER =		1 << 6,
+ 	FAULT_FLAG_REMOTE =		1 << 7,
+ 	FAULT_FLAG_INSTRUCTION =	1 << 8,
+ 	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
+ };
+ 
  #endif /* _LINUX_MM_TYPES_H */
diff --combined include/linux/page-flags.h
index b3d353d537e2,7e2b90dc7d3f..129421002443
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@@ -68,6 -68,9 +68,6 @@@
   * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
   * a result of MADV_FREE).
   *
 - * PG_uptodate tells whether the page's contents is valid.  When a read
 - * completes, the page becomes uptodate, unless a disk I/O error happened.
 - *
   * PG_referenced, PG_reclaim are used for page reclaim for anonymous and
   * file-backed pagecache (see mm/vmscan.c).
   *
@@@ -380,7 -383,7 +380,7 @@@ static __always_inline int TestClearPag
    TESTCLEARFLAG(uname, lname, policy)
#define TESTPAGEFLAG_FALSE(uname, lname)				\
- static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \
+ static inline bool folio_test_##lname(const struct folio *folio) { return false; } \
  static inline int Page##uname(const struct page *page) { return 0; }
#define SETPAGEFLAG_NOOP(uname, lname)					\
@@@ -519,7 -522,11 +519,11 @@@ PAGEFLAG_FALSE(Uncached, uncached
  PAGEFLAG(HWPoison, hwpoison, PF_ANY)
  TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
  #define __PG_HWPOISON (1UL << PG_hwpoison)
+ #define MAGIC_HWPOISON	0x48575053U	/* HWPS */
+ extern void SetPageHWPoisonTakenOff(struct page *page);
+ extern void ClearPageHWPoisonTakenOff(struct page *page);
  extern bool take_page_off_buddy(struct page *page);
+ extern bool put_page_back_buddy(struct page *page);
  #else
  PAGEFLAG_FALSE(HWPoison, hwpoison)
  #define __PG_HWPOISON 0
@@@ -612,16 -619,6 +616,16 @@@ TESTPAGEFLAG_FALSE(Ksm, ksm
u64 stable_page_flags(struct page *page);
+/**
 + * folio_test_uptodate - Is this folio up to date?
 + * @folio: The folio.
 + *
 + * The uptodate flag is set on a folio when every byte in the folio is
 + * at least as new as the corresponding bytes on storage.  Anonymous
 + * and CoW folios are always uptodate.  If the folio is not uptodate,
 + * some of the bytes in it may be; see the is_partially_uptodate()
 + * address_space operation.
 + */
  static inline bool folio_test_uptodate(struct folio *folio)
  {
    bool ret = test_bit(PG_uptodate, folio_flags(folio, 0));
diff --combined kernel/fork.c
index 3161d7980155,75737e566441..1c989cc4208a
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@@ -42,6 -42,7 +42,7 @@@
  #include <linux/mmu_notifier.h>
  #include <linux/fs.h>
  #include <linux/mm.h>
+ #include <linux/mm_inline.h>
  #include <linux/vmacache.h>
  #include <linux/nsproxy.h>
  #include <linux/capability.h>
@@@ -365,12 -366,14 +366,14 @@@ struct vm_area_struct *vm_area_dup(stru
    	*new = data_race(*orig);
    	INIT_LIST_HEAD(&new->anon_vma_chain);
    	new->vm_next = new->vm_prev = NULL;
+ 		dup_vma_anon_name(orig, new);
    }
    return new;
  }
void vm_area_free(struct vm_area_struct *vma)
  {
+ 	free_vma_anon_name(vma);
    kmem_cache_free(vm_area_cachep, vma);
  }
@@@ -1556,6 -1559,32 +1559,6 @@@ out
    return error;
  }
-static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 -{
 -#ifdef CONFIG_BLOCK
 -	struct io_context *ioc = current->io_context;
 -	struct io_context *new_ioc;
 -
 -	if (!ioc)
 -		return 0;
 -	/*
 -	 * Share io context with parent, if CLONE_IO is set
 -	 */
 -	if (clone_flags & CLONE_IO) {
 -		ioc_task_link(ioc);
 -		tsk->io_context = ioc;
 -	} else if (ioprio_valid(ioc->ioprio)) {
 -		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
 -		if (unlikely(!new_ioc))
 -			return -ENOMEM;
 -
 -		new_ioc->ioprio = ioc->ioprio;
 -		put_io_context(new_ioc);
 -	}
 -#endif
 -	return 0;
 -}
 -
  static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
  {
    struct sighand_struct *sig;
diff --combined kernel/rcu/rcutorture.c
index 33ea446101b3,42bc66a2f170..422f7e4cc08d
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@@ -46,7 -46,6 +46,7 @@@
  #include <linux/oom.h>
  #include <linux/tick.h>
  #include <linux/rcupdate_trace.h>
 +#include <linux/nmi.h>
#include "rcu.h"
@@@ -54,18 -53,15 +54,18 @@@ MODULE_LICENSE("GPL")
  MODULE_AUTHOR("Paul E. McKenney paulmck@linux.ibm.com and Josh Triplett josh@joshtriplett.org");
/* Bits for ->extendables field, extendables param, and related definitions. */
 -#define RCUTORTURE_RDR_SHIFT	 8	/* Put SRCU index in upper bits. */
 -#define RCUTORTURE_RDR_MASK	 ((1 << RCUTORTURE_RDR_SHIFT) - 1)
 +#define RCUTORTURE_RDR_SHIFT_1	 8	/* Put SRCU index in upper bits. */
 +#define RCUTORTURE_RDR_MASK_1	 (1 << RCUTORTURE_RDR_SHIFT_1)
 +#define RCUTORTURE_RDR_SHIFT_2	 9	/* Put SRCU index in upper bits. */
 +#define RCUTORTURE_RDR_MASK_2	 (1 << RCUTORTURE_RDR_SHIFT_2)
  #define RCUTORTURE_RDR_BH	 0x01	/* Extend readers by disabling bh. */
  #define RCUTORTURE_RDR_IRQ	 0x02	/*  ... disabling interrupts. */
  #define RCUTORTURE_RDR_PREEMPT	 0x04	/*  ... disabling preemption. */
  #define RCUTORTURE_RDR_RBH	 0x08	/*  ... rcu_read_lock_bh(). */
  #define RCUTORTURE_RDR_SCHED	 0x10	/*  ... rcu_read_lock_sched(). */
 -#define RCUTORTURE_RDR_RCU	 0x20	/*  ... entering another RCU reader. */
 -#define RCUTORTURE_RDR_NBITS	 6	/* Number of bits defined above. */
 +#define RCUTORTURE_RDR_RCU_1	 0x20	/*  ... entering another RCU reader. */
 +#define RCUTORTURE_RDR_RCU_2	 0x40	/*  ... entering another RCU reader. */
 +#define RCUTORTURE_RDR_NBITS	 7	/* Number of bits defined above. */
  #define RCUTORTURE_MAX_EXTEND	 \
    (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
     RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
@@@ -79,7 -75,7 +79,7 @@@ torture_param(int, fqs_duration, 0
          "Duration of fqs bursts (us), 0 to disable");
  torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
  torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
 -torture_param(bool, fwd_progress, 1, "Test grace-period forward progress");
 +torture_param(int, fwd_progress, 1, "Test grace-period forward progress");
  torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait");
  torture_param(int, fwd_progress_holdoff, 60,
          "Time between forward-progress tests (s)");
@@@ -113,8 -109,6 +113,8 @@@ torture_param(int, shutdown_secs, 0, "S
  torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
  torture_param(int, stall_cpu_holdoff, 10,
         "Time to wait before starting stall (s).");
 +torture_param(bool, stall_no_softlockup, false,
 +	     "Avoid softlockup warning during cpu stall.");
  torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
  torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
  torture_param(int, stall_gp_kthread, 0,
@@@ -146,7 -140,7 +146,7 @@@ static struct task_struct *stats_task
  static struct task_struct *fqs_task;
  static struct task_struct *boost_tasks[NR_CPUS];
  static struct task_struct *stall_task;
 -static struct task_struct *fwd_prog_task;
 +static struct task_struct **fwd_prog_tasks;
  static struct task_struct **barrier_cbs_tasks;
  static struct task_struct *barrier_task;
  static struct task_struct *read_exit_task;
@@@ -348,12 -342,10 +348,12 @@@ struct rcu_torture_ops 
    void (*gp_kthread_dbg)(void);
    bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
    int (*stall_dur)(void);
 +	long cbflood_max;
    int irq_capable;
    int can_boost;
    int extendables;
    int slow_gps;
 +	int no_pi_lock;
    const char *name;
  };
@@@ -675,7 -667,6 +675,7 @@@ static struct rcu_torture_ops srcu_ops 
    .cb_barrier	= srcu_torture_barrier,
    .stats		= srcu_torture_stats,
    .irq_capable	= 1,
 +	.no_pi_lock	= IS_ENABLED(CONFIG_TINY_SRCU),
    .name		= "srcu"
  };
@@@ -709,7 -700,6 +709,7 @@@ static struct rcu_torture_ops srcud_op
    .cb_barrier	= srcu_torture_barrier,
    .stats		= srcu_torture_stats,
    .irq_capable	= 1,
 +	.no_pi_lock	= IS_ENABLED(CONFIG_TINY_SRCU),
    .name		= "srcud"
  };
@@@ -730,7 -720,6 +730,7 @@@ static struct rcu_torture_ops busted_sr
    .cb_barrier	= srcu_torture_barrier,
    .stats		= srcu_torture_stats,
    .irq_capable	= 1,
 +	.no_pi_lock	= IS_ENABLED(CONFIG_TINY_SRCU),
    .extendables	= RCUTORTURE_MAX_EXTEND,
    .name		= "busted_srcud"
  };
@@@ -842,7 -831,6 +842,7 @@@ static struct rcu_torture_ops tasks_rud
    .call		= call_rcu_tasks_rude,
    .cb_barrier	= rcu_barrier_tasks_rude,
    .gp_kthread_dbg	= show_rcu_tasks_rude_gp_kthread,
 +	.cbflood_max	= 50000,
    .fqs		= NULL,
    .stats		= NULL,
    .irq_capable	= 1,
@@@ -883,7 -871,6 +883,7 @@@ static struct rcu_torture_ops tasks_tra
    .call		= call_rcu_tasks_trace,
    .cb_barrier	= rcu_barrier_tasks_trace,
    .gp_kthread_dbg	= show_rcu_tasks_trace_gp_kthread,
 +	.cbflood_max	= 50000,
    .fqs		= NULL,
    .stats		= NULL,
    .irq_capable	= 1,
@@@ -1433,15 -1420,13 +1433,15 @@@ static void rcutorture_one_extend(int *
    			  struct rt_read_seg *rtrsp)
  {
    unsigned long flags;
 -	int idxnew = -1;
 -	int idxold = *readstate;
 +	int idxnew1 = -1;
 +	int idxnew2 = -1;
 +	int idxold1 = *readstate;
 +	int idxold2 = idxold1;
    int statesnew = ~*readstate & newstate;
    int statesold = *readstate & ~newstate;
-	WARN_ON_ONCE(idxold < 0);
 -	WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1);
 +	WARN_ON_ONCE(idxold2 < 0);
 +	WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
    rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
@@@ -1455,10 -1440,8 +1455,10 @@@
    	preempt_disable();
    if (statesnew & RCUTORTURE_RDR_SCHED)
    	rcu_read_lock_sched();
 -	if (statesnew & RCUTORTURE_RDR_RCU)
 -		idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT;
 +	if (statesnew & RCUTORTURE_RDR_RCU_1)
 +		idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1;
 +	if (statesnew & RCUTORTURE_RDR_RCU_2)
 +		idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2;
/*
     * Next, remove old protection, in decreasing order of strength
@@@ -1477,20 -1460,12 +1477,20 @@@
    	local_bh_enable();
    if (statesold & RCUTORTURE_RDR_RBH)
    	rcu_read_unlock_bh();
 -	if (statesold & RCUTORTURE_RDR_RCU) {
 -		bool lockit = !statesnew && !(torture_random(trsp) & 0xffff);
 +	if (statesold & RCUTORTURE_RDR_RCU_2) {
 +		cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1);
 +		WARN_ON_ONCE(idxnew2 != -1);
 +		idxold2 = 0;
 +	}
 +	if (statesold & RCUTORTURE_RDR_RCU_1) {
 +		bool lockit;
+		lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff);
    	if (lockit)
    		raw_spin_lock_irqsave(&current->pi_lock, flags);
 -		cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT);
 +		cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1);
 +		WARN_ON_ONCE(idxnew1 != -1);
 +		idxold1 = 0;
    	if (lockit)
    		raw_spin_unlock_irqrestore(&current->pi_lock, flags);
    }
@@@ -1500,19 -1475,13 +1500,19 @@@
    	cur_ops->read_delay(trsp, rtrsp);
/* Update the reader state. */
 -	if (idxnew == -1)
 -		idxnew = idxold & ~RCUTORTURE_RDR_MASK;
 -	WARN_ON_ONCE(idxnew < 0);
 -	WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1);
 -	*readstate = idxnew | newstate;
 -	WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0);
 -	WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1);
 +	if (idxnew1 == -1)
 +		idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1;
 +	WARN_ON_ONCE(idxnew1 < 0);
 +	if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1))
 +		pr_info("Unexpected idxnew1 value of %#x\n", idxnew1);
 +	if (idxnew2 == -1)
 +		idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2;
 +	WARN_ON_ONCE(idxnew2 < 0);
 +	WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
 +	*readstate = idxnew1 | idxnew2 | newstate;
 +	WARN_ON_ONCE(*readstate < 0);
 +	if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1))
 +		pr_info("Unexpected idxnew2 value of %#x\n", idxnew2);
  }
/* Return the biggest extendables mask given current RCU and boot parameters. */
@@@ -1522,7 -1491,7 +1522,7 @@@ static int rcutorture_extend_mask_max(v
WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND);
    mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables;
 -	mask = mask | RCUTORTURE_RDR_RCU;
 +	mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
    return mask;
  }
@@@ -1537,21 -1506,13 +1537,21 @@@ rcutorture_extend_mask(int oldmask, str
    unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
    unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
-	WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
 +	WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1);
    /* Mostly only one bit (need preemption!), sometimes lots of bits. */
    if (!(randmask1 & 0x7))
    	mask = mask & randmask2;
    else
    	mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
+	// Can't have nested RCU reader without outer RCU reader.
 +	if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) {
 +		if (oldmask & RCUTORTURE_RDR_RCU_1)
 +			mask &= ~RCUTORTURE_RDR_RCU_2;
 +		else
 +			mask |= RCUTORTURE_RDR_RCU_1;
 +	}
 +
    /*
     * Can't enable bh w/irq disabled.
     */
@@@ -1571,7 -1532,7 +1571,7 @@@
    		mask |= oldmask & bhs;
    }
-	return mask ?: RCUTORTURE_RDR_RCU;
 +	return mask ?: RCUTORTURE_RDR_RCU_1;
  }
/*
@@@ -1665,7 -1626,7 +1665,7 @@@ static bool rcu_torture_one_read(struc
    		  rcu_torture_writer_state,
    		  cookie, cur_ops->get_gp_state());
    rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
 -	WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK);
 +	WARN_ON_ONCE(readstate);
    // This next splat is expected behavior if leakpointer, especially
    // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
    WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1);
@@@ -2031,9 -1992,8 +2031,8 @@@ static int rcutorture_booster_init(unsi
    mutex_lock(&boost_mutex);
    rcu_torture_disable_rt_throttle();
    VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
- 	boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
- 						  cpu_to_node(cpu),
- 						  "rcu_torture_boost");
+ 	boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL,
+ 					      cpu, "rcu_torture_boost_%u");
    if (IS_ERR(boost_tasks[cpu])) {
    	retval = PTR_ERR(boost_tasks[cpu]);
    	VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
@@@ -2042,8 -2002,6 +2041,6 @@@
    	mutex_unlock(&boost_mutex);
    	return retval;
    }
- 	kthread_bind(boost_tasks[cpu], cpu);
- 	wake_up_process(boost_tasks[cpu]);
    mutex_unlock(&boost_mutex);
    return 0;
  }
@@@ -2091,8 -2049,6 +2088,8 @@@ static int rcu_torture_stall(void *args
  #else
    			schedule_timeout_uninterruptible(HZ);
  #endif
 +			} else if (stall_no_softlockup) {
 +				touch_softlockup_watchdog();
    		}
    	if (stall_cpu_irqsoff)
    		local_irq_enable();
@@@ -2164,13 -2120,10 +2161,13 @@@ struct rcu_fwd 
    unsigned long rcu_fwd_startat;
    struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
    unsigned long rcu_launder_gp_seq_start;
 +	int rcu_fwd_id;
  };
static DEFINE_MUTEX(rcu_fwd_mutex);
  static struct rcu_fwd *rcu_fwds;
 +static unsigned long rcu_fwd_seq;
 +static atomic_long_t rcu_fwd_max_cbs;
  static bool rcu_fwd_emergency_stop;
static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
@@@ -2183,9 -2136,8 +2180,9 @@@
    for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
    	if (rfp->n_launders_hist[i].n_launders > 0)
    		break;
 -	pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
 -		 __func__, jiffies - rfp->rcu_fwd_startat);
 +	mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
 +	pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):",
 +		 __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat);
    gps_old = rfp->rcu_launder_gp_seq_start;
    for (j = 0; j <= i; j++) {
    	gps = rfp->n_launders_hist[j].launder_gp_seq;
@@@ -2196,7 -2148,6 +2193,7 @@@
    	gps_old = gps;
    }
    pr_cont("\n");
 +	mutex_unlock(&rcu_fwd_mutex);
  }
/* Callback function for continuous-flood RCU callbacks. */
@@@ -2322,8 -2273,7 +2319,8 @@@ static void rcu_torture_fwd_prog_nr(str
    	cver = READ_ONCE(rcu_torture_current_version) - cver;
    	gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
    	WARN_ON(!cver && gps < 2);
 -		pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps);
 +		pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__,
 +			 rfp->rcu_fwd_id, dur, cver, gps);
    }
    if (selfpropcb) {
    	WRITE_ONCE(fcs.stop, 1);
@@@ -2391,7 -2341,7 +2388,7 @@@ static void rcu_torture_fwd_prog_cr(str
    		rfp->rcu_fwd_cb_head = rfcpn;
    		n_launders++;
    		n_launders_sa++;
 -		} else {
 +		} else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) {
    		rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
    		if (WARN_ON_ONCE(!rfcp)) {
    			schedule_timeout_interruptible(1);
@@@ -2401,11 -2351,8 +2398,11 @@@
    		n_launders_sa = 0;
    		rfcp->rfc_gps = 0;
    		rfcp->rfc_rfp = rfp;
 +		} else {
 +			rfcp = NULL;
    	}
 -		cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
 +		if (rfcp)
 +			cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
    	rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
    	if (tick_nohz_full_enabled()) {
    		local_irq_save(flags);
@@@ -2429,7 -2376,6 +2426,7 @@@
    		 n_launders + n_max_cbs - n_launders_cb_snap,
    		 n_launders, n_launders_sa,
    		 n_max_gps, n_max_cbs, cver, gps);
 +		atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
    	rcu_torture_fwd_cb_hist(rfp);
    }
    schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
@@@ -2445,8 -2391,6 +2442,8 @@@
  static int rcutorture_oom_notify(struct notifier_block *self,
    			 unsigned long notused, void *nfreed)
  {
 +	int i;
 +	long ncbs;
    struct rcu_fwd *rfp;
mutex_lock(&rcu_fwd_mutex);
@@@ -2457,26 -2401,18 +2454,26 @@@
    }
    WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
         __func__);
 -	rcu_torture_fwd_cb_hist(rfp);
 -	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2);
 +	for (i = 0; i < fwd_progress; i++) {
 +		rcu_torture_fwd_cb_hist(&rfp[i]);
 +		rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2);
 +	}
    WRITE_ONCE(rcu_fwd_emergency_stop, true);
    smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
 -	pr_info("%s: Freed %lu RCU callbacks.\n",
 -		__func__, rcu_torture_fwd_prog_cbfree(rfp));
 +	ncbs = 0;
 +	for (i = 0; i < fwd_progress; i++)
 +		ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
 +	pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
    rcu_barrier();
 -	pr_info("%s: Freed %lu RCU callbacks.\n",
 -		__func__, rcu_torture_fwd_prog_cbfree(rfp));
 +	ncbs = 0;
 +	for (i = 0; i < fwd_progress; i++)
 +		ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
 +	pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
    rcu_barrier();
 -	pr_info("%s: Freed %lu RCU callbacks.\n",
 -		__func__, rcu_torture_fwd_prog_cbfree(rfp));
 +	ncbs = 0;
 +	for (i = 0; i < fwd_progress; i++)
 +		ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
 +	pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
    smp_mb(); /* Frees before return to avoid redoing OOM. */
    (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
    pr_info("%s returning after OOM processing.\n", __func__);
@@@ -2491,10 -2427,7 +2488,10 @@@ static struct notifier_block rcutorture
  /* Carry out grace-period forward-progress testing. */
  static int rcu_torture_fwd_prog(void *args)
  {
 +	bool firsttime = true;
 +	long max_cbs;
    int oldnice = task_nice(current);
 +	unsigned long oldseq = READ_ONCE(rcu_fwd_seq);
    struct rcu_fwd *rfp = args;
    int tested = 0;
    int tested_tries = 0;
@@@ -2504,38 -2437,21 +2501,38 @@@
    if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
    	set_user_nice(current, MAX_NICE);
    do {
 -		schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
 -		WRITE_ONCE(rcu_fwd_emergency_stop, false);
 -		if (!IS_ENABLED(CONFIG_TINY_RCU) ||
 -		    rcu_inkernel_boot_has_ended())
 -			rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
 -		if (rcu_inkernel_boot_has_ended())
 +		if (!rfp->rcu_fwd_id) {
 +			schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
 +			WRITE_ONCE(rcu_fwd_emergency_stop, false);
 +			if (!firsttime) {
 +				max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0);
 +				pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs);
 +			}
 +			firsttime = false;
 +			WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
 +		} else {
 +			while (READ_ONCE(rcu_fwd_seq) == oldseq)
 +				schedule_timeout_interruptible(1);
 +			oldseq = READ_ONCE(rcu_fwd_seq);
 +		}
 +		pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
 +		if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id)
    		rcu_torture_fwd_prog_cr(rfp);
 +		if ((cur_ops->stall_dur && cur_ops->stall_dur() > 0) &&
 +		    (!IS_ENABLED(CONFIG_TINY_RCU) ||
 +		     (rcu_inkernel_boot_has_ended() &&
 +		      torture_num_online_cpus() > rfp->rcu_fwd_id)))
 +			rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
/* Avoid slow periods, better to test when busy. */
    	if (stutter_wait("rcu_torture_fwd_prog"))
    		sched_set_normal(current, oldnice);
    } while (!torture_must_stop());
    /* Short runs might not contain a valid forward-progress attempt. */
 -	WARN_ON(!tested && tested_tries >= 5);
 -	pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
 +	if (!rfp->rcu_fwd_id) {
 +		WARN_ON(!tested && tested_tries >= 5);
 +		pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
 +	}
    torture_kthread_stopping("rcu_torture_fwd_prog");
    return 0;
  }
@@@ -2543,28 -2459,17 +2540,28 @@@
  /* If forward-progress checking is requested and feasible, spawn the thread. */
  static int __init rcu_torture_fwd_prog_init(void)
  {
 +	int i;
 +	int ret = 0;
    struct rcu_fwd *rfp;
if (!fwd_progress)
    	return 0; /* Not requested, so don't do it. */
 +	if (fwd_progress >= nr_cpu_ids) {
 +		VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n");
 +		fwd_progress = nr_cpu_ids;
 +	} else if (fwd_progress < 0) {
 +		fwd_progress = nr_cpu_ids;
 +	}
    if ((!cur_ops->sync && !cur_ops->call) ||
 -	    !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) {
 +	    (!cur_ops->cbflood_max && (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0)) ||
 +	    cur_ops == &rcu_busted_ops) {
    	VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test");
 +		fwd_progress = 0;
    	return 0;
    }
    if (stall_cpu > 0) {
    	VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing");
 +		fwd_progress = 0;
    	if (IS_MODULE(CONFIG_RCU_TORTURE_TEST))
    		return -EINVAL; /* In module, can fail back to user. */
    	WARN_ON(1); /* Make sure rcutorture notices conflict. */
@@@ -2574,51 -2479,29 +2571,51 @@@
    	fwd_progress_holdoff = 1;
    if (fwd_progress_div <= 0)
    	fwd_progress_div = 4;
 -	rfp = kzalloc(sizeof(*rfp), GFP_KERNEL);
 -	if (!rfp)
 +	rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL);
 +	fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL);
 +	if (!rfp || !fwd_prog_tasks) {
 +		kfree(rfp);
 +		kfree(fwd_prog_tasks);
 +		fwd_prog_tasks = NULL;
 +		fwd_progress = 0;
    	return -ENOMEM;
 -	spin_lock_init(&rfp->rcu_fwd_lock);
 -	rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
 +	}
 +	for (i = 0; i < fwd_progress; i++) {
 +		spin_lock_init(&rfp[i].rcu_fwd_lock);
 +		rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head;
 +		rfp[i].rcu_fwd_id = i;
 +	}
    mutex_lock(&rcu_fwd_mutex);
    rcu_fwds = rfp;
    mutex_unlock(&rcu_fwd_mutex);
    register_oom_notifier(&rcutorture_oom_nb);
 -	return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
 +	for (i = 0; i < fwd_progress; i++) {
 +		ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]);
 +		if (ret) {
 +			fwd_progress = i;
 +			return ret;
 +		}
 +	}
 +	return 0;
  }
static void rcu_torture_fwd_prog_cleanup(void)
  {
 +	int i;
    struct rcu_fwd *rfp;
-	torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
 -	rfp = rcu_fwds;
 +	if (!rcu_fwds || !fwd_prog_tasks)
 +		return;
 +	for (i = 0; i < fwd_progress; i++)
 +		torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]);
 +	unregister_oom_notifier(&rcutorture_oom_nb);
    mutex_lock(&rcu_fwd_mutex);
 +	rfp = rcu_fwds;
    rcu_fwds = NULL;
    mutex_unlock(&rcu_fwd_mutex);
 -	unregister_oom_notifier(&rcutorture_oom_nb);
    kfree(rfp);
 +	kfree(fwd_prog_tasks);
 +	fwd_prog_tasks = NULL;
  }
/* Callback function for RCU barrier testing. */
@@@ -2855,7 -2738,7 +2852,7 @@@ static int rcu_torture_read_exit(void *
    			     &trs, "%s",
    			     "rcu_torture_read_exit_child");
    	if (IS_ERR(tsp)) {
 -			VERBOSE_TOROUT_ERRSTRING("out of memory");
 +			TOROUT_ERRSTRING("out of memory");
    		errexit = true;
    		tsp = NULL;
    		break;
@@@ -3182,7 -3065,7 +3179,7 @@@ rcu_torture_init(void
    				   sizeof(fakewriter_tasks[0]),
    				   GFP_KERNEL);
    	if (fakewriter_tasks == NULL) {
 -			VERBOSE_TOROUT_ERRSTRING("out of memory");
 +			TOROUT_ERRSTRING("out of memory");
    		firsterr = -ENOMEM;
    		goto unwind;
    	}
@@@ -3198,7 -3081,7 +3195,7 @@@
    rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk),
    				   GFP_KERNEL);
    if (!reader_tasks || !rcu_torture_reader_mbchk) {
 -		VERBOSE_TOROUT_ERRSTRING("out of memory");
 +		TOROUT_ERRSTRING("out of memory");
    	firsterr = -ENOMEM;
    	goto unwind;
    }
@@@ -3217,7 -3100,7 +3214,7 @@@
    if (nrealnocbers > 0) {
    	nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL);
    	if (nocb_tasks == NULL) {
 -			VERBOSE_TOROUT_ERRSTRING("out of memory");
 +			TOROUT_ERRSTRING("out of memory");
    		firsterr = -ENOMEM;
    		goto unwind;
    	}
diff --combined kernel/sysctl.c
index d7ed1dffa426,2ab4edb6e450..ef77be575d87
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -33,7 -33,6 +33,7 @@@
  #include <linux/security.h>
  #include <linux/ctype.h>
  #include <linux/kmemleak.h>
 +#include <linux/filter.h>
  #include <linux/fs.h>
  #include <linux/init.h>
  #include <linux/kernel.h>
@@@ -123,6 -122,7 +123,7 @@@ static unsigned long long_max = LONG_MA
  static int one_hundred = 100;
  static int two_hundred = 200;
  static int one_thousand = 1000;
+ static int three_thousand = 3000;
  #ifdef CONFIG_PRINTK
  static int ten_thousand = 10000;
  #endif
@@@ -2960,7 -2960,7 +2961,7 @@@ static struct ctl_table vm_table[] = 
    	.mode		= 0644,
    	.proc_handler	= watermark_scale_factor_sysctl_handler,
    	.extra1		= SYSCTL_ONE,
- 		.extra2		= &one_thousand,
+ 		.extra2		= &three_thousand,
    },
    {
    	.procname	= "percpu_pagelist_high_fraction",
diff --combined mm/Makefile
index 7919cd7f13f2,5c5a3a480fa6..588d3113f3b0
--- a/mm/Makefile
+++ b/mm/Makefile
@@@ -15,8 -15,6 +15,8 @@@ KCSAN_SANITIZE_slab_common.o := 
  KCSAN_SANITIZE_slab.o := n
  KCSAN_SANITIZE_slub.o := n
  KCSAN_SANITIZE_page_alloc.o := n
 +# But enable explicit instrumentation for memory barriers.
 +KCSAN_INSTRUMENT_BARRIERS := y
# These files are disabled because they produce non-interesting and/or
  # flaky coverage that is not a function of syscall inputs. E.g. slab is out of
@@@ -114,6 -112,7 +114,7 @@@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += 
  obj-$(CONFIG_CMA)	+= cma.o
  obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
  obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
+ obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
  obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
  obj-$(CONFIG_SECRETMEM) += secretmem.o
  obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
diff --combined mm/huge_memory.c
index f58524394dc1,6ed86a8f6a5b..406a3c28c026
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@@ -1322,7 -1322,7 +1322,7 @@@ vm_fault_t do_huge_pmd_wp_page(struct v
     * We can only reuse the page if nobody else maps the huge page or it's
     * part.
     */
- 	if (reuse_swap_page(page, NULL)) {
+ 	if (reuse_swap_page(page)) {
    	pmd_t entry;
    	entry = pmd_mkyoung(orig_pmd);
    	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@@ -2542,38 -2542,28 +2542,28 @@@ int total_mapcount(struct page *page
   * need full accuracy to avoid breaking page pinning, because
   * page_trans_huge_mapcount() is slower than page_mapcount().
   */
- int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
+ int page_trans_huge_mapcount(struct page *page)
  {
- 	int i, ret, _total_mapcount, mapcount;
+ 	int i, ret;
/* hugetlbfs shouldn't call it */
    VM_BUG_ON_PAGE(PageHuge(page), page);
- 	if (likely(!PageTransCompound(page))) {
- 		mapcount = atomic_read(&page->_mapcount) + 1;
- 		if (total_mapcount)
- 			*total_mapcount = mapcount;
- 		return mapcount;
- 	}
+ 	if (likely(!PageTransCompound(page)))
+ 		return atomic_read(&page->_mapcount) + 1;
page = compound_head(page);
- 	_total_mapcount = ret = 0;
+ 	ret = 0;
    for (i = 0; i < thp_nr_pages(page); i++) {
- 		mapcount = atomic_read(&page[i]._mapcount) + 1;
+ 		int mapcount = atomic_read(&page[i]._mapcount) + 1;
    	ret = max(ret, mapcount);
- 		_total_mapcount += mapcount;
    }
- 	if (PageDoubleMap(page)) {
+ 
+ 	if (PageDoubleMap(page))
    	ret -= 1;
- 		_total_mapcount -= thp_nr_pages(page);
- 	}
- 	mapcount = compound_mapcount(page);
- 	ret += mapcount;
- 	_total_mapcount += mapcount;
- 	if (total_mapcount)
- 		*total_mapcount = _total_mapcount;
- 	return ret;
+ 
+ 	return ret + compound_mapcount(page);
  }
/* Racy check whether the huge page can be split */
@@@ -2614,7 -2604,6 +2604,7 @@@ int split_huge_page_to_list(struct pag
  {
    struct page *head = compound_head(page);
    struct deferred_split *ds_queue = get_deferred_split_queue(head);
 +	XA_STATE(xas, &head->mapping->i_pages, head->index);
    struct anon_vma *anon_vma = NULL;
    struct address_space *mapping = NULL;
    int extra_pins, ret;
@@@ -2653,13 -2642,6 +2643,13 @@@
    		goto out;
    	}
+		xas_split_alloc(&xas, head, compound_order(head),
 +				mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
 +		if (xas_error(&xas)) {
 +			ret = xas_error(&xas);
 +			goto out;
 +		}
 +
    	anon_vma = NULL;
    	i_mmap_lock_read(mapping);
@@@ -2689,12 -2671,13 +2679,12 @@@
    /* block interrupt reentry in xa_lock and spinlock */
    local_irq_disable();
    if (mapping) {
 -		XA_STATE(xas, &mapping->i_pages, page_index(head));
 -
    	/*
    	 * Check if the head page is present in page cache.
    	 * We assume all tail are present too, if head is there.
    	 */
 -		xa_lock(&mapping->i_pages);
 +		xas_lock(&xas);
 +		xas_reset(&xas);
    	if (xas_load(&xas) != head)
    		goto fail;
    }
@@@ -2710,7 -2693,6 +2700,7 @@@
    	if (mapping) {
    		int nr = thp_nr_pages(head);
+			xas_split(&xas, head, thp_order(head));
    		if (PageSwapBacked(head)) {
    			__mod_lruvec_page_state(head, NR_SHMEM_THPS,
    						-nr);
@@@ -2727,7 -2709,7 +2717,7 @@@
    	spin_unlock(&ds_queue->split_queue_lock);
  fail:
    	if (mapping)
 -			xa_unlock(&mapping->i_pages);
 +			xas_unlock(&xas);
    	local_irq_enable();
    	remap_page(head, thp_nr_pages(head));
    	ret = -EBUSY;
@@@ -2741,8 -2723,6 +2731,8 @@@ out_unlock
    if (mapping)
    	i_mmap_unlock_read(mapping);
  out:
 +	/* Free any memory we didn't use */
 +	xas_nomem(&xas, 0);
    count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
    return ret;
  }
diff --combined mm/internal.h
index 26af8a5a5be3,c5834cc28a44..d80300392a19
--- a/mm/internal.h
+++ b/mm/internal.h
@@@ -12,8 -12,6 +12,8 @@@
  #include <linux/pagemap.h>
  #include <linux/tracepoint-defs.h>
+struct folio_batch;
 +
  /*
   * The set of flags that only affect watermark checking and reclaim
   * behaviour. This is used by the MM to obey the caller constraints
@@@ -23,7 -21,7 +23,7 @@@
  #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
    		__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
    		__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
- 			__GFP_ATOMIC)
+ 			__GFP_ATOMIC|__GFP_NOLOCKDEP)
/* The GFP flags allowed during early boot */
  #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
@@@ -76,7 -74,6 +76,7 @@@ static inline bool can_madv_lru_vma(str
    return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
  }
+struct zap_details;
  void unmap_page_range(struct mmu_gather *tlb,
    		     struct vm_area_struct *vma,
    		     unsigned long addr, unsigned long end,
@@@ -93,13 -90,7 +93,13 @@@ static inline void force_page_cache_rea
  }
unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
 -		pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
 +		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
 +unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
 +		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
 +void filemap_free_folio(struct address_space *mapping, struct folio *folio);
 +int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
 +bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
 +		loff_t end);
/**
   * folio_evictable - Test whether a folio is evictable.
@@@ -166,11 -157,6 +166,6 @@@ extern void reclaim_throttle(pg_data_t 
   */
  extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
- /*
-  * in mm/memcontrol.c:
-  */
- extern bool cgroup_memory_nokmem;
- 
  /*
   * in mm/page_alloc.c
   */
@@@ -397,7 -383,6 +392,7 @@@ void __vma_link_list(struct mm_struct *
  void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
#ifdef CONFIG_MMU
 +void unmap_mapping_folio(struct folio *folio);
  extern long populate_vma_page_range(struct vm_area_struct *vma,
    	unsigned long start, unsigned long end, int *locked);
  extern long faultin_vma_page_range(struct vm_area_struct *vma,
@@@ -501,8 -486,8 +496,8 @@@ static inline struct file *maybe_unlock
    }
    return fpin;
  }
 -
  #else /* !CONFIG_MMU */
 +static inline void unmap_mapping_folio(struct folio *folio) { }
  static inline void clear_page_mlock(struct page *page) { }
  static inline void mlock_vma_page(struct page *page) { }
  static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
diff --combined mm/kasan/quarantine.c
index 587da8995f2d,47ed4fc33a29..08291ed33e93
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@@ -117,7 -117,7 +117,7 @@@ static unsigned long quarantine_batch_s
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
  {
 -	return virt_to_head_page(qlink)->slab_cache;
 +	return virt_to_slab(qlink)->slab_cache;
  }
static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
@@@ -132,11 -132,22 +132,22 @@@
  static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
  {
    void *object = qlink_to_object(qlink, cache);
+ 	struct kasan_free_meta *meta = kasan_get_free_meta(cache, object);
    unsigned long flags;
if (IS_ENABLED(CONFIG_SLAB))
    	local_irq_save(flags);
+ 	/*
+ 	 * If init_on_free is enabled and KASAN's free metadata is stored in
+ 	 * the object, zero the metadata. Otherwise, the object's memory will
+ 	 * not be properly zeroed, as KASAN saves the metadata after the slab
+ 	 * allocator zeroes the object.
+ 	 */
+ 	if (slab_want_init_on_free(cache) &&
+ 	    cache->kasan_info.free_meta_offset == 0)
+ 		memzero_explicit(meta, sizeof(*meta));
+ 
    /*
     * As the object now gets freed from the quarantine, assume that its
     * free track is no longer valid.
diff --combined mm/khugepaged.c
index 2e1911cc3466,7af84bac6fc2..35f14d0a00a6
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@@ -618,6 -618,7 +618,7 @@@ static int __collapse_huge_page_isolate
    			continue;
    		} else {
    			result = SCAN_EXCEED_NONE_PTE;
+ 				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
    			goto out;
    		}
    	}
@@@ -636,6 -637,7 +637,7 @@@
    	if (page_mapcount(page) > 1 &&
    			++shared > khugepaged_max_ptes_shared) {
    		result = SCAN_EXCEED_SHARED_PTE;
+ 			count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
    		goto out;
    	}
@@@ -681,7 -683,7 +683,7 @@@
    		goto out;
    	}
    	if (!pte_write(pteval) && PageSwapCache(page) &&
- 				!reuse_swap_page(page, NULL)) {
+ 				!reuse_swap_page(page)) {
    		/*
    		 * Page is in the swap cache and cannot be re-used.
    		 * It cannot be collapsed into a THP.
@@@ -756,11 -758,7 +758,7 @@@ static void __collapse_huge_page_copy(p
    			 * ptl mostly unnecessary.
    			 */
    			spin_lock(ptl);
- 				/*
- 				 * paravirt calls inside pte_clear here are
- 				 * superfluous.
- 				 */
- 				pte_clear(vma->vm_mm, address, _pte);
+ 				ptep_clear(vma->vm_mm, address, _pte);
    			spin_unlock(ptl);
    		}
    	} else {
@@@ -774,11 -772,7 +772,7 @@@
    		 * inside page_remove_rmap().
    		 */
    		spin_lock(ptl);
- 			/*
- 			 * paravirt calls inside pte_clear here are
- 			 * superfluous.
- 			 */
- 			pte_clear(vma->vm_mm, address, _pte);
+ 			ptep_clear(vma->vm_mm, address, _pte);
    		page_remove_rmap(src_page, false);
    		spin_unlock(ptl);
    		free_page_and_swap_cache(src_page);
@@@ -1261,6 -1255,7 +1255,7 @@@ static int khugepaged_scan_pmd(struct m
    			continue;
    		} else {
    			result = SCAN_EXCEED_SWAP_PTE;
+ 				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
    			goto out_unmap;
    		}
    	}
@@@ -1270,6 -1265,7 +1265,7 @@@
    			continue;
    		} else {
    			result = SCAN_EXCEED_NONE_PTE;
+ 				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
    			goto out_unmap;
    		}
    	}
@@@ -1298,6 -1294,7 +1294,7 @@@
    	if (page_mapcount(page) > 1 &&
    			++shared > khugepaged_max_ptes_shared) {
    		result = SCAN_EXCEED_SHARED_PTE;
+ 			count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
    		goto out_unmap;
    	}
@@@ -1306,7 -1303,7 +1303,7 @@@
    	/*
    	 * Record which node the original page is from and save this
    	 * information to khugepaged_node_load[].
- 		 * Khupaged will allocate hugepage from the node has the max
+ 		 * Khugepaged will allocate hugepage from the node has the max
    	 * hit record.
    	 */
    	node = page_to_nid(page);
@@@ -1667,10 -1664,7 +1664,10 @@@ static void collapse_file(struct mm_str
    }
    count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
-	/* This will be less messy when we use multi-index entries */
 +	/*
 +	 * Ensure we have slots for all the pages in the range.  This is
 +	 * almost certainly a no-op because most of the pages must be present
 +	 */
    do {
    	xas_lock_irq(&xas);
    	xas_create_range(&xas);
@@@ -1895,9 -1889,6 +1892,9 @@@ out_unlock
    		__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
    }
+	/* Join all the small entries into a single multi-index entry */
 +	xas_set_order(&xas, start, HPAGE_PMD_ORDER);
 +	xas_store(&xas, new_page);
  xa_locked:
    xas_unlock_irq(&xas);
  xa_unlocked:
@@@ -2014,15 -2005,12 +2011,16 @@@ static void khugepaged_scan_file(struc
    	if (xa_is_value(page)) {
    		if (++swap > khugepaged_max_ptes_swap) {
    			result = SCAN_EXCEED_SWAP_PTE;
+ 				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
    			break;
    		}
    		continue;
    	}
+		/*
 +		 * XXX: khugepaged should compact smaller compound pages
 +		 * into a PMD sized page
 +		 */
    	if (PageTransCompound(page)) {
    		result = SCAN_PAGE_COMPOUND;
    		break;
@@@ -2064,6 -2052,7 +2062,7 @@@
    if (result == SCAN_SUCCEED) {
    	if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
    		result = SCAN_EXCEED_NONE_PTE;
+ 			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
    	} else {
    		node = khugepaged_find_target_node();
    		collapse_file(mm, file, start, hpage, node);
diff --combined mm/memcontrol.c
index 4a7b3ebf8e48,c9ddd02dc5de..09d342c7cbd0
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -84,7 -84,7 +84,7 @@@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_me
  static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */
- bool cgroup_memory_nokmem __ro_after_init;
+ static bool cgroup_memory_nokmem __ro_after_init;
/* Whether the swap controller is active */
  #ifdef CONFIG_MEMCG_SWAP
@@@ -629,11 -629,17 +629,17 @@@ static DEFINE_SPINLOCK(stats_flush_lock
  static DEFINE_PER_CPU(unsigned int, stats_updates);
  static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
- static inline void memcg_rstat_updated(struct mem_cgroup *memcg)
+ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
  {
+ 	unsigned int x;
+ 
    cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
- 	if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH))
- 		atomic_inc(&stats_flush_threshold);
+ 
+ 	x = __this_cpu_add_return(stats_updates, abs(val));
+ 	if (x > MEMCG_CHARGE_BATCH) {
+ 		atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
+ 		__this_cpu_write(stats_updates, 0);
+ 	}
  }
static void __mem_cgroup_flush_stats(void)
@@@ -656,7 -662,7 +662,7 @@@ void mem_cgroup_flush_stats(void
static void flush_memcg_stats_dwork(struct work_struct *w)
  {
- 	mem_cgroup_flush_stats();
+ 	__mem_cgroup_flush_stats();
    queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
  }
@@@ -672,7 -678,7 +678,7 @@@ void __mod_memcg_state(struct mem_cgrou
    	return;
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
- 	memcg_rstat_updated(memcg);
+ 	memcg_rstat_updated(memcg, val);
  }
/* idx can be of type enum memcg_stat_item or node_stat_item. */
@@@ -705,7 -711,7 +711,7 @@@ void __mod_memcg_lruvec_state(struct lr
    /* Update lruvec */
    __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
- 	memcg_rstat_updated(memcg);
+ 	memcg_rstat_updated(memcg, val);
  }
/**
@@@ -789,7 -795,7 +795,7 @@@ void __count_memcg_events(struct mem_cg
    	return;
__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
- 	memcg_rstat_updated(memcg);
+ 	memcg_rstat_updated(memcg, count);
  }
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@@ -1369,6 -1375,7 +1375,7 @@@ static const struct memory_stat memory_
    { "pagetables",			NR_PAGETABLE			},
    { "percpu",			MEMCG_PERCPU_B			},
    { "sock",			MEMCG_SOCK			},
+ 	{ "vmalloc",			MEMCG_VMALLOC			},
    { "shmem",			NR_SHMEM			},
    { "file_mapped",		NR_FILE_MAPPED			},
    { "file_dirty",			NR_FILE_DIRTY			},
@@@ -2816,31 -2823,31 +2823,31 @@@ static inline void mod_objcg_mlstate(st
    rcu_read_unlock();
  }
-int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
 -				 gfp_t gfp, bool new_page)
 +int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
 +				 gfp_t gfp, bool new_slab)
  {
 -	unsigned int objects = objs_per_slab_page(s, page);
 +	unsigned int objects = objs_per_slab(s, slab);
    unsigned long memcg_data;
    void *vec;
gfp &= ~OBJCGS_CLEAR_MASK;
    vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
 -			   page_to_nid(page));
 +			   slab_nid(slab));
    if (!vec)
    	return -ENOMEM;
memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
 -	if (new_page) {
 +	if (new_slab) {
    	/*
 -		 * If the slab page is brand new and nobody can yet access
 -		 * it's memcg_data, no synchronization is required and
 -		 * memcg_data can be simply assigned.
 +		 * If the slab is brand new and nobody can yet access its
 +		 * memcg_data, no synchronization is required and memcg_data can
 +		 * be simply assigned.
    	 */
 -		page->memcg_data = memcg_data;
 -	} else if (cmpxchg(&page->memcg_data, 0, memcg_data)) {
 +		slab->memcg_data = memcg_data;
 +	} else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
    	/*
 -		 * If the slab page is already in use, somebody can allocate
 -		 * and assign obj_cgroups in parallel. In this case the existing
 +		 * If the slab is already in use, somebody can allocate and
 +		 * assign obj_cgroups in parallel. In this case the existing
    	 * objcg vector should be reused.
    	 */
    	kfree(vec);
@@@ -2865,43 -2872,38 +2872,43 @@@
   */
  struct mem_cgroup *mem_cgroup_from_obj(void *p)
  {
 -	struct page *page;
 +	struct folio *folio;
if (mem_cgroup_disabled())
    	return NULL;
-	page = virt_to_head_page(p);
 +	folio = virt_to_folio(p);
/*
     * Slab objects are accounted individually, not per-page.
     * Memcg membership data for each individual object is saved in
 -	 * the page->obj_cgroups.
 +	 * slab->memcg_data.
     */
 -	if (page_objcgs_check(page)) {
 -		struct obj_cgroup *objcg;
 +	if (folio_test_slab(folio)) {
 +		struct obj_cgroup **objcgs;
 +		struct slab *slab;
    	unsigned int off;
-		off = obj_to_index(page->slab_cache, page, p);
 -		objcg = page_objcgs(page)[off];
 -		if (objcg)
 -			return obj_cgroup_memcg(objcg);
 +		slab = folio_slab(folio);
 +		objcgs = slab_objcgs(slab);
 +		if (!objcgs)
 +			return NULL;
 +
 +		off = obj_to_index(slab->slab_cache, slab, p);
 +		if (objcgs[off])
 +			return obj_cgroup_memcg(objcgs[off]);
return NULL;
    }
/*
 -	 * page_memcg_check() is used here, because page_has_obj_cgroups()
 -	 * check above could fail because the object cgroups vector wasn't set
 -	 * at that moment, but it can be set concurrently.
 +	 * page_memcg_check() is used here, because in theory we can encounter
 +	 * a folio where the slab flag has been cleared already, but
 +	 * slab->memcg_data has not been freed yet
     * page_memcg_check(page) will guarantee that a proper memory
     * cgroup pointer or NULL will be returned.
     */
 -	return page_memcg_check(page);
 +	return page_memcg_check(folio_page(folio, 0));
  }
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
@@@ -4850,6 -4852,17 +4857,17 @@@ out_kfree
    return ret;
  }
+ #if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
+ static int mem_cgroup_slab_show(struct seq_file *m, void *p)
+ {
+ 	/*
+ 	 * Deprecated.
+ 	 * Please, take a look at tools/cgroup/slabinfo.py .
+ 	 */
+ 	return 0;
+ }
+ #endif
+ 
  static struct cftype mem_cgroup_legacy_files[] = {
    {
    	.name = "usage_in_bytes",
@@@ -4950,7 -4963,7 +4968,7 @@@
    (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
    {
    	.name = "kmem.slabinfo",
- 		.seq_show = memcg_slab_show,
+ 		.seq_show = mem_cgroup_slab_show,
    },
  #endif
    {
@@@ -5110,15 -5123,11 +5128,11 @@@ static void mem_cgroup_free(struct mem_
  static struct mem_cgroup *mem_cgroup_alloc(void)
  {
    struct mem_cgroup *memcg;
- 	unsigned int size;
    int node;
    int __maybe_unused i;
    long error = -ENOMEM;
- 	size = sizeof(struct mem_cgroup);
- 	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
- 
- 	memcg = kzalloc(size, GFP_KERNEL);
+ 	memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
    if (!memcg)
    	return ERR_PTR(error);
@@@ -6312,6 -6321,8 +6326,8 @@@ static void __memory_events_show(struc
    seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
    seq_printf(m, "oom_kill %lu\n",
    	   atomic_long_read(&events[MEMCG_OOM_KILL]));
+ 	seq_printf(m, "oom_group_kill %lu\n",
+ 		   atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
  }
static int memory_events_show(struct seq_file *m, void *v)
diff --combined mm/memory-failure.c
index f1c389f7e669,373837bb94cb..14ae5c18e776
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@@ -58,6 -58,7 +58,7 @@@
  #include <linux/ratelimit.h>
  #include <linux/page-isolation.h>
  #include <linux/pagewalk.h>
+ #include <linux/shmem_fs.h>
  #include "internal.h"
  #include "ras/ras_event.h"
@@@ -722,7 -723,6 +723,6 @@@ static const char * const action_page_t
    [MF_MSG_KERNEL_HIGH_ORDER]	= "high-order kernel page",
    [MF_MSG_SLAB]			= "kernel slab page",
    [MF_MSG_DIFFERENT_COMPOUND]	= "different compound page after locking",
- 	[MF_MSG_POISONED_HUGE]		= "huge page already hardware poisoned",
    [MF_MSG_HUGE]			= "huge page",
    [MF_MSG_FREE_HUGE]		= "free huge page",
    [MF_MSG_NON_PMD_HUGE]		= "non-pmd-sized huge page",
@@@ -737,7 -737,6 +737,6 @@@
    [MF_MSG_CLEAN_LRU]		= "clean LRU page",
    [MF_MSG_TRUNCATED_LRU]		= "already truncated LRU page",
    [MF_MSG_BUDDY]			= "free buddy page",
- 	[MF_MSG_BUDDY_2ND]		= "free buddy page (2nd try)",
    [MF_MSG_DAX]			= "dax page",
    [MF_MSG_UNSPLIT_THP]		= "unsplit thp",
    [MF_MSG_UNKNOWN]		= "unknown page",
@@@ -867,6 -866,7 +866,7 @@@ static int me_pagecache_clean(struct pa
  {
    int ret;
    struct address_space *mapping;
+ 	bool extra_pins;
delete_from_lru_cache(p);
@@@ -895,18 -895,24 +895,24 @@@
    	goto out;
    }
+ 	/*
+ 	 * The shmem page is kept in page cache instead of truncating
+ 	 * so is expected to have an extra refcount after error-handling.
+ 	 */
+ 	extra_pins = shmem_mapping(mapping);
+ 
    /*
     * Truncation is a bit tricky. Enable it per file system for now.
     *
     * Open: to take i_rwsem or not for this? Right now we don't.
     */
    ret = truncate_error_page(p, page_to_pfn(p), mapping);
+ 	if (has_extra_refcount(ps, p, extra_pins))
+ 		ret = MF_FAILED;
+ 
  out:
    unlock_page(p);
- 	if (has_extra_refcount(ps, p, false))
- 		ret = MF_FAILED;
- 
    return ret;
  }
@@@ -1154,6 -1160,22 +1160,22 @@@ static int page_action(struct page_stat
    return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
  }
+ static inline bool PageHWPoisonTakenOff(struct page *page)
+ {
+ 	return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
+ }
+ 
+ void SetPageHWPoisonTakenOff(struct page *page)
+ {
+ 	set_page_private(page, MAGIC_HWPOISON);
+ }
+ 
+ void ClearPageHWPoisonTakenOff(struct page *page)
+ {
+ 	if (PageHWPoison(page))
+ 		set_page_private(page, 0);
+ }
+ 
  /*
   * Return true if a page type of a given page is supported by hwpoison
   * mechanism (while handling could fail), otherwise false.  This function
@@@ -1256,6 -1278,27 +1278,27 @@@ out
    return ret;
  }
+ static int __get_unpoison_page(struct page *page)
+ {
+ 	struct page *head = compound_head(page);
+ 	int ret = 0;
+ 	bool hugetlb = false;
+ 
+ 	ret = get_hwpoison_huge_page(head, &hugetlb);
+ 	if (hugetlb)
+ 		return ret;
+ 
+ 	/*
+ 	 * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
+ 	 * but also isolated from buddy freelist, so need to identify the
+ 	 * state and have to cancel both operations to unpoison.
+ 	 */
+ 	if (PageHWPoisonTakenOff(page))
+ 		return -EHWPOISON;
+ 
+ 	return get_page_unless_zero(page) ? 1 : 0;
+ }
+ 
  /**
   * get_hwpoison_page() - Get refcount for memory error handling
   * @p:		Raw error page (hit by memory error)
@@@ -1263,7 -1306,7 +1306,7 @@@
   *
   * get_hwpoison_page() takes a page refcount of an error page to handle memory
   * error on it, after checking that the error page is in a well-defined state
-  * (defined as a page-type we can successfully handle the memor error on it,
+  * (defined as a page-type we can successfully handle the memory error on it,
   * such as LRU page and hugetlb page).
   *
   * Memory error handling could be triggered at any time on any type of page,
@@@ -1272,18 -1315,26 +1315,26 @@@
   * extra care for the error page's state (as done in __get_hwpoison_page()),
   * and has some retry logic in get_any_page().
   *
+  * When called from unpoison_memory(), the caller should already ensure that
+  * the given page has PG_hwpoison. So it's never reused for other page
+  * allocations, and __get_unpoison_page() never races with them.
+  *
   * Return: 0 on failure,
   *         1 on success for in-use pages in a well-defined state,
   *         -EIO for pages on which we can not handle memory errors,
   *         -EBUSY when get_hwpoison_page() has raced with page lifecycle
-  *         operations like allocation and free.
+  *         operations like allocation and free,
+  *         -EHWPOISON when the page is hwpoisoned and taken off from buddy.
   */
  static int get_hwpoison_page(struct page *p, unsigned long flags)
  {
    int ret;
zone_pcp_disable(page_zone(p));
- 	ret = get_any_page(p, flags);
+ 	if (flags & MF_UNPOISON)
+ 		ret = __get_unpoison_page(p);
+ 	else
+ 		ret = get_any_page(p, flags);
    zone_pcp_enable(page_zone(p));
return ret;
@@@ -1494,14 -1545,6 +1545,6 @@@ static int memory_failure_hugetlb(unsig
    lock_page(head);
    page_flags = head->flags;
- 	if (!PageHWPoison(head)) {
- 		pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
- 		num_poisoned_pages_dec();
- 		unlock_page(head);
- 		put_page(head);
- 		return 0;
- 	}
- 
    /*
     * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
     * simply disable it. In order to make it work properly, we need
@@@ -1615,6 -1658,8 +1658,8 @@@ out
    return rc;
  }
+ static DEFINE_MUTEX(mf_mutex);
+ 
  /**
   * memory_failure - Handle memory failure of a page.
   * @pfn: Page Number of the corrupted page
@@@ -1641,33 -1686,25 +1686,32 @@@ int memory_failure(unsigned long pfn, i
    int res = 0;
    unsigned long page_flags;
    bool retry = true;
- 	static DEFINE_MUTEX(mf_mutex);
if (!sysctl_memory_failure_recovery)
    	panic("Memory failure on page %lx", pfn);
+	mutex_lock(&mf_mutex);
 +
    p = pfn_to_online_page(pfn);
    if (!p) {
 +		res = arch_memory_failure(pfn, flags);
 +		if (res == 0)
 +			goto unlock_mutex;
 +
    	if (pfn_valid(pfn)) {
    		pgmap = get_dev_pagemap(pfn, NULL);
 -			if (pgmap)
 -				return memory_failure_dev_pagemap(pfn, flags,
 -								  pgmap);
 +			if (pgmap) {
 +				res = memory_failure_dev_pagemap(pfn, flags,
 +								 pgmap);
 +				goto unlock_mutex;
 +			}
    	}
    	pr_err("Memory failure: %#lx: memory outside kernel control\n",
    		pfn);
 -		return -ENXIO;
 +		res = -ENXIO;
 +		goto unlock_mutex;
    }
-	mutex_lock(&mf_mutex);
 -
  try_again:
    if (PageHuge(p)) {
    	res = memory_failure_hugetlb(pfn, flags);
@@@ -1782,16 -1819,6 +1826,6 @@@
     */
    page_flags = p->flags;
- 	/*
- 	 * unpoison always clear PG_hwpoison inside page lock
- 	 */
- 	if (!PageHWPoison(p)) {
- 		pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
- 		num_poisoned_pages_dec();
- 		unlock_page(p);
- 		put_page(p);
- 		goto unlock_mutex;
- 	}
    if (hwpoison_filter(p)) {
    	if (TestClearPageHWPoison(p))
    		num_poisoned_pages_dec();
@@@ -1955,6 -1982,28 +1989,28 @@@ core_initcall(memory_failure_init)
    	pr_info(fmt, pfn);			\
  })
+ static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p)
+ {
+ 	if (TestClearPageHWPoison(p)) {
+ 		unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
+ 				 page_to_pfn(p), rs);
+ 		num_poisoned_pages_dec();
+ 		return 1;
+ 	}
+ 	return 0;
+ }
+ 
+ static inline int unpoison_taken_off_page(struct ratelimit_state *rs,
+ 					  struct page *p)
+ {
+ 	if (put_page_back_buddy(p)) {
+ 		unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
+ 				 page_to_pfn(p), rs);
+ 		return 0;
+ 	}
+ 	return -EBUSY;
+ }
+ 
  /**
   * unpoison_memory - Unpoison a previously poisoned page
   * @pfn: Page number of the to be unpoisoned page
@@@ -1971,8 -2020,7 +2027,7 @@@ int unpoison_memory(unsigned long pfn
  {
    struct page *page;
    struct page *p;
- 	int freeit = 0;
- 	unsigned long flags = 0;
+ 	int ret = -EBUSY;
    static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
    				DEFAULT_RATELIMIT_BURST);
@@@ -1982,69 -2030,60 +2037,60 @@@
    p = pfn_to_page(pfn);
    page = compound_head(p);
+ 	mutex_lock(&mf_mutex);
+ 
    if (!PageHWPoison(p)) {
    	unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
    			 pfn, &unpoison_rs);
- 		return 0;
+ 		goto unlock_mutex;
    }
if (page_count(page) > 1) {
    	unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
    			 pfn, &unpoison_rs);
- 		return 0;
+ 		goto unlock_mutex;
    }
if (page_mapped(page)) {
    	unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
    			 pfn, &unpoison_rs);
- 		return 0;
+ 		goto unlock_mutex;
    }
if (page_mapping(page)) {
    	unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
    			 pfn, &unpoison_rs);
- 		return 0;
- 	}
- 
- 	/*
- 	 * unpoison_memory() can encounter thp only when the thp is being
- 	 * worked by memory_failure() and the page lock is not held yet.
- 	 * In such case, we yield to memory_failure() and make unpoison fail.
- 	 */
- 	if (!PageHuge(page) && PageTransHuge(page)) {
- 		unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
- 				 pfn, &unpoison_rs);
- 		return 0;
+ 		goto unlock_mutex;
    }
- 	if (!get_hwpoison_page(p, flags)) {
- 		if (TestClearPageHWPoison(p))
- 			num_poisoned_pages_dec();
- 		unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
- 				 pfn, &unpoison_rs);
- 		return 0;
- 	}
+ 	if (PageSlab(page) || PageTable(page))
+ 		goto unlock_mutex;
- 	lock_page(page);
- 	/*
- 	 * This test is racy because PG_hwpoison is set outside of page lock.
- 	 * That's acceptable because that won't trigger kernel panic. Instead,
- 	 * the PG_hwpoison page will be caught and isolated on the entrance to
- 	 * the free buddy page pool.
- 	 */
- 	if (TestClearPageHWPoison(page)) {
- 		unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
- 				 pfn, &unpoison_rs);
- 		num_poisoned_pages_dec();
- 		freeit = 1;
- 	}
- 	unlock_page(page);
+ 	ret = get_hwpoison_page(p, MF_UNPOISON);
+ 	if (!ret) {
+ 		if (clear_page_hwpoison(&unpoison_rs, page))
+ 			ret = 0;
+ 		else
+ 			ret = -EBUSY;
+ 	} else if (ret < 0) {
+ 		if (ret == -EHWPOISON) {
+ 			ret = unpoison_taken_off_page(&unpoison_rs, p);
+ 		} else
+ 			unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
+ 					 pfn, &unpoison_rs);
+ 	} else {
+ 		int freeit = clear_page_hwpoison(&unpoison_rs, p);
- 	put_page(page);
- 	if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
    	put_page(page);
+ 		if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) {
+ 			put_page(page);
+ 			ret = 0;
+ 		}
+ 	}
- 	return 0;
+ unlock_mutex:
+ 	mutex_unlock(&mf_mutex);
+ 	return ret;
  }
  EXPORT_SYMBOL(unpoison_memory);
@@@ -2225,9 -2264,12 +2271,12 @@@ int soft_offline_page(unsigned long pfn
    	return -EIO;
    }
+ 	mutex_lock(&mf_mutex);
+ 
    if (PageHWPoison(page)) {
    	pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
    	put_ref_page(ref_page);
+ 		mutex_unlock(&mf_mutex);
    	return 0;
    }
@@@ -2246,5 -2288,7 +2295,7 @@@ retry
    	}
    }
+ 	mutex_unlock(&mf_mutex);
+ 
    return ret;
  }
diff --combined mm/memory.c
index 23f2f1300d42,571d02f419ba..f306e698a1e3
--- a/mm/memory.c
+++ b/mm/memory.c
@@@ -41,6 -41,7 +41,7 @@@
#include <linux/kernel_stat.h>
  #include <linux/mm.h>
+ #include <linux/mm_inline.h>
  #include <linux/sched/mm.h>
  #include <linux/sched/coredump.h>
  #include <linux/sched/numa_balancing.h>
@@@ -719,8 -720,6 +720,6 @@@ static void restore_exclusive_pte(struc
    else if (is_writable_device_exclusive_entry(entry))
    	pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- 	set_pte_at(vma->vm_mm, address, ptep, pte);
- 
    /*
     * No need to take a page reference as one was already
     * created when the swap entry was made.
@@@ -734,6 -733,8 +733,8 @@@
    	 */
    	WARN_ON_ONCE(!PageAnon(page));
+ 	set_pte_at(vma->vm_mm, address, ptep, pte);
+ 
    if (vma->vm_flags & VM_LOCKED)
    	mlock_vma_page(page);
@@@ -1304,28 -1305,6 +1305,28 @@@ copy_page_range(struct vm_area_struct *
    return ret;
  }
+/*
 + * Parameter block passed down to zap_pte_range in exceptional cases.
 + */
 +struct zap_details {
 +	struct address_space *zap_mapping;	/* Check page->mapping if set */
 +	struct folio *single_folio;	/* Locked folio to be unmapped */
 +};
 +
 +/*
 + * We set details->zap_mapping when we want to unmap shared but keep private
 + * pages. Return true if skip zapping this page, false otherwise.
 + */
 +static inline bool
 +zap_skip_check_mapping(struct zap_details *details, struct page *page)
 +{
 +	if (!details || !page)
 +		return false;
 +
 +	return details->zap_mapping &&
 +		(details->zap_mapping != page_rmapping(page));
 +}
 +
  static unsigned long zap_pte_range(struct mmu_gather *tlb,
    			struct vm_area_struct *vma, pmd_t *pmd,
    			unsigned long addr, unsigned long end,
@@@ -1465,8 -1444,8 +1466,8 @@@ static inline unsigned long zap_pmd_ran
    		else if (zap_huge_pmd(tlb, vma, pmd, addr))
    			goto next;
    		/* fall through */
 -		} else if (details && details->single_page &&
 -			   PageTransCompound(details->single_page) &&
 +		} else if (details && details->single_folio &&
 +			   folio_test_pmd_mappable(details->single_folio) &&
    		   next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
    		spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
    		/*
@@@ -3354,30 -3333,31 +3355,30 @@@ static inline void unmap_mapping_range_
  }
/**
 - * unmap_mapping_page() - Unmap single page from processes.
 - * @page: The locked page to be unmapped.
 + * unmap_mapping_folio() - Unmap single folio from processes.
 + * @folio: The locked folio to be unmapped.
   *
 - * Unmap this page from any userspace process which still has it mmaped.
 + * Unmap this folio from any userspace process which still has it mmaped.
   * Typically, for efficiency, the range of nearby pages has already been
   * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
 - * truncation or invalidation holds the lock on a page, it may find that
 - * the page has been remapped again: and then uses unmap_mapping_page()
 + * truncation or invalidation holds the lock on a folio, it may find that
 + * the page has been remapped again: and then uses unmap_mapping_folio()
   * to unmap it finally.
   */
 -void unmap_mapping_page(struct page *page)
 +void unmap_mapping_folio(struct folio *folio)
  {
 -	struct address_space *mapping = page->mapping;
 +	struct address_space *mapping = folio->mapping;
    struct zap_details details = { };
    pgoff_t	first_index;
    pgoff_t	last_index;
-	VM_BUG_ON(!PageLocked(page));
 -	VM_BUG_ON(PageTail(page));
 +	VM_BUG_ON(!folio_test_locked(folio));
-	first_index = page->index;
 -	last_index = page->index + thp_nr_pages(page) - 1;
 +	first_index = folio->index;
 +	last_index = folio->index + folio_nr_pages(folio) - 1;
details.zap_mapping = mapping;
 -	details.single_page = page;
 +	details.single_folio = folio;
i_mmap_lock_write(mapping);
    if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
@@@ -3647,7 -3627,7 +3648,7 @@@ vm_fault_t do_swap_page(struct vm_faul
    inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
    dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
    pte = mk_pte(page, vma->vm_page_prot);
- 	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ 	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
    	pte = maybe_mkwrite(pte_mkdirty(pte), vma);
    	vmf->flags &= ~FAULT_FLAG_WRITE;
    	ret |= VM_FAULT_WRITE;
@@@ -3660,8 -3640,6 +3661,6 @@@
    	pte = pte_mkuffd_wp(pte);
    	pte = pte_wrprotect(pte);
    }
- 	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
- 	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
    vmf->orig_pte = pte;
/* ksm created a completely new copy */
@@@ -3672,6 -3650,9 +3671,9 @@@
    	do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
    }
+ 	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ 	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
+ 
    swap_free(entry);
    if (mem_cgroup_swap_full(page) ||
        (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
diff --combined mm/memremap.c
index 643965da13a6,a2869d8519a2..6aa5f0c2d11f
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@@ -102,16 -102,47 +102,23 @@@ static unsigned long pfn_end(struct dev
    return (range->start + range_len(range)) >> PAGE_SHIFT;
  }
- static unsigned long pfn_next(unsigned long pfn)
+ static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
  {
- 	if (pfn % 1024 == 0)
+ 	if (pfn % (1024 << pgmap->vmemmap_shift))
    	cond_resched();
- 	return pfn + 1;
+ 	return pfn + pgmap_vmemmap_nr(pgmap);
+ }
+ 
+ static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
+ {
+ 	return (pfn_end(pgmap, range_id) -
+ 		pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
  }
#define for_each_device_pfn(pfn, map, i) \
- 	for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
+ 	for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \
+ 	     pfn = pfn_next(map, pfn))
-static void dev_pagemap_kill(struct dev_pagemap *pgmap)
 -{
 -	if (pgmap->ops && pgmap->ops->kill)
 -		pgmap->ops->kill(pgmap);
 -	else
 -		percpu_ref_kill(pgmap->ref);
 -}
 -
 -static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
 -{
 -	if (pgmap->ops && pgmap->ops->cleanup) {
 -		pgmap->ops->cleanup(pgmap);
 -	} else {
 -		wait_for_completion(&pgmap->done);
 -		percpu_ref_exit(pgmap->ref);
 -	}
 -	/*
 -	 * Undo the pgmap ref assignment for the internal case as the
 -	 * caller may re-enable the same pgmap.
 -	 */
 -	if (pgmap->ref == &pgmap->internal_ref)
 -		pgmap->ref = NULL;
 -}
 -
  static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
  {
    struct range *range = &pgmap->ranges[range_id];
@@@ -143,12 -174,11 +150,12 @@@ void memunmap_pages(struct dev_pagemap 
    unsigned long pfn;
    int i;
-	dev_pagemap_kill(pgmap);
 +	percpu_ref_kill(&pgmap->ref);
    for (i = 0; i < pgmap->nr_range; i++)
    	for_each_device_pfn(pfn, pgmap, i)
    		put_page(pfn_to_page(pfn));
 -	dev_pagemap_cleanup(pgmap);
 +	wait_for_completion(&pgmap->done);
 +	percpu_ref_exit(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++)
    	pageunmap_range(pgmap, i);
@@@ -165,7 -195,8 +172,7 @@@ static void devm_memremap_pages_release
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
  {
 -	struct dev_pagemap *pgmap =
 -		container_of(ref, struct dev_pagemap, internal_ref);
 +	struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref);
complete(&pgmap->done);
  }
@@@ -271,8 -302,7 +278,7 @@@ static int pagemap_range(struct dev_pag
    memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
    			PHYS_PFN(range->start),
    			PHYS_PFN(range_len(range)), pgmap);
- 	percpu_ref_get_many(&pgmap->ref,
- 		pfn_end(pgmap, range_id) - pfn_first(pgmap, range_id));
 -	percpu_ref_get_many(pgmap->ref, pfn_len(pgmap, range_id));
++	percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
    return 0;
err_add_memory:
@@@ -338,11 -368,22 +344,11 @@@ void *memremap_pages(struct dev_pagema
    	break;
    }
-	if (!pgmap->ref) {
 -		if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
 -			return ERR_PTR(-EINVAL);
 -
 -		init_completion(&pgmap->done);
 -		error = percpu_ref_init(&pgmap->internal_ref,
 -				dev_pagemap_percpu_release, 0, GFP_KERNEL);
 -		if (error)
 -			return ERR_PTR(error);
 -		pgmap->ref = &pgmap->internal_ref;
 -	} else {
 -		if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
 -			WARN(1, "Missing reference count teardown definition\n");
 -			return ERR_PTR(-EINVAL);
 -		}
 -	}
 +	init_completion(&pgmap->done);
 +	error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0,
 +				GFP_KERNEL);
 +	if (error)
 +		return ERR_PTR(error);
devmap_managed_enable_get(pgmap);
@@@ -451,7 -492,7 +457,7 @@@ struct dev_pagemap *get_dev_pagemap(uns
    /* fall back to slow path lookup */
    rcu_read_lock();
    pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
 -	if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
 +	if (pgmap && !percpu_ref_tryget_live(&pgmap->ref))
    	pgmap = NULL;
    rcu_read_unlock();
diff --combined mm/migrate.c
index 7079e6b7dbe7,05af2b2336b9..18ce840914f0
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@@ -50,6 -50,7 +50,7 @@@
  #include <linux/ptrace.h>
  #include <linux/oom.h>
  #include <linux/memory.h>
+ #include <linux/random.h>
#include <asm/tlbflush.h>
@@@ -236,20 -237,19 +237,19 @@@ static bool remove_migration_pte(struc
pte = pte_mkhuge(pte);
    		pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
- 			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
    		if (PageAnon(new))
    			hugepage_add_anon_rmap(new, vma, pvmw.address);
    		else
    			page_dup_rmap(new, true);
+ 			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
    	} else
  #endif
    	{
- 			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
- 
    		if (PageAnon(new))
    			page_add_anon_rmap(new, vma, pvmw.address, false);
    		else
    			page_add_file_rmap(new, false);
+ 			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
    	}
    	if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
    		mlock_vma_page(new);
@@@ -291,7 -291,7 +291,7 @@@ void __migration_entry_wait(struct mm_s
  {
    pte_t pte;
    swp_entry_t entry;
 -	struct page *page;
 +	struct folio *folio;
spin_lock(ptl);
    pte = *ptep;
@@@ -302,17 -302,18 +302,17 @@@
    if (!is_migration_entry(entry))
    	goto out;
-	page = pfn_swap_entry_to_page(entry);
 -	page = compound_head(page);
 +	folio = page_folio(pfn_swap_entry_to_page(entry));
/*
     * Once page cache replacement of page migration started, page_count
 -	 * is zero; but we must not call put_and_wait_on_page_locked() without
 -	 * a ref. Use get_page_unless_zero(), and just fault again if it fails.
 +	 * is zero; but we must not call folio_put_wait_locked() without
 +	 * a ref. Use folio_try_get(), and just fault again if it fails.
     */
 -	if (!get_page_unless_zero(page))
 +	if (!folio_try_get(folio))
    	goto out;
    pte_unmap_unlock(ptep, ptl);
 -	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 +	folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
    return;
  out:
    pte_unmap_unlock(ptep, ptl);
@@@ -337,16 -338,16 +337,16 @@@ void migration_entry_wait_huge(struct v
  void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
  {
    spinlock_t *ptl;
 -	struct page *page;
 +	struct folio *folio;
ptl = pmd_lock(mm, pmd);
    if (!is_pmd_migration_entry(*pmd))
    	goto unlock;
 -	page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
 -	if (!get_page_unless_zero(page))
 +	folio = page_folio(pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd)));
 +	if (!folio_try_get(folio))
    	goto unlock;
    spin_unlock(ptl);
 -	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 +	folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
    return;
  unlock:
    spin_unlock(ptl);
@@@ -433,6 -434,14 +433,6 @@@ int folio_migrate_mapping(struct addres
    }
xas_store(&xas, newfolio);
 -	if (nr > 1) {
 -		int i;
 -
 -		for (i = 1; i < nr; i++) {
 -			xas_next(&xas);
 -			xas_store(&xas, newfolio);
 -		}
 -	}
/*
     * Drop cache reference from old page by unfreezing
@@@ -1084,80 -1093,6 +1084,6 @@@ out
    return rc;
  }
- 
- /*
-  * node_demotion[] example:
-  *
-  * Consider a system with two sockets.  Each socket has
-  * three classes of memory attached: fast, medium and slow.
-  * Each memory class is placed in its own NUMA node.  The
-  * CPUs are placed in the node with the "fast" memory.  The
-  * 6 NUMA nodes (0-5) might be split among the sockets like
-  * this:
-  *
-  *	Socket A: 0, 1, 2
-  *	Socket B: 3, 4, 5
-  *
-  * When Node 0 fills up, its memory should be migrated to
-  * Node 1.  When Node 1 fills up, it should be migrated to
-  * Node 2.  The migration path start on the nodes with the
-  * processors (since allocations default to this node) and
-  * fast memory, progress through medium and end with the
-  * slow memory:
-  *
-  *	0 -> 1 -> 2 -> stop
-  *	3 -> 4 -> 5 -> stop
-  *
-  * This is represented in the node_demotion[] like this:
-  *
-  *	{  1, // Node 0 migrates to 1
-  *	   2, // Node 1 migrates to 2
-  *	  -1, // Node 2 does not migrate
-  *	   4, // Node 3 migrates to 4
-  *	   5, // Node 4 migrates to 5
-  *	  -1} // Node 5 does not migrate
-  */
- 
- /*
-  * Writes to this array occur without locking.  Cycles are
-  * not allowed: Node X demotes to Y which demotes to X...
-  *
-  * If multiple reads are performed, a single rcu_read_lock()
-  * must be held over all reads to ensure that no cycles are
-  * observed.
-  */
- static int node_demotion[MAX_NUMNODES] __read_mostly =
- 	{[0 ...  MAX_NUMNODES - 1] = NUMA_NO_NODE};
- 
- /**
-  * next_demotion_node() - Get the next node in the demotion path
-  * @node: The starting node to lookup the next node
-  *
-  * Return: node id for next memory node in the demotion path hierarchy
-  * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
-  * @node online or guarantee that it *continues* to be the next demotion
-  * target.
-  */
- int next_demotion_node(int node)
- {
- 	int target;
- 
- 	/*
- 	 * node_demotion[] is updated without excluding this
- 	 * function from running.  RCU doesn't provide any
- 	 * compiler barriers, so the READ_ONCE() is required
- 	 * to avoid compiler reordering or read merging.
- 	 *
- 	 * Make sure to use RCU over entire code blocks if
- 	 * node_demotion[] reads need to be consistent.
- 	 */
- 	rcu_read_lock();
- 	target = READ_ONCE(node_demotion[node]);
- 	rcu_read_unlock();
- 
- 	return target;
- }
- 
  /*
   * Obtain the lock on page, remove all ptes and migrate the page
   * to the newly allocated page in newpage.
@@@ -1413,7 -1348,7 +1339,7 @@@ static inline int try_split_thp(struct 
   * @mode:		The migration mode that specifies the constraints for
   *			page migration, if any.
   * @reason:		The reason for page migration.
-  * @ret_succeeded:	Set to the number of pages migrated successfully if
+  * @ret_succeeded:	Set to the number of normal pages migrated successfully if
   *			the caller passes a non-NULL pointer.
   *
   * The function returns after 10 attempts or if no pages are movable any more
@@@ -1421,7 -1356,9 +1347,9 @@@
   * It is caller's responsibility to call putback_movable_pages() to return pages
   * to the LRU or free list only if ret != 0.
   *
-  * Returns the number of pages that were not migrated, or an error code.
+  * Returns the number of {normal page, THP, hugetlb} that were not migrated, or
+  * an error code. The number of THP splits will be considered as the number of
+  * non-migrated THP, no matter how many subpages of the THP are migrated successfully.
   */
  int migrate_pages(struct list_head *from, new_page_t get_new_page,
    	free_page_t put_new_page, unsigned long private,
@@@ -1430,6 -1367,7 +1358,7 @@@
    int retry = 1;
    int thp_retry = 1;
    int nr_failed = 0;
+ 	int nr_failed_pages = 0;
    int nr_succeeded = 0;
    int nr_thp_succeeded = 0;
    int nr_thp_failed = 0;
@@@ -1441,13 -1379,16 +1370,16 @@@
    int swapwrite = current->flags & PF_SWAPWRITE;
    int rc, nr_subpages;
    LIST_HEAD(ret_pages);
+ 	LIST_HEAD(thp_split_pages);
    bool nosplit = (reason == MR_NUMA_MISPLACED);
+ 	bool no_subpage_counting = false;
trace_mm_migrate_pages_start(mode, reason);
if (!swapwrite)
    	current->flags |= PF_SWAPWRITE;
+ thp_subpage_migration:
    for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
    	retry = 0;
    	thp_retry = 0;
@@@ -1460,7 -1401,7 +1392,7 @@@ retry
    		 * during migration.
    		 */
    		is_thp = PageTransHuge(page) && !PageHuge(page);
- 			nr_subpages = thp_nr_pages(page);
+ 			nr_subpages = compound_nr(page);
    		cond_resched();
if (PageHuge(page))
@@@ -1496,18 -1437,20 +1428,20 @@@
    		case -ENOSYS:
    			/* THP migration is unsupported */
    			if (is_thp) {
- 					if (!try_split_thp(page, &page2, from)) {
+ 					nr_thp_failed++;
+ 					if (!try_split_thp(page, &page2, &thp_split_pages)) {
    					nr_thp_split++;
    					goto retry;
    				}
- 					nr_thp_failed++;
- 					nr_failed += nr_subpages;
+ 					nr_failed_pages += nr_subpages;
    				break;
    			}
/* Hugetlb migration is unsupported */
- 				nr_failed++;
+ 				if (!no_subpage_counting)
+ 					nr_failed++;
+ 				nr_failed_pages += nr_subpages;
    			break;
    		case -ENOMEM:
    			/*
@@@ -1516,16 -1459,19 +1450,19 @@@
    			 * THP NUMA faulting doesn't split THP to retry.
    			 */
    			if (is_thp && !nosplit) {
- 					if (!try_split_thp(page, &page2, from)) {
+ 					nr_thp_failed++;
+ 					if (!try_split_thp(page, &page2, &thp_split_pages)) {
    					nr_thp_split++;
    					goto retry;
    				}
- 					nr_thp_failed++;
- 					nr_failed += nr_subpages;
+ 					nr_failed_pages += nr_subpages;
    				goto out;
    			}
- 				nr_failed++;
+ 
+ 				if (!no_subpage_counting)
+ 					nr_failed++;
+ 				nr_failed_pages += nr_subpages;
    			goto out;
    		case -EAGAIN:
    			if (is_thp) {
@@@ -1535,12 -1481,11 +1472,11 @@@
    			retry++;
    			break;
    		case MIGRATEPAGE_SUCCESS:
+ 				nr_succeeded += nr_subpages;
    			if (is_thp) {
    				nr_thp_succeeded++;
- 					nr_succeeded += nr_subpages;
    				break;
    			}
- 				nr_succeeded++;
    			break;
    		default:
    			/*
@@@ -1551,17 -1496,37 +1487,37 @@@
    			 */
    			if (is_thp) {
    				nr_thp_failed++;
- 					nr_failed += nr_subpages;
+ 					nr_failed_pages += nr_subpages;
    				break;
    			}
- 				nr_failed++;
+ 
+ 				if (!no_subpage_counting)
+ 					nr_failed++;
+ 				nr_failed_pages += nr_subpages;
    			break;
    		}
    	}
    }
- 	nr_failed += retry + thp_retry;
+ 	nr_failed += retry;
    nr_thp_failed += thp_retry;
- 	rc = nr_failed;
+ 	/*
+ 	 * Try to migrate subpages of fail-to-migrate THPs, no nr_failed
+ 	 * counting in this round, since all subpages of a THP is counted
+ 	 * as 1 failure in the first round.
+ 	 */
+ 	if (!list_empty(&thp_split_pages)) {
+ 		/*
+ 		 * Move non-migrated pages (after 10 retries) to ret_pages
+ 		 * to avoid migrating them again.
+ 		 */
+ 		list_splice_init(from, &ret_pages);
+ 		list_splice_init(&thp_split_pages, from);
+ 		no_subpage_counting = true;
+ 		retry = 1;
+ 		goto thp_subpage_migration;
+ 	}
+ 
+ 	rc = nr_failed + nr_thp_failed;
  out:
    /*
     * Put the permanent failure page back to migration list, they
@@@ -1570,11 -1535,11 +1526,11 @@@
    list_splice(&ret_pages, from);
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
- 	count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ 	count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
    count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
    count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
    count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
- 	trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
+ 	trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
    		       nr_thp_failed, nr_thp_split, mode, reason);
if (!swapwrite)
@@@ -2516,8 -2481,7 +2472,7 @@@ static bool migrate_vma_check_page(stru
  static void migrate_vma_unmap(struct migrate_vma *migrate)
  {
    const unsigned long npages = migrate->npages;
- 	const unsigned long start = migrate->start;
- 	unsigned long addr, i, restore = 0;
+ 	unsigned long i, restore = 0;
    bool allow_drain = true;
lru_add_drain();
@@@ -2563,7 -2527,7 +2518,7 @@@
    	}
    }
- 	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+ 	for (i = 0; i < npages && restore; i++) {
    	struct page *page = migrate_pfn_to_page(migrate->src[i]);
if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
@@@ -2961,14 -2925,152 +2916,152 @@@ void migrate_vma_finalize(struct migrat
  EXPORT_SYMBOL(migrate_vma_finalize);
  #endif /* CONFIG_DEVICE_PRIVATE */
+ /*
+  * node_demotion[] example:
+  *
+  * Consider a system with two sockets.  Each socket has
+  * three classes of memory attached: fast, medium and slow.
+  * Each memory class is placed in its own NUMA node.  The
+  * CPUs are placed in the node with the "fast" memory.  The
+  * 6 NUMA nodes (0-5) might be split among the sockets like
+  * this:
+  *
+  *	Socket A: 0, 1, 2
+  *	Socket B: 3, 4, 5
+  *
+  * When Node 0 fills up, its memory should be migrated to
+  * Node 1.  When Node 1 fills up, it should be migrated to
+  * Node 2.  The migration path start on the nodes with the
+  * processors (since allocations default to this node) and
+  * fast memory, progress through medium and end with the
+  * slow memory:
+  *
+  *	0 -> 1 -> 2 -> stop
+  *	3 -> 4 -> 5 -> stop
+  *
+  * This is represented in the node_demotion[] like this:
+  *
+  *	{  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
+  *	{  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
+  *	{  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
+  *	{  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
+  *	{  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
+  *	{  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
+  *
+  * Moreover some systems may have multiple slow memory nodes.
+  * Suppose a system has one socket with 3 memory nodes, node 0
+  * is fast memory type, and node 1/2 both are slow memory
+  * type, and the distance between fast memory node and slow
+  * memory node is same. So the migration path should be:
+  *
+  *	0 -> 1/2 -> stop
+  *
+  * This is represented in the node_demotion[] like this:
+  *	{ nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
+  *	{ nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
+  *	{ nr=0, nodes[0]=-1, }, // Node 2 does not migrate
+  */
+ 
+ /*
+  * Writes to this array occur without locking.  Cycles are
+  * not allowed: Node X demotes to Y which demotes to X...
+  *
+  * If multiple reads are performed, a single rcu_read_lock()
+  * must be held over all reads to ensure that no cycles are
+  * observed.
+  */
+ #define DEFAULT_DEMOTION_TARGET_NODES 15
+ 
+ #if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
+ #define DEMOTION_TARGET_NODES	(MAX_NUMNODES - 1)
+ #else
+ #define DEMOTION_TARGET_NODES	DEFAULT_DEMOTION_TARGET_NODES
+ #endif
+ 
+ struct demotion_nodes {
+ 	unsigned short nr;
+ 	short nodes[DEMOTION_TARGET_NODES];
+ };
+ 
+ static struct demotion_nodes *node_demotion __read_mostly;
+ 
+ /**
+  * next_demotion_node() - Get the next node in the demotion path
+  * @node: The starting node to lookup the next node
+  *
+  * Return: node id for next memory node in the demotion path hierarchy
+  * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
+  * @node online or guarantee that it *continues* to be the next demotion
+  * target.
+  */
+ int next_demotion_node(int node)
+ {
+ 	struct demotion_nodes *nd;
+ 	unsigned short target_nr, index;
+ 	int target;
+ 
+ 	if (!node_demotion)
+ 		return NUMA_NO_NODE;
+ 
+ 	nd = &node_demotion[node];
+ 
+ 	/*
+ 	 * node_demotion[] is updated without excluding this
+ 	 * function from running.  RCU doesn't provide any
+ 	 * compiler barriers, so the READ_ONCE() is required
+ 	 * to avoid compiler reordering or read merging.
+ 	 *
+ 	 * Make sure to use RCU over entire code blocks if
+ 	 * node_demotion[] reads need to be consistent.
+ 	 */
+ 	rcu_read_lock();
+ 	target_nr = READ_ONCE(nd->nr);
+ 
+ 	switch (target_nr) {
+ 	case 0:
+ 		target = NUMA_NO_NODE;
+ 		goto out;
+ 	case 1:
+ 		index = 0;
+ 		break;
+ 	default:
+ 		/*
+ 		 * If there are multiple target nodes, just select one
+ 		 * target node randomly.
+ 		 *
+ 		 * In addition, we can also use round-robin to select
+ 		 * target node, but we should introduce another variable
+ 		 * for node_demotion[] to record last selected target node,
+ 		 * that may cause cache ping-pong due to the changing of
+ 		 * last target node. Or introducing per-cpu data to avoid
+ 		 * caching issue, which seems more complicated. So selecting
+ 		 * target node randomly seems better until now.
+ 		 */
+ 		index = get_random_int() % target_nr;
+ 		break;
+ 	}
+ 
+ 	target = READ_ONCE(nd->nodes[index]);
+ 
+ out:
+ 	rcu_read_unlock();
+ 	return target;
+ }
+ 
  #if defined(CONFIG_HOTPLUG_CPU)
  /* Disable reclaim-based migration. */
  static void __disable_all_migrate_targets(void)
  {
- 	int node;
+ 	int node, i;
+ 
+ 	if (!node_demotion)
+ 		return;
- 	for_each_online_node(node)
- 		node_demotion[node] = NUMA_NO_NODE;
+ 	for_each_online_node(node) {
+ 		node_demotion[node].nr = 0;
+ 		for (i = 0; i < DEMOTION_TARGET_NODES; i++)
+ 			node_demotion[node].nodes[i] = NUMA_NO_NODE;
+ 	}
  }
static void disable_all_migrate_targets(void)
@@@ -2995,26 -3097,40 +3088,40 @@@
   * Failing here is OK.  It might just indicate
   * being at the end of a chain.
   */
- static int establish_migrate_target(int node, nodemask_t *used)
+ static int establish_migrate_target(int node, nodemask_t *used,
+ 				    int best_distance)
  {
- 	int migration_target;
+ 	int migration_target, index, val;
+ 	struct demotion_nodes *nd;
- 	/*
- 	 * Can not set a migration target on a
- 	 * node with it already set.
- 	 *
- 	 * No need for READ_ONCE() here since this
- 	 * in the write path for node_demotion[].
- 	 * This should be the only thread writing.
- 	 */
- 	if (node_demotion[node] != NUMA_NO_NODE)
+ 	if (!node_demotion)
    	return NUMA_NO_NODE;
+ 	nd = &node_demotion[node];
+ 
    migration_target = find_next_best_node(node, used);
    if (migration_target == NUMA_NO_NODE)
    	return NUMA_NO_NODE;
- 	node_demotion[node] = migration_target;
+ 	/*
+ 	 * If the node has been set a migration target node before,
+ 	 * which means it's the best distance between them. Still
+ 	 * check if this node can be demoted to other target nodes
+ 	 * if they have a same best distance.
+ 	 */
+ 	if (best_distance != -1) {
+ 		val = node_distance(node, migration_target);
+ 		if (val > best_distance)
+ 			return NUMA_NO_NODE;
+ 	}
+ 
+ 	index = nd->nr;
+ 	if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
+ 		      "Exceeds maximum demotion target nodes\n"))
+ 		return NUMA_NO_NODE;
+ 
+ 	nd->nodes[index] = migration_target;
+ 	nd->nr++;
return migration_target;
  }
@@@ -3030,7 -3146,9 +3137,9 @@@
   *
   * The difference here is that cycles must be avoided.  If
   * node0 migrates to node1, then neither node1, nor anything
-  * node1 migrates to can migrate to node0.
+  * node1 migrates to can migrate to node0. Also one node can
+  * be migrated to multiple nodes if the target nodes all have
+  * a same best-distance against the source node.
   *
   * This function can run simultaneously with readers of
   * node_demotion[].  However, it can not run simultaneously
@@@ -3042,7 -3160,7 +3151,7 @@@ static void __set_migration_target_node
    nodemask_t next_pass	= NODE_MASK_NONE;
    nodemask_t this_pass	= NODE_MASK_NONE;
    nodemask_t used_targets = NODE_MASK_NONE;
- 	int node;
+ 	int node, best_distance;
/*
     * Avoid any oddities like cycles that could occur
@@@ -3071,18 -3189,33 +3180,33 @@@ again
     * multiple source nodes to share a destination.
     */
    nodes_or(used_targets, used_targets, this_pass);
- 	for_each_node_mask(node, this_pass) {
- 		int target_node = establish_migrate_target(node, &used_targets);
- 		if (target_node == NUMA_NO_NODE)
- 			continue;
+ 	for_each_node_mask(node, this_pass) {
+ 		best_distance = -1;
/*
- 		 * Visit targets from this pass in the next pass.
- 		 * Eventually, every node will have been part of
- 		 * a pass, and will become set in 'used_targets'.
+ 		 * Try to set up the migration path for the node, and the target
+ 		 * migration nodes can be multiple, so doing a loop to find all
+ 		 * the target nodes if they all have a best node distance.
    	 */
- 		node_set(target_node, next_pass);
+ 		do {
+ 			int target_node =
+ 				establish_migrate_target(node, &used_targets,
+ 							 best_distance);
+ 
+ 			if (target_node == NUMA_NO_NODE)
+ 				break;
+ 
+ 			if (best_distance == -1)
+ 				best_distance = node_distance(node, target_node);
+ 
+ 			/*
+ 			 * Visit targets from this pass in the next pass.
+ 			 * Eventually, every node will have been part of
+ 			 * a pass, and will become set in 'used_targets'.
+ 			 */
+ 			node_set(target_node, next_pass);
+ 		} while (1);
    }
    /*
     * 'next_pass' contains nodes which became migration
@@@ -3183,6 -3316,11 +3307,11 @@@ static int __init migrate_on_reclaim_in
  {
    int ret;
+ 	node_demotion = kmalloc_array(nr_node_ids,
+ 				      sizeof(struct demotion_nodes),
+ 				      GFP_KERNEL);
+ 	WARN_ON(!node_demotion);
+ 
    ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
    				NULL, migration_offline_cpu);
    /*
diff --combined mm/shmem.c
index 28d627444a24,0700e9acf53b..66909efd0a1b
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@@ -554,7 -554,7 +554,7 @@@ static unsigned long shmem_unused_huge_
    struct shmem_inode_info *info;
    struct page *page;
    unsigned long batch = sc ? sc->nr_to_scan : 128;
- 	int removed = 0, split = 0;
+ 	int split = 0;
if (list_empty(&sbinfo->shrinklist))
    	return SHRINK_STOP;
@@@ -569,7 -569,6 +569,6 @@@
    	/* inode is about to be evicted */
    	if (!inode) {
    		list_del_init(&info->shrinklist);
- 			removed++;
    		goto next;
    	}
@@@ -577,12 -576,12 +576,12 @@@
    	if (round_up(inode->i_size, PAGE_SIZE) ==
    			round_up(inode->i_size, HPAGE_PMD_SIZE)) {
    		list_move(&info->shrinklist, &to_remove);
- 			removed++;
    		goto next;
    	}
list_move(&info->shrinklist, &list);
  next:
+ 		sbinfo->shrinklist_len--;
    	if (!--batch)
    		break;
    }
@@@ -602,7 -601,7 +601,7 @@@
    	inode = &info->vfs_inode;
if (nr_to_split && split >= nr_to_split)
- 			goto leave;
+ 			goto move_back;
page = find_get_page(inode->i_mapping,
    			(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
@@@ -616,38 -615,44 +615,44 @@@
    	}
/*
- 		 * Leave the inode on the list if we failed to lock
- 		 * the page at this time.
+ 		 * Move the inode on the list back to shrinklist if we failed
+ 		 * to lock the page at this time.
    	 *
    	 * Waiting for the lock may lead to deadlock in the
    	 * reclaim path.
    	 */
    	if (!trylock_page(page)) {
    		put_page(page);
- 			goto leave;
+ 			goto move_back;
    	}
ret = split_huge_page(page);
    	unlock_page(page);
    	put_page(page);
- 		/* If split failed leave the inode on the list */
+ 		/* If split failed move the inode on the list back to shrinklist */
    	if (ret)
- 			goto leave;
+ 			goto move_back;
split++;
  drop:
    	list_del_init(&info->shrinklist);
- 		removed++;
- leave:
+ 		goto put;
+ move_back:
+ 		/*
+ 		 * Make sure the inode is either on the global list or deleted
+ 		 * from any local list before iput() since it could be deleted
+ 		 * in another thread once we put the inode (then the local list
+ 		 * is corrupted).
+ 		 */
+ 		spin_lock(&sbinfo->shrinklist_lock);
+ 		list_move(&info->shrinklist, &sbinfo->shrinklist);
+ 		sbinfo->shrinklist_len++;
+ 		spin_unlock(&sbinfo->shrinklist_lock);
+ put:
    	iput(inode);
    }
- 	spin_lock(&sbinfo->shrinklist_lock);
- 	list_splice_tail(&list, &sbinfo->shrinklist);
- 	sbinfo->shrinklist_len -= removed;
- 	spin_unlock(&sbinfo->shrinklist_lock);
- 
    return split;
  }
@@@ -694,6 -699,7 +699,6 @@@ static int shmem_add_to_page_cache(stru
    			   struct mm_struct *charge_mm)
  {
    XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
 -	unsigned long i = 0;
    unsigned long nr = compound_nr(page);
    int error;
@@@ -720,18 -726,20 +725,18 @@@
    cgroup_throttle_swaprate(page, gfp);
do {
 -		void *entry;
    	xas_lock_irq(&xas);
 -		entry = xas_find_conflict(&xas);
 -		if (entry != expected)
 +		if (expected != xas_find_conflict(&xas)) {
    		xas_set_err(&xas, -EEXIST);
 -		xas_create_range(&xas);
 -		if (xas_error(&xas))
    		goto unlock;
 -next:
 -		xas_store(&xas, page);
 -		if (++i < nr) {
 -			xas_next(&xas);
 -			goto next;
    	}
 +		if (expected && xas_find_conflict(&xas)) {
 +			xas_set_err(&xas, -EEXIST);
 +			goto unlock;
 +		}
 +		xas_store(&xas, page);
 +		if (xas_error(&xas))
 +			goto unlock;
    	if (PageTransHuge(page)) {
    		count_vm_event(THP_FILE_ALLOC);
    		__mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
@@@ -877,26 -885,30 +882,26 @@@ void shmem_unlock_mapping(struct addres
    }
  }
-/*
 - * Check whether a hole-punch or truncation needs to split a huge page,
 - * returning true if no split was required, or the split has been successful.
 - *
 - * Eviction (or truncation to 0 size) should never need to split a huge page;
 - * but in rare cases might do so, if shmem_undo_range() failed to trylock on
 - * head, and then succeeded to trylock on tail.
 - *
 - * A split can only succeed when there are no additional references on the
 - * huge page: so the split below relies upon find_get_entries() having stopped
 - * when it found a subpage of the huge page, without getting further references.
 - */
 -static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
 +static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
  {
 -	if (!PageTransCompound(page))
 -		return true;
 -
 -	/* Just proceed to delete a huge page wholly within the range punched */
 -	if (PageHead(page) &&
 -	    page->index >= start && page->index + HPAGE_PMD_NR <= end)
 -		return true;
 +	struct folio *folio;
 +	struct page *page;
-	/* Try to split huge page, so we can truly punch the hole or truncate */
 -	return split_huge_page(page) >= 0;
 +	/*
 +	 * At first avoid shmem_getpage(,,,SGP_READ): that fails
 +	 * beyond i_size, and reports fallocated pages as holes.
 +	 */
 +	folio = __filemap_get_folio(inode->i_mapping, index,
 +					FGP_ENTRY | FGP_LOCK, 0);
 +	if (!xa_is_value(folio))
 +		return folio;
 +	/*
 +	 * But read a page back from swap if any of it is within i_size
 +	 * (although in some cases this is just a waste of time).
 +	 */
 +	page = NULL;
 +	shmem_getpage(inode, index, &page, SGP_READ);
 +	return page ? page_folio(page) : NULL;
  }
/*
@@@ -910,10 -922,10 +915,10 @@@ static void shmem_undo_range(struct ino
    struct shmem_inode_info *info = SHMEM_I(inode);
    pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
    pgoff_t end = (lend + 1) >> PAGE_SHIFT;
 -	unsigned int partial_start = lstart & (PAGE_SIZE - 1);
 -	unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
 -	struct pagevec pvec;
 +	struct folio_batch fbatch;
    pgoff_t indices[PAGEVEC_SIZE];
 +	struct folio *folio;
 +	bool same_folio;
    long nr_swaps_freed = 0;
    pgoff_t index;
    int i;
@@@ -924,64 -936,67 +929,64 @@@
    if (info->fallocend > start && info->fallocend <= end && !unfalloc)
    	info->fallocend = start;
-	pagevec_init(&pvec);
 +	folio_batch_init(&fbatch);
    index = start;
    while (index < end && find_lock_entries(mapping, index, end - 1,
 -			&pvec, indices)) {
 -		for (i = 0; i < pagevec_count(&pvec); i++) {
 -			struct page *page = pvec.pages[i];
 +			&fbatch, indices)) {
 +		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 +			folio = fbatch.folios[i];
index = indices[i];
-			if (xa_is_value(page)) {
 +			if (xa_is_value(folio)) {
    			if (unfalloc)
    				continue;
    			nr_swaps_freed += !shmem_free_swap(mapping,
 -								index, page);
 +								index, folio);
    			continue;
    		}
 -			index += thp_nr_pages(page) - 1;
 +			index += folio_nr_pages(folio) - 1;
-			if (!unfalloc || !PageUptodate(page))
 -				truncate_inode_page(mapping, page);
 -			unlock_page(page);
 +			if (!unfalloc || !folio_test_uptodate(folio))
 +				truncate_inode_folio(mapping, folio);
 +			folio_unlock(folio);
    	}
 -		pagevec_remove_exceptionals(&pvec);
 -		pagevec_release(&pvec);
 +		folio_batch_remove_exceptionals(&fbatch);
 +		folio_batch_release(&fbatch);
    	cond_resched();
    	index++;
    }
-	if (partial_start) {
 -		struct page *page = NULL;
 -		shmem_getpage(inode, start - 1, &page, SGP_READ);
 -		if (page) {
 -			unsigned int top = PAGE_SIZE;
 -			if (start > end) {
 -				top = partial_end;
 -				partial_end = 0;
 -			}
 -			zero_user_segment(page, partial_start, top);
 -			set_page_dirty(page);
 -			unlock_page(page);
 -			put_page(page);
 +	same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
 +	folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
 +	if (folio) {
 +		same_folio = lend < folio_pos(folio) + folio_size(folio);
 +		folio_mark_dirty(folio);
 +		if (!truncate_inode_partial_folio(folio, lstart, lend)) {
 +			start = folio->index + folio_nr_pages(folio);
 +			if (same_folio)
 +				end = folio->index;
    	}
 +		folio_unlock(folio);
 +		folio_put(folio);
 +		folio = NULL;
    }
 -	if (partial_end) {
 -		struct page *page = NULL;
 -		shmem_getpage(inode, end, &page, SGP_READ);
 -		if (page) {
 -			zero_user_segment(page, 0, partial_end);
 -			set_page_dirty(page);
 -			unlock_page(page);
 -			put_page(page);
 -		}
 +
 +	if (!same_folio)
 +		folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
 +	if (folio) {
 +		folio_mark_dirty(folio);
 +		if (!truncate_inode_partial_folio(folio, lstart, lend))
 +			end = folio->index;
 +		folio_unlock(folio);
 +		folio_put(folio);
    }
 -	if (start >= end)
 -		return;
index = start;
    while (index < end) {
    	cond_resched();
-		if (!find_get_entries(mapping, index, end - 1, &pvec,
 +		if (!find_get_entries(mapping, index, end - 1, &fbatch,
    			indices)) {
    		/* If all gone or hole-punch or unfalloc, we're done */
    		if (index == start || end != -1)
@@@ -990,14 -1005,14 +995,14 @@@
    		index = start;
    		continue;
    	}
 -		for (i = 0; i < pagevec_count(&pvec); i++) {
 -			struct page *page = pvec.pages[i];
 +		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 +			folio = fbatch.folios[i];
index = indices[i];
 -			if (xa_is_value(page)) {
 +			if (xa_is_value(folio)) {
    			if (unfalloc)
    				continue;
 -				if (shmem_free_swap(mapping, index, page)) {
 +				if (shmem_free_swap(mapping, index, folio)) {
    				/* Swap was replaced by page: retry */
    				index--;
    				break;
@@@ -1006,24 -1021,32 +1011,24 @@@
    			continue;
    		}
-			lock_page(page);
 +			folio_lock(folio);
-			if (!unfalloc || !PageUptodate(page)) {
 -				if (page_mapping(page) != mapping) {
 +			if (!unfalloc || !folio_test_uptodate(folio)) {
 +				if (folio_mapping(folio) != mapping) {
    				/* Page was replaced by swap: retry */
 -					unlock_page(page);
 +					folio_unlock(folio);
    				index--;
    				break;
    			}
 -				VM_BUG_ON_PAGE(PageWriteback(page), page);
 -				if (shmem_punch_compound(page, start, end))
 -					truncate_inode_page(mapping, page);
 -				else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
 -					/* Wipe the page and don't get stuck */
 -					clear_highpage(page);
 -					flush_dcache_page(page);
 -					set_page_dirty(page);
 -					if (index <
 -					    round_up(start, HPAGE_PMD_NR))
 -						start = index + 1;
 -				}
 +				VM_BUG_ON_FOLIO(folio_test_writeback(folio),
 +						folio);
 +				truncate_inode_folio(mapping, folio);
    		}
 -			unlock_page(page);
 +			index = folio->index + folio_nr_pages(folio) - 1;
 +			folio_unlock(folio);
    	}
 -		pagevec_remove_exceptionals(&pvec);
 -		pagevec_release(&pvec);
 +		folio_batch_remove_exceptionals(&fbatch);
 +		folio_batch_release(&fbatch);
    	index++;
    }
@@@ -1541,8 -1564,7 +1546,7 @@@ static struct page *shmem_alloc_hugepag
    	return NULL;
shmem_pseudo_vma_init(&pvma, info, hindex);
- 	page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(),
- 			       true);
+ 	page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
    shmem_pseudo_vma_destroy(&pvma);
    if (page)
    	prep_transhuge_page(page);
@@@ -2439,6 -2461,7 +2443,7 @@@ shmem_write_begin(struct file *file, st
    struct inode *inode = mapping->host;
    struct shmem_inode_info *info = SHMEM_I(inode);
    pgoff_t index = pos >> PAGE_SHIFT;
+ 	int ret = 0;
/* i_rwsem is held by caller */
    if (unlikely(info->seals & (F_SEAL_GROW |
@@@ -2449,7 -2472,19 +2454,19 @@@
    		return -EPERM;
    }
- 	return shmem_getpage(inode, index, pagep, SGP_WRITE);
+ 	ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
+ 
+ 	if (ret)
+ 		return ret;
+ 
+ 	if (PageHWPoison(*pagep)) {
+ 		unlock_page(*pagep);
+ 		put_page(*pagep);
+ 		*pagep = NULL;
+ 		return -EIO;
+ 	}
+ 
+ 	return 0;
  }
static int
@@@ -2536,6 -2571,12 +2553,12 @@@ static ssize_t shmem_file_read_iter(str
    		if (sgp == SGP_CACHE)
    			set_page_dirty(page);
    		unlock_page(page);
+ 
+ 			if (PageHWPoison(page)) {
+ 				put_page(page);
+ 				error = -EIO;
+ 				break;
+ 			}
    	}
/*
@@@ -3075,7 -3116,8 +3098,8 @@@ static const char *shmem_get_link(struc
    	page = find_get_page(inode->i_mapping, 0);
    	if (!page)
    		return ERR_PTR(-ECHILD);
- 		if (!PageUptodate(page)) {
+ 		if (PageHWPoison(page) ||
+ 		    !PageUptodate(page)) {
    		put_page(page);
    		return ERR_PTR(-ECHILD);
    	}
@@@ -3083,6 -3125,13 +3107,13 @@@
    	error = shmem_getpage(inode, 0, &page, SGP_READ);
    	if (error)
    		return ERR_PTR(error);
+ 		if (!page)
+ 			return ERR_PTR(-ECHILD);
+ 		if (PageHWPoison(page)) {
+ 			unlock_page(page);
+ 			put_page(page);
+ 			return ERR_PTR(-ECHILD);
+ 		}
    	unlock_page(page);
    }
    set_delayed_call(done, shmem_put_link, page);
@@@ -3733,6 -3782,13 +3764,13 @@@ static void shmem_destroy_inodecache(vo
    kmem_cache_destroy(shmem_inode_cachep);
  }
+ /* Keep the page in page cache instead of truncating it */
+ static int shmem_error_remove_page(struct address_space *mapping,
+ 				   struct page *page)
+ {
+ 	return 0;
+ }
+ 
  const struct address_space_operations shmem_aops = {
    .writepage	= shmem_writepage,
    .set_page_dirty	= __set_page_dirty_no_writeback,
@@@ -3743,7 -3799,7 +3781,7 @@@
  #ifdef CONFIG_MIGRATION
    .migratepage	= migrate_page,
  #endif
- 	.error_remove_page = generic_error_remove_page,
+ 	.error_remove_page = shmem_error_remove_page,
  };
  EXPORT_SYMBOL(shmem_aops);
@@@ -4151,9 -4207,14 +4189,14 @@@ struct page *shmem_read_mapping_page_gf
    error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
    			  gfp, NULL, NULL, NULL);
    if (error)
- 		page = ERR_PTR(error);
- 	else
- 		unlock_page(page);
+ 		return ERR_PTR(error);
+ 
+ 	unlock_page(page);
+ 	if (PageHWPoison(page)) {
+ 		put_page(page);
+ 		return ERR_PTR(-EIO);
+ 	}
+ 
    return page;
  #else
    /*
diff --combined mm/slab.h
index 95b9a74a2d51,053eefaf6cbd..7edb7d23f141
--- a/mm/slab.h
+++ b/mm/slab.h
@@@ -5,197 -5,6 +5,197 @@@
   * Internal slab definitions
   */
+/* Reuses the bits in struct page */
 +struct slab {
 +	unsigned long __page_flags;
 +
 +#if defined(CONFIG_SLAB)
 +
 +	union {
 +		struct list_head slab_list;
 +		struct rcu_head rcu_head;
 +	};
 +	struct kmem_cache *slab_cache;
 +	void *freelist;	/* array of free object indexes */
 +	void *s_mem;	/* first object */
 +	unsigned int active;
 +
 +#elif defined(CONFIG_SLUB)
 +
 +	union {
 +		struct list_head slab_list;
 +		struct rcu_head rcu_head;
 +#ifdef CONFIG_SLUB_CPU_PARTIAL
 +		struct {
 +			struct slab *next;
 +			int slabs;	/* Nr of slabs left */
 +		};
 +#endif
 +	};
 +	struct kmem_cache *slab_cache;
 +	/* Double-word boundary */
 +	void *freelist;		/* first free object */
 +	union {
 +		unsigned long counters;
 +		struct {
 +			unsigned inuse:16;
 +			unsigned objects:15;
 +			unsigned frozen:1;
 +		};
 +	};
 +	unsigned int __unused;
 +
 +#elif defined(CONFIG_SLOB)
 +
 +	struct list_head slab_list;
 +	void *__unused_1;
 +	void *freelist;		/* first free block */
 +	long units;
 +	unsigned int __unused_2;
 +
 +#else
 +#error "Unexpected slab allocator configured"
 +#endif
 +
 +	atomic_t __page_refcount;
 +#ifdef CONFIG_MEMCG
 +	unsigned long memcg_data;
 +#endif
 +};
 +
 +#define SLAB_MATCH(pg, sl)						\
 +	static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
 +SLAB_MATCH(flags, __page_flags);
 +SLAB_MATCH(compound_head, slab_list);	/* Ensure bit 0 is clear */
 +SLAB_MATCH(slab_list, slab_list);
 +#ifndef CONFIG_SLOB
 +SLAB_MATCH(rcu_head, rcu_head);
 +SLAB_MATCH(slab_cache, slab_cache);
 +#endif
 +#ifdef CONFIG_SLAB
 +SLAB_MATCH(s_mem, s_mem);
 +SLAB_MATCH(active, active);
 +#endif
 +SLAB_MATCH(_refcount, __page_refcount);
 +#ifdef CONFIG_MEMCG
 +SLAB_MATCH(memcg_data, memcg_data);
 +#endif
 +#undef SLAB_MATCH
 +static_assert(sizeof(struct slab) <= sizeof(struct page));
 +
 +/**
 + * folio_slab - Converts from folio to slab.
 + * @folio: The folio.
 + *
 + * Currently struct slab is a different representation of a folio where
 + * folio_test_slab() is true.
 + *
 + * Return: The slab which contains this folio.
 + */
 +#define folio_slab(folio)	(_Generic((folio),			\
 +	const struct folio *:	(const struct slab *)(folio),		\
 +	struct folio *:		(struct slab *)(folio)))
 +
 +/**
 + * slab_folio - The folio allocated for a slab
 + * @slab: The slab.
 + *
 + * Slabs are allocated as folios that contain the individual objects and are
 + * using some fields in the first struct page of the folio - those fields are
 + * now accessed by struct slab. It is occasionally necessary to convert back to
 + * a folio in order to communicate with the rest of the mm.  Please use this
 + * helper function instead of casting yourself, as the implementation may change
 + * in the future.
 + */
 +#define slab_folio(s)		(_Generic((s),				\
 +	const struct slab *:	(const struct folio *)s,		\
 +	struct slab *:		(struct folio *)s))
 +
 +/**
 + * page_slab - Converts from first struct page to slab.
 + * @p: The first (either head of compound or single) page of slab.
 + *
 + * A temporary wrapper to convert struct page to struct slab in situations where
 + * we know the page is the compound head, or single order-0 page.
 + *
 + * Long-term ideally everything would work with struct slab directly or go
 + * through folio to struct slab.
 + *
 + * Return: The slab which contains this page
 + */
 +#define page_slab(p)		(_Generic((p),				\
 +	const struct page *:	(const struct slab *)(p),		\
 +	struct page *:		(struct slab *)(p)))
 +
 +/**
 + * slab_page - The first struct page allocated for a slab
 + * @slab: The slab.
 + *
 + * A convenience wrapper for converting slab to the first struct page of the
 + * underlying folio, to communicate with code not yet converted to folio or
 + * struct slab.
 + */
 +#define slab_page(s) folio_page(slab_folio(s), 0)
 +
 +/*
 + * If network-based swap is enabled, sl*b must keep track of whether pages
 + * were allocated from pfmemalloc reserves.
 + */
 +static inline bool slab_test_pfmemalloc(const struct slab *slab)
 +{
 +	return folio_test_active((struct folio *)slab_folio(slab));
 +}
 +
 +static inline void slab_set_pfmemalloc(struct slab *slab)
 +{
 +	folio_set_active(slab_folio(slab));
 +}
 +
 +static inline void slab_clear_pfmemalloc(struct slab *slab)
 +{
 +	folio_clear_active(slab_folio(slab));
 +}
 +
 +static inline void __slab_clear_pfmemalloc(struct slab *slab)
 +{
 +	__folio_clear_active(slab_folio(slab));
 +}
 +
 +static inline void *slab_address(const struct slab *slab)
 +{
 +	return folio_address(slab_folio(slab));
 +}
 +
 +static inline int slab_nid(const struct slab *slab)
 +{
 +	return folio_nid(slab_folio(slab));
 +}
 +
 +static inline pg_data_t *slab_pgdat(const struct slab *slab)
 +{
 +	return folio_pgdat(slab_folio(slab));
 +}
 +
 +static inline struct slab *virt_to_slab(const void *addr)
 +{
 +	struct folio *folio = virt_to_folio(addr);
 +
 +	if (!folio_test_slab(folio))
 +		return NULL;
 +
 +	return folio_slab(folio);
 +}
 +
 +static inline int slab_order(const struct slab *slab)
 +{
 +	return folio_order((struct folio *)slab_folio(slab));
 +}
 +
 +static inline size_t slab_size(const struct slab *slab)
 +{
 +	return PAGE_SIZE << slab_order(slab);
 +}
 +
  #ifdef CONFIG_SLOB
  /*
   * Common fields provided in kmem_cache by all slab allocators
@@@ -436,33 -245,15 +436,33 @@@ static inline bool kmem_cache_debug_fla
  }
#ifdef CONFIG_MEMCG_KMEM
 -int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
 -				 gfp_t gfp, bool new_page);
 +/*
 + * slab_objcgs - get the object cgroups vector associated with a slab
 + * @slab: a pointer to the slab struct
 + *
 + * Returns a pointer to the object cgroups vector associated with the slab,
 + * or NULL if no such vector has been associated yet.
 + */
 +static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
 +{
 +	unsigned long memcg_data = READ_ONCE(slab->memcg_data);
 +
 +	VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS),
 +							slab_page(slab));
 +	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab));
 +
 +	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 +}
 +
 +int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
 +				 gfp_t gfp, bool new_slab);
  void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
    	     enum node_stat_item idx, int nr);
-static inline void memcg_free_page_obj_cgroups(struct page *page)
 +static inline void memcg_free_slab_cgroups(struct slab *slab)
  {
 -	kfree(page_objcgs(page));
 -	page->memcg_data = 0;
 +	kfree(slab_objcgs(slab));
 +	slab->memcg_data = 0;
  }
static inline size_t obj_full_size(struct kmem_cache *s)
@@@ -507,7 -298,7 +507,7 @@@ static inline void memcg_slab_post_allo
    				      gfp_t flags, size_t size,
    				      void **p)
  {
 -	struct page *page;
 +	struct slab *slab;
    unsigned long off;
    size_t i;
@@@ -516,19 -307,19 +516,19 @@@
for (i = 0; i < size; i++) {
    	if (likely(p[i])) {
 -			page = virt_to_head_page(p[i]);
 +			slab = virt_to_slab(p[i]);
-			if (!page_objcgs(page) &&
 -			    memcg_alloc_page_obj_cgroups(page, s, flags,
 +			if (!slab_objcgs(slab) &&
 +			    memcg_alloc_slab_cgroups(slab, s, flags,
    						 false)) {
    			obj_cgroup_uncharge(objcg, obj_full_size(s));
    			continue;
    		}
-			off = obj_to_index(s, page, p[i]);
 +			off = obj_to_index(s, slab, p[i]);
    		obj_cgroup_get(objcg);
 -			page_objcgs(page)[off] = objcg;
 -			mod_objcg_state(objcg, page_pgdat(page),
 +			slab_objcgs(slab)[off] = objcg;
 +			mod_objcg_state(objcg, slab_pgdat(slab),
    				cache_vmstat_idx(s), obj_full_size(s));
    	} else {
    		obj_cgroup_uncharge(objcg, obj_full_size(s));
@@@ -543,7 -334,7 +543,7 @@@ static inline void memcg_slab_free_hook
    struct kmem_cache *s;
    struct obj_cgroup **objcgs;
    struct obj_cgroup *objcg;
 -	struct page *page;
 +	struct slab *slab;
    unsigned int off;
    int i;
@@@ -554,52 -345,43 +554,52 @@@
    	if (unlikely(!p[i]))
    		continue;
-		page = virt_to_head_page(p[i]);
 -		objcgs = page_objcgs_check(page);
 +		slab = virt_to_slab(p[i]);
 +		/* we could be given a kmalloc_large() object, skip those */
 +		if (!slab)
 +			continue;
 +
 +		objcgs = slab_objcgs(slab);
    	if (!objcgs)
    		continue;
if (!s_orig)
 -			s = page->slab_cache;
 +			s = slab->slab_cache;
    	else
    		s = s_orig;
-		off = obj_to_index(s, page, p[i]);
 +		off = obj_to_index(s, slab, p[i]);
    	objcg = objcgs[off];
    	if (!objcg)
    		continue;
objcgs[off] = NULL;
    	obj_cgroup_uncharge(objcg, obj_full_size(s));
 -		mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
 +		mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
    			-obj_full_size(s));
    	obj_cgroup_put(objcg);
    }
  }
#else /* CONFIG_MEMCG_KMEM */
 +static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
 +{
 +	return NULL;
 +}
 +
  static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
  {
    return NULL;
  }
-static inline int memcg_alloc_page_obj_cgroups(struct page *page,
 +static inline int memcg_alloc_slab_cgroups(struct slab *slab,
    				       struct kmem_cache *s, gfp_t gfp,
 -					       bool new_page)
 +					       bool new_slab)
  {
    return 0;
  }
-static inline void memcg_free_page_obj_cgroups(struct page *page)
 +static inline void memcg_free_slab_cgroups(struct slab *slab)
  {
  }
@@@ -623,35 -405,35 +623,35 @@@ static inline void memcg_slab_free_hook
  }
  #endif /* CONFIG_MEMCG_KMEM */
+#ifndef CONFIG_SLOB
  static inline struct kmem_cache *virt_to_cache(const void *obj)
  {
 -	struct page *page;
 +	struct slab *slab;
-	page = virt_to_head_page(obj);
 -	if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n",
 +	slab = virt_to_slab(obj);
 +	if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n",
    				__func__))
    	return NULL;
 -	return page->slab_cache;
 +	return slab->slab_cache;
  }
-static __always_inline void account_slab_page(struct page *page, int order,
 -					      struct kmem_cache *s,
 -					      gfp_t gfp)
 +static __always_inline void account_slab(struct slab *slab, int order,
 +					 struct kmem_cache *s, gfp_t gfp)
  {
    if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT))
 -		memcg_alloc_page_obj_cgroups(page, s, gfp, true);
 +		memcg_alloc_slab_cgroups(slab, s, gfp, true);
-	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
 +	mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
    		    PAGE_SIZE << order);
  }
-static __always_inline void unaccount_slab_page(struct page *page, int order,
 -						struct kmem_cache *s)
 +static __always_inline void unaccount_slab(struct slab *slab, int order,
 +					   struct kmem_cache *s)
  {
    if (memcg_kmem_enabled())
 -		memcg_free_page_obj_cgroups(page);
 +		memcg_free_slab_cgroups(slab);
-	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
 +	mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
    		    -(PAGE_SIZE << order));
  }
@@@ -670,7 -452,6 +670,7 @@@ static inline struct kmem_cache *cache_
    	print_tracking(cachep, x);
    return cachep;
  }
 +#endif /* CONFIG_SLOB */
static inline size_t slab_ksize(const struct kmem_cache *s)
  {
@@@ -794,11 -575,6 +794,6 @@@ static inline struct kmem_cache_node *g
#endif
- void *slab_start(struct seq_file *m, loff_t *pos);
- void *slab_next(struct seq_file *m, void *p, loff_t *pos);
- void slab_stop(struct seq_file *m, void *p);
- int memcg_slab_show(struct seq_file *m, void *p);
- 
  #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
  void dump_unreclaimable_slab(void);
  #else
@@@ -854,7 -630,7 +849,7 @@@ static inline void debugfs_slab_release
  #define KS_ADDRS_COUNT 16
  struct kmem_obj_info {
    void *kp_ptr;
 -	struct page *kp_page;
 +	struct slab *kp_slab;
    void *kp_objp;
    unsigned long kp_data_offset;
    struct kmem_cache *kp_slab_cache;
@@@ -862,18 -638,7 +857,18 @@@
    void *kp_stack[KS_ADDRS_COUNT];
    void *kp_free_stack[KS_ADDRS_COUNT];
  };
 -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
 +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab);
 +#endif
 +
 +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
 +void __check_heap_object(const void *ptr, unsigned long n,
 +			 const struct slab *slab, bool to_user);
 +#else
 +static inline
 +void __check_heap_object(const void *ptr, unsigned long n,
 +			 const struct slab *slab, bool to_user)
 +{
 +}
  #endif
#endif /* MM_SLAB_H */
diff --combined mm/slab_common.c
index dc15566141d4,9513244457e6..23f2ab0713b7
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@@ -489,9 -489,7 +489,7 @@@ void slab_kmem_cache_release(struct kme
void kmem_cache_destroy(struct kmem_cache *s)
  {
- 	int err;
- 
- 	if (unlikely(!s))
+ 	if (unlikely(!s) || !kasan_check_byte(s))
    	return;
cpus_read_lock();
@@@ -501,12 -499,9 +499,9 @@@
    if (s->refcount)
    	goto out_unlock;
- 	err = shutdown_cache(s);
- 	if (err) {
- 		pr_err("%s %s: Slab cache still has objects\n",
- 		       __func__, s->name);
- 		dump_stack();
- 	}
+ 	WARN(shutdown_cache(s),
+ 	     "%s %s: Slab cache still has objects when called from %pS",
+ 	     __func__, s->name, (void *)_RET_IP_);
  out_unlock:
    mutex_unlock(&slab_mutex);
    cpus_read_unlock();
@@@ -550,13 -545,13 +545,13 @@@ bool slab_is_available(void
   */
  bool kmem_valid_obj(void *object)
  {
 -	struct page *page;
 +	struct folio *folio;
/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
    if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
    	return false;
 -	page = virt_to_head_page(object);
 -	return PageSlab(page);
 +	folio = virt_to_folio(object);
 +	return folio_test_slab(folio);
  }
  EXPORT_SYMBOL_GPL(kmem_valid_obj);
@@@ -579,18 -574,18 +574,18 @@@ void kmem_dump_obj(void *object
  {
    char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
    int i;
 -	struct page *page;
 +	struct slab *slab;
    unsigned long ptroffset;
    struct kmem_obj_info kp = { };
if (WARN_ON_ONCE(!virt_addr_valid(object)))
    	return;
 -	page = virt_to_head_page(object);
 -	if (WARN_ON_ONCE(!PageSlab(page))) {
 +	slab = virt_to_slab(object);
 +	if (WARN_ON_ONCE(!slab)) {
    	pr_cont(" non-slab memory.\n");
    	return;
    }
 -	kmem_obj_info(&kp, object, page);
 +	kmem_obj_info(&kp, object, slab);
    if (kp.kp_slab_cache)
    	pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
    else
@@@ -824,7 -819,7 +819,7 @@@ void __init setup_kmalloc_cache_index_t
if (KMALLOC_MIN_SIZE >= 64) {
    	/*
- 		 * The 96 byte size cache is not used if the alignment
+ 		 * The 96 byte sized cache is not used if the alignment
    	 * is 64 byte.
    	 */
    	for (i = 64 + 8; i <= 96; i += 8)
@@@ -849,7 -844,7 +844,7 @@@ new_kmalloc_cache(int idx, enum kmalloc
    if (type == KMALLOC_RECLAIM) {
    	flags |= SLAB_RECLAIM_ACCOUNT;
    } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
- 		if (cgroup_memory_nokmem) {
+ 		if (mem_cgroup_kmem_disabled()) {
    		kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
    		return;
    	}
@@@ -1044,18 -1039,18 +1039,18 @@@ static void print_slabinfo_header(struc
    seq_putc(m, '\n');
  }
- void *slab_start(struct seq_file *m, loff_t *pos)
+ static void *slab_start(struct seq_file *m, loff_t *pos)
  {
    mutex_lock(&slab_mutex);
    return seq_list_start(&slab_caches, *pos);
  }
- void *slab_next(struct seq_file *m, void *p, loff_t *pos)
+ static void *slab_next(struct seq_file *m, void *p, loff_t *pos)
  {
    return seq_list_next(p, &slab_caches, pos);
  }
- void slab_stop(struct seq_file *m, void *p)
+ static void slab_stop(struct seq_file *m, void *p)
  {
    mutex_unlock(&slab_mutex);
  }
@@@ -1123,17 -1118,6 +1118,6 @@@ void dump_unreclaimable_slab(void
    mutex_unlock(&slab_mutex);
  }
- #if defined(CONFIG_MEMCG_KMEM)
- int memcg_slab_show(struct seq_file *m, void *p)
- {
- 	/*
- 	 * Deprecated.
- 	 * Please, take a look at tools/cgroup/slabinfo.py .
- 	 */
- 	return 0;
- }
- #endif
- 
  /*
   * slabinfo_op - iterator that generates /proc/slabinfo
   *
diff --combined mm/swap.c
index 74f6b311d7ee,b461814ce0cb..bcf3ac288b56
--- a/mm/swap.c
+++ b/mm/swap.c
@@@ -882,7 -882,7 +882,7 @@@ void lru_cache_disable(void
     * all online CPUs so any calls of lru_cache_disabled wrapped by
     * local_lock or preemption disabled would be ordered by that.
     * The atomic operation doesn't need to have stronger ordering
- 	 * requirements because that is enforeced by the scheduling
+ 	 * requirements because that is enforced by the scheduling
     * guarantees.
     */
    __lru_add_drain_all(true);
@@@ -1077,24 -1077,24 +1077,24 @@@ void __pagevec_lru_add(struct pagevec *
  }
/**
 - * pagevec_remove_exceptionals - pagevec exceptionals pruning
 - * @pvec:	The pagevec to prune
 + * folio_batch_remove_exceptionals() - Prune non-folios from a batch.
 + * @fbatch: The batch to prune
   *
 - * find_get_entries() fills both pages and XArray value entries (aka
 - * exceptional entries) into the pagevec.  This function prunes all
 - * exceptionals from @pvec without leaving holes, so that it can be
 - * passed on to page-only pagevec operations.
 + * find_get_entries() fills a batch with both folios and shadow/swap/DAX
 + * entries.  This function prunes all the non-folio entries from @fbatch
 + * without leaving holes, so that it can be passed on to folio-only batch
 + * operations.
   */
 -void pagevec_remove_exceptionals(struct pagevec *pvec)
 +void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
  {
 -	int i, j;
 +	unsigned int i, j;
-	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
 -		struct page *page = pvec->pages[i];
 -		if (!xa_is_value(page))
 -			pvec->pages[j++] = page;
 +	for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) {
 +		struct folio *folio = fbatch->folios[i];
 +		if (!xa_is_value(folio))
 +			fbatch->folios[j++] = folio;
    }
 -	pvec->nr = j;
 +	fbatch->nr = j;
  }
/**
diff --combined mm/truncate.c
index 5c87cdc70e7b,41b8249b3b4a..5e243d7269c0
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@@ -56,11 -56,11 +56,11 @@@ static void clear_shadow_entry(struct a
/*
   * Unconditionally remove exceptional entries. Usually called from truncate
 - * path. Note that the pagevec may be altered by this function by removing
 - * exceptional entries similar to what pagevec_remove_exceptionals does.
 + * path. Note that the folio_batch may be altered by this function by removing
 + * exceptional entries similar to what folio_batch_remove_exceptionals() does.
   */
 -static void truncate_exceptional_pvec_entries(struct address_space *mapping,
 -				struct pagevec *pvec, pgoff_t *indices)
 +static void truncate_folio_batch_exceptionals(struct address_space *mapping,
 +				struct folio_batch *fbatch, pgoff_t *indices)
  {
    int i, j;
    bool dax;
@@@ -69,11 -69,11 +69,11 @@@
    if (shmem_mapping(mapping))
    	return;
-	for (j = 0; j < pagevec_count(pvec); j++)
 -		if (xa_is_value(pvec->pages[j]))
 +	for (j = 0; j < folio_batch_count(fbatch); j++)
 +		if (xa_is_value(fbatch->folios[j]))
    		break;
-	if (j == pagevec_count(pvec))
 +	if (j == folio_batch_count(fbatch))
    	return;
dax = dax_mapping(mapping);
@@@ -82,12 -82,12 +82,12 @@@
    	xa_lock_irq(&mapping->i_pages);
    }
-	for (i = j; i < pagevec_count(pvec); i++) {
 -		struct page *page = pvec->pages[i];
 +	for (i = j; i < folio_batch_count(fbatch); i++) {
 +		struct folio *folio = fbatch->folios[i];
    	pgoff_t index = indices[i];
-		if (!xa_is_value(page)) {
 -			pvec->pages[j++] = page;
 +		if (!xa_is_value(folio)) {
 +			fbatch->folios[j++] = folio;
    		continue;
    	}
@@@ -96,7 -96,7 +96,7 @@@
    		continue;
    	}
-		__clear_shadow_entry(mapping, index, page);
 +		__clear_shadow_entry(mapping, index, folio);
    }
if (!dax) {
@@@ -105,7 -105,7 +105,7 @@@
    		inode_add_lru(mapping->host);
    	spin_unlock(&mapping->host->i_lock);
    }
 -	pvec->nr = j;
 +	fbatch->nr = j;
  }
/*
@@@ -177,21 -177,21 +177,21 @@@ void do_invalidatepage(struct page *pag
   * its lock, b) when a concurrent invalidate_mapping_pages got there first and
   * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
   */
 -static void truncate_cleanup_page(struct page *page)
 +static void truncate_cleanup_folio(struct folio *folio)
  {
 -	if (page_mapped(page))
 -		unmap_mapping_page(page);
 +	if (folio_mapped(folio))
 +		unmap_mapping_folio(folio);
-	if (page_has_private(page))
 -		do_invalidatepage(page, 0, thp_size(page));
 +	if (folio_has_private(folio))
 +		do_invalidatepage(&folio->page, 0, folio_size(folio));
/*
     * Some filesystems seem to re-dirty the page even after
     * the VM has canceled the dirty bit (eg ext3 journaling).
     * Hence dirty accounting check is placed after invalidation.
     */
 -	cancel_dirty_page(page);
 -	ClearPageMappedToDisk(page);
 +	folio_cancel_dirty(folio);
 +	folio_clear_mappedtodisk(folio);
  }
/*
@@@ -205,7 -205,6 +205,6 @@@
  static int
  invalidate_complete_page(struct address_space *mapping, struct page *page)
  {
- 	int ret;
if (page->mapping != mapping)
    	return 0;
@@@ -213,80 -212,26 +212,78 @@@
    if (page_has_private(page) && !try_to_release_page(page, 0))
    	return 0;
- 	ret = remove_mapping(mapping, page);
- 
- 	return ret;
+ 	return remove_mapping(mapping, page);
  }
-int truncate_inode_page(struct address_space *mapping, struct page *page)
 +int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
  {
 -	VM_BUG_ON_PAGE(PageTail(page), page);
 -
 -	if (page->mapping != mapping)
 +	if (folio->mapping != mapping)
    	return -EIO;
-	truncate_cleanup_page(page);
 -	delete_from_page_cache(page);
 +	truncate_cleanup_folio(folio);
 +	filemap_remove_folio(folio);
    return 0;
  }
+/*
 + * Handle partial folios.  The folio may be entirely within the
 + * range if a split has raced with us.  If not, we zero the part of the
 + * folio that's within the [start, end] range, and then split the folio if
 + * it's large.  split_page_range() will discard pages which now lie beyond
 + * i_size, and we rely on the caller to discard pages which lie within a
 + * newly created hole.
 + *
 + * Returns false if splitting failed so the caller can avoid
 + * discarding the entire folio which is stubbornly unsplit.
 + */
 +bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
 +{
 +	loff_t pos = folio_pos(folio);
 +	unsigned int offset, length;
 +
 +	if (pos < start)
 +		offset = start - pos;
 +	else
 +		offset = 0;
 +	length = folio_size(folio);
 +	if (pos + length <= (u64)end)
 +		length = length - offset;
 +	else
 +		length = end + 1 - pos - offset;
 +
 +	folio_wait_writeback(folio);
 +	if (length == folio_size(folio)) {
 +		truncate_inode_folio(folio->mapping, folio);
 +		return true;
 +	}
 +
 +	/*
 +	 * We may be zeroing pages we're about to discard, but it avoids
 +	 * doing a complex calculation here, and then doing the zeroing
 +	 * anyway if the page split fails.
 +	 */
 +	folio_zero_range(folio, offset, length);
 +
 +	cleancache_invalidate_page(folio->mapping, &folio->page);
 +	if (folio_has_private(folio))
 +		do_invalidatepage(&folio->page, offset, length);
 +	if (!folio_test_large(folio))
 +		return true;
 +	if (split_huge_page(&folio->page) == 0)
 +		return true;
 +	if (folio_test_dirty(folio))
 +		return false;
 +	truncate_inode_folio(folio->mapping, folio);
 +	return true;
 +}
 +
  /*
   * Used to get rid of pages on hardware memory corruption.
   */
  int generic_error_remove_page(struct address_space *mapping, struct page *page)
  {
 +	VM_BUG_ON_PAGE(PageTail(page), page);
 +
    if (!mapping)
    	return -EINVAL;
    /*
@@@ -295,7 -240,7 +292,7 @@@
     */
    if (!S_ISREG(mapping->host->i_mode))
    	return -EIO;
 -	return truncate_inode_page(mapping, page);
 +	return truncate_inode_folio(mapping, page_folio(page));
  }
  EXPORT_SYMBOL(generic_error_remove_page);
@@@ -346,16 -291,20 +343,16 @@@ void truncate_inode_pages_range(struct 
  {
    pgoff_t		start;		/* inclusive */
    pgoff_t		end;		/* exclusive */
 -	unsigned int	partial_start;	/* inclusive */
 -	unsigned int	partial_end;	/* exclusive */
 -	struct pagevec	pvec;
 +	struct folio_batch fbatch;
    pgoff_t		indices[PAGEVEC_SIZE];
    pgoff_t		index;
    int		i;
 +	struct folio	*folio;
 +	bool		same_folio;
if (mapping_empty(mapping))
    	goto out;
-	/* Offsets within partial pages */
 -	partial_start = lstart & (PAGE_SIZE - 1);
 -	partial_end = (lend + 1) & (PAGE_SIZE - 1);
 -
    /*
     * 'start' and 'end' always covers the range of pages to be fully
     * truncated. Partial pages are covered with 'partial_start' at the
@@@ -373,49 -322,64 +370,49 @@@
    else
    	end = (lend + 1) >> PAGE_SHIFT;
-	pagevec_init(&pvec);
 +	folio_batch_init(&fbatch);
    index = start;
    while (index < end && find_lock_entries(mapping, index, end - 1,
 -			&pvec, indices)) {
 -		index = indices[pagevec_count(&pvec) - 1] + 1;
 -		truncate_exceptional_pvec_entries(mapping, &pvec, indices);
 -		for (i = 0; i < pagevec_count(&pvec); i++)
 -			truncate_cleanup_page(pvec.pages[i]);
 -		delete_from_page_cache_batch(mapping, &pvec);
 -		for (i = 0; i < pagevec_count(&pvec); i++)
 -			unlock_page(pvec.pages[i]);
 -		pagevec_release(&pvec);
 +			&fbatch, indices)) {
 +		index = indices[folio_batch_count(&fbatch) - 1] + 1;
 +		truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
 +		for (i = 0; i < folio_batch_count(&fbatch); i++)
 +			truncate_cleanup_folio(fbatch.folios[i]);
 +		delete_from_page_cache_batch(mapping, &fbatch);
 +		for (i = 0; i < folio_batch_count(&fbatch); i++)
 +			folio_unlock(fbatch.folios[i]);
 +		folio_batch_release(&fbatch);
    	cond_resched();
    }
-	if (partial_start) {
 -		struct page *page = find_lock_page(mapping, start - 1);
 -		if (page) {
 -			unsigned int top = PAGE_SIZE;
 -			if (start > end) {
 -				/* Truncation within a single page */
 -				top = partial_end;
 -				partial_end = 0;
 -			}
 -			wait_on_page_writeback(page);
 -			zero_user_segment(page, partial_start, top);
 -			cleancache_invalidate_page(mapping, page);
 -			if (page_has_private(page))
 -				do_invalidatepage(page, partial_start,
 -						  top - partial_start);
 -			unlock_page(page);
 -			put_page(page);
 +	same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
 +	folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
 +	if (folio) {
 +		same_folio = lend < folio_pos(folio) + folio_size(folio);
 +		if (!truncate_inode_partial_folio(folio, lstart, lend)) {
 +			start = folio->index + folio_nr_pages(folio);
 +			if (same_folio)
 +				end = folio->index;
    	}
 +		folio_unlock(folio);
 +		folio_put(folio);
 +		folio = NULL;
    }
 -	if (partial_end) {
 -		struct page *page = find_lock_page(mapping, end);
 -		if (page) {
 -			wait_on_page_writeback(page);
 -			zero_user_segment(page, 0, partial_end);
 -			cleancache_invalidate_page(mapping, page);
 -			if (page_has_private(page))
 -				do_invalidatepage(page, 0,
 -						  partial_end);
 -			unlock_page(page);
 -			put_page(page);
 -		}
 +
 +	if (!same_folio)
 +		folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
 +						FGP_LOCK, 0);
 +	if (folio) {
 +		if (!truncate_inode_partial_folio(folio, lstart, lend))
 +			end = folio->index;
 +		folio_unlock(folio);
 +		folio_put(folio);
    }
 -	/*
 -	 * If the truncation happened within a single page no pages
 -	 * will be released, just zeroed, so we can bail out now.
 -	 */
 -	if (start >= end)
 -		goto out;
index = start;
 -	for ( ; ; ) {
 +	while (index < end) {
    	cond_resched();
 -		if (!find_get_entries(mapping, index, end - 1, &pvec,
 +		if (!find_get_entries(mapping, index, end - 1, &fbatch,
    			indices)) {
    		/* If all gone from start onwards, we're done */
    		if (index == start)
@@@ -425,24 -389,23 +422,24 @@@
    		continue;
    	}
-		for (i = 0; i < pagevec_count(&pvec); i++) {
 -			struct page *page = pvec.pages[i];
 +		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 +			struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing page->index */
    		index = indices[i];
-			if (xa_is_value(page))
 +			if (xa_is_value(folio))
    			continue;
-			lock_page(page);
 -			WARN_ON(page_to_index(page) != index);
 -			wait_on_page_writeback(page);
 -			truncate_inode_page(mapping, page);
 -			unlock_page(page);
 +			folio_lock(folio);
 +			VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
 +			folio_wait_writeback(folio);
 +			truncate_inode_folio(mapping, folio);
 +			folio_unlock(folio);
 +			index = folio_index(folio) + folio_nr_pages(folio) - 1;
    	}
 -		truncate_exceptional_pvec_entries(mapping, &pvec, indices);
 -		pagevec_release(&pvec);
 +		truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
 +		folio_batch_release(&fbatch);
    	index++;
    }
@@@ -513,16 -476,16 +510,16 @@@ static unsigned long __invalidate_mappi
    	pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
  {
    pgoff_t indices[PAGEVEC_SIZE];
 -	struct pagevec pvec;
 +	struct folio_batch fbatch;
    pgoff_t index = start;
    unsigned long ret;
    unsigned long count = 0;
    int i;
-	pagevec_init(&pvec);
 -	while (find_lock_entries(mapping, index, end, &pvec, indices)) {
 -		for (i = 0; i < pagevec_count(&pvec); i++) {
 -			struct page *page = pvec.pages[i];
 +	folio_batch_init(&fbatch);
 +	while (find_lock_entries(mapping, index, end, &fbatch, indices)) {
 +		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 +			struct page *page = &fbatch.folios[i]->page;
/* We rely upon deletion not changing page->index */
    		index = indices[i];
@@@ -549,8 -512,8 +546,8 @@@
    		}
    		count += ret;
    	}
 -		pagevec_remove_exceptionals(&pvec);
 -		pagevec_release(&pvec);
 +		folio_batch_remove_exceptionals(&fbatch);
 +		folio_batch_release(&fbatch);
    	cond_resched();
    	index++;
    }
@@@ -602,29 -565,31 +599,29 @@@ void invalidate_mapping_pagevec(struct 
   * shrink_page_list() has a temp ref on them, or because they're transiently
   * sitting in the lru_cache_add() pagevecs.
   */
 -static int
 -invalidate_complete_page2(struct address_space *mapping, struct page *page)
 +static int invalidate_complete_folio2(struct address_space *mapping,
 +					struct folio *folio)
  {
 -	if (page->mapping != mapping)
 +	if (folio->mapping != mapping)
    	return 0;
-	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
 +	if (folio_has_private(folio) &&
 +	    !filemap_release_folio(folio, GFP_KERNEL))
    	return 0;
spin_lock(&mapping->host->i_lock);
    xa_lock_irq(&mapping->i_pages);
 -	if (PageDirty(page))
 +	if (folio_test_dirty(folio))
    	goto failed;
-	BUG_ON(page_has_private(page));
 -	__delete_from_page_cache(page, NULL);
 +	BUG_ON(folio_has_private(folio));
 +	__filemap_remove_folio(folio, NULL);
    xa_unlock_irq(&mapping->i_pages);
    if (mapping_shrinkable(mapping))
    	inode_add_lru(mapping->host);
    spin_unlock(&mapping->host->i_lock);
-	if (mapping->a_ops->freepage)
 -		mapping->a_ops->freepage(page);
 -
 -	put_page(page);	/* pagecache ref */
 +	filemap_free_folio(mapping, folio);
    return 1;
  failed:
    xa_unlock_irq(&mapping->i_pages);
@@@ -632,13 -597,13 +629,13 @@@
    return 0;
  }
-static int do_launder_page(struct address_space *mapping, struct page *page)
 +static int do_launder_folio(struct address_space *mapping, struct folio *folio)
  {
 -	if (!PageDirty(page))
 +	if (!folio_test_dirty(folio))
    	return 0;
 -	if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
 +	if (folio->mapping != mapping || mapping->a_ops->launder_page == NULL)
    	return 0;
 -	return mapping->a_ops->launder_page(page);
 +	return mapping->a_ops->launder_page(&folio->page);
  }
/**
@@@ -656,7 -621,7 +653,7 @@@ int invalidate_inode_pages2_range(struc
    			  pgoff_t start, pgoff_t end)
  {
    pgoff_t indices[PAGEVEC_SIZE];
 -	struct pagevec pvec;
 +	struct folio_batch fbatch;
    pgoff_t index;
    int i;
    int ret = 0;
@@@ -666,25 -631,25 +663,25 @@@
    if (mapping_empty(mapping))
    	goto out;
-	pagevec_init(&pvec);
 +	folio_batch_init(&fbatch);
    index = start;
 -	while (find_get_entries(mapping, index, end, &pvec, indices)) {
 -		for (i = 0; i < pagevec_count(&pvec); i++) {
 -			struct page *page = pvec.pages[i];
 +	while (find_get_entries(mapping, index, end, &fbatch, indices)) {
 +		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 +			struct folio *folio = fbatch.folios[i];
-			/* We rely upon deletion not changing page->index */
 +			/* We rely upon deletion not changing folio->index */
    		index = indices[i];
-			if (xa_is_value(page)) {
 +			if (xa_is_value(folio)) {
    			if (!invalidate_exceptional_entry2(mapping,
 -								   index, page))
 +						index, folio))
    				ret = -EBUSY;
    			continue;
    		}
-			if (!did_range_unmap && page_mapped(page)) {
 +			if (!did_range_unmap && folio_mapped(folio)) {
    			/*
 -				 * If page is mapped, before taking its lock,
 +				 * If folio is mapped, before taking its lock,
    			 * zap the rest of the file in one hit.
    			 */
    			unmap_mapping_pages(mapping, index,
@@@ -692,29 -657,29 +689,29 @@@
    			did_range_unmap = 1;
    		}
-			lock_page(page);
 -			WARN_ON(page_to_index(page) != index);
 -			if (page->mapping != mapping) {
 -				unlock_page(page);
 +			folio_lock(folio);
 +			VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
 +			if (folio->mapping != mapping) {
 +				folio_unlock(folio);
    			continue;
    		}
 -			wait_on_page_writeback(page);
 +			folio_wait_writeback(folio);
-			if (page_mapped(page))
 -				unmap_mapping_page(page);
 -			BUG_ON(page_mapped(page));
 +			if (folio_mapped(folio))
 +				unmap_mapping_folio(folio);
 +			BUG_ON(folio_mapped(folio));
-			ret2 = do_launder_page(mapping, page);
 +			ret2 = do_launder_folio(mapping, folio);
    		if (ret2 == 0) {
 -				if (!invalidate_complete_page2(mapping, page))
 +				if (!invalidate_complete_folio2(mapping, folio))
    				ret2 = -EBUSY;
    		}
    		if (ret2 < 0)
    			ret = ret2;
 -			unlock_page(page);
 +			folio_unlock(folio);
    	}
 -		pagevec_remove_exceptionals(&pvec);
 -		pagevec_release(&pvec);
 +		folio_batch_remove_exceptionals(&fbatch);
 +		folio_batch_release(&fbatch);
    	cond_resched();
    	index++;
    }
-- 
LinuxNextTracking