The following commit has been merged in the master branch: commit f3105fd38ff332967eaf92abe7edcdd0cdd9ff42 Merge: a7a85ec518094c4504c5a1c99a1ee37cb53c3359 0ffdd21bf6505df7f27c2a44a7ba78269d133d01 Author: Stephen Rothwell sfr@canb.auug.org.au Date: Wed Jul 9 17:37:22 2014 +1000
Merge branch 'akpm-current/current'
Conflicts: mm/shmem.c
diff --combined Documentation/kernel-parameters.txt index a9ccb53,80746b2..ad7e59a --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@@ -566,11 -566,6 +566,11 @@@ bytes respectively. Such letter suffixe possible to determine what the correct size should be. This option provides an override for these situations.
+ ca_keys= [KEYS] This parameter identifies a specific key(s) on + the system trusted keyring to be used for certificate + trust validation. + format: { id:<keyid> | builtin } + ccw_timeout_log [S390] See Documentation/s390/CommonIO for details.
@@@ -1421,6 -1416,10 +1421,6 @@@ ip= [IP_PNP] See Documentation/filesystems/nfs/nfsroot.txt.
- ip2= [HW] Set IO/IRQ pairs for up to 4 IntelliPort boards - See comment before ip2_setup() in - drivers/char/ip2/ip2base.c. - irqfixup [HW] When an interrupt is not handled search all handlers for it. Intended to get systems with badly broken @@@ -1693,8 -1692,12 +1693,12 @@@ 7 (KERN_DEBUG) debug-level messages
log_buf_len=n[KMG] Sets the size of the printk ring buffer, - in bytes. n must be a power of two. The default - size is set in the kernel config file. + in bytes. n must be a power of two and greater + than the minimal size. The minimal size is defined + by LOG_BUF_SHIFT kernel config parameter. There is + also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter + that allows to increase the default size depending on + the number of CPUs. See init/Kconfig for more details.
logo.nologo [FB] Disables display of the built-in Linux logo. This may be used to provide more screen space for @@@ -2167,21 -2170,6 +2171,21 @@@ and restore using xsave. The kernel will fallback to enabling legacy floating-point and sse state.
+ noxsaveopt [X86] Disables xsaveopt used in saving x86 extended + register states. The kernel will fall back to use + xsave to save the states. By using this parameter, + performance of saving the states is degraded because + xsave doesn't support modified optimization while + xsaveopt supports it on xsaveopt enabled systems. + + noxsaves [X86] Disables xsaves and xrstors used in saving and + restoring x86 extended register state in compacted + form of xsave area. The kernel will fall back to use + xsaveopt and xrstor to save and restore the states + in standard form of xsave area. By using this + parameter, xsave area per process might occupy more + memory on xsaves enabled systems. + eagerfpu= [X86] on enable eager fpu restore off disable eager fpu restore @@@ -2806,12 -2794,6 +2810,12 @@@ leaf rcu_node structure. Useful for very large systems.
+ rcutree.jiffies_till_sched_qs= [KNL] + Set required age in jiffies for a + given grace period before RCU starts + soliciting quiescent-state help from + rcu_note_context_switch(). + rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. @@@ -2823,13 -2805,6 +2827,13 @@@ quiescent states. Units are jiffies, minimum value is one, and maximum value is HZ.
+ rcutree.rcu_nocb_leader_stride= [KNL] + Set the number of NOCB kthread groups, which + defaults to the square root of the number of + CPUs. Larger numbers reduces the wakeup overhead + on the per-CPU grace-period kthreads, but increases + that same overhead on each group's leader. + rcutree.qhimark= [KNL] Set threshold of queued RCU callbacks beyond which batch limiting is disabled. diff --combined MAINTAINERS index 2a7aaa9,dc5ed06..eb8d728 --- a/MAINTAINERS +++ b/MAINTAINERS @@@ -70,8 -70,6 +70,8 @@@ Descriptions of section entries
P: Person (obsolete) M: Mail patches to: FullName address@domain + R: Designated reviewer: FullName address@domain + These reviewers should be CCed on patches. L: Mailing list that is relevant to this area W: Web-page with status/info Q: Patchwork web based patch tracking system site @@@ -589,7 -587,7 +589,7 @@@ W: http://www.amd.com/us-en/Connectivit S: Supported F: drivers/char/hw_random/geode-rng.c F: drivers/crypto/geode* -F: drivers/video/geode/ +F: drivers/video/fbdev/geode/ F: arch/x86/include/asm/geode.h
AMD IOMMU (AMD-VI) @@@ -718,8 -716,8 +718,8 @@@ F: drivers/ata/pata_arasan_cf. ARC FRAMEBUFFER DRIVER M: Jaya Kumar jayalk@intworks.biz S: Maintained -F: drivers/video/arcfb.c -F: drivers/video/fb_defio.c +F: drivers/video/fbdev/arcfb.c +F: drivers/video/fbdev/core/fb_defio.c
ARM MFM AND FLOPPY DRIVERS M: Ian Molton spyro@f2s.com @@@ -758,7 -756,7 +758,7 @@@ F: sound/arm/aaci. ARM PRIMECELL CLCD PL110 DRIVER M: Russell King linux@arm.linux.org.uk S: Maintained -F: drivers/video/amba-clcd.* +F: drivers/video/fbdev/amba-clcd.*
ARM PRIMECELL KMI PL050 DRIVER M: Russell King linux@arm.linux.org.uk @@@ -945,10 -943,16 +945,10 @@@ L: linux-arm-kernel@lists.infradead.or S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git F: arch/arm/mach-imx/ +F: arch/arm/mach-mxs/ F: arch/arm/boot/dts/imx* F: arch/arm/configs/imx*_defconfig
-ARM/FREESCALE MXS ARM ARCHITECTURE -M: Shawn Guo shawn.guo@linaro.org -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -S: Maintained -T: git git://git.linaro.org/people/shawnguo/linux-2.6.git -F: arch/arm/mach-mxs/ - ARM/GLOMATION GESBC9312SX MACHINE SUPPORT M: Lennert Buytenhek kernel@wantstofly.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@@ -1048,33 -1052,9 +1048,33 @@@ M: Santosh Shilimkar <santosh.shilimkar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-keystone/ -F: drivers/clk/keystone/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/ssantosh/linux-keystone.git
+ARM/TEXAS INSTRUMENT KEYSTONE CLOCK FRAMEWORK +M: Santosh Shilimkar santosh.shilimkar@ti.com +L: linux-kernel@vger.kernel.org +S: Maintained +F: drivers/clk/keystone/ + +ARM/TEXAS INSTRUMENT KEYSTONE ClOCKSOURCE +M: Santosh Shilimkar santosh.shilimkar@ti.com +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +L: linux-kernel@vger.kernel.org +S: Maintained +F: drivers/clocksource/timer-keystone.c + +ARM/TEXAS INSTRUMENT KEYSTONE RESET DRIVER +M: Santosh Shilimkar santosh.shilimkar@ti.com +L: linux-kernel@vger.kernel.org +S: Maintained +F: drivers/power/reset/keystone-reset.c + +ARM/TEXAS INSTRUMENT AEMIF/EMIF DRIVERS +M: Santosh Shilimkar santosh.shilimkar@ti.com +L: linux-kernel@vger.kernel.org +S: Maintained +F: drivers/memory/*emif* + ARM/LOGICPD PXA270 MACHINE SUPPORT M: Lennert Buytenhek kernel@wantstofly.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@@ -1155,7 -1135,7 +1155,7 @@@ M: Daniel Walker <dwalker@fifo99.com M: Bryan Huntsman bryanh@codeaurora.org L: linux-arm-msm@vger.kernel.org F: arch/arm/mach-msm/ -F: drivers/video/msm/ +F: drivers/video/fbdev/msm/ F: drivers/mmc/host/msm_sdcc.c F: drivers/mmc/host/msm_sdcc.h F: drivers/tty/serial/msm_serial.h @@@ -1316,20 -1296,6 +1316,20 @@@ W: http://oss.renesas.co Q: http://patchwork.kernel.org/project/linux-sh/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next S: Supported +F: arch/arm/boot/dts/emev2* +F: arch/arm/boot/dts/r7s* +F: arch/arm/boot/dts/r8a* +F: arch/arm/boot/dts/sh* +F: arch/arm/configs/ape6evm_defconfig +F: arch/arm/configs/armadillo800eva_defconfig +F: arch/arm/configs/bockw_defconfig +F: arch/arm/configs/genmai_defconfig +F: arch/arm/configs/koelsch_defconfig +F: arch/arm/configs/kzm9g_defconfig +F: arch/arm/configs/lager_defconfig +F: arch/arm/configs/mackerel_defconfig +F: arch/arm/configs/marzen_defconfig +F: arch/arm/configs/shmobile_defconfig F: arch/arm/mach-shmobile/ F: drivers/sh/
@@@ -1389,7 -1355,7 +1389,7 @@@ F: drivers/mtd/nand/nuc900_nand. F: drivers/rtc/rtc-nuc900.c F: drivers/spi/spi-nuc900.c F: drivers/usb/host/ehci-w90x900.c -F: drivers/video/nuc900fb.c +F: drivers/video/fbdev/nuc900fb.c
ARM/U300 MACHINE SUPPORT M: Linus Walleij linus.walleij@linaro.org @@@ -1458,9 -1424,9 +1458,9 @@@ F: drivers/rtc/rtc-vt8500. F: drivers/tty/serial/vt8500_serial.c F: drivers/usb/host/ehci-platform.c F: drivers/usb/host/uhci-platform.c -F: drivers/video/vt8500lcdfb.* -F: drivers/video/wm8505fb* -F: drivers/video/wmt_ge_rops.* +F: drivers/video/fbdev/vt8500lcdfb.* +F: drivers/video/fbdev/wm8505fb* +F: drivers/video/fbdev/wmt_ge_rops.*
ARM/ZIPIT Z2 SUPPORT M: Marek Vasut marek.vasut@gmail.com @@@ -1650,7 -1616,7 +1650,7 @@@ ATMEL LCDFB DRIVE M: Nicolas Ferre nicolas.ferre@atmel.com L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/atmel_lcdfb.c +F: drivers/video/fbdev/atmel_lcdfb.c F: include/video/atmel_lcdc.h
ATMEL MACB ETHERNET DRIVER @@@ -1797,13 -1763,6 +1797,13 @@@ W: http://bcache.evilpiepirate.or S: Maintained: F: drivers/md/bcache/
+BECEEM BCS200/BCS220-3/BCSM250 WIMAX SUPPORT +M: Kevin McKinney klmckinney1@gmail.com +M: Matthias Beyer mail@beyermatthias.de +L: devel@driverdev.osuosl.org +S: Maintained +F: drivers/staging/bcm* + BEFS FILE SYSTEM S: Orphan F: Documentation/filesystems/befs.txt @@@ -1935,8 -1894,7 +1935,8 @@@ S: Supporte F: drivers/net/ethernet/broadcom/genet/
BROADCOM BNX2 GIGABIT ETHERNET DRIVER -M: Michael Chan mchan@broadcom.com +M: Sony Chacko sony.chacko@qlogic.com +M: Dept-HSGLinuxNICDev@qlogic.com L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/broadcom/bnx2.* @@@ -1981,7 -1939,7 +1981,7 @@@ F: arch/arm/boot/dts/bcm5301x.dts F: arch/arm/boot/dts/bcm470*
BROADCOM TG3 GIGABIT ETHERNET DRIVER -M: Nithin Nayak Sujir nsujir@broadcom.com +M: Prashant Sreedharan prashant@broadcom.com M: Michael Chan mchan@broadcom.com L: netdev@vger.kernel.org S: Supported @@@ -2668,7 -2626,7 +2668,7 @@@ M: Russell King <linux@arm.linux.org.uk L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.arm.linux.org.uk/ S: Maintained -F: drivers/video/cyber2000fb.* +F: drivers/video/fbdev/cyber2000fb.*
CYCLADES ASYNC MUX DRIVER W: http://www.cyclades.com/ @@@ -2905,7 -2863,7 +2905,7 @@@ M: Bernie Thompson <bernie@plugable.com L: linux-fbdev@vger.kernel.org S: Maintained W: http://plugable.com/category/projects/udlfb/ -F: drivers/video/udlfb.c +F: drivers/video/fbdev/udlfb.c F: include/video/udlfb.h F: Documentation/fb/udlfb.txt
@@@ -2924,8 -2882,8 +2924,8 @@@ S: Maintaine L: linux-media@vger.kernel.org L: dri-devel@lists.freedesktop.org L: linaro-mm-sig@lists.linaro.org -F: drivers/base/dma-buf* -F: include/linux/dma-buf* +F: drivers/dma-buf/ +F: include/linux/dma-buf* include/linux/reservation.h include/linux/*fence.h F: Documentation/dma-buf-sharing.txt T: git git://git.linaro.org/people/sumitsemwal/linux-dma-buf.git
@@@ -3214,26 -3172,12 +3214,12 @@@ T: git git://linuxtv.org/anttip/media_t S: Maintained F: drivers/media/tuners/e4000*
- EATA-DMA SCSI DRIVER - M: Michael Neuffer mike@i-Connect.Net - L: linux-eata@i-connect.net - L: linux-scsi@vger.kernel.org - S: Maintained - F: drivers/scsi/eata* - EATA ISA/EISA/PCI SCSI DRIVER M: Dario Ballabio ballabio_dario@emc.com L: linux-scsi@vger.kernel.org S: Maintained F: drivers/scsi/eata.c
- EATA-PIO SCSI DRIVER - M: Michael Neuffer mike@i-Connect.Net - L: linux-eata@i-connect.net - L: linux-scsi@vger.kernel.org - S: Maintained - F: drivers/scsi/eata_pio.* - EC100 MEDIA DRIVER M: Antti Palosaari crope@iki.fi L: linux-media@vger.kernel.org @@@ -3422,7 -3366,7 +3408,7 @@@ EFIFB FRAMEBUFFER DRIVE L: linux-fbdev@vger.kernel.org M: Peter Jones pjones@redhat.com S: Maintained -F: drivers/video/efifb.c +F: drivers/video/fbdev/efifb.c
EFS FILESYSTEM W: http://aeschi.ch.eu.org/efs/ @@@ -3487,7 -3431,7 +3473,7 @@@ EPSON S1D13XXX FRAMEBUFFER DRIVE M: Kristoffer Ericson kristoffer.ericson@gmail.com S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/kristoffer/linux-hpc.git -F: drivers/video/s1d13xxxfb.c +F: drivers/video/fbdev/s1d13xxxfb.c F: include/video/s1d13xxxfb.h
ETHERNET BRIDGE @@@ -3565,7 -3509,7 +3551,7 @@@ M: Donghwa Lee <dh09.lee@samsung.com M: Kyungmin Park kyungmin.park@samsung.com L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/exynos/exynos_mipi* +F: drivers/video/fbdev/exynos/exynos_mipi* F: include/video/exynos_mipi*
F71805F HARDWARE MONITORING DRIVER @@@ -3744,7 -3688,7 +3730,7 @@@ FREESCALE DIU FRAMEBUFFER DRIVE M: Timur Tabi timur@tabi.org L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/fsl-diu-fb.* +F: drivers/video/fbdev/fsl-diu-fb.*
FREESCALE DMA DRIVER M: Li Yang leoli@freescale.com @@@ -3766,7 -3710,7 +3752,7 @@@ L: linux-fbdev@vger.kernel.or L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: include/linux/platform_data/video-imxfb.h -F: drivers/video/imxfb.c +F: drivers/video/fbdev/imxfb.c
FREESCALE SOC FS_ENET DRIVER M: Pantelis Antoniou pantelis.antoniou@gmail.com @@@ -4185,7 -4129,7 +4171,7 @@@ M: Ferenc Bakonyi <fero@drama.obuda.kan L: linux-nvidia@lists.surfsouth.com W: http://drama.obuda.kando.hu/~fero/cgi-bin/hgafb.shtml S: Maintained -F: drivers/video/hgafb.c +F: drivers/video/fbdev/hgafb.c
HIBERNATION (aka Software Suspend, aka swsusp) M: "Rafael J. Wysocki" rjw@rjwysocki.net @@@ -4215,7 -4159,7 +4201,7 @@@ L: linux-kernel@vger.kernel.or T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core S: Maintained F: Documentation/timers/ -F: kernel/hrtimer.c +F: kernel/time/hrtimer.c F: kernel/time/clockevents.c F: kernel/time/tick*.* F: kernel/time/timer_*.c @@@ -4327,7 -4271,7 +4313,7 @@@ F: drivers/hv F: drivers/input/serio/hyperv-keyboard.c F: drivers/net/hyperv/ F: drivers/scsi/storvsc_drv.c -F: drivers/video/hyperv_fb.c +F: drivers/video/fbdev/hyperv_fb.c F: include/linux/hyperv.h F: tools/hv/
@@@ -4471,10 -4415,7 +4457,7 @@@ S: Supporte F: drivers/scsi/ibmvscsi/ibmvfc*
IBM ServeRAID RAID DRIVER - P: Jack Hammer - M: Dave Jeffery ipslinux@adaptec.com - W: http://www.developer.ibm.com/welcome/netfinity/serveraid.html - S: Supported + S: Orphan F: drivers/scsi/ips.*
ICH LPC AND GPIO DRIVER @@@ -4521,7 -4462,8 +4504,7 @@@ S: Supporte F: drivers/idle/i7300_idle.c
IEEE 802.15.4 SUBSYSTEM -M: Alexander Smirnov alex.bluesman.smirnov@gmail.com -M: Dmitry Eremin-Solenikov dbaryshkov@gmail.com +M: Alexander Aring alex.aring@gmail.com L: linux-zigbee-devel@lists.sourceforge.net (moderated for non-subscribers) W: http://apps.sourceforge.net/trac/linux-zigbee T: git git://git.kernel.org/pub/scm/linux/kernel/git/lowpan/lowpan.git @@@ -4586,7 -4528,7 +4569,7 @@@ F: security/integrity/ima IMS TWINTURBO FRAMEBUFFER DRIVER L: linux-fbdev@vger.kernel.org S: Orphan -F: drivers/video/imsttfb.c +F: drivers/video/fbdev/imsttfb.c
INFINIBAND SUBSYSTEM M: Roland Dreier roland@kernel.org @@@ -4653,13 -4595,13 +4636,13 @@@ M: Maik Broemme <mbroemme@plusserver.de L: linux-fbdev@vger.kernel.org S: Maintained F: Documentation/fb/intelfb.txt -F: drivers/video/intelfb/ +F: drivers/video/fbdev/intelfb/
INTEL 810/815 FRAMEBUFFER DRIVER M: Antonino Daplas adaplas@gmail.com L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/i810/ +F: drivers/video/fbdev/i810/
INTEL MENLOW THERMAL DRIVER M: Sujith Thomas sujith.thomas@intel.com @@@ -5434,17 -5376,16 +5417,17 @@@ F: arch/powerpc/*/*/*virtex
LINUX FOR POWERPC EMBEDDED PPC8XX M: Vitaly Bordug vitb@kernel.crashing.org -M: Marcelo Tosatti marcelo@kvack.org W: http://www.penguinppc.org/ L: linuxppc-dev@lists.ozlabs.org S: Maintained F: arch/powerpc/platforms/8xx/
LINUX FOR POWERPC EMBEDDED PPC83XX AND PPC85XX +M: Scott Wood scottwood@freescale.com M: Kumar Gala galak@kernel.crashing.org W: http://www.penguinppc.org/ L: linuxppc-dev@lists.ozlabs.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/scottwood/linux.git S: Maintained F: arch/powerpc/platforms/83xx/ F: arch/powerpc/platforms/85xx/ @@@ -5554,11 -5495,10 +5537,11 @@@ S: Maintaine F: arch/arm/mach-lpc32xx/
LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI) -M: Nagalakshmi Nandigama Nagalakshmi.Nandigama@lsi.com -M: Sreekanth Reddy Sreekanth.Reddy@lsi.com -M: support@lsi.com -L: DL-MPTFusionLinux@lsi.com +M: Nagalakshmi Nandigama nagalakshmi.nandigama@avagotech.com +M: Praveen Krishnamoorthy praveen.krishnamoorthy@avagotech.com +M: Sreekanth Reddy sreekanth.reddy@avagotech.com +M: Abhijit Mahajan abhijit.mahajan@avagotech.com +L: MPT-FusionLinux.pdl@avagotech.com L: linux-scsi@vger.kernel.org W: http://www.lsilogic.com/support S: Supported @@@ -5667,6 -5607,16 +5650,6 @@@ F: Documentation/networking/mac80211-in F: include/net/mac80211.h F: net/mac80211/
-MAC80211 PID RATE CONTROL -M: Stefano Brivio stefano.brivio@polimi.it -M: Mattias Nissler mattias.nissler@gmx.de -L: linux-wireless@vger.kernel.org -W: http://wireless.kernel.org/en/developers/Documentation/mac80211/RateControl/... -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git -S: Maintained -F: net/mac80211/rc80211_pid* - MACVLAN DRIVER M: Patrick McHardy kaber@trash.net L: netdev@vger.kernel.org @@@ -5730,7 -5680,7 +5713,7 @@@ F: drivers/mmc/host/mvsdio. MATROX FRAMEBUFFER DRIVER L: linux-fbdev@vger.kernel.org S: Orphan -F: drivers/video/matrox/matroxfb_* +F: drivers/video/fbdev/matrox/matroxfb_* F: include/uapi/linux/matroxfb.h
MAX16065 HARDWARE MONITOR DRIVER @@@ -6372,8 -6322,8 +6355,8 @@@ NVIDIA (rivafb and nvidiafb) FRAMEBUFFE M: Antonino Daplas adaplas@gmail.com L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/riva/ -F: drivers/video/nvidia/ +F: drivers/video/fbdev/riva/ +F: drivers/video/fbdev/nvidia/
NVM EXPRESS DRIVER M: Matthew Wilcox willy@linux.intel.com @@@ -6443,14 -6393,14 +6426,14 @@@ M: Tomi Valkeinen <tomi.valkeinen@ti.co L: linux-fbdev@vger.kernel.org L: linux-omap@vger.kernel.org S: Maintained -F: drivers/video/omap/ +F: drivers/video/fbdev/omap/
OMAP DISPLAY SUBSYSTEM and FRAMEBUFFER SUPPORT (DSS2) M: Tomi Valkeinen tomi.valkeinen@ti.com L: linux-omap@vger.kernel.org L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/omap2/ +F: drivers/video/fbdev/omap2/ F: Documentation/arm/OMAP/DSS
OMAP HARDWARE SPINLOCK SUPPORT @@@ -6741,7 -6691,7 +6724,7 @@@ F: drivers/char/agp/parisc-agp. F: drivers/input/serio/gscps2.c F: drivers/parport/parport_gsc.* F: drivers/tty/serial/8250/8250_gsc.c -F: drivers/video/sti* +F: drivers/video/fbdev/sti* F: drivers/video/console/sti* F: drivers/video/logo/logo_parisc*
@@@ -6801,7 -6751,7 +6784,7 @@@ F: arch/x86/kernel/quirks.
PCI DRIVER FOR IMX6 M: Richard Zhu r65037@freescale.com -M: Shawn Guo shawn.guo@linaro.org +M: Shawn Guo shawn.guo@freescale.com L: linux-pci@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained @@@ -6990,7 -6940,7 +6973,7 @@@ S: Maintaine T: git git://github.com/gxt/linux.git F: drivers/input/serio/i8042-unicore32io.h F: drivers/i2c/busses/i2c-puv3.c -F: drivers/video/fb-puv3.c +F: drivers/video/fbdev/fb-puv3.c F: drivers/rtc/rtc-puv3.c
PMBUS HARDWARE MONITORING DRIVERS @@@ -7022,10 -6972,10 +7005,10 @@@ POSIX CLOCKS and TIMER M: Thomas Gleixner tglx@linutronix.de L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core -S: Supported +S: Maintained F: fs/timerfd.c F: include/linux/timer* -F: kernel/*timer* +F: kernel/time/*timer*
POWER SUPPLY CLASS/SUBSYSTEM and DRIVERS M: Dmitry Eremin-Solenikov dbaryshkov@gmail.com @@@ -7239,12 -7189,6 +7222,12 @@@ M: Robert Jarzmik <robert.jarzmik@free. L: rtc-linux@googlegroups.com S: Maintained
+QAT DRIVER +M: Tadeusz Struk tadeusz.struk@intel.com +L: qat-linux@intel.com +S: Supported +F: drivers/crypto/qat/ + QIB DRIVER M: Mike Marciniszyn infinipath@intel.com L: linux-rdma@vger.kernel.org @@@ -7368,7 -7312,7 +7351,7 @@@ RADEON FRAMEBUFFER DISPLAY DRIVE M: Benjamin Herrenschmidt benh@kernel.crashing.org L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/aty/radeon* +F: drivers/video/fbdev/aty/radeon* F: include/uapi/linux/radeonfb.h
RADIOSHARK RADIO DRIVER @@@ -7390,7 -7334,7 +7373,7 @@@ RAGE128 FRAMEBUFFER DISPLAY DRIVE M: Paul Mackerras paulus@samba.org L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/aty/aty128fb.c +F: drivers/video/fbdev/aty/aty128fb.c
RALINK RT2X00 WIRELESS LAN DRIVER P: rt2x00 project @@@ -7432,14 -7376,10 +7415,14 @@@ L: linux-kernel@vger.kernel.or S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git F: Documentation/RCU/torture.txt -F: kernel/rcu/torture.c +F: kernel/rcu/rcutorture.c
RCUTORTURE TEST FRAMEWORK M: "Paul E. McKenney" paulmck@linux.vnet.ibm.com +M: Josh Triplett josh@joshtriplett.org +R: Steven Rostedt rostedt@goodmis.org +R: Mathieu Desnoyers mathieu.desnoyers@efficios.com +R: Lai Jiangshan laijs@cn.fujitsu.com L: linux-kernel@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git @@@ -7462,11 -7402,8 +7445,11 @@@ S: Supporte F: net/rds/
READ-COPY UPDATE (RCU) -M: Dipankar Sarma dipankar@in.ibm.com M: "Paul E. McKenney" paulmck@linux.vnet.ibm.com +M: Josh Triplett josh@joshtriplett.org +R: Steven Rostedt rostedt@goodmis.org +R: Mathieu Desnoyers mathieu.desnoyers@efficios.com +R: Lai Jiangshan laijs@cn.fujitsu.com L: linux-kernel@vger.kernel.org W: http://www.rdrop.com/users/paulmck/RCU/ S: Supported @@@ -7476,7 -7413,7 +7459,7 @@@ X: Documentation/RCU/torture.tx F: include/linux/rcu* X: include/linux/srcu.h F: kernel/rcu/ -X: kernel/rcu/torture.c +X: kernel/torture.c
REAL TIME CLOCK (RTC) SUBSYSTEM M: Alessandro Zummo a.zummo@towertech.it @@@ -7631,7 -7568,7 +7614,7 @@@ S3 SAVAGE FRAMEBUFFER DRIVE M: Antonino Daplas adaplas@gmail.com L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/savage/ +F: drivers/video/fbdev/savage/
S390 M: Martin Schwidefsky schwidefsky@de.ibm.com @@@ -7754,7 -7691,7 +7737,7 @@@ SAMSUNG FRAMEBUFFER DRIVE M: Jingoo Han jg1.han@samsung.com L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/s3c-fb.c +F: drivers/video/fbdev/s3c-fb.c
SAMSUNG MULTIFUNCTION DEVICE DRIVERS M: Sangbeom Kim sbkim73@samsung.com @@@ -8226,7 -8163,7 +8209,7 @@@ M: Thomas Winischhofer <thomas@winischh W: http://www.winischhofer.net/linuxsisvga.shtml S: Maintained F: Documentation/fb/sisfb.txt -F: drivers/video/sis/ +F: drivers/video/fbdev/sis/ F: include/video/sisfb.h
SIS USB2VGA DRIVER @@@ -8249,9 -8186,6 +8232,9 @@@ F: mm/sl?b SLEEPABLE READ-COPY UPDATE (SRCU) M: Lai Jiangshan laijs@cn.fujitsu.com M: "Paul E. McKenney" paulmck@linux.vnet.ibm.com +M: Josh Triplett josh@joshtriplett.org +R: Steven Rostedt rostedt@goodmis.org +R: Mathieu Desnoyers mathieu.desnoyers@efficios.com L: linux-kernel@vger.kernel.org W: http://www.rdrop.com/users/paulmck/RCU/ S: Supported @@@ -8335,7 -8269,7 +8318,7 @@@ SMSC UFX6000 and UFX7000 USB to VGA DRI M: Steve Glendinning steve.glendinning@shawell.net L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/smscufx.c +F: drivers/video/fbdev/smscufx.c
SOC-CAMERA V4L2 SUBSYSTEM M: Guennadi Liakhovetski g.liakhovetski@gmx.de @@@ -8541,6 -8475,12 +8524,6 @@@ L: devel@driverdev.osuosl.or S: Supported F: drivers/staging/
-STAGING - AGERE HERMES II and II.5 WIRELESS DRIVERS -M: Henk de Groot pe1dnn@amsat.org -S: Odd Fixes -F: drivers/staging/wlags49_h2/ -F: drivers/staging/wlags49_h25/ - STAGING - ASUS OLED M: Jakub Schmidtke sjakub@gmail.com S: Odd Fixes @@@ -8552,6 -8492,14 +8535,6 @@@ M: H Hartley Sweeten <hsweeten@visionen S: Odd Fixes F: drivers/staging/comedi/
-STAGING - CRYSTAL HD VIDEO DECODER -M: Naren Sankar nsankar@broadcom.com -M: Jarod Wilson jarod@wilsonet.com -M: Scott Davilla davilla@4pi.com -M: Manu Abraham abraham.manu@gmail.com -S: Odd Fixes -F: drivers/staging/crystalhd/ - STAGING - ECHO CANCELLER M: Steve Underwood steveu@coppice.org M: David Rowe david@rowetel.com @@@ -8670,6 -8618,11 +8653,6 @@@ M: Forest Bond <forest@alittletooquiet. S: Odd Fixes F: drivers/staging/vt665?/
-STAGING - WINBOND IS89C35 WLAN USB DRIVER -M: Pavel Machek pavel@ucw.cz -S: Odd Fixes -F: drivers/staging/winbond/ - STAGING - XGI Z7,Z9,Z11 PCI DISPLAY DRIVER M: Arnaud Patard arnaud.patard@rtp-net.org S: Odd Fixes @@@ -8995,7 -8948,7 +8978,7 @@@ F: drivers/media/radio/radio-raremono.
THERMAL M: Zhang Rui rui.zhang@intel.com -M: Eduardo Valentin eduardo.valentin@ti.com +M: Eduardo Valentin edubezval@gmail.com L: linux-pm@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/evalenti/linux-soc-thermal.git @@@ -9022,7 -8975,7 +9005,7 @@@ S: Maintaine F: drivers/platform/x86/thinkpad_acpi.c
TI BANDGAP AND THERMAL DRIVER -M: Eduardo Valentin eduardo.valentin@ti.com +M: Eduardo Valentin edubezval@gmail.com L: linux-pm@vger.kernel.org S: Supported F: drivers/thermal/ti-soc-thermal/ @@@ -9436,6 -9389,12 +9419,6 @@@ S: Maintaine F: drivers/usb/host/isp116x* F: include/linux/usb/isp116x.h
-USB KAWASAKI LSI DRIVER -M: Oliver Neukum oliver@neukum.org -L: linux-usb@vger.kernel.org -S: Maintained -F: drivers/usb/serial/kl5kusb105.* - USB MASS STORAGE DRIVER M: Matthew Dharm mdharm-usb@one-eyed-alien.net L: linux-usb@vger.kernel.org @@@ -9463,6 -9422,12 +9446,6 @@@ S: Maintaine F: Documentation/usb/ohci.txt F: drivers/usb/host/ohci*
-USB OPTION-CARD DRIVER -M: Matthias Urlichs smurf@smurf.noris.de -L: linux-usb@vger.kernel.org -S: Maintained -F: drivers/usb/serial/option.c - USB PEGASUS DRIVER M: Petko Manolov petkan@nucleusys.com L: linux-usb@vger.kernel.org @@@ -9495,7 -9460,7 +9478,7 @@@ S: Maintaine F: drivers/net/usb/rtl8150.c
USB SERIAL SUBSYSTEM -M: Johan Hovold jhovold@gmail.com +M: Johan Hovold johan@kernel.org L: linux-usb@vger.kernel.org S: Maintained F: Documentation/usb/usb-serial.txt @@@ -9636,7 -9601,7 +9619,7 @@@ L: linux-fbdev@vger.kernel.or W: http://dev.gentoo.org/~spock/projects/uvesafb/ S: Maintained F: Documentation/fb/uvesafb.txt -F: drivers/video/uvesafb.* +F: drivers/video/fbdev/uvesafb.*
VFAT/FAT/MSDOS FILESYSTEM M: OGAWA Hirofumi hirofumi@mail.parknet.co.jp @@@ -9709,7 -9674,7 +9692,7 @@@ S: Maintaine F: include/linux/via-core.h F: include/linux/via-gpio.h F: include/linux/via_i2c.h -F: drivers/video/via/ +F: drivers/video/fbdev/via/
VIA VELOCITY NETWORK DRIVER M: Francois Romieu romieu@fr.zoreil.com diff --combined Makefile index 39022dc,512c82f..86eb013 --- a/Makefile +++ b/Makefile @@@ -1,7 -1,7 +1,7 @@@ VERSION = 3 PATCHLEVEL = 16 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Shuffling Zombie Juror
# *DOCUMENTATION* @@@ -41,29 -41,6 +41,29 @@@ unexport GREP_OPTION # descending is started. They are now explicitly listed as the # prepare rule.
+# Beautify output +# --------------------------------------------------------------------------- +# +# Normally, we echo the whole command before executing it. By making +# that echo $($(quiet)$(cmd)), we now have the possibility to set +# $(quiet) to choose other forms of output instead, e.g. +# +# quiet_cmd_cc_o_c = Compiling $(RELDIR)/$@ +# cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< +# +# If $(quiet) is empty, the whole command will be printed. +# If it is set to "quiet_", only the short version will be printed. +# If it is set to "silent_", nothing will be printed at all, since +# the variable $(silent_cmd_cc_o_c) doesn't exist. +# +# A simple variant is to prefix commands with $(Q) - that's useful +# for commands that shall be hidden in non-verbose mode. +# +# $(Q)ln $@ :< +# +# If KBUILD_VERBOSE equals 0 then the above command will be hidden. +# If KBUILD_VERBOSE equals 1 then the above command is displayed. +# # To put more focus on warnings, be less verbose as default # Use 'make V=1' to see the full commands
@@@ -74,29 -51,6 +74,29 @@@ ifndef KBUILD_VERBOS KBUILD_VERBOSE = 0 endif
+ifeq ($(KBUILD_VERBOSE),1) + quiet = + Q = +else + quiet=quiet_ + Q = @ +endif + +# If the user is running make -s (silent mode), suppress echoing of +# commands + +ifneq ($(filter 4.%,$(MAKE_VERSION)),) # make-4 +ifneq ($(filter %s ,$(firstword x$(MAKEFLAGS))),) + quiet=silent_ +endif +else # make-3.8x +ifneq ($(filter s% -s%,$(MAKEFLAGS)),) + quiet=silent_ +endif +endif + +export quiet Q KBUILD_VERBOSE + # Call a source code checker (by default, "sparse") as part of the # C compilation. # @@@ -172,13 -126,7 +172,13 @@@ PHONY += $(MAKECMDGOALS) sub-mak $(filter-out _all sub-make $(CURDIR)/Makefile, $(MAKECMDGOALS)) _all: sub-make @:
+# Fake the "Entering directory" message once, so that IDEs/editors are +# able to understand relative filenames. + echodir := @echo + quiet_echodir := @echo +silent_echodir := @: sub-make: FORCE + $($(quiet)echodir) "make[1]: Entering directory `$(KBUILD_OUTPUT)'" $(if $(KBUILD_VERBOSE:1=),@)$(MAKE) -C $(KBUILD_OUTPUT) \ KBUILD_SRC=$(CURDIR) \ KBUILD_EXTMOD="$(KBUILD_EXTMOD)" -f $(CURDIR)/Makefile \ @@@ -341,6 -289,52 +341,6 @@@ endi export KBUILD_MODULES KBUILD_BUILTIN export KBUILD_CHECKSRC KBUILD_SRC KBUILD_EXTMOD
-# Beautify output -# --------------------------------------------------------------------------- -# -# Normally, we echo the whole command before executing it. By making -# that echo $($(quiet)$(cmd)), we now have the possibility to set -# $(quiet) to choose other forms of output instead, e.g. -# -# quiet_cmd_cc_o_c = Compiling $(RELDIR)/$@ -# cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< -# -# If $(quiet) is empty, the whole command will be printed. -# If it is set to "quiet_", only the short version will be printed. -# If it is set to "silent_", nothing will be printed at all, since -# the variable $(silent_cmd_cc_o_c) doesn't exist. -# -# A simple variant is to prefix commands with $(Q) - that's useful -# for commands that shall be hidden in non-verbose mode. -# -# $(Q)ln $@ :< -# -# If KBUILD_VERBOSE equals 0 then the above command will be hidden. -# If KBUILD_VERBOSE equals 1 then the above command is displayed. - -ifeq ($(KBUILD_VERBOSE),1) - quiet = - Q = -else - quiet=quiet_ - Q = @ -endif - -# If the user is running make -s (silent mode), suppress echoing of -# commands - -ifneq ($(filter 4.%,$(MAKE_VERSION)),) # make-4 -ifneq ($(filter %s ,$(firstword x$(MAKEFLAGS))),) - quiet=silent_ -endif -else # make-3.8x -ifneq ($(filter s% -s%,$(MAKEFLAGS)),) - quiet=silent_ -endif -endif - -export quiet Q KBUILD_VERBOSE - ifneq ($(CC),) ifeq ($(shell $(CC) -v 2>&1 | grep -c "clang version"), 1) COMPILER := clang @@@ -360,14 -354,9 +360,14 @@@ include $(srctree)/scripts/Kbuild.inclu # Make variables (CC, etc...) AS = $(CROSS_COMPILE)as LD = $(CROSS_COMPILE)ld +LDFINAL = $(LD) CC = $(CROSS_COMPILE)gcc CPP = $(CC) -E +ifdef CONFIG_LTO +AR = $(CROSS_COMPILE)gcc-ar +else AR = $(CROSS_COMPILE)ar +endif NM = $(CROSS_COMPILE)nm STRIP = $(CROSS_COMPILE)strip OBJCOPY = $(CROSS_COMPILE)objcopy @@@ -426,7 -415,7 +426,7 @@@ KERNELVERSION = $(VERSION)$(if $(PATCHL
export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC -export CPP AR NM STRIP OBJCOPY OBJDUMP +export CPP AR NM STRIP OBJCOPY OBJDUMP LDFINAL export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
@@@ -437,17 -426,6 +437,17 @@@ export KBUILD_AFLAGS_MODULE KBUILD_CFLA export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL export KBUILD_ARFLAGS
+ifdef CONFIG_LTO +# LTO gcc creates a lot of files in TMPDIR, and with /tmp as tmpfs +# it's easy to drive the machine OOM. Use the object directory +# instead. +ifndef TMPDIR +TMPDIR ?= $(objtree) +export TMPDIR +$(info setting TMPDIR=$(objtree) for LTO build) +endif +endif + # When compiling out-of-tree modules, put MODVERDIR in the module # tree rather than in the kernel tree. The kernel tree might # even be read-only. @@@ -637,6 -615,9 +637,9 @@@ els KBUILD_CFLAGS += -O2 endif
+ # Tell gcc to never replace conditional load with a non-conditional one + KBUILD_CFLAGS += $(call cc-option,--param=allow-store-data-races=0) + ifdef CONFIG_READABLE_ASM # Disable optimizations that make assembler listings hard to read. # reorder blocks reorders the control in the function @@@ -767,7 -748,6 +770,7 @@@ ifeq ($(shell $(CONFIG_SHELL) $(srctree endif
include $(srctree)/scripts/Makefile.extrawarn +include ${srctree}/scripts/Makefile.lto
# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments KBUILD_CPPFLAGS += $(KCPPFLAGS) @@@ -1193,7 -1173,7 +1196,7 @@@ distclean: mrprope # Packaging of the kernel to various formats # --------------------------------------------------------------------------- # rpm target kept for backward compatibility -package-dir := $(srctree)/scripts/package +package-dir := scripts/package
%src-pkg: FORCE $(Q)$(MAKE) $(build)=$(package-dir) $@ diff --combined arch/arm/Kconfig index ea73acc,b9679c8..4fc4744 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@@ -83,6 -83,7 +83,7 @@@ config AR http://www.arm.linux.org.uk/.
config ARM_HAS_SG_CHAIN + select ARCH_HAS_SG_CHAIN bool
config NEED_SG_DMA_LENGTH @@@ -320,6 -321,7 +321,6 @@@ config ARCH_INTEGRATO select HAVE_TCM select ICST select MULTI_IRQ_HANDLER - select NEED_MACH_MEMORY_H select PLAT_VERSATILE select SPARSE_IRQ select USE_OF @@@ -757,6 -759,42 +758,6 @@@ config ARCH_S3C64X help Samsung S3C64XX series based systems
-config ARCH_S5P64X0 - bool "Samsung S5P6440 S5P6450" - select ATAGS - select CLKDEV_LOOKUP - select CLKSRC_SAMSUNG_PWM - select CPU_V6 - select GENERIC_CLOCKEVENTS - select GPIO_SAMSUNG - select HAVE_S3C2410_I2C if I2C - select HAVE_S3C2410_WATCHDOG if WATCHDOG - select HAVE_S3C_RTC if RTC_CLASS - select NEED_MACH_GPIO_H - select SAMSUNG_ATAGS - select SAMSUNG_WDT_RESET - help - Samsung S5P64X0 CPU based systems, such as the Samsung SMDK6440, - SMDK6450. - -config ARCH_S5PC100 - bool "Samsung S5PC100" - select ARCH_REQUIRE_GPIOLIB - select ATAGS - select CLKDEV_LOOKUP - select CLKSRC_SAMSUNG_PWM - select CPU_V7 - select GENERIC_CLOCKEVENTS - select GPIO_SAMSUNG - select HAVE_S3C2410_I2C if I2C - select HAVE_S3C2410_WATCHDOG if WATCHDOG - select HAVE_S3C_RTC if RTC_CLASS - select NEED_MACH_GPIO_H - select SAMSUNG_ATAGS - select SAMSUNG_WDT_RESET - help - Samsung S5PC100 series based systems - config ARCH_S5PV210 bool "Samsung S5PV210/S5PC110" select ARCH_HAS_HOLES_MEMORYMODEL @@@ -967,6 -1005,10 +968,6 @@@ source "arch/arm/mach-s3c24xx/Kconfig
source "arch/arm/mach-s3c64xx/Kconfig"
-source "arch/arm/mach-s5p64x0/Kconfig" - -source "arch/arm/mach-s5pc100/Kconfig" - source "arch/arm/mach-s5pv210/Kconfig"
source "arch/arm/mach-exynos/Kconfig" @@@ -1528,7 -1570,7 +1529,7 @@@ source kernel/Kconfig.preemp
config HZ_FIXED int - default 200 if ARCH_EBSA110 || ARCH_S3C24XX || ARCH_S5P64X0 || \ + default 200 if ARCH_EBSA110 || ARCH_S3C24XX || \ ARCH_S5PV210 || ARCH_EXYNOS4 default AT91_TIMER_HZ if ARCH_AT91 default SHMOBILE_TIMER_HZ if ARCH_SHMOBILE_LEGACY @@@ -2153,6 -2195,7 +2154,6 @@@ menu "Power management options source "kernel/power/Kconfig"
config ARCH_SUSPEND_POSSIBLE - depends on !ARCH_S5PC100 depends on CPU_ARM920T || CPU_ARM926T || CPU_FEROCEON || CPU_SA1100 || \ CPU_V6 || CPU_V6K || CPU_V7 || CPU_V7M || CPU_XSC3 || CPU_XSCALE || CPU_MOHAWK def_bool y diff --combined arch/arm64/Kconfig index 7fc6e2e,65317fb..301aefd --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@@ -2,6 -2,7 +2,7 @@@ config ARM6 def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_OPP + select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANT_OPTIONAL_GPIOLIB @@@ -10,7 -11,6 +11,7 @@@ select ARM_AMBA select ARM_ARCH_TIMER select ARM_GIC + select ARM_GIC_V3 select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS select COMMON_CLK diff --combined arch/powerpc/Kconfig index fefe7c8,e5b9170..d5ecc91 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@@ -111,6 -111,7 +111,7 @@@ config PP select HAVE_DMA_API_DEBUG select HAVE_OPROFILE select HAVE_DEBUG_KMEMLEAK + select ARCH_HAS_SG_CHAIN select GENERIC_ATOMIC64 if PPC32 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_PERF_EVENTS @@@ -414,7 -415,7 +415,7 @@@ config KEXE config CRASH_DUMP bool "Build a kdump crash kernel" depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP) - select RELOCATABLE if PPC64 || 44x || FSL_BOOKE + select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE help Build a kernel suitable for use as a kdump capture kernel. The same kernel binary can be used as production kernel and dump @@@ -1017,7 -1018,6 +1018,7 @@@ endmen if PPC64 config RELOCATABLE bool "Build a relocatable kernel" + depends on !COMPILE_TEST select NONSTATIC_KERNEL help This builds a kernel image that is capable of running anywhere diff --combined arch/powerpc/kvm/book3s_64_mmu_hv.c index 2d154d9,a41e625..93218ae --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@@ -37,8 -37,6 +37,6 @@@ #include <asm/ppc-opcode.h> #include <asm/cputable.h>
- #include "book3s_hv_cma.h" - /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63
@@@ -64,10 -62,10 +62,10 @@@ long kvmppc_alloc_hpt(struct kvm *kvm, }
kvm->arch.hpt_cma_alloc = 0; page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + memset((void *)hpt, 0, (1 << order)); kvm->arch.hpt_cma_alloc = 1; }
@@@ -450,7 -448,7 +448,7 @@@ static int kvmppc_mmu_book3s_64_hv_xlat unsigned long slb_v; unsigned long pp, key; unsigned long v, gr; - unsigned long *hptep; + __be64 *hptep; int index; int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
@@@ -473,13 -471,13 +471,13 @@@ preempt_enable(); return -ENOENT; } - hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); - v = hptep[0] & ~HPTE_V_HVLOCK; + hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; gr = kvm->arch.revmap[index].guest_rpte;
/* Unlock the HPTE */ asm volatile("lwsync" : : : "memory"); - hptep[0] = v; + hptep[0] = cpu_to_be64(v); preempt_enable();
gpte->eaddr = eaddr; @@@ -583,8 -581,7 +581,8 @@@ int kvmppc_book3s_hv_page_fault(struct unsigned long ea, unsigned long dsisr) { struct kvm *kvm = vcpu->kvm; - unsigned long *hptep, hpte[3], r; + unsigned long hpte[3], r; + __be64 *hptep; unsigned long mmu_seq, psize, pte_size; unsigned long gpa_base, gfn_base; unsigned long gpa, gfn, hva, pfn; @@@ -607,16 -604,16 +605,16 @@@ if (ea != vcpu->arch.pgfault_addr) return RESUME_GUEST; index = vcpu->arch.pgfault_index; - hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); rev = &kvm->arch.revmap[index]; preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) cpu_relax(); - hpte[0] = hptep[0] & ~HPTE_V_HVLOCK; - hpte[1] = hptep[1]; + hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; + hpte[1] = be64_to_cpu(hptep[1]); hpte[2] = r = rev->guest_rpte; asm volatile("lwsync" : : : "memory"); - hptep[0] = hpte[0]; + hptep[0] = cpu_to_be64(hpte[0]); preempt_enable();
if (hpte[0] != vcpu->arch.pgfault_hpte[0] || @@@ -732,9 -729,8 +730,9 @@@ preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) cpu_relax(); - if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] || - rev->guest_rpte != hpte[2]) + if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] || + be64_to_cpu(hptep[1]) != hpte[1] || + rev->guest_rpte != hpte[2]) /* HPTE has been changed under us; let the guest retry */ goto out_unlock; hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; @@@ -754,20 -750,20 +752,20 @@@ rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
- if (hptep[0] & HPTE_V_VALID) { + if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) { /* HPTE was previously valid, so we need to invalidate it */ unlock_rmap(rmap); - hptep[0] |= HPTE_V_ABSENT; + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); kvmppc_invalidate_hpte(kvm, hptep, index); /* don't lose previous R and C bits */ - r |= hptep[1] & (HPTE_R_R | HPTE_R_C); + r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); } else { kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); }
- hptep[1] = r; + hptep[1] = cpu_to_be64(r); eieio(); - hptep[0] = hpte[0]; + hptep[0] = cpu_to_be64(hpte[0]); asm volatile("ptesync" : : : "memory"); preempt_enable(); if (page && hpte_is_writable(r)) @@@ -786,7 -782,7 +784,7 @@@ return ret;
out_unlock: - hptep[0] &= ~HPTE_V_HVLOCK; + hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); preempt_enable(); goto out_put; } @@@ -862,7 -858,7 +860,7 @@@ static int kvm_unmap_rmapp(struct kvm * { struct revmap_entry *rev = kvm->arch.revmap; unsigned long h, i, j; - unsigned long *hptep; + __be64 *hptep; unsigned long ptel, psize, rcbits;
for (;;) { @@@ -878,11 -874,11 +876,11 @@@ * rmap chain lock. */ i = *rmapp & KVMPPC_RMAP_INDEX; - hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { /* unlock rmap before spinning on the HPTE lock */ unlock_rmap(rmapp); - while (hptep[0] & HPTE_V_HVLOCK) + while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) cpu_relax(); continue; } @@@ -901,14 -897,14 +899,14 @@@
/* Now check and modify the HPTE */ ptel = rev[i].guest_rpte; - psize = hpte_page_size(hptep[0], ptel); - if ((hptep[0] & HPTE_V_VALID) && + psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); + if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && hpte_rpn(ptel, psize) == gfn) { if (kvm->arch.using_mmu_notifiers) - hptep[0] |= HPTE_V_ABSENT; + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); kvmppc_invalidate_hpte(kvm, hptep, i); /* Harvest R and C */ - rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); + rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; if (rcbits & ~rev[i].guest_rpte) { rev[i].guest_rpte = ptel | rcbits; @@@ -916,7 -912,7 +914,7 @@@ } } unlock_rmap(rmapp); - hptep[0] &= ~HPTE_V_HVLOCK; + hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); } return 0; } @@@ -963,7 -959,7 +961,7 @@@ static int kvm_age_rmapp(struct kvm *kv { struct revmap_entry *rev = kvm->arch.revmap; unsigned long head, i, j; - unsigned long *hptep; + __be64 *hptep; int ret = 0;
retry: @@@ -979,24 -975,23 +977,24 @@@
i = head = *rmapp & KVMPPC_RMAP_INDEX; do { - hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); j = rev[i].forw;
/* If this HPTE isn't referenced, ignore it */ - if (!(hptep[1] & HPTE_R_R)) + if (!(be64_to_cpu(hptep[1]) & HPTE_R_R)) continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { /* unlock rmap before spinning on the HPTE lock */ unlock_rmap(rmapp); - while (hptep[0] & HPTE_V_HVLOCK) + while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) cpu_relax(); goto retry; }
/* Now check and modify the HPTE */ - if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { + if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && + (be64_to_cpu(hptep[1]) & HPTE_R_R)) { kvmppc_clear_ref_hpte(kvm, hptep, i); if (!(rev[i].guest_rpte & HPTE_R_R)) { rev[i].guest_rpte |= HPTE_R_R; @@@ -1004,7 -999,7 +1002,7 @@@ } ret = 1; } - hptep[0] &= ~HPTE_V_HVLOCK; + hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); } while ((i = j) != head);
unlock_rmap(rmapp); @@@ -1038,7 -1033,7 +1036,7 @@@ static int kvm_test_age_rmapp(struct kv do { hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); j = rev[i].forw; - if (hp[1] & HPTE_R_R) + if (be64_to_cpu(hp[1]) & HPTE_R_R) goto out; } while ((i = j) != head); } @@@ -1078,7 -1073,7 +1076,7 @@@ static int kvm_test_clear_dirty_npages( unsigned long head, i, j; unsigned long n; unsigned long v, r; - unsigned long *hptep; + __be64 *hptep; int npages_dirty = 0;
retry: @@@ -1094,8 -1089,7 +1092,8 @@@
i = head = *rmapp & KVMPPC_RMAP_INDEX; do { - hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + unsigned long hptep1; + hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); j = rev[i].forw;
/* @@@ -1112,30 -1106,29 +1110,30 @@@ * Otherwise we need to do the tlbie even if C==0 in * order to pick up any delayed writeback of C. */ - if (!(hptep[1] & HPTE_R_C) && - (!hpte_is_writable(hptep[1]) || vcpus_running(kvm))) + hptep1 = be64_to_cpu(hptep[1]); + if (!(hptep1 & HPTE_R_C) && + (!hpte_is_writable(hptep1) || vcpus_running(kvm))) continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { /* unlock rmap before spinning on the HPTE lock */ unlock_rmap(rmapp); - while (hptep[0] & HPTE_V_HVLOCK) + while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK)) cpu_relax(); goto retry; }
/* Now check and modify the HPTE */ - if (!(hptep[0] & HPTE_V_VALID)) + if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) continue;
/* need to make it temporarily absent so C is stable */ - hptep[0] |= HPTE_V_ABSENT; + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); kvmppc_invalidate_hpte(kvm, hptep, i); - v = hptep[0]; - r = hptep[1]; + v = be64_to_cpu(hptep[0]); + r = be64_to_cpu(hptep[1]); if (r & HPTE_R_C) { - hptep[1] = r & ~HPTE_R_C; + hptep[1] = cpu_to_be64(r & ~HPTE_R_C); if (!(rev[i].guest_rpte & HPTE_R_C)) { rev[i].guest_rpte |= HPTE_R_C; note_hpte_modification(kvm, &rev[i]); @@@ -1148,7 -1141,7 +1146,7 @@@ } v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); v |= HPTE_V_VALID; - hptep[0] = v; + hptep[0] = cpu_to_be64(v); } while ((i = j) != head);
unlock_rmap(rmapp); @@@ -1312,7 -1305,7 +1310,7 @@@ struct kvm_htab_ctx * Returns 1 if this HPT entry has been modified or has pending * R/C bit changes. */ -static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp) +static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp) { unsigned long rcbits_unset;
@@@ -1321,14 -1314,13 +1319,14 @@@
/* Also need to consider changes in reference and changed bits */ rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); - if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset)) + if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) && + (be64_to_cpu(hptp[1]) & rcbits_unset)) return 1;
return 0; }
-static long record_hpte(unsigned long flags, unsigned long *hptp, +static long record_hpte(unsigned long flags, __be64 *hptp, unsigned long *hpte, struct revmap_entry *revp, int want_valid, int first_pass) { @@@ -1343,10 -1335,10 +1341,10 @@@ return 0;
valid = 0; - if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) { + if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) { valid = 1; if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && - !(hptp[0] & HPTE_V_BOLTED)) + !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED)) valid = 0; } if (valid != want_valid) @@@ -1358,7 -1350,7 +1356,7 @@@ preempt_disable(); while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) cpu_relax(); - v = hptp[0]; + v = be64_to_cpu(hptp[0]);
/* re-evaluate valid and dirty from synchronized HPTE value */ valid = !!(v & HPTE_V_VALID); @@@ -1366,9 -1358,9 +1364,9 @@@
/* Harvest R and C into guest view if necessary */ rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); - if (valid && (rcbits_unset & hptp[1])) { - revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) | - HPTE_GR_MODIFIED; + if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) { + revp->guest_rpte |= (be64_to_cpu(hptp[1]) & + (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; dirty = 1; }
@@@ -1387,13 -1379,13 +1385,13 @@@ revp->guest_rpte = r; } asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); - hptp[0] &= ~HPTE_V_HVLOCK; + hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); preempt_enable(); if (!(valid == want_valid && (first_pass || dirty))) ok = 0; } - hpte[0] = v; - hpte[1] = r; + hpte[0] = cpu_to_be64(v); + hpte[1] = cpu_to_be64(r); return ok; }
@@@ -1403,7 -1395,7 +1401,7 @@@ static ssize_t kvm_htab_read(struct fil struct kvm_htab_ctx *ctx = file->private_data; struct kvm *kvm = ctx->kvm; struct kvm_get_htab_header hdr; - unsigned long *hptp; + __be64 *hptp; struct revmap_entry *revp; unsigned long i, nb, nw; unsigned long __user *lbuf; @@@ -1419,7 -1411,7 +1417,7 @@@ flags = ctx->flags;
i = ctx->index; - hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); revp = kvm->arch.revmap + i; lbuf = (unsigned long __user *)buf;
@@@ -1503,7 -1495,7 +1501,7 @@@ static ssize_t kvm_htab_write(struct fi unsigned long i, j; unsigned long v, r; unsigned long __user *lbuf; - unsigned long *hptp; + __be64 *hptp; unsigned long tmp[2]; ssize_t nb; long int err, ret; @@@ -1545,7 -1537,7 +1543,7 @@@ i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) break;
- hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); lbuf = (unsigned long __user *)buf; for (j = 0; j < hdr.n_valid; ++j) { err = -EFAULT; @@@ -1557,7 -1549,7 +1555,7 @@@ lbuf += 2; nb += HPTE_SIZE;
- if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) kvmppc_do_h_remove(kvm, 0, i, 0, tmp); err = -EIO; ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, @@@ -1583,7 -1575,7 +1581,7 @@@ }
for (j = 0; j < hdr.n_invalid; ++j) { - if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) kvmppc_do_h_remove(kvm, 0, i, 0, tmp); ++i; hptp += 2; diff --combined arch/powerpc/kvm/book3s_hv_builtin.c index 3b41447,6cf498a..329d7fd --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@@ -16,12 -16,14 +16,14 @@@ #include <linux/init.h> #include <linux/memblock.h> #include <linux/sizes.h> + #include <linux/cma.h>
#include <asm/cputable.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h>
- #include "book3s_hv_cma.h" + #define KVM_CMA_CHUNK_ORDER 18 + /* * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206) * should be power of 2. @@@ -43,6 -45,8 +45,8 @@@ static unsigned long kvm_cma_resv_rati unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT; /* 128MB */ EXPORT_SYMBOL_GPL(kvm_rma_pages);
+ static struct cma *kvm_cma; + /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ static inline int lpcr_rmls(unsigned long rma_size) @@@ -97,7 -101,7 +101,7 @@@ struct kvm_rma_info *kvm_alloc_rma( ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL); if (!ri) return NULL; - page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages); + page = cma_alloc(kvm_cma, kvm_rma_pages, get_order(kvm_rma_pages)); if (!page) goto err_out; atomic_set(&ri->use_count, 1); @@@ -112,7 -116,7 +116,7 @@@ EXPORT_SYMBOL_GPL(kvm_alloc_rma) void kvm_release_rma(struct kvm_rma_info *ri) { if (atomic_dec_and_test(&ri->use_count)) { - kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages); + cma_release(kvm_cma, pfn_to_page(ri->base_pfn), kvm_rma_pages); kfree(ri); } } @@@ -131,16 -135,18 +135,18 @@@ struct page *kvm_alloc_hpt(unsigned lon { unsigned long align_pages = HPT_ALIGN_PAGES;
+ VM_BUG_ON(get_order(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + /* Old CPUs require HPT aligned on a multiple of its size */ if (!cpu_has_feature(CPU_FTR_ARCH_206)) align_pages = nr_pages; - return kvm_alloc_cma(nr_pages, align_pages); + return cma_alloc(kvm_cma, nr_pages, get_order(align_pages)); } EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
void kvm_release_hpt(struct page *page, unsigned long nr_pages) { - kvm_release_cma(page, nr_pages); + cma_release(kvm_cma, page, nr_pages); } EXPORT_SYMBOL_GPL(kvm_release_hpt);
@@@ -179,7 -185,8 +185,8 @@@ void __init kvm_cma_reserve(void align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size); - kvm_cma_declare_contiguous(selected_size, align_size); + cma_declare_contiguous(0, selected_size, 0, align_size, + KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma); } }
@@@ -212,16 -219,3 +219,16 @@@ bool kvm_hv_mode_active(void { return atomic_read(&hv_vm_count) != 0; } + +extern int hcall_real_table[], hcall_real_table_end[]; + +int kvmppc_hcall_impl_hv_realmode(unsigned long cmd) +{ + cmd /= 4; + if (cmd < hcall_real_table_end - hcall_real_table && + hcall_real_table[cmd]) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode); diff --combined arch/x86/Kconfig index be0d37e,77e7790..8657c06 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@@ -96,6 -96,7 +96,7 @@@ config X8 select IRQ_FORCED_THREADING select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select ARCH_HAS_SG_CHAIN select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP @@@ -536,7 -537,7 +537,7 @@@ config X86_32_IRI
config SCHED_OMIT_FRAME_POINTER def_bool y - prompt "Single-depth WCHAN output" + prompt "Single-depth WCHAN output" if !LTO && !FRAME_POINTER depends on X86 ---help--- Calculate simpler /proc/<PID>/wchan values. If this option @@@ -1576,6 -1577,9 +1577,9 @@@ source kernel/Kconfig.h
config KEXEC bool "kexec system call" + select BUILD_BIN2C + select CRYPTO + select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --combined arch/x86/Makefile index c65fd96,dc302a7..c1aa368 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@@ -15,9 -15,12 +15,9 @@@ endi # that way we can complain to the user if the CPU is insufficient. # # The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For -# older versions of GCC, we need to play evil and unreliable tricks to -# attempt to ensure that our asm(".code16gcc") is first in the asm -# output. -CODE16GCC_CFLAGS := -m32 -include $(srctree)/arch/x86/boot/code16gcc.h \ - $(call cc-option, -fno-toplevel-reorder,\ - $(call cc-option, -fno-unit-at-a-time)) +# older versions of GCC, include an *assembly* header to make sure that +# gcc doesn't play any games behind our back. +CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ @@@ -183,6 -186,14 +183,14 @@@ archscripts: scripts_basi archheaders: $(Q)$(MAKE) $(build)=arch/x86/syscalls all
+ archprepare: + ifeq ($(CONFIG_KEXEC),y) + # Build only for 64bit. No loaders for 32bit yet. + ifeq ($(CONFIG_X86_64),y) + $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c + endif + endif + ### # Kernel objects
diff --combined arch/x86/mm/fault.c index 1dbade8,0193a32..939cb8c --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@@ -350,7 -350,7 +350,7 @@@ out
void vmalloc_sync_all(void) { - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0); }
/* @@@ -577,8 -577,6 +577,8 @@@ static int is_f00f_bug(struct pt_regs *
static const char nx_warning[] = KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; +static const char smep_warning[] = KERN_CRIT +"unable to execute userspace code (SMEP?) (uid: %d)\n";
static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, @@@ -599,10 -597,6 +599,10 @@@
if (pte && pte_present(*pte) && !pte_exec(*pte)) printk(nx_warning, from_kuid(&init_user_ns, current_uid())); + if (pte && pte_present(*pte) && pte_exec(*pte) && + (pgd_flags(*pgd) & _PAGE_USER) && + (read_cr4() & X86_CR4_SMEP)) + printk(smep_warning, from_kuid(&init_user_ns, current_uid())); }
printk(KERN_ALERT "BUG: unable to handle kernel "); diff --combined arch/x86/purgatory/Makefile index 0000000,e5829dd..7fde9ee mode 000000,100644..100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@@ -1,0 -1,30 +1,30 @@@ + purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o + + targets += $(purgatory-y) + PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) + + LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib + targets += purgatory.ro + + # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That + # in turn leaves some undefined symbols like __fentry__ in purgatory and not + # sure how to relocate those. Like kexec-tools, use custom flags. + + KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os -mcmodel=large + + $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE + $(call if_changed,ld) + + targets += kexec-purgatory.c + + quiet_cmd_bin2c = BIN2C $@ - cmd_bin2c = cat $(obj)/purgatory.ro | $(srctree)/scripts/basic/bin2c kexec_purgatory > $(obj)/kexec-purgatory.c ++ cmd_bin2c = cat $(obj)/purgatory.ro | $(objtree)/scripts/basic/bin2c kexec_purgatory > $(obj)/kexec-purgatory.c + + $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE + $(call if_changed,bin2c) + + + # No loaders for 32bits yet. + ifeq ($(CONFIG_X86_64),y) + obj-$(CONFIG_KEXEC) += kexec-purgatory.o + endif diff --combined drivers/base/Kconfig index 88500fe,9d5fed1..4e7f0ff --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@@ -149,21 -149,15 +149,21 @@@ config EXTRA_FIRMWARE_DI some other directory containing the firmware files.
config FW_LOADER_USER_HELPER + bool + +config FW_LOADER_USER_HELPER_FALLBACK bool "Fallback user-helper invocation for firmware loading" depends on FW_LOADER - default y + select FW_LOADER_USER_HELPER help This option enables / disables the invocation of user-helper (e.g. udev) for loading firmware files as a fallback after the direct file loading in kernel fails. The user-mode helper is no longer required unless you have a special firmware file that - resides in a non-standard path. + resides in a non-standard path. Moreover, the udev support has + been deprecated upstream. + + If you are unsure about this, say N here.
config DEBUG_DRIVER bool "Driver Core verbose debug messages" @@@ -214,15 -208,6 +214,15 @@@ config DMA_SHARED_BUFFE APIs extension; the file's descriptor can then be passed on to other driver.
+config FENCE_TRACE + bool "Enable verbose FENCE_TRACE messages" + depends on DMA_SHARED_BUFFER + help + Enable the FENCE_TRACE printks. This will add extra + spam to the console log, but will make it easier to diagnose + lockup related problems for dma-buffers shared across multiple + devices. + config DMA_CMA bool "DMA Contiguous Memory Allocator" depends on HAVE_DMA_CONTIGUOUS && CMA @@@ -289,16 -274,6 +289,6 @@@ config CMA_ALIGNMEN
If unsure, leave the default value "8".
- config CMA_AREAS - int "Maximum count of the CMA device-private areas" - default 7 - help - CMA allows to create CMA areas for particular devices. This parameter - sets the maximum number of such device private CMA areas in the - system. - - If unsure, leave the default value "7". - endif
endmenu diff --combined drivers/net/ethernet/intel/i40e/i40e_ethtool.c index df89b6c,c57b085..6237727 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@@ -1027,10 -1027,9 +1027,10 @@@ static int i40e_nway_reset(struct net_d struct i40e_netdev_priv *np = netdev_priv(netdev); struct i40e_pf *pf = np->vsi->back; struct i40e_hw *hw = &pf->hw; + bool link_up = hw->phy.link_info.link_info & I40E_AQ_LINK_UP; i40e_status ret = 0;
- ret = i40e_aq_set_link_restart_an(hw, NULL); + ret = i40e_aq_set_link_restart_an(hw, link_up, NULL); if (ret) { netdev_info(netdev, "link restart failed, aq_err=%d\n", pf->hw.aq.asq_last_status); @@@ -1106,36 -1105,17 +1106,36 @@@ static int i40e_set_coalesce(struct net if (ec->tx_max_coalesced_frames_irq || ec->rx_max_coalesced_frames_irq) vsi->work_limit = ec->tx_max_coalesced_frames_irq;
+ vector = vsi->base_vector; if ((ec->rx_coalesce_usecs >= (I40E_MIN_ITR << 1)) && - (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1))) + (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1))) { vsi->rx_itr_setting = ec->rx_coalesce_usecs; - else + } else if (ec->rx_coalesce_usecs == 0) { + vsi->rx_itr_setting = ec->rx_coalesce_usecs; + i40e_irq_dynamic_disable(vsi, vector); + if (ec->use_adaptive_rx_coalesce) + netif_info(pf, drv, netdev, + "Rx-secs=0, need to disable adaptive-Rx for a complete disable\n"); + } else { + netif_info(pf, drv, netdev, + "Invalid value, Rx-usecs range is 0, 8-8160\n"); return -EINVAL; + }
if ((ec->tx_coalesce_usecs >= (I40E_MIN_ITR << 1)) && - (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1))) + (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1))) { vsi->tx_itr_setting = ec->tx_coalesce_usecs; - else + } else if (ec->tx_coalesce_usecs == 0) { + vsi->tx_itr_setting = ec->tx_coalesce_usecs; + i40e_irq_dynamic_disable(vsi, vector); + if (ec->use_adaptive_tx_coalesce) + netif_info(pf, drv, netdev, + "Tx-secs=0, need to disable adaptive-Tx for a complete disable\n"); + } else { + netif_info(pf, drv, netdev, + "Invalid value, Tx-usecs range is 0, 8-8160\n"); return -EINVAL; + }
if (ec->use_adaptive_rx_coalesce) vsi->rx_itr_setting |= I40E_ITR_DYNAMIC; @@@ -1147,6 -1127,7 +1147,6 @@@ else vsi->tx_itr_setting &= ~I40E_ITR_DYNAMIC;
- vector = vsi->base_vector; for (i = 0; i < vsi->num_q_vectors; i++, vector++) { q_vector = vsi->q_vectors[i]; q_vector->rx.itr = ITR_TO_REG(vsi->rx_itr_setting); @@@ -1517,7 -1498,7 +1517,7 @@@ static int i40e_update_ethtool_fdir_ent
/* add filter to the list */ if (parent) - hlist_add_after(&parent->fdir_node, &input->fdir_node); + hlist_add_behind(&input->fdir_node, &parent->fdir_node); else hlist_add_head(&input->fdir_node, &pf->fdir_filter_list); diff --combined fs/cifs/cifssmb.c index b7e5b65,c3dc52e..0e706ab --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@@ -1734,7 -1734,10 +1734,7 @@@ CIFSSMBRead(const unsigned int xid, str
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ if (*buf) { - if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base); } else if (resp_buf_type != CIFS_NO_BUFFER) { /* return buffer to caller to free */ *buf = iov[0].iov_base; @@@ -2200,7 -2203,10 +2200,7 @@@ CIFSSMBWrite2(const unsigned int xid, s }
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ - if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@@ -2424,14 -2430,14 +2424,14 @@@ CIFSSMBPosixLock(const unsigned int xid } parm_data = (struct cifs_posix_lock *) ((char *)&pSMBr->hdr.Protocol + data_offset); - if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK)) + if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) pLockData->fl_type = F_UNLCK; else { if (parm_data->lock_type == - __constant_cpu_to_le16(CIFS_RDLCK)) + cpu_to_le16(CIFS_RDLCK)) pLockData->fl_type = F_RDLCK; else if (parm_data->lock_type == - __constant_cpu_to_le16(CIFS_WRLCK)) + cpu_to_le16(CIFS_WRLCK)) pLockData->fl_type = F_WRLCK;
pLockData->fl_start = le64_to_cpu(parm_data->start); @@@ -2445,7 -2451,10 +2445,7 @@@ plk_err_exit if (pSMB) cifs_small_buf_release(pSMB);
- if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@@ -3223,25 -3232,25 +3223,25 @@@ CIFSSMB_set_compression(const unsigned pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
pSMB->TotalParameterCount = 0; - pSMB->TotalDataCount = __constant_cpu_to_le32(2); + pSMB->TotalDataCount = cpu_to_le32(2); pSMB->MaxParameterCount = 0; pSMB->MaxDataCount = 0; pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; - pSMB->DataCount = __constant_cpu_to_le32(2); + pSMB->DataCount = cpu_to_le32(2); pSMB->DataOffset = cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req, compression_state) - 4); /* 84 */ pSMB->SetupCount = 4; - pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL); + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); pSMB->ParameterCount = 0; - pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION); + pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION); pSMB->IsFsctl = 1; /* FSCTL */ pSMB->IsRootFlag = 0; pSMB->Fid = fid; /* file handle always le */ /* 3 byte pad, followed by 2 byte compress state */ - pSMB->ByteCount = __constant_cpu_to_le16(5); + pSMB->ByteCount = cpu_to_le16(5); inc_rfc1001_len(pSMB, 5);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@@ -3377,10 -3386,10 +3377,10 @@@ static __u16 ACL_to_cifs_posix(char *pa cifs_acl->version = cpu_to_le16(1); if (acl_type == ACL_TYPE_ACCESS) { cifs_acl->access_entry_count = cpu_to_le16(count); - cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF); + cifs_acl->default_entry_count = cpu_to_le16(0xFFFF); } else if (acl_type == ACL_TYPE_DEFAULT) { cifs_acl->default_entry_count = cpu_to_le16(count); - cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF); + cifs_acl->access_entry_count = cpu_to_le16(0xFFFF); } else { cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); return 0; @@@ -3829,7 -3838,10 +3829,7 @@@ CIFSSMBGetCIFSACL(const unsigned int xi } } qsec_out: - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(buf_type, iov[0].iov_base); /* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ return rc; } diff --combined fs/cifs/sess.c index 36d0b90,27e6175..db6ecdf --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@@ -46,7 -46,7 +46,7 @@@ static __u32 cifs_ssetup_hdr(struct cif CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); - pSMB->req.VcNumber = __constant_cpu_to_le16(1); + pSMB->req.VcNumber = cpu_to_le16(1);
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@@ -520,240 -520,6 +520,240 @@@ select_sectype(struct TCP_Server_Info * } }
+struct sess_data { + unsigned int xid; + struct cifs_ses *ses; + struct nls_table *nls_cp; + void (*func)(struct sess_data *); + int result; + + /* we will send the SMB in three pieces: + * a fixed length beginning part, an optional + * SPNEGO blob (which can be zero length), and a + * last part which will include the strings + * and rest of bcc area. This allows us to avoid + * a large buffer 17K allocation + */ + int buf0_type; + struct kvec iov[3]; +}; + +static int +sess_alloc_buffer(struct sess_data *sess_data, int wct) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct smb_hdr *smb_buf; + + rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, + (void **)&smb_buf); + + if (rc) + return rc; + + sess_data->iov[0].iov_base = (char *)smb_buf; + sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4; + /* + * This variable will be used to clear the buffer + * allocated above in case of any error in the calling function. + */ + sess_data->buf0_type = CIFS_SMALL_BUFFER; + + /* 2000 big enough to fit max user, domain, NOS name etc. */ + sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL); + if (!sess_data->iov[2].iov_base) { + rc = -ENOMEM; + goto out_free_smb_buf; + } + + return 0; + +out_free_smb_buf: + kfree(smb_buf); + sess_data->iov[0].iov_base = NULL; + sess_data->iov[0].iov_len = 0; + sess_data->buf0_type = CIFS_NO_BUFFER; + return rc; +} + +static void +sess_free_buffer(struct sess_data *sess_data) +{ + + free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + sess_data->buf0_type = CIFS_NO_BUFFER; + kfree(sess_data->iov[2].iov_base); +} + +static int +sess_establish_session(struct sess_data *sess_data) +{ + struct cifs_ses *ses = sess_data->ses; + + mutex_lock(&ses->server->srv_mutex); + if (!ses->server->session_estab) { + if (ses->server->sign) { + ses->server->session_key.response = + kmemdup(ses->auth_key.response, + ses->auth_key.len, GFP_KERNEL); + if (!ses->server->session_key.response) { + mutex_unlock(&ses->server->srv_mutex); + return -ENOMEM; + } + ses->server->session_key.len = + ses->auth_key.len; + } + ses->server->sequence_number = 0x2; + ses->server->session_estab = true; + } + mutex_unlock(&ses->server->srv_mutex); + + cifs_dbg(FYI, "CIFS session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); + + return 0; +} + +static int +sess_sendreceive(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base; + __u16 count; + + count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len; + smb_buf->smb_buf_length = + cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count); + put_bcc(count, smb_buf); + + rc = SendReceive2(sess_data->xid, sess_data->ses, + sess_data->iov, 3 /* num_iovecs */, + &sess_data->buf0_type, + CIFS_LOG_ERROR); + + return rc; +} + +/* + * LANMAN and plaintext are less secure and off by default. + * So we make this explicitly be turned on in kconfig (in the + * build) and turned on at runtime (changed from the default) + * in proc/fs/cifs or via mount parm. Unfortunately this is + * needed for old Win (e.g. Win95), some obscure NAS and OS/2 + */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH +static void +sess_auth_lanman(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + char lnm_session_key[CIFS_AUTH_RESP_SIZE]; + __u32 capabilities; + __u16 bytes_remaining; + + /* lanman 2 style sessionsetup */ + /* wct = 10 */ + rc = sess_alloc_buffer(sess_data, 10); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; + + /* no capabilities flags in old lanman negotiation */ + pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); + + /* Calculate hash with password and copy into bcc_ptr. + * Encryption Key (stored as in cryptkey) gets used if the + * security mode bit in Negottiate Protocol response states + * to use challenge/response method (i.e. Password bit is 1). + */ + rc = calc_lanman_hash(ses->password, ses->server->cryptkey, + ses->server->sec_mode & SECMODE_PW_ENCRYPT ? + true : false, lnm_session_key); + + memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + /* + * can not sign if LANMAN negotiated so no need + * to calculate signing key? but what if server + * changed to do higher than lanman dialect and + * we reconnected would we ever calc signing_key? + */ + + cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); + /* Unicode not allowed for LANMAN dialects */ + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + /* lanman response has a word count of 3 */ + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); +} + +#else + +static void +sess_auth_lanman(struct sess_data *sess_data) +{ + sess_data->result = -EOPNOTSUPP; + sess_data->func = NULL; +} +#endif + int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp) @@@ -774,21 -540,12 +774,21 @@@ __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ u16 blob_len; char *ntlmsspblob = NULL; + struct sess_data *sess_data;
if (ses == NULL) { WARN(1, "%s: ses == NULL!", __func__); return -EINVAL; }
+ sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL); + if (!sess_data) + return -ENOMEM; + sess_data->xid = xid; + sess_data->ses = ses; + sess_data->buf0_type = CIFS_NO_BUFFER; + sess_data->nls_cp = (struct nls_table *) nls_cp; + type = select_sectype(ses->server, ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); if (type == Unspecified) { @@@ -797,14 -554,6 +797,14 @@@ return -EINVAL; }
+ switch (type) { + case LANMAN: + sess_auth_lanman(sess_data); + goto out; + default: + cifs_dbg(FYI, "Continuing with CIFS_SessSetup\n"); + } + if (type == RawNTLMSSP) { /* if memory allocation is successful, caller of this function * frees it. @@@ -820,7 -569,17 +820,7 @@@ ssetup_ntlmssp_authenticate if (phase == NtLmChallenge) phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
- if (type == LANMAN) { -#ifndef CONFIG_CIFS_WEAK_PW_HASH - /* LANMAN and plaintext are less secure and off by default. - So we make this explicitly be turned on in kconfig (in the - build) and turned on at runtime (changed from the default) - in proc/fs/cifs or via mount parm. Unfortunately this is - needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ - return -EOPNOTSUPP; -#endif - wct = 10; /* lanman 2 style sessionsetup */ - } else if ((type == NTLM) || (type == NTLMv2)) { + if ((type == NTLM) || (type == NTLMv2)) { /* For NTLMv2 failures eventually may need to retry NTLM */ wct = 13; /* old style NTLM sessionsetup */ } else /* same size: negotiate or auth, NTLMSSP or extended security */ @@@ -859,7 -618,39 +859,7 @@@ iov[1].iov_base = NULL; iov[1].iov_len = 0;
- if (type == LANMAN) { -#ifdef CONFIG_CIFS_WEAK_PW_HASH - char lnm_session_key[CIFS_AUTH_RESP_SIZE]; - - pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; - - /* no capabilities flags in old lanman negotiation */ - - pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - - /* Calculate hash with password and copy into bcc_ptr. - * Encryption Key (stored as in cryptkey) gets used if the - * security mode bit in Negottiate Protocol response states - * to use challenge/response method (i.e. Password bit is 1). - */ - - rc = calc_lanman_hash(ses->password, ses->server->cryptkey, - ses->server->sec_mode & SECMODE_PW_ENCRYPT ? - true : false, lnm_session_key); - - memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - - /* can not sign if LANMAN negotiated so no need - to calculate signing key? but what if server - changed to do higher than lanman dialect and - we reconnected would we ever calc signing_key? */ - - cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); - /* Unicode not allowed for LANMAN dialects */ - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); -#endif - } else if (type == NTLM) { + if (type == NTLM) { pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); pSMB->req_no_secext.CaseInsensitivePasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); @@@ -1098,6 -889,7 +1098,6 @@@ } if (phase == NtLmChallenge) { rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); - /* now goto beginning for ntlmssp authenticate phase */ if (rc) goto ssetup_exit; } @@@ -1170,9 -962,4 +1170,9 @@@ keycp_exit kfree(ses->ntlmssp);
return rc; + +out: + rc = sess_data->result; + kfree(sess_data); + return rc; } diff --combined fs/cifs/smb2pdu.c index 0158104,a9b03c2..b3f05b4 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@@ -309,6 -309,16 +309,6 @@@ small_smb2_init(__le16 smb2_command, st return rc; }
-static void -free_rsp_buf(int resp_buftype, void *rsp) -{ - if (resp_buftype == CIFS_SMALL_BUFFER) - cifs_small_buf_release(rsp); - else if (resp_buftype == CIFS_LARGE_BUFFER) - cifs_buf_release(rsp); -} - - /* * * SMB2 Worker functions follow: @@@ -1359,7 -1369,7 +1359,7 @@@ SMB2_set_compression(const unsigned in char *ret_data = NULL;
fsctl_input.CompressionState = - __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SET_COMPRESSION, true /* is_fsctl */, diff --combined fs/namespace.c index b10db3d,2a1447c..019ff81 --- a/fs/namespace.c +++ b/fs/namespace.c @@@ -225,7 -225,6 +225,7 @@@ static struct mount *alloc_vfsmnt(cons INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_LIST_HEAD(&mnt->mnt_mp_list); #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif @@@ -668,45 -667,11 +668,45 @@@ struct vfsmount *lookup_mnt(struct pat return m; }
-static struct mountpoint *new_mountpoint(struct dentry *dentry) +/* + * __is_local_mountpoint - Test to see if dentry is a mountpoint in the + * current mount namespace. + * + * The common case is dentries are not mountpoints at all and that + * test is handled inline. For the slow case when we are actually + * dealing with a mountpoint of some kind, walk through all of the + * mounts in the current mount namespace and test to see if the dentry + * is a mountpoint. + * + * The mount_hashtable is not usable in the context because we + * need to identify all mounts that may be in the current mount + * namespace not just a mount that happens to have some specified + * parent mount. + */ +bool __is_local_mountpoint(struct dentry *dentry) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + bool is_covered = false; + + if (!d_mountpoint(dentry)) + goto out; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + is_covered = (mnt->mnt_mountpoint == dentry); + if (is_covered) + break; + } + up_read(&namespace_sem); +out: + return is_covered; +} + +static struct mountpoint *lookup_mountpoint(struct dentry *dentry) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; - int ret;
hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { @@@ -717,14 -682,6 +717,14 @@@ return mp; } } + return NULL; +} + +static struct mountpoint *new_mountpoint(struct dentry *dentry) +{ + struct hlist_head *chain = mp_hash(dentry); + struct mountpoint *mp; + int ret;
mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); if (!mp) @@@ -739,7 -696,6 +739,7 @@@ mp->m_dentry = dentry; mp->m_count = 1; hlist_add_head(&mp->m_hash, chain); + INIT_LIST_HEAD(&mp->m_list); return mp; }
@@@ -747,7 -703,6 +747,7 @@@ static void put_mountpoint(struct mount { if (!--mp->m_count) { struct dentry *dentry = mp->m_dentry; + BUG_ON(!list_empty(&mp->m_list)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); @@@ -794,7 -749,6 +794,7 @@@ static void detach_mnt(struct mount *mn mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); hlist_del_init_rcu(&mnt->mnt_hash); + list_del_init(&mnt->mnt_mp_list); put_mountpoint(mnt->mnt_mp); mnt->mnt_mp = NULL; } @@@ -811,7 -765,6 +811,7 @@@ void mnt_set_mountpoint(struct mount *m child_mnt->mnt_mountpoint = dget(mp->m_dentry); child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; + list_add_tail(&child_mnt->mnt_mp_list, &mp->m_list); }
/* @@@ -845,7 -798,7 +845,7 @@@ static void commit_tree(struct mount *m list_splice(&head, n->list.prev);
if (shadows) - hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); else hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); @@@ -983,25 -936,9 +983,25 @@@ static struct mount *clone_mnt(struct m return ERR_PTR(err); }
+static void cleanup_mnt(struct mount *mnt) +{ + fsnotify_vfsmount_delete(&mnt->mnt); + dput(mnt->mnt.mnt_root); + deactivate_super(mnt->mnt.mnt_sb); + mnt_free_id(mnt); + complete(mnt->mnt_undone); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); +} + +static void cleanup_mnt_work(struct work_struct *work) +{ + cleanup_mnt(container_of(work, struct mount, mnt_cleanup_work)); +} + static void mntput_no_expire(struct mount *mnt) { -put_again: + struct completion undone; + rcu_read_lock(); mnt_add_count(mnt, -1); if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ @@@ -1015,15 -952,12 +1015,15 @@@ return; } if (unlikely(mnt->mnt_pinned)) { - mnt_add_count(mnt, mnt->mnt_pinned + 1); + init_completion(&undone); + mnt->mnt_undone = &undone; + mnt_add_count(mnt, mnt->mnt_pinned); mnt->mnt_pinned = 0; rcu_read_unlock(); unlock_mount_hash(); acct_auto_close_mnt(&mnt->mnt); - goto put_again; + wait_for_completion(&undone); + return; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); @@@ -1047,19 -981,11 +1047,19 @@@ * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); - fsnotify_vfsmount_delete(&mnt->mnt); - dput(mnt->mnt.mnt_root); - deactivate_super(mnt->mnt.mnt_sb); - mnt_free_id(mnt); - call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); + /* The stack may be deep here, cleanup the mount on a work + * queue where the stack is guaranteed to be shallow. + */ + init_completion(&undone); + if (!mnt->mnt_undone) + mnt->mnt_undone = &undone; + else + complete(&undone); + + INIT_WORK(&mnt->mnt_cleanup_work, cleanup_mnt_work); + schedule_work(&mnt->mnt_cleanup_work); + + wait_for_completion(&undone); }
void mntput(struct vfsmount *mnt) @@@ -1335,7 -1261,6 +1335,7 @@@ void umount_tree(struct mount *mnt, in p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { + list_del_init(&p->mnt_mp_list); put_mountpoint(p->mnt_mp); /* move the reference to mountpoint into ->mnt_ex_mountpoint */ p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; @@@ -1448,37 -1373,6 +1448,37 @@@ static int do_umount(struct mount *mnt return retval; }
+/* + * __detach_mounts - lazily unmount all mounts on the specified dentry + * + * During unlink, rmdir, and d_drop it is possible to loose the path + * to an existing mountpoint, and wind up leaking the mount. + * detach_mounts allows lazily unmounting those mounts instead of + * leaking them. + * + * The caller may hold dentry->d_inode->i_mutex. + */ +void __detach_mounts(struct dentry *dentry) +{ + struct mountpoint *mp; + struct mount *mnt; + + namespace_lock(); + mp = lookup_mountpoint(dentry); + if (!mp) + goto out_unlock; + + lock_mount_hash(); + while (!list_empty(&mp->m_list)) { + mnt = list_first_entry(&mp->m_list, struct mount, mnt_mp_list); + umount_tree(mnt, 2); + } + unlock_mount_hash(); + put_mountpoint(mp); +out_unlock: + namespace_unlock(); +} + /* * Is the caller allowed to modify his namespace? */ @@@ -1828,9 -1722,7 +1828,9 @@@ retry namespace_lock(); mnt = lookup_mnt(path); if (likely(!mnt)) { - struct mountpoint *mp = new_mountpoint(dentry); + struct mountpoint *mp = lookup_mountpoint(dentry); + if (!mp) + mp = new_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); mutex_unlock(&dentry->d_inode->i_mutex); diff --combined fs/proc/base.c index e442784,79df9ff..feb01e0 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@@ -1625,6 -1625,7 +1625,6 @@@ int pid_revalidate(struct dentry *dentr put_task_struct(task); return 1; } - d_drop(dentry); return 0; }
@@@ -1761,6 -1762,9 +1761,6 @@@ out put_task_struct(task);
out_notask: - if (status <= 0) - d_drop(dentry); - return status; }
@@@ -2445,7 -2449,7 +2445,7 @@@ static int proc_tgid_io_accounting(stru
#ifdef CONFIG_USER_NS static int proc_id_map_open(struct inode *inode, struct file *file, - struct seq_operations *seq_ops) + const struct seq_operations *seq_ops) { struct user_namespace *ns = NULL; struct task_struct *task; @@@ -2672,7 -2676,8 +2672,7 @@@ static void proc_flush_task_mnt(struct /* no ->d_hash() rejects on procfs */ dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { - shrink_dcache_parent(dentry); - d_drop(dentry); + d_invalidate(dentry); dput(dentry); }
@@@ -2692,7 -2697,8 +2692,7 @@@ name.len = snprintf(buf, sizeof(buf), "%d", pid); dentry = d_hash_and_lookup(dir, &name); if (dentry) { - shrink_dcache_parent(dentry); - d_drop(dentry); + d_invalidate(dentry); dput(dentry); }
diff --combined include/linux/kernel.h index a9e2268,44a498d..e989204 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@@ -470,6 -470,7 +470,7 @@@ extern enum system_states #define TAINT_FIRMWARE_WORKAROUND 11 #define TAINT_OOT_MODULE 12 #define TAINT_UNSIGNED_MODULE 13 + #define TAINT_SOFTLOCKUP 14
extern const char hex_asc[]; #define hex_asc_lo(x) hex_asc[((x) & 0x0f)] @@@ -493,15 -494,10 +494,10 @@@ static inline char *hex_byte_pack_upper return buf; }
extern int hex_to_bin(char ch); extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
-int mac_pton(const char *s, u8 *mac); +bool mac_pton(const char *s, u8 *mac);
/* * General tracing related utility functions - trace_printk(), @@@ -719,23 -715,8 +715,8 @@@ static inline void ftrace_dump(enum ftr (void) (&_max1 == &_max2); \ _max1 > _max2 ? _max1 : _max2; })
- #define min3(x, y, z) ({ \ - typeof(x) _min1 = (x); \ - typeof(y) _min2 = (y); \ - typeof(z) _min3 = (z); \ - (void) (&_min1 == &_min2); \ - (void) (&_min1 == &_min3); \ - _min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \ - (_min2 < _min3 ? _min2 : _min3); }) - - #define max3(x, y, z) ({ \ - typeof(x) _max1 = (x); \ - typeof(y) _max2 = (y); \ - typeof(z) _max3 = (z); \ - (void) (&_max1 == &_max2); \ - (void) (&_max1 == &_max3); \ - _max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \ - (_max2 > _max3 ? _max2 : _max3); }) + #define min3(x, y, z) min((typeof(x))min(x, y), z) + #define max3(x, y, z) max((typeof(x))max(x, y), z)
/** * min_not_zero - return the minimum that is _not_ zero, unless both are zero @@@ -750,20 -731,13 +731,13 @@@ /** * clamp - return a value clamped to a given range with strict typechecking * @val: current value - * @min: minimum allowable value - * @max: maximum allowable value + * @lo: lowest allowable value + * @hi: highest allowable value * * This macro does strict typechecking of min/max to make sure they are of the * same type as val. See the unnecessary pointer comparisons. */ - #define clamp(val, min, max) ({ \ - typeof(val) __val = (val); \ - typeof(min) __min = (min); \ - typeof(max) __max = (max); \ - (void) (&__val == &__min); \ - (void) (&__val == &__max); \ - __val = __val < __min ? __min: __val; \ - __val > __max ? __max: __val; }) + #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
/* * ..and if you can't take the strict diff --combined include/linux/sched.h index b39a671,1d169c8..ec89295 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@@ -33,6 -33,7 +33,7 @@@ struct sched_param
#include <linux/smp.h> #include <linux/sem.h> + #include <linux/shm.h> #include <linux/signal.h> #include <linux/compiler.h> #include <linux/completion.h> @@@ -872,21 -873,21 +873,21 @@@ enum cpu_idle_type #define SD_NUMA 0x4000 /* cross-node balancing */
#ifdef CONFIG_SCHED_SMT -static inline const int cpu_smt_flags(void) +static inline int cpu_smt_flags(void) { return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; } #endif
#ifdef CONFIG_SCHED_MC -static inline const int cpu_core_flags(void) +static inline int cpu_core_flags(void) { return SD_SHARE_PKG_RESOURCES; } #endif
#ifdef CONFIG_NUMA -static inline const int cpu_numa_flags(void) +static inline int cpu_numa_flags(void) { return SD_NUMA; } @@@ -999,7 -1000,7 +1000,7 @@@ void free_sched_domains(cpumask_var_t d bool cpus_share_cache(int this_cpu, int that_cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); -typedef const int (*sched_domain_flags_f)(void); +typedef int (*sched_domain_flags_f)(void);
#define SDTL_OVERLAP 0x01
@@@ -1270,6 -1271,9 +1271,6 @@@ struct task_struct #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rcu_boost_mutex; -#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@@ -1386,6 -1390,7 +1387,7 @@@ #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; + struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ @@@ -2006,6 -2011,9 +2008,6 @@@ static inline void rcu_copy_process(str #ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - p->rcu_boost_mutex = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ INIT_LIST_HEAD(&p->rcu_node_entry); }
diff --combined include/scsi/scsi.h index 91e2e42,d34cf2d..4b69139 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@@ -31,7 -31,7 +31,7 @@@ enum scsi_timeouts * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. */ - #ifdef ARCH_HAS_SG_CHAIN + #ifdef CONFIG_ARCH_HAS_SG_CHAIN #define SCSI_MAX_SG_CHAIN_SEGMENTS 2048 #else #define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS @@@ -385,7 -385,7 +385,7 @@@ struct scsi_lun #define SCSI_W_LUN_ACCESS_CONTROL (SCSI_W_LUN_BASE + 2) #define SCSI_W_LUN_TARGET_LOG_PAGE (SCSI_W_LUN_BASE + 3)
-static inline int scsi_is_wlun(unsigned int lun) +static inline int scsi_is_wlun(u64 lun) { return (lun & 0xff00) == SCSI_W_LUN_BASE; } diff --combined init/Kconfig index 85fb985,b66f859..1dfdd81 --- a/init/Kconfig +++ b/init/Kconfig @@@ -505,7 -505,7 +505,7 @@@ config PREEMPT_RC def_bool TREE_PREEMPT_RCU help This option enables preemptible-RCU code that is common between - the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. + TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU.
config RCU_STALL_COMMON def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE ) @@@ -737,7 -737,7 +737,7 @@@ choic
config RCU_NOCB_CPU_NONE bool "No build_forced no-CBs CPUs" - depends on RCU_NOCB_CPU && !NO_HZ_FULL + depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL help This option does not force any of the CPUs to be no-CBs CPUs. Only CPUs designated by the rcu_nocbs= boot parameter will be @@@ -751,7 -751,7 +751,7 @@@
config RCU_NOCB_CPU_ZERO bool "CPU 0 is a build_forced no-CBs CPU" - depends on RCU_NOCB_CPU && !NO_HZ_FULL + depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL help This option forces CPU 0 to be a no-CBs CPU, so that its RCU callbacks are invoked by a per-CPU kthread whose name begins @@@ -783,8 -783,13 +783,13 @@@ endchoic
endmenu # "RCU Subsystem"
+ config BUILD_BIN2C + bool + default n + config IKCONFIG tristate "Kernel .config support" + select BUILD_BIN2C ---help--- This option enables the complete Linux kernel ".config" file contents to be saved in the kernel. It provides documentation @@@ -807,15 -812,53 +812,53 @@@ config LOG_BUF_SHIF range 12 21 default 17 help - Select kernel log buffer size as a power of 2. + Select the minimal kernel log buffer size as a power of 2. + The final size is affected by LOG_CPU_MAX_BUF_SHIFT config + parameter, see below. Any higher size also might be forced + by "log_buf_len" boot parameter. + Examples: - 17 => 128 KB + 17 => 128 KB 16 => 64 KB - 15 => 32 KB - 14 => 16 KB + 15 => 32 KB + 14 => 16 KB 13 => 8 KB 12 => 4 KB
+ config LOG_CPU_MAX_BUF_SHIFT + int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)" + range 0 21 + default 12 if !BASE_SMALL + default 0 if BASE_SMALL + help + This option allows to increase the default ring buffer size + according to the number of CPUs. The value defines the contribution + of each CPU as a power of 2. The used space is typically only few + lines however it might be much more when problems are reported, + e.g. backtraces. + + The increased size means that a new buffer has to be allocated and + the original static one is unused. It makes sense only on systems + with more CPUs. Therefore this value is used only when the sum of + contributions is greater than the half of the default kernel ring + buffer as defined by LOG_BUF_SHIFT. The default values are set + so that more than 64 CPUs are needed to trigger the allocation. + + Also this option is ignored when "log_buf_len" kernel parameter is + used as it forces an exact (power of two) size of the ring buffer. + + The number of possible CPUs is used for this computation ignoring + hotplugging making the compuation optimal for the the worst case + scenerio while allowing a simple algorithm to be used from bootup. + + Examples shift values and their meaning: + 17 => 128 KB for each CPU + 16 => 64 KB for each CPU + 15 => 32 KB for each CPU + 14 => 16 KB for each CPU + 13 => 8 KB for each CPU + 12 => 4 KB for each CPU + # # Architectures with an unreliable sched_clock() should select this: # @@@ -1264,77 -1307,6 +1307,77 @@@ config CC_OPTIMIZE_FOR_SIZ
If unsure, say N.
+config LTO_MENU + bool "Enable gcc link time optimization (LTO)" + # Only tested on X86 for now. For other architectures you likely + # have to fix some things first, like adding asmlinkages etc. + depends on X86 + # lto does not support excluding flags for specific files + # right now. Can be removed if that is fixed. + depends on !FUNCTION_TRACER + help + With this option gcc will do whole program optimizations for + the whole kernel and module. This increases compile time, but can + lead to better code. It allows gcc to inline functions between + different files and do other optimization. It might also trigger + bugs due to more aggressive optimization. It allows gcc to drop unused + code. On smaller monolithic kernel configurations + it usually leads to smaller kernels, especially when modules + are disabled. + + With this option gcc will also do some global checking over + different source files. It also disables a number of kernel + features. + + This option is recommended for release builds. With LTO + the kernel always has to be re-optimized (but not re-parsed) + on each build. + + This requires a gcc 4.8 or later compiler and + Linux binutils 2.21.51.0.3 or later. gcc 4.9 builds significantly + faster than 4.8 It does not currently work with a FSF release of + binutils or with the gold linker. + + On larger configurations this may need more than 4GB of RAM. + It will likely not work on those with a 32bit compiler. + + When the toolchain support is not available this will (hopefully) + be automatically disabled. + + For more information see Documentation/lto-build + +config LTO_DISABLE + bool "Disable LTO again" + depends on LTO_MENU + default n + help + This option is merely here so that allyesconfig or allmodconfig do + not enable LTO. If you want to actually use LTO do not enable. + +config LTO + bool + default y + depends on LTO_MENU && !LTO_DISABLE + +config LTO_DEBUG + bool "Enable LTO compile time debugging" + depends on LTO + help + Enable LTO debugging in the compiler. The compiler dumps + some log files that make it easier to figure out LTO + behavior. The log files also allow to reconstruct + the global inlining and a global callgraph. + They however add some (single threaded) cost to the + compilation. When in doubt do not enable. + +config LTO_CP_CLONE + bool "Allow aggressive cloning for function specialization" + depends on LTO + help + Allow the compiler to clone and specialize functions for specific + arguments when it determines these arguments are very commonly + called. Experimential. Will increase text size. + config SYSCTL bool
@@@ -1834,8 -1806,6 +1877,8 @@@ config MODULE_FORCE_UNLOA
config MODVERSIONS bool "Module versioning support" + # LTO should work with gcc 4.9 + depends on !LTO help Usually, you have to use modules compiled with your kernel. Saying Y here makes it sometimes possible to use modules diff --combined kernel/Makefile index 973a40c,9b07bb7..de62ac0 --- a/kernel/Makefile +++ b/kernel/Makefile @@@ -3,11 -3,12 +3,11 @@@ #
obj-y = fork.o exec_domain.o panic.o \ - cpu.o exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ + cpu.o exit.o softirq.o resource.o \ + sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ - extable.o params.o posix-timers.o \ - kthread.o sys_ni.o posix-cpu-timers.o \ - hrtimer.o nsproxy.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o smpboot.o
@@@ -104,11 -105,27 +104,11 @@@ targets += config_data.g $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip)
- filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz)
-$(obj)/time.o: $(obj)/timeconst.h - -quiet_cmd_hzfile = HZFILE $@ - cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ - -targets += hz.bc -$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE - $(call if_changed,hzfile) - -quiet_cmd_bc = BC $@ - cmd_bc = bc -q $(filter-out FORCE,$^) > $@ - -targets += timeconst.h -$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE - $(call if_changed,bc) - ############################################################################### # # Roll all the X.509 certificates that we can find together and pull them into diff --combined kernel/events/uprobes.c index 6f3254e,46b7c31..1d0af8a --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@@ -167,6 -167,11 +167,11 @@@ static int __replace_page(struct vm_are /* For mmu_notifiers */ const unsigned long mmun_start = addr; const unsigned long mmun_end = addr + PAGE_SIZE; + struct mem_cgroup *memcg; + + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + if (err) + return err;
/* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); @@@ -179,6 -184,8 +184,8 @@@
get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); + mem_cgroup_commit_charge(kpage, memcg, false); + lru_cache_add_active_or_unevictable(kpage, vma);
if (!PageAnon(page)) { dec_mm_counter(mm, MM_FILEPAGES); @@@ -200,6 -207,7 +207,7 @@@
err = 0; unlock: + mem_cgroup_cancel_charge(kpage, memcg); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; @@@ -315,18 -323,11 +323,11 @@@ retry if (!new_page) goto put_old;
- if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) - goto put_new; - __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
ret = __replace_page(vma, vaddr, old_page, new_page); - if (ret) - mem_cgroup_uncharge_page(new_page); - - put_new: page_cache_release(new_page); put_old: put_page(old_page); @@@ -846,7 -847,7 +847,7 @@@ static void __uprobe_unregister(struct { int err;
- if (!consumer_del(uprobe, uc)) /* WARN? */ + if (WARN_ON(!consumer_del(uprobe, uc))) return;
err = register_for_each_vma(uprobe, NULL); @@@ -927,7 -928,7 +928,7 @@@ int uprobe_apply(struct inode *inode, l int ret = -ENOENT;
uprobe = find_uprobe(inode, offset); - if (!uprobe) + if (WARN_ON(!uprobe)) return ret;
down_write(&uprobe->register_rwsem); @@@ -952,7 -953,7 +953,7 @@@ void uprobe_unregister(struct inode *in struct uprobe *uprobe;
uprobe = find_uprobe(inode, offset); - if (!uprobe) + if (WARN_ON(!uprobe)) return;
down_write(&uprobe->register_rwsem); diff --combined kernel/fork.c index 8f54193,dd8864f..41c9890 --- a/kernel/fork.c +++ b/kernel/fork.c @@@ -365,12 -365,11 +365,11 @@@ static int dup_mmap(struct mm_struct *m */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
- mm->locked_vm = 0; - mm->mmap = NULL; - mm->vmacache_seqnum = 0; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@@ -529,17 -528,28 +528,28 @@@ static void mm_init_aio(struct mm_struc
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm->map_count = 0; + mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); clear_tlb_flush_pending(mm); + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; + #endif
if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@@ -549,11 -559,17 +559,17 @@@ mm->def_flags = 0; }
- if (likely(!mm_alloc_pgd(mm))) { - mmu_notifier_mm_init(mm); - return mm; - } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext;
+ return mm; + + fail_nocontext: + mm_free_pgd(mm); + fail_nopgd: free_mm(mm); return NULL; } @@@ -587,7 -603,6 +603,6 @@@ struct mm_struct *mm_alloc(void return NULL;
memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); }
@@@ -819,17 -834,10 +834,10 @@@ static struct mm_struct *dup_mm(struct goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm);
if (!mm_init(mm, tsk)) goto fail_nomem;
- if (init_new_context(tsk, mm)) - goto fail_nocontext; - dup_mm_exe_file(oldmm, mm);
err = dup_mmap(mm, oldmm); @@@ -851,15 -859,6 +859,6 @@@ free_pt
fail_nomem: return NULL; - - fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; }
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) @@@ -1262,7 -1261,7 +1261,7 @@@ static struct task_struct *copy_process
posix_cpu_timers_init(p);
- do_posix_clock_monotonic_gettime(&p->start_time); + ktime_get_ts(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; @@@ -1328,6 -1327,7 +1327,7 @@@ if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@@ -1873,6 -1873,11 +1873,11 @@@ SYSCALL_DEFINE1(unshare, unsigned long */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + }
if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); diff --combined kernel/time/posix-timers.c index 424c2d4,86535c0..86535c0 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@@ -71,7 -71,7 +71,7 @@@ static DEFINE_SPINLOCK(hash_lock) * SIGEV values. Here we put out an error if this assumption fails. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ - ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) + ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif
@@@ -252,7 -252,8 +252,8 @@@ static int posix_get_monotonic_coarse(c return 0; }
- static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) + static int posix_get_coarse_res(const clockid_t which_clock, + struct timespec *tp) { *tp = ktime_to_timespec(KTIME_LOW_RES); return 0; @@@ -333,14 -334,16 +334,16 @@@ static __init int init_posix_timers(voi posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_REALTIME_COARSE, + &clock_realtime_coarse); + posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, + &clock_monotonic_coarse); posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); posix_timers_register_clock(CLOCK_TAI, &clock_tai);
posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); + sizeof(struct k_itimer), 0, + SLAB_PANIC, NULL); return 0; }
@@@ -494,11 -497,11 +497,11 @@@ static enum hrtimer_restart posix_timer return ret; }
- static struct pid *good_sigevent(sigevent_t * event) + static struct pid *good_sigevent(sigevent_t *event) { struct task_struct *rtn = current->group_leader;
- if ((event->sigev_notify & SIGEV_THREAD_ID ) && + if ((event->sigev_notify & SIGEV_THREAD_ID) && (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || !same_thread_group(rtn, current) || (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) @@@ -515,18 -518,18 +518,18 @@@ void posix_timers_register_clock(const struct k_clock *new_clock) { if ((unsigned) clock_id >= MAX_CLOCKS) { - printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", + pr_warn("POSIX clock register failed for clock_id %d\n", clock_id); return; }
if (!new_clock->clock_get) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", + pr_warn("POSIX clock id %d lacks clock_get()\n", clock_id); return; } if (!new_clock->clock_getres) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", + pr_warn("POSIX clock id %d lacks clock_getres()\n", clock_id); return; } @@@ -535,7 -538,7 +538,7 @@@ } EXPORT_SYMBOL_GPL(posix_timers_register_clock);
- static struct k_itimer * alloc_posix_timer(void) + static struct k_itimer *alloc_posix_timer(void) { struct k_itimer *tmr; tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); @@@ -622,7 -625,7 +625,7 @@@ SYSCALL_DEFINE3(timer_create, const clo new_timer->it_overrun = -1;
if (timer_event_spec) { - if (copy_from_user(&event, timer_event_spec, sizeof (event))) { + if (copy_from_user(&event, timer_event_spec, sizeof(event))) { error = -EFAULT; goto out; } @@@ -647,7 -650,7 +650,7 @@@ new_timer->sigq->info.si_code = SI_TIMER;
if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { + &new_timer_id, sizeof(new_timer_id))) { error = -EFAULT; goto out; } @@@ -748,7 -751,8 +751,8 @@@ common_timer_get(struct k_itimer *timr */ if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) - timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); + timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, + iv);
remaining = ktime_sub(hrtimer_get_expires(timer), now); /* Return 0 only, when the timer is expired and not pending */ @@@ -785,7 -789,7 +789,7 @@@ SYSCALL_DEFINE2(timer_gettime, timer_t
unlock_timer(timr, flags);
- if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + if (!ret && copy_to_user(setting, &cur_setting, sizeof(cur_setting))) return -EFAULT;
return ret; @@@ -837,7 -841,7 +841,7 @@@ common_timer_set(struct k_itimer *timr if (hrtimer_try_to_cancel(timer) < 0) return TIMER_RETRY;
- timr->it_requeue_pending = (timr->it_requeue_pending + 2) & + timr->it_requeue_pending = (timr->it_requeue_pending + 2) & ~REQUEUE_PENDING; timr->it_overrun_last = 0;
@@@ -857,9 -861,8 +861,8 @@@ /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_MODE_REL) { + if (mode == HRTIMER_MODE_REL) hrtimer_add_expires(timer, timer->base->get_time()); - } return 0; }
@@@ -882,7 -885,7 +885,7 @@@ SYSCALL_DEFINE4(timer_settime, timer_t if (!new_setting) return -EINVAL;
- if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) + if (copy_from_user(&new_spec, new_setting, sizeof(new_spec))) return -EFAULT;
if (!timespec_valid(&new_spec.it_interval) || @@@ -901,12 -904,12 +904,12 @@@ retry
unlock_timer(timr, flag); if (error == TIMER_RETRY) { - rtn = NULL; // We already got the old time... + rtn = NULL; /* We already got the old time... */ goto retry; }
if (old_setting && !error && - copy_to_user(old_setting, &old_spec, sizeof (old_spec))) + copy_to_user(old_setting, &old_spec, sizeof(old_spec))) error = -EFAULT;
return error; @@@ -1008,14 -1011,14 +1011,14 @@@ SYSCALL_DEFINE2(clock_settime, const cl if (!kc || !kc->clock_set) return -EINVAL;
- if (copy_from_user(&new_tp, tp, sizeof (*tp))) + if (copy_from_user(&new_tp, tp, sizeof(*tp))) return -EFAULT;
return kc->clock_set(which_clock, &new_tp); }
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *,tp) + struct timespec __user *, tp) { struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec kernel_tp; @@@ -1026,7 -1029,7 +1029,7 @@@
error = kc->clock_get(which_clock, &kernel_tp);
- if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + if (!error && copy_to_user(tp, &kernel_tp, sizeof(kernel_tp))) error = -EFAULT;
return error; @@@ -1067,7 -1070,7 +1070,7 @@@ SYSCALL_DEFINE2(clock_getres, const clo
error = kc->clock_getres(which_clock, &rtn_tp);
- if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof(rtn_tp))) error = -EFAULT;
return error; @@@ -1096,7 -1099,7 +1099,7 @@@ SYSCALL_DEFINE4(clock_nanosleep, const if (!kc->nsleep) return -ENANOSLEEP_NOTSUP;
- if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + if (copy_from_user(&t, rqtp, sizeof(struct timespec))) return -EFAULT;
if (!timespec_valid(&t)) diff --combined mm/memcontrol.c index db536e9,6c3ffb0..df394e0 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@@ -882,13 -882,6 +882,6 @@@ static long mem_cgroup_read_stat(struc return val; }
- static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - bool charge) - { - int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); - } - static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { @@@ -909,13 -902,15 +902,15 @@@
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - bool anon, int nr_pages) + int nr_pages) { + preempt_disable(); + /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is * counted as CACHE even if it's on ANON LRU. */ - if (anon) + if (PageAnon(page)) __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); else @@@ -935,6 -930,7 +930,7 @@@ }
__this_cpu_add(memcg->stat->nr_page_events, nr_pages); + preempt_enable(); }
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) @@@ -1347,20 -1343,6 +1343,6 @@@ out return lruvec; }
- /* - * Following LRU functions are allowed to be used without PCG_LOCK. - * Operations are called by routine of global LRU independently from memcg. - * What we have to take care of here is validness of pc->mem_cgroup. - * - * Changes to pc->mem_cgroup happens when - * 1. charge - * 2. moving account - * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. - * It is added to LRU before charge. - * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. - * When moving account, the page is not on LRU. It's isolated. - */ - /** * mem_cgroup_page_lruvec - return lruvec for adding an lru page * @page: the page @@@ -2261,22 -2243,14 +2243,14 @@@ cleanup * * Notes: Race condition * - * We usually use lock_page_cgroup() for accessing page_cgroup member but - * it tends to be costly. But considering some conditions, we doesn't need - * to do so _always_. - * - * Considering "charge", lock_page_cgroup() is not required because all - * file-stat operations happen after a page is attached to radix-tree. There - * are no race with "charge". + * Charging occurs during page instantiation, while the page is + * unmapped and locked in page migration, or while the page table is + * locked in THP migration. No race is possible. * - * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup - * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even - * if there are race with "uncharge". Statistics itself is properly handled - * by flags. + * Uncharge happens to pages with zero references, no race possible. * - * Considering "move", this is an only case we see a race. To make the race - * small, we check memcg->moving_account and detect there are possibility - * of race or not. If there is, we take a lock. + * Charge moving between groups is protected by checking mm->moving + * account and taking the move_lock in the slowpath. */
void __mem_cgroup_begin_update_page_stat(struct page *page, @@@ -2551,55 -2525,63 +2525,63 @@@ static int memcg_cpu_hotplug_callback(s return NOTIFY_OK; }
- - /* See mem_cgroup_try_charge() for details */ - enum { - CHARGE_OK, /* success */ - CHARGE_RETRY, /* need to retry but retry is not bad */ - CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ - CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ - }; - - static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, unsigned int min_pages, - bool invoke_oom) + static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) { - unsigned long csize = nr_pages * PAGE_SIZE; + unsigned int batch = max(CHARGE_BATCH, nr_pages); + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; + unsigned long nr_reclaimed; unsigned long flags = 0; - int ret; + unsigned long long size; + int ret = 0;
- ret = res_counter_charge(&memcg->res, csize, &fail_res); + retry: + if (consume_stock(memcg, nr_pages)) + goto done;
- if (likely(!ret)) { + size = batch * PAGE_SIZE; + if (!res_counter_charge(&memcg->res, size, &fail_res)) { if (!do_swap_account) - return CHARGE_OK; - ret = res_counter_charge(&memcg->memsw, csize, &fail_res); - if (likely(!ret)) - return CHARGE_OK; - - res_counter_uncharge(&memcg->res, csize); + goto done_restock; + if (!res_counter_charge(&memcg->memsw, size, &fail_res)) + goto done_restock; + res_counter_uncharge(&memcg->res, size); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + + if (batch > nr_pages) { + batch = nr_pages; + goto retry; + } + /* - * Never reclaim on behalf of optional batching, retry with a - * single page instead. + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. */ - if (nr_pages > min_pages) - return CHARGE_RETRY; + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current) || + current->flags & PF_EXITING)) + goto bypass; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem;
if (!(gfp_mask & __GFP_WAIT)) - return CHARGE_WOULDBLOCK; + goto nomem;
- if (gfp_mask & __GFP_NORETRY) - return CHARGE_NOMEM; + nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
- ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); - if (mem_cgroup_margin(mem_over_limit) >= nr_pages) - return CHARGE_RETRY; + if (mem_cgroup_margin(mem_over_limit) >= batch) + goto retry; + + if (gfp_mask & __GFP_NORETRY) + goto nomem; /* * Even though the limit is exceeded at this point, reclaim * may have been able to free some pages. Retry the charge @@@ -2609,142 -2591,47 +2591,47 @@@ * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) - return CHARGE_RETRY; - + if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + goto retry; /* * At task move, charge accounts can be doubly counted. So, it's * better to wait until the end of task_move if something is going on. */ if (mem_cgroup_wait_acct_move(mem_over_limit)) - return CHARGE_RETRY; - - if (invoke_oom) - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - - return CHARGE_NOMEM; - } - - /** - * mem_cgroup_try_charge - try charging a memcg - * @memcg: memcg to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns 0 if @memcg was charged successfully, -EINTR if the charge - * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. - */ - static int mem_cgroup_try_charge(struct mem_cgroup *memcg, - gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) - { - unsigned int batch = max(CHARGE_BATCH, nr_pages); - int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - int ret; - - if (mem_cgroup_is_root(memcg)) - goto done; - /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) - goto bypass; + goto retry;
- if (unlikely(task_in_memcg_oom(current))) - goto nomem; + if (nr_retries--) + goto retry;
if (gfp_mask & __GFP_NOFAIL) - oom = false; - again: - if (consume_stock(memcg, nr_pages)) - goto done; - - do { - bool invoke_oom = oom && !nr_oom_retries; - - /* If killed, bypass charge */ - if (fatal_signal_pending(current)) - goto bypass; + goto bypass;
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, - nr_pages, invoke_oom); - switch (ret) { - case CHARGE_OK: - break; - case CHARGE_RETRY: /* not in OOM situation but retry */ - batch = nr_pages; - goto again; - case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - goto nomem; - case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) - goto nomem; - nr_oom_retries--; - break; - } - } while (ret != CHARGE_OK); + if (fatal_signal_pending(current)) + goto bypass;
- if (batch > nr_pages) - refill_stock(memcg, batch - nr_pages); - done: - return 0; + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; bypass: - return -EINTR; - } - - /** - * mem_cgroup_try_charge_mm - try charging a mm - * @mm: mm_struct to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns the charged mem_cgroup associated with the given mm_struct or - * NULL the charge failed. - */ - static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, - gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) - - { - struct mem_cgroup *memcg; - int ret; - - memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); - css_put(&memcg->css); - if (ret == -EINTR) - memcg = root_mem_cgroup; - else if (ret) - memcg = NULL; + memcg = root_mem_cgroup; + ret = -EINTR; + goto retry;
- return memcg; + done_restock: + if (batch > nr_pages) + refill_stock(memcg, batch - nr_pages); + done: + return ret; }
- /* - * Somemtimes we have to undo a charge we got by try_charge(). - * This function is for that and do uncharge, put css's refcnt. - * gotten by try_charge(). - */ - static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) + static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(memcg)) { - unsigned long bytes = nr_pages * PAGE_SIZE; + unsigned long bytes = nr_pages * PAGE_SIZE;
- res_counter_uncharge(&memcg->res, bytes); - if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); - } + res_counter_uncharge(&memcg->res, bytes); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, bytes); }
/* @@@ -2756,9 -2643,6 +2643,6 @@@ static void __mem_cgroup_cancel_local_c { unsigned long bytes = nr_pages * PAGE_SIZE;
- if (mem_cgroup_is_root(memcg)) - return; - res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); if (do_swap_account) res_counter_uncharge_until(&memcg->memsw, @@@ -2779,6 -2663,16 +2663,16 @@@ static struct mem_cgroup *mem_cgroup_lo return mem_cgroup_from_id(id); }
+ /* + * try_get_mem_cgroup_from_page - look up page's memcg association + * @page: the page + * + * Look up, get a css reference, and return the memcg that owns @page. + * + * The page must be locked to prevent racing with swap-in and page + * cache charges. If coming from an unlocked page table, the caller + * must ensure the page is on the LRU or this can race with charging. + */ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { struct mem_cgroup *memcg = NULL; @@@ -2789,7 -2683,6 +2683,6 @@@ VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { memcg = pc->mem_cgroup; if (memcg && !css_tryget_online(&memcg->css)) @@@ -2803,23 -2696,17 +2696,17 @@@ memcg = NULL; rcu_read_unlock(); } return memcg; }
- static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, - struct page *page, - unsigned int nr_pages, - enum charge_type ctype, - bool lrucare) + static void commit_charge(struct page *page, struct mem_cgroup *memcg, + unsigned int nr_pages, bool lrucare) { struct page_cgroup *pc = lookup_page_cgroup(page); struct zone *uninitialized_var(zone); struct lruvec *lruvec; bool was_on_lru = false; - bool anon;
- lock_page_cgroup(pc); VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); /* * we don't need page_cgroup_lock about tail pages, becase they are not @@@ -2841,16 -2728,22 +2728,22 @@@ } }
- pc->mem_cgroup = memcg; /* - * We access a page_cgroup asynchronously without lock_page_cgroup(). - * Especially when a page_cgroup is taken from a page, pc->mem_cgroup - * is accessed after testing USED bit. To make pc->mem_cgroup visible - * before USED bit, we need memory barrier here. - * See mem_cgroup_add_lru_list(), etc. + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point: + * + * - the page is uncharged + * + * - the page is off-LRU + * + * - an anonymous fault has exclusive page access, except for + * a locked page table + * + * - a page cache insertion, a swapin fault, or a migration + * have the page locked */ - smp_wmb(); - SetPageCgroupUsed(pc); + pc->mem_cgroup = memcg; + pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
if (lrucare) { if (was_on_lru) { @@@ -2862,14 -2755,7 +2755,7 @@@ spin_unlock_irq(&zone->lru_lock); }
- if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) - anon = true; - else - anon = false; - - mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); - unlock_page_cgroup(pc); - + mem_cgroup_charge_statistics(memcg, page, nr_pages); /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. @@@ -2937,22 -2823,21 +2823,21 @@@ static int memcg_charge_kmem(struct mem if (ret) return ret;
- ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, - oom_gfp_allowed(gfp)); + ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); if (ret == -EINTR) { /* - * mem_cgroup_try_charge() chosed to bypass to root due to - * OOM kill or fatal signal. Since our only options are to - * either fail the allocation or charge it to this cgroup, do - * it as a temporary condition. But we can't fail. From a - * kmem/slab perspective, the cache has already been selected, - * by mem_cgroup_kmem_get_cache(), so it is too late to change + * try_charge() chose to bypass to root due to OOM kill or + * fatal signal. Since our only options are to either fail + * the allocation or charge it to this cgroup, do it as a + * temporary condition. But we can't fail. From a kmem/slab + * perspective, the cache has already been selected, by + * mem_cgroup_kmem_get_cache(), so it is too late to change * our minds. * * This condition will only trigger if the task entered - * memcg_charge_kmem in a sane state, but was OOM-killed during - * mem_cgroup_try_charge() above. Tasks that were already - * dying when the allocation triggers should have been already + * memcg_charge_kmem in a sane state, but was OOM-killed + * during try_charge() above. Tasks that were already dying + * when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ res_counter_charge_nofail(&memcg->res, size, &fail_res); @@@ -3076,6 -2961,8 +2961,8 @@@ int memcg_update_cache_size(struct kmem return 0; }
+ static void memcg_unregister_cache_func(struct work_struct *work); + int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { @@@ -3097,6 -2984,9 +2984,9 @@@ if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; + atomic_long_set(&s->memcg_params->refcnt, 1); + INIT_WORK(&s->memcg_params->unregister_work, + memcg_unregister_cache_func); css_get(&memcg->css); } else s->memcg_params->is_root_cache = true; @@@ -3178,6 -3068,25 +3068,25 @@@ static void memcg_unregister_cache(stru kmem_cache_destroy(cachep); }
+ static void memcg_unregister_cache_func(struct work_struct *work) + { + struct memcg_cache_params *params = + container_of(work, struct memcg_cache_params, unregister_work); + struct kmem_cache *cachep = memcg_params_to_cache(params); + + mutex_lock(&memcg_slab_mutex); + memcg_unregister_cache(cachep); + mutex_unlock(&memcg_slab_mutex); + } + + static void memcg_unregister_cache_rcu_func(struct rcu_head *rcu) + { + struct memcg_cache_params *params = + container_of(rcu, struct memcg_cache_params, rcu_head); + + schedule_work(¶ms->unregister_work); + } + /* * During the creation a new cache, we need to disable our accounting mechanism * altogether. This is true even if we are not creating, but rather just @@@ -3233,6 -3142,7 +3142,7 @@@ static void memcg_unregister_all_caches { struct kmem_cache *cachep; struct memcg_cache_params *params, *tmp; + LIST_HEAD(empty_caches);
if (!memcg_kmem_is_active(memcg)) return; @@@ -3240,9 -3150,31 +3150,31 @@@ mutex_lock(&memcg_slab_mutex); list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); + + memcg_cache_mark_dead(cachep); kmem_cache_shrink(cachep); - if (atomic_read(&cachep->memcg_params->nr_pages) == 0) - memcg_unregister_cache(cachep); + + if (atomic_long_dec_and_test(&cachep->memcg_params->refcnt)) + list_move(&cachep->memcg_params->list, &empty_caches); + } + + /* + * kmem_cache_free doesn't expect that the cache can be destroyed as + * soon as the object is freed, e.g. SLUB's implementation may want to + * update cache stats after putting the object to the free list. + * + * Therefore we should wait for all kmem_cache_free's to finish before + * proceeding to cache destruction. Since both SLAB and SLUB versions + * of kmem_cache_free are non-preemptable, we wait for rcu-sched grace + * period to elapse. + */ + synchronize_sched(); + + while (!list_empty(&empty_caches)) { + params = list_first_entry(&empty_caches, + struct memcg_cache_params, list); + cachep = memcg_params_to_cache(params); + memcg_unregister_cache(cachep); } mutex_unlock(&memcg_slab_mutex); } @@@ -3315,14 -3247,18 +3247,18 @@@ int __memcg_charge_slab(struct kmem_cac res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, PAGE_SIZE << order); if (!res) - atomic_add(1 << order, &cachep->memcg_params->nr_pages); + atomic_long_inc(&cachep->memcg_params->refcnt); return res; }
void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) { memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); - atomic_sub(1 << order, &cachep->memcg_params->nr_pages); + + if (unlikely(atomic_long_dec_and_test(&cachep->memcg_params->refcnt))) + /* see memcg_unregister_all_caches */ + call_rcu_sched(&cachep->memcg_params->rcu_head, + memcg_unregister_cache_rcu_func); }
/* @@@ -3463,12 -3399,13 +3399,13 @@@ void __memcg_kmem_commit_charge(struct memcg_uncharge_kmem(memcg, PAGE_SIZE << order); return; } - + /* + * The page is freshly allocated and not visible to any + * outside callers yet. Set up pc non-atomically. + */ pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); pc->mem_cgroup = memcg; - SetPageCgroupUsed(pc); - unlock_page_cgroup(pc); + pc->flags = PCG_USED; }
void __memcg_kmem_uncharge_pages(struct page *page, int order) @@@ -3478,19 -3415,11 +3415,11 @@@
pc = lookup_page_cgroup(page); if (!PageCgroupUsed(pc)) return;
- lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); + memcg = pc->mem_cgroup; + pc->flags = 0;
/* * We trust that only if there is a memcg associated with the page, it @@@ -3510,7 -3439,6 +3439,6 @@@ static inline void memcg_unregister_all
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@@ -3531,8 -3459,7 +3459,7 @@@ void mem_cgroup_split_huge_fixup(struc for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; pc->mem_cgroup = memcg; - smp_wmb();/* see __commit_charge() */ - pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + pc->flags = head_pc->flags; } __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], HPAGE_PMD_NR); @@@ -3562,7 -3489,6 +3489,6 @@@ static int mem_cgroup_move_account(stru { unsigned long flags; int ret; - bool anon = PageAnon(page);
VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@@ -3576,15 -3502,13 +3502,13 @@@ if (nr_pages > 1 && !PageTransHuge(page)) goto out;
- lock_page_cgroup(pc); - ret = -EINVAL; if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) - goto unlock; + goto out;
move_lock_mem_cgroup(from, &flags);
- if (!anon && page_mapped(page)) { + if (!PageAnon(page) && page_mapped(page)) { __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], nr_pages); __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], @@@ -3598,15 -3522,19 +3522,19 @@@ nr_pages); }
- mem_cgroup_charge_statistics(from, page, anon, -nr_pages); + mem_cgroup_charge_statistics(from, page, -nr_pages); + + /* + * It is safe to change pc->mem_cgroup here because the page + * is referenced, charged, and isolated - we can't race with + * uncharging, charging, migration, or LRU putback. + */
/* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, page, anon, nr_pages); + mem_cgroup_charge_statistics(to, page, nr_pages); move_unlock_mem_cgroup(from, &flags); ret = 0; - unlock: - unlock_page_cgroup(pc); /* * check events */ @@@ -3682,357 -3610,6 +3610,6 @@@ out return ret; }
- int mem_cgroup_charge_anon(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) - { - unsigned int nr_pages = 1; - struct mem_cgroup *memcg; - bool oom = true; - - if (mem_cgroup_disabled()) - return 0; - - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - VM_BUG_ON(!mm); - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - /* - * Never OOM-kill a process for a huge page. The - * fault handler will fall back to regular pages. - */ - oom = false; - } - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); - if (!memcg) - return -ENOMEM; - __mem_cgroup_commit_charge(memcg, page, nr_pages, - MEM_CGROUP_CHARGE_TYPE_ANON, false); - return 0; - } - - /* - * While swap-in, try_charge -> commit or cancel, the page is locked. - * And when try_charge() successfully returns, one refcnt to memcg without - * struct page_cgroup is acquired. This refcnt will be consumed by - * "commit()" or removed by "cancel()" - */ - static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, - gfp_t mask, - struct mem_cgroup **memcgp) - { - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - int ret; - - pc = lookup_page_cgroup(page); - /* - * Every swap fault against a single page tries to charge the - * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. The USED bit is protected by - * the page lock, which serializes swap cache removal, which - * in turn serializes uncharging. - */ - if (PageCgroupUsed(pc)) - goto out; - if (do_swap_account) - memcg = try_get_mem_cgroup_from_page(page); - if (!memcg) - memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, mask, 1, true); - css_put(&memcg->css); - if (ret == -EINTR) - memcg = root_mem_cgroup; - else if (ret) - return ret; - out: - *memcgp = memcg; - return 0; - } - - int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, - gfp_t gfp_mask, struct mem_cgroup **memcgp) - { - if (mem_cgroup_disabled()) { - *memcgp = NULL; - return 0; - } - /* - * A racing thread's fault, or swapoff, may have already - * updated the pte, and even removed page from swap cache: in - * those cases unuse_pte()'s pte_same() test will fail; but - * there's also a KSM case which does need to charge the page. - */ - if (!PageSwapCache(page)) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); - if (!memcg) - return -ENOMEM; - *memcgp = memcg; - return 0; - } - return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); - } - - void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) - { - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - __mem_cgroup_cancel_charge(memcg, 1); - } - - static void - __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, - enum charge_type ctype) - { - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - - __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); - /* - * Now swap is on-memory. This means this page may be - * counted both as mem and swap....double count. - * Fix it by uncharging from memsw. Basically, this SwapCache is stable - * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() - * may call delete_from_swap_cache() before reach here. - */ - if (do_swap_account && PageSwapCache(page)) { - swp_entry_t ent = {.val = page_private(page)}; - mem_cgroup_uncharge_swap(ent); - } - } - - void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *memcg) - { - __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_ANON); - } - - int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) - { - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - struct mem_cgroup *memcg; - int ret; - - if (mem_cgroup_disabled()) - return 0; - if (PageCompound(page)) - return 0; - - if (PageSwapCache(page)) { /* shmem */ - ret = __mem_cgroup_try_charge_swapin(mm, page, - gfp_mask, &memcg); - if (ret) - return ret; - __mem_cgroup_commit_charge_swapin(page, memcg, type); - return 0; - } - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); - if (!memcg) - return -ENOMEM; - __mem_cgroup_commit_charge(memcg, page, 1, type, false); - return 0; - } - - static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, - unsigned int nr_pages, - const enum charge_type ctype) - { - struct memcg_batch_info *batch = NULL; - bool uncharge_memsw = true; - - /* If swapout, usage of swap doesn't decrease */ - if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - uncharge_memsw = false; - - batch = ¤t->memcg_batch; - /* - * In usual, we do css_get() when we remember memcg pointer. - * But in this case, we keep res->usage until end of a series of - * uncharges. Then, it's ok to ignore memcg's refcnt. - */ - if (!batch->memcg) - batch->memcg = memcg; - /* - * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continuously can be expected to be in - * the same cgroup and we have chance to coalesce uncharges. - * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) - * because we want to do uncharge as soon as possible. - */ - - if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) - goto direct_uncharge; - - if (nr_pages > 1) - goto direct_uncharge; - - /* - * In typical case, batch->memcg == mem. This means we can - * merge a series of uncharges to an uncharge of res_counter. - * If not, we uncharge res_counter ony by one. - */ - if (batch->memcg != memcg) - goto direct_uncharge; - /* remember freed charge and uncharge it later */ - batch->nr_pages++; - if (uncharge_memsw) - batch->memsw_nr_pages++; - return; - direct_uncharge: - res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); - if (uncharge_memsw) - res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); - if (unlikely(batch->memcg != memcg)) - memcg_oom_recover(memcg); - } - - /* - * uncharge if !page_mapped(page) - */ - static struct mem_cgroup * - __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, - bool end_migration) - { - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - bool anon; - - if (mem_cgroup_disabled()) - return NULL; - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - /* - * Check if our page_cgroup is valid - */ - pc = lookup_page_cgroup(page); - if (unlikely(!PageCgroupUsed(pc))) - return NULL; - - lock_page_cgroup(pc); - - memcg = pc->mem_cgroup; - - if (!PageCgroupUsed(pc)) - goto unlock_out; - - anon = PageAnon(page); - - switch (ctype) { - case MEM_CGROUP_CHARGE_TYPE_ANON: - /* - * Generally PageAnon tells if it's the anon statistics to be - * updated; but sometimes e.g. mem_cgroup_uncharge_page() is - * used before page reached the stage of being marked PageAnon. - */ - anon = true; - /* fallthrough */ - case MEM_CGROUP_CHARGE_TYPE_DROP: - /* See mem_cgroup_prepare_migration() */ - if (page_mapped(page)) - goto unlock_out; - /* - * Pages under migration may not be uncharged. But - * end_migration() /must/ be the one uncharging the - * unused post-migration page and so it has to call - * here with the migration bit still set. See the - * res_counter handling below. - */ - if (!end_migration && PageCgroupMigration(pc)) - goto unlock_out; - break; - case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: - if (!PageAnon(page)) { /* Shared memory */ - if (page->mapping && !page_is_file_cache(page)) - goto unlock_out; - } else if (page_mapped(page)) /* Anon */ - goto unlock_out; - break; - default: - break; - } - - mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); - - ClearPageCgroupUsed(pc); - /* - * pc->mem_cgroup is not cleared here. It will be accessed when it's - * freed from LRU. This is safe because uncharged page is expected not - * to be reused (freed soon). Exception is SwapCache, it's handled by - * special functions. - */ - - unlock_page_cgroup(pc); - /* - * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed, so it's safe to call css_get(). - */ - memcg_check_events(memcg, page); - if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { - mem_cgroup_swap_statistics(memcg, true); - css_get(&memcg->css); - } - /* - * Migration does not charge the res_counter for the - * replacement page, so leave it alone when phasing out the - * page that is unused after the migration. - */ - if (!end_migration && !mem_cgroup_is_root(memcg)) - mem_cgroup_do_uncharge(memcg, nr_pages, ctype); - - return memcg; - - unlock_out: - unlock_page_cgroup(pc); - return NULL; - } - - void mem_cgroup_uncharge_page(struct page *page) - { - /* early check. */ - if (page_mapped(page)) - return; - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - /* - * If the page is in swap cache, uncharge should be deferred - * to the swap path, which also properly accounts swap usage - * and handles memcg lifetime. - * - * Note that this check is not stable and reclaim may add the - * page to swap cache at any time after this. However, if the - * page is not in swap cache by the time page->mapcount hits - * 0, there won't be any page table references to the swap - * slot, and reclaim will free it and not actually write the - * page to disk. - */ - if (PageSwapCache(page)) - return; - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); - } - - void mem_cgroup_uncharge_cache_page(struct page *page) - { - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping, page); - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); - } - /* * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. * In that cases, pages are freed continuously and we can expect pages @@@ -4080,58 -3657,12 +3657,12 @@@ void mem_cgroup_uncharge_end(void batch->memcg = NULL; }
- #ifdef CONFIG_SWAP - /* - * called after __delete_from_swap_cache() and drop "page" account. - * memcg information is recorded to swap_cgroup of "ent" - */ - void - mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) - { - struct mem_cgroup *memcg; - int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; - - if (!swapout) /* this was a swap cache but the swap is unused ! */ - ctype = MEM_CGROUP_CHARGE_TYPE_DROP; - - memcg = __mem_cgroup_uncharge_common(page, ctype, false); - - /* - * record memcg information, if swapout && memcg != NULL, - * css_get() was called in uncharge(). - */ - if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, mem_cgroup_id(memcg)); - } - #endif - #ifdef CONFIG_MEMCG_SWAP - /* - * called from swap_entry_free(). remove record in swap_cgroup and - * uncharge "memsw" account. - */ - void mem_cgroup_uncharge_swap(swp_entry_t ent) + static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, + bool charge) { - struct mem_cgroup *memcg; - unsigned short id; - - if (!do_swap_account) - return; - - id = swap_cgroup_record(ent, 0); - rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { - /* - * We uncharge this because swap is freed. This memcg can - * be obsolete one. We avoid calling css_tryget_online(). - */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); - mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); - } - rcu_read_unlock(); + int val = (charge) ? 1 : -1; + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); }
/** @@@ -4177,180 -3708,11 +3708,11 @@@ static int mem_cgroup_move_swap_account } #else static inline int mem_cgroup_move_swap_account(swp_entry_t entry, - struct mem_cgroup *from, struct mem_cgroup *to) - { - return -EINVAL; - } - #endif - - /* - * Before starting migration, account PAGE_SIZE to mem_cgroup that the old - * page belongs to. - */ - void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **memcgp) - { - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - enum charge_type ctype; - - *memcgp = NULL; - - if (mem_cgroup_disabled()) - return; - - if (PageTransHuge(page)) - nr_pages <<= compound_order(page); - - pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - css_get(&memcg->css); - /* - * At migrating an anonymous page, its mapcount goes down - * to 0 and uncharge() will be called. But, even if it's fully - * unmapped, migration may fail and this page has to be - * charged again. We set MIGRATION flag here and delay uncharge - * until end_migration() is called - * - * Corner Case Thinking - * A) - * When the old page was mapped as Anon and it's unmap-and-freed - * while migration was ongoing. - * If unmap finds the old page, uncharge() of it will be delayed - * until end_migration(). If unmap finds a new page, it's - * uncharged when it make mapcount to be 1->0. If unmap code - * finds swap_migration_entry, the new page will not be mapped - * and end_migration() will find it(mapcount==0). - * - * B) - * When the old page was mapped but migraion fails, the kernel - * remaps it. A charge for it is kept by MIGRATION flag even - * if mapcount goes down to 0. We can do remap successfully - * without charging it again. - * - * C) - * The "old" page is under lock_page() until the end of - * migration, so, the old page itself will not be swapped-out. - * If the new page is swapped out before end_migraton, our - * hook to usual swap-out path will catch the event. - */ - if (PageAnon(page)) - SetPageCgroupMigration(pc); - } - unlock_page_cgroup(pc); - /* - * If the page is not charged at this point, - * we return here. - */ - if (!memcg) - return; - - *memcgp = memcg; - /* - * We charge new page before it's used/mapped. So, even if unlock_page() - * is called before end_migration, we can catch all events on this new - * page. In the case new page is migrated but not remapped, new page's - * mapcount will be finally 0 and we call uncharge in end_migration(). - */ - if (PageAnon(page)) - ctype = MEM_CGROUP_CHARGE_TYPE_ANON; - else - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; - /* - * The page is committed to the memcg, but it's not actually - * charged to the res_counter since we plan on replacing the - * old one and only one page is going to be left afterwards. - */ - __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); - } - - /* remove redundant charge if migration failed*/ - void mem_cgroup_end_migration(struct mem_cgroup *memcg, - struct page *oldpage, struct page *newpage, bool migration_ok) - { - struct page *used, *unused; - struct page_cgroup *pc; - bool anon; - - if (!memcg) - return; - - if (!migration_ok) { - used = oldpage; - unused = newpage; - } else { - used = newpage; - unused = oldpage; - } - anon = PageAnon(used); - __mem_cgroup_uncharge_common(unused, - anon ? MEM_CGROUP_CHARGE_TYPE_ANON - : MEM_CGROUP_CHARGE_TYPE_CACHE, - true); - css_put(&memcg->css); - /* - * We disallowed uncharge of pages under migration because mapcount - * of the page goes down to zero, temporarly. - * Clear the flag and check the page should be charged. - */ - pc = lookup_page_cgroup(oldpage); - lock_page_cgroup(pc); - ClearPageCgroupMigration(pc); - unlock_page_cgroup(pc); - - /* - * If a page is a file cache, radix-tree replacement is very atomic - * and we can skip this check. When it was an Anon page, its mapcount - * goes down to 0. But because we added MIGRATION flage, it's not - * uncharged yet. There are several case but page->mapcount check - * and USED bit check in mem_cgroup_uncharge_page() will do enough - * check. (see prepare_charge() also) - */ - if (anon) - mem_cgroup_uncharge_page(used); - } - - /* - * At replace page cache, newpage is not under any memcg but it's on - * LRU. So, this function doesn't touch res_counter but handles LRU - * in correct way. Both pages are locked so we cannot race with uncharge. - */ - void mem_cgroup_replace_page_cache(struct page *oldpage, - struct page *newpage) - { - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(oldpage); - /* fix accounting on old pages */ - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, oldpage, false, -1); - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); - - /* - * When called from shmem_replace_page(), in some cases the - * oldpage has already been charged, and in some cases not. - */ - if (!memcg) - return; - /* - * Even if newpage->mapping was NULL before starting replacement, - * the newpage may be on LRU(or pagevec for LRU) already. We lock - * LRU while we overwrite pc->mem_cgroup. - */ - __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); + struct mem_cgroup *from, struct mem_cgroup *to) + { + return -EINVAL; } + #endif
#ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) @@@ -4817,78 -4179,24 +4179,24 @@@ out return retval; }
- - static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) - { - struct mem_cgroup *iter; - long val = 0; - - /* Per-cpu values can be negative, use a signed accumulator */ - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); - - if (val < 0) /* race ? */ - val = 0; - return val; - } - - static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) - { - u64 val; - - if (!mem_cgroup_is_root(memcg)) { - if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); - else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); - } - - /* - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS - * as well as in MEM_CGROUP_STAT_RSS_HUGE. - */ - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); - - if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); - - return val << PAGE_SHIFT; - } - static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) + struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - u64 val; - int name; - enum res_type type; - - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); + int name = MEMFILE_ATTR(cft->private);
switch (type) { case _MEM: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, false); - else - val = res_counter_read_u64(&memcg->res, name); - break; + return res_counter_read_u64(&memcg->res, name); case _MEMSWAP: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, true); - else - val = res_counter_read_u64(&memcg->memsw, name); - break; + return res_counter_read_u64(&memcg->memsw, name); case _KMEM: - val = res_counter_read_u64(&memcg->kmem, name); + return res_counter_read_u64(&memcg->kmem, name); break; default: BUG(); } - - return val; }
#ifdef CONFIG_MEMCG_KMEM @@@ -5350,7 -4658,10 +4658,10 @@@ static void __mem_cgroup_threshold(stru if (!t) goto unlock;
- usage = mem_cgroup_usage(memcg, swap); + if (!swap) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* * current_threshold points to threshold just below or equal to usage. @@@ -5442,15 -4753,15 +4753,15 @@@ static int __mem_cgroup_usage_register_
mutex_lock(&memcg->thresholds_lock);
- if (type == _MEM) + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG();
- usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before adding a new one */ if (thresholds->primary) __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@@ -5530,18 -4841,19 +4841,19 @@@ static void __mem_cgroup_usage_unregist int i, j, size;
mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG();
if (!thresholds->primary) goto unlock;
- usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before removing */ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@@ -6296,9 -5608,9 +5608,9 @@@ mem_cgroup_css_online(struct cgroup_sub * core guarantees its existence. */ } else { - res_counter_init(&memcg->res, NULL); - res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); + res_counter_init(&memcg->res, &root_mem_cgroup->res); + res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); + res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@@ -6407,80 -5719,40 +5719,63 @@@ static void mem_cgroup_css_free(struct __mem_cgroup_free(memcg); }
+/** + * mem_cgroup_css_reset - reset the states of a mem_cgroup + * @css: the target css + * + * Reset the states of the mem_cgroup associated with @css. This is + * invoked when the userland requests disabling on the default hierarchy + * but the memcg is pinned through dependency. The memcg should stop + * applying policies and should revert to the vanilla state as it may be + * made visible again. + * + * The current implementation only resets the essential configurations. + * This needs to be expanded to cover all the visible parts. + */ +static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + mem_cgroup_resize_limit(memcg, ULLONG_MAX); + mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); + memcg_update_kmem_limit(memcg, ULLONG_MAX); + res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); +} + #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ - #define PRECHARGE_COUNT_AT_ONCE 256 static int mem_cgroup_do_precharge(unsigned long count) { - int ret = 0; - int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *memcg = mc.to; + int ret;
- if (mem_cgroup_is_root(memcg)) { + /* Try a single bulk charge without reclaim first */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + if (!ret) { mc.precharge += count; - /* we don't need css_get for root */ return ret; } - /* try to charge at once */ - if (count > 1) { - struct res_counter *dummy; - /* - * "memcg" cannot be under rmdir() because we've already checked - * by cgroup_lock_live_cgroup() that it is not removed and we - * are still under the same cgroup_mutex. So we can postpone - * css_get(). - */ - if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) - goto one_by_one; - if (do_swap_account && res_counter_charge(&memcg->memsw, - PAGE_SIZE * count, &dummy)) { - res_counter_uncharge(&memcg->res, PAGE_SIZE * count); - goto one_by_one; - } - mc.precharge += count; + if (ret == -EINTR) { + cancel_charge(root_mem_cgroup, count); return ret; } - one_by_one: - /* fall back to one by one charge */ + + /* Try charges one by one with reclaim */ while (count--) { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - if (!batch_count--) { - batch_count = PRECHARGE_COUNT_AT_ONCE; - cond_resched(); - } - ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + /* + * In case of failure, any residual charges against + * mc.to will be dropped by mem_cgroup_clear_mc() + * later on. However, cancel any charges that are + * bypassed to root right away or they'll be lost. + */ + if (ret == -EINTR) + cancel_charge(root_mem_cgroup, 1); if (ret) - /* mem_cgroup_clear_mc() will do uncharge later */ return ret; mc.precharge++; + cond_resched(); } - return ret; + return 0; }
/** @@@ -6616,9 -5888,9 +5911,9 @@@ static enum mc_target_type get_mctgt_ty if (page) { pc = lookup_page_cgroup(page); /* - * Do only loose check w/o page_cgroup lock. - * mem_cgroup_move_account() checks the pc is valid or not under - * the lock. + * Do only loose check w/o serialization. + * mem_cgroup_move_account() checks the pc is valid or + * not under LRU exclusion. */ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; @@@ -6743,7 -6015,7 +6038,7 @@@ static void __mem_cgroup_clear_mc(void
/* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { - __mem_cgroup_cancel_charge(mc.to, mc.precharge); + cancel_charge(mc.to, mc.precharge); mc.precharge = 0; } /* @@@ -6751,27 -6023,24 +6046,24 @@@ * we must uncharge here. */ if (mc.moved_charge) { - __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); + cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; } /* we must fixup refcnts and charges */ if (mc.moved_swap) { /* uncharge swap account from the old cgroup */ - if (!mem_cgroup_is_root(mc.from)) - res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); + res_counter_uncharge(&mc.from->memsw, + PAGE_SIZE * mc.moved_swap);
for (i = 0; i < mc.moved_swap; i++) css_put(&mc.from->css);
- if (!mem_cgroup_is_root(mc.to)) { - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ - res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); - } + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + res_counter_uncharge(&mc.to->res, + PAGE_SIZE * mc.moved_swap); /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } @@@ -7042,7 -6311,6 +6334,7 @@@ struct cgroup_subsys memory_cgrp_subsy .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, .css_free = mem_cgroup_css_free, + .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, @@@ -7081,6 -6349,321 +6373,321 @@@ static void __init enable_swap_cgroup(v } #endif
+ #ifdef CONFIG_MEMCG_SWAP + /** + * mem_cgroup_swapout - transfer a memsw charge to swap + * @page: page whose memsw charge to transfer + * @entry: swap entry to move the charge to + * + * Transfer the memsw charge of @page to @entry. + */ + void mem_cgroup_swapout(struct page *page, swp_entry_t entry) + { + struct page_cgroup *pc; + unsigned short oldid; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + if (!do_swap_account) + return; + + pc = lookup_page_cgroup(page); + + /* Readahead page, never charged */ + if (!PageCgroupUsed(pc)) + return; + + VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); + + oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); + VM_BUG_ON_PAGE(oldid, page); + + pc->flags &= ~PCG_MEMSW; + css_get(&pc->mem_cgroup->css); + mem_cgroup_swap_statistics(pc->mem_cgroup, true); + } + + /** + * mem_cgroup_uncharge_swap - uncharge a swap entry + * @entry: swap entry to uncharge + * + * Drop the memsw charge associated with @entry. + */ + void mem_cgroup_uncharge_swap(swp_entry_t entry) + { + struct mem_cgroup *memcg; + unsigned short id; + + if (!do_swap_account) + return; + + id = swap_cgroup_record(entry, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); + if (memcg) { + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); + css_put(&memcg->css); + } + rcu_read_unlock(); + } + #endif + + /** + * mem_cgroup_try_charge - try charging a page + * @page: page to charge + * @mm: mm context of the victim + * @gfp_mask: reclaim mode + * @memcgp: charged memcg return + * + * Try to charge @page to the memcg that @mm belongs to, reclaiming + * pages according to @gfp_mask if necessary. + * + * Returns 0 on success, with *@memcgp pointing to the charged memcg. + * Otherwise, an error code is returned. + * + * After page->mapping has been set up, the caller must finalize the + * charge with mem_cgroup_commit_charge(). Or abort the transaction + * with mem_cgroup_cancel_charge() in case page instantiation fails. + */ + int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp) + { + struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; + int ret = 0; + + if (mem_cgroup_disabled()) + goto out; + + if (PageSwapCache(page)) { + struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * Every swap fault against a single page tries to charge the + * page, bail as early as possible. shmem_unuse() encounters + * already charged pages, too. The USED bit is protected by + * the page lock, which serializes swap cache removal, which + * in turn serializes uncharging. + */ + if (PageCgroupUsed(pc)) + goto out; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + if (do_swap_account && PageSwapCache(page)) + memcg = try_get_mem_cgroup_from_page(page); + if (!memcg) + memcg = get_mem_cgroup_from_mm(mm); + + ret = try_charge(memcg, gfp_mask, nr_pages); + + css_put(&memcg->css); + + if (ret == -EINTR) { + memcg = root_mem_cgroup; + ret = 0; + } + out: + *memcgp = memcg; + return ret; + } + + /** + * mem_cgroup_commit_charge - commit a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * @lrucare: page might be on LRU already + * + * Finalize a charge transaction started by mem_cgroup_try_charge(), + * after page->mapping has been set up. This must happen atomically + * as part of the page instantiation, i.e. under the page table lock + * for anonymous pages, under the page lock for page and swap cache. + * + * In addition, the page must not be on the LRU during the commit, to + * prevent racing with task migration. If it might be, use @lrucare. + * + * Use mem_cgroup_cancel_charge() to cancel the transaction instead. + */ + void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare) + { + unsigned int nr_pages = 1; + + VM_BUG_ON_PAGE(!page->mapping, page); + VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + commit_charge(page, memcg, nr_pages, lrucare); + + if (do_swap_account && PageSwapCache(page)) { + swp_entry_t entry = { .val = page_private(page) }; + /* + * The swap entry might not get freed for a long time, + * let's not wait for it. The page already received a + * memory+swap charge, drop the swap entry duplicate. + */ + mem_cgroup_uncharge_swap(entry); + } + } + + /** + * mem_cgroup_cancel_charge - cancel a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * + * Cancel a charge transaction started by mem_cgroup_try_charge(). + */ + void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) + { + unsigned int nr_pages = 1; + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + cancel_charge(memcg, nr_pages); + } + + /** + * mem_cgroup_uncharge - uncharge a page + * @page: page to uncharge + * + * Uncharge a page previously charged with mem_cgroup_try_charge() and + * mem_cgroup_commit_charge(). + */ + void mem_cgroup_uncharge(struct page *page) + { + struct memcg_batch_info *batch; + unsigned int nr_pages = 1; + struct mem_cgroup *memcg; + struct page_cgroup *pc; + unsigned long flags; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(page); + + /* Every final put_page() ends up here */ + if (!PageCgroupUsed(pc)) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + /* + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point, we have fully + * exclusive access to the page. + */ + memcg = pc->mem_cgroup; + flags = pc->flags; + pc->flags = 0; + + mem_cgroup_charge_statistics(memcg, page, -nr_pages); + memcg_check_events(memcg, page); + + batch = ¤t->memcg_batch; + if (!batch->memcg) + batch->memcg = memcg; + else if (batch->memcg != memcg) + goto uncharge; + if (nr_pages > 1) + goto uncharge; + if (!batch->do_batch) + goto uncharge; + if (test_thread_flag(TIF_MEMDIE)) + goto uncharge; + if (flags & PCG_MEM) + batch->nr_pages++; + if (flags & PCG_MEMSW) + batch->memsw_nr_pages++; + return; + uncharge: + if (flags & PCG_MEM) + res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); + if (flags & PCG_MEMSW) + res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); + if (batch->memcg != memcg) + memcg_oom_recover(memcg); + } + + /** + * mem_cgroup_migrate - migrate a charge to another page + * @oldpage: currently charged page + * @newpage: page to transfer the charge to + * @lrucare: page might be on LRU already + * + * Migrate the charge from @oldpage to @newpage. + * + * Both pages must be locked, @newpage->mapping must be set up. + */ + void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, + bool lrucare) + { + unsigned int nr_pages = 1; + struct page_cgroup *pc; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON_PAGE(PageLRU(oldpage), oldpage); + VM_BUG_ON_PAGE(PageLRU(newpage), newpage); + VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(oldpage); + if (!PageCgroupUsed(pc)) + return; + + VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); + VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); + pc->flags &= ~(PCG_MEM | PCG_MEMSW); + + if (PageTransHuge(oldpage)) { + nr_pages <<= compound_order(oldpage); + VM_BUG_ON_PAGE(!PageTransHuge(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageTransHuge(newpage), newpage); + } + + commit_charge(newpage, pc->mem_cgroup, nr_pages, lrucare); + } + /* * subsys_initcall() for memory controller. * diff --combined mm/slub.c index 8c24a23,9efabba..6641a8f --- a/mm/slub.c +++ b/mm/slub.c @@@ -233,11 -233,6 +233,6 @@@ static inline void stat(const struct km * Core slab cache functions *******************************************************************/
- static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) - { - return s->node[node]; - } - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) @@@ -382,9 -377,9 +377,9 @@@ static inline bool __cmpxchg_double_sla defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { @@@ -418,9 -413,9 +413,9 @@@ static inline bool cmpxchg_double_slab( defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { @@@ -945,60 -940,6 +940,6 @@@ static void trace(struct kmem_cache *s }
/* - * Hooks for other subsystems that check memory allocations. In a typical - * production configuration these hooks all should produce no code at all. - */ - static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) - { - kmemleak_alloc(ptr, size, 1, flags); - } - - static inline void kfree_hook(const void *x) - { - kmemleak_free(x); - } - - static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) - { - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); - - return should_failslab(s->object_size, flags, s->flags); - } - - static inline void slab_post_alloc_hook(struct kmem_cache *s, - gfp_t flags, void *object) - { - flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); - } - - static inline void slab_free_hook(struct kmem_cache *s, void *x) - { - kmemleak_free_recursive(x, s->flags); - - /* - * Trouble is that we may no longer disable interrupts in the fast path - * So in order to make the debug calls that expect irqs to be - * disabled we need to disable interrupts temporarily. - */ - #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) - { - unsigned long flags; - - local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); - debug_check_no_locks_freed(x, s->object_size); - local_irq_restore(flags); - } - #endif - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->object_size); - } - - /* * Tracking of fully allocated slabs for debugging purposes. */ static void add_full(struct kmem_cache *s, @@@ -1282,6 -1223,12 +1223,12 @@@ static inline void inc_slabs_node(struc static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {}
+ #endif /* CONFIG_SLUB_DEBUG */ + + /* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); @@@ -1293,21 -1240,44 +1240,44 @@@ static inline void kfree_hook(const voi }
static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) - { return 0; } + { + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); + + return should_failslab(s->object_size, flags, s->flags); + }
- static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) + static inline void slab_post_alloc_hook(struct kmem_cache *s, + gfp_t flags, void *object) { - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, - flags & gfp_allowed_mask); + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); }
static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); - }
- #endif /* CONFIG_SLUB_DEBUG */ + /* + * Trouble is that we may no longer disable interrupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ + #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); + local_irq_restore(flags); + } + #endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); + }
/* * Slab allocation and freeing @@@ -1433,7 -1403,7 +1403,7 @@@ static struct page *new_slab(struct kme memset(start, POISON_INUSE, PAGE_SIZE << order);
last = start; - for_each_object(p, s, start, page->objects) { + for_each_object(p, s, start + s->size, page->objects - 1) { setup_object(s, page, last); set_freepointer(s, last, p); last = p; @@@ -2064,6 -2034,14 +2034,14 @@@ static void put_cpu_partial(struct kmem
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + + if (memcg_cache_dead(s)) { + unsigned long flags; + + local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + local_irq_restore(flags); + } #endif }
@@@ -2162,6 -2140,7 +2140,7 @@@ slab_out_of_memory(struct kmem_cache *s static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); int node; + struct kmem_cache_node *n;
if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; @@@ -2176,15 -2155,11 +2155,11 @@@ pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", s->name);
- for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long nr_slabs; unsigned long nr_objs; unsigned long nr_free;
- if (!n) - continue; - nr_free = count_partial(n, count_free); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); @@@ -2673,18 -2648,17 +2648,17 @@@ static __always_inline void slab_free(s
slab_free_hook(s, x);
- redo: /* - * Determine the currently cpus per cpu slab. - * The cpu may change afterward. However that does not matter since - * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succedd. + * We could make this function fully preemptable, but then we wouldn't + * have a method to wait for all currently executing kfree's to finish, + * which is necessary to avoid use-after-free on per memcg cache + * destruction. */ preempt_disable(); + redo: c = this_cpu_ptr(s->cpu_slab);
tid = c->tid; - preempt_enable();
if (likely(page == c->page)) { set_freepointer(s, object, c->freelist); @@@ -2701,6 -2675,7 +2675,7 @@@ } else __slab_free(s, page, x, addr);
+ preempt_enable(); }
void kmem_cache_free(struct kmem_cache *s, void *x) @@@ -2928,13 -2903,10 +2903,10 @@@ static void early_kmem_cache_node_alloc static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; + struct kmem_cache_node *n;
- for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = s->node[node]; - - if (n) - kmem_cache_free(kmem_cache_node, n); - + for_each_kmem_cache_node(s, node, n) { + kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; } } @@@ -3199,13 -3171,12 +3171,13 @@@ static void list_slab_objects(struct km /* * Attempt to free all partial slabs on a node. * This is called from kmem_cache_close(). We must be the last thread - * using the cache and therefore we do not need to lock anymore. + * using the cache, but we still have to lock for lockdep's sake. */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { struct page *page, *h;
+ spin_lock_irq(&n->list_lock); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { __remove_partial(n, page); @@@ -3215,7 -3186,6 +3187,7 @@@ "Objects remaining in %s on kmem_cache_close()"); } } + spin_unlock_irq(&n->list_lock); }
/* @@@ -3224,12 -3194,11 +3196,11 @@@ static inline int kmem_cache_close(struct kmem_cache *s) { int node; + struct kmem_cache_node *n;
flush_all(s); /* Attempt to free all objects */ - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { free_partial(s, n); if (n->nr_partial || slabs_node(s, node)) return 1; @@@ -3406,20 -3375,26 +3377,26 @@@ int __kmem_cache_shrink(struct kmem_cac struct page *page; struct page *t; int objects = oo_objects(s->max); + struct list_head empty_slabs; struct list_head *slabs_by_inuse = kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); unsigned long flags;
- if (!slabs_by_inuse) - return -ENOMEM; + if (memcg_cache_dead(s)) + s->min_partial = 0;
- flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); - - if (!n->nr_partial) - continue; + if (!slabs_by_inuse) { + /* + * Do not fail shrinking empty slabs if allocation of the + * temporary array failed. Just skip the slab placement + * optimization then. + */ + slabs_by_inuse = &empty_slabs; + objects = 1; + }
+ flush_all(s); + for_each_kmem_cache_node(s, node, n) { for (i = 0; i < objects; i++) INIT_LIST_HEAD(slabs_by_inuse + i);
@@@ -3432,7 -3407,9 +3409,9 @@@ * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - list_move(&page->lru, slabs_by_inuse + page->inuse); + if (page->inuse < objects) + list_move(&page->lru, + slabs_by_inuse + page->inuse); if (!page->inuse) n->nr_partial--; } @@@ -3451,7 -3428,8 +3430,8 @@@ discard_slab(s, page); }
- kfree(slabs_by_inuse); + if (slabs_by_inuse != &empty_slabs) + kfree(slabs_by_inuse); return 0; }
@@@ -3588,6 -3566,7 +3568,7 @@@ static struct kmem_cache * __init boots { int node; struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n;
memcpy(s, static_cache, kmem_cache->object_size);
@@@ -3597,19 -3576,16 +3578,16 @@@ * IPIs around. */ __flush_cpu_slab(s, smp_processor_id()); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { struct page *p;
- if (n) { - list_for_each_entry(p, &n->partial, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->partial, lru) + p->slab_cache = s;
#ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->full, lru) + p->slab_cache = s; #endif - } } list_add(&s->list, &slab_caches); return s; @@@ -3962,16 -3938,14 +3940,14 @@@ static long validate_slab_cache(struct unsigned long count = 0; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n;
if (!map) return -ENOMEM;
flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - } kfree(map); return count; } @@@ -4125,6 -4099,7 +4101,7 @@@ static int list_locations(struct kmem_c int node; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n;
if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_TEMPORARY)) { @@@ -4134,8 -4109,7 +4111,7 @@@ /* Push back cpu slabs */ flush_all(s);
- for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long flags; struct page *page;
@@@ -4207,7 -4181,7 +4183,7 @@@ #endif
#ifdef SLUB_RESILIENCY_TEST - static void resiliency_test(void) + static void __init resiliency_test(void) { u8 *p;
@@@ -4334,8 -4308,9 +4310,9 @@@ static ssize_t show_slab_objects(struc get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) {
if (flags & SO_TOTAL) x = atomic_long_read(&n->total_objects); @@@ -4351,9 -4326,9 +4328,9 @@@ } else #endif if (flags & SO_PARTIAL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n;
+ for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = count_partial(n, count_total); else if (flags & SO_OBJECTS) @@@ -4366,7 -4341,7 +4343,7 @@@ } x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA - for_each_node_state(node, N_NORMAL_MEMORY) + for (node = 0; node < nr_node_ids; node++) if (nodes[node]) x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); @@@ -4380,16 -4355,12 +4357,12 @@@ static int any_slab_objects(struct kmem_cache *s) { int node; + struct kmem_cache_node *n;
- for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) if (atomic_long_read(&n->total_objects)) return 1; - } + return 0; } #endif @@@ -5344,13 -5315,9 +5317,9 @@@ void get_slabinfo(struct kmem_cache *s unsigned long nr_objs = 0; unsigned long nr_free = 0; int node; + struct kmem_cache_node *n;
- for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); diff --combined net/xfrm/xfrm_policy.c index 0525d78,92cb08d..beeed60 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@@ -389,7 -389,7 +389,7 @@@ redo if (h != h0) continue; hlist_del(&pol->bydst); - hlist_add_after(entry0, &pol->bydst); + hlist_add_behind(&pol->bydst, entry0); } entry0 = &pol->bydst; } @@@ -654,7 -654,7 +654,7 @@@ int xfrm_policy_insert(int dir, struct break; } if (newpos) - hlist_add_after(newpos, &policy->bydst); + hlist_add_behind(&policy->bydst, newpos); else hlist_add_head(&policy->bydst, chain); xfrm_pol_hold(policy); @@@ -2097,8 -2097,6 +2097,8 @@@ struct dst_entry *xfrm_lookup(struct ne goto no_transform; }
+ dst_hold(&xdst->u.dst); + xdst->u.dst.flags |= DST_NOCACHE; route = xdst->route; } }