The following commit has been merged in the master branch: commit ca42c2c857c69f8e2263d10435b350f0f011fe21 Merge: 68829a32531de509a4c267953b758abbb577003e 7407621ec9e00da2298c4b5fff4c73de44f99e40 Author: Stephen Rothwell sfr@canb.auug.org.au Date: Wed Jul 16 17:43:22 2014 +1000
Merge branch 'akpm-current/current'
diff --combined Documentation/kernel-parameters.txt index f805ef6,80746b2..1eaa1d2 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@@ -566,11 -566,6 +566,11 @@@ bytes respectively. Such letter suffixe possible to determine what the correct size should be. This option provides an override for these situations.
+ ca_keys= [KEYS] This parameter identifies a specific key(s) on + the system trusted keyring to be used for certificate + trust validation. + format: { id:<keyid> | builtin } + ccw_timeout_log [S390] See Documentation/s390/CommonIO for details.
@@@ -1102,12 -1097,6 +1102,12 @@@ that can be changed at run time by the set_graph_function file in the debugfs tracing directory.
+ ftrace_graph_notrace=[function-list] + [FTRACE] Do not trace from the functions specified in + function-list. This list is a comma separated list of + functions that can be changed at run time by the + set_graph_notrace file in the debugfs tracing directory. + gamecon.map[2|3]= [HW,JOY] Multisystem joystick and NES/SNES/PSX pad support via parallel port (up to 5 devices per port) @@@ -1427,6 -1416,10 +1427,6 @@@ ip= [IP_PNP] See Documentation/filesystems/nfs/nfsroot.txt.
- ip2= [HW] Set IO/IRQ pairs for up to 4 IntelliPort boards - See comment before ip2_setup() in - drivers/char/ip2/ip2base.c. - irqfixup [HW] When an interrupt is not handled search all handlers for it. Intended to get systems with badly broken @@@ -1699,8 -1692,12 +1699,12 @@@ 7 (KERN_DEBUG) debug-level messages
log_buf_len=n[KMG] Sets the size of the printk ring buffer, - in bytes. n must be a power of two. The default - size is set in the kernel config file. + in bytes. n must be a power of two and greater + than the minimal size. The minimal size is defined + by LOG_BUF_SHIFT kernel config parameter. There is + also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter + that allows to increase the default size depending on + the number of CPUs. See init/Kconfig for more details.
logo.nologo [FB] Disables display of the built-in Linux logo. This may be used to provide more screen space for @@@ -2173,21 -2170,6 +2177,21 @@@ and restore using xsave. The kernel will fallback to enabling legacy floating-point and sse state.
+ noxsaveopt [X86] Disables xsaveopt used in saving x86 extended + register states. The kernel will fall back to use + xsave to save the states. By using this parameter, + performance of saving the states is degraded because + xsave doesn't support modified optimization while + xsaveopt supports it on xsaveopt enabled systems. + + noxsaves [X86] Disables xsaves and xrstors used in saving and + restoring x86 extended register state in compacted + form of xsave area. The kernel will fall back to use + xsaveopt and xrstor to save and restore the states + in standard form of xsave area. By using this + parameter, xsave area per process might occupy more + memory on xsaves enabled systems. + eagerfpu= [X86] on enable eager fpu restore off disable eager fpu restore @@@ -2812,12 -2794,6 +2816,12 @@@ leaf rcu_node structure. Useful for very large systems.
+ rcutree.jiffies_till_sched_qs= [KNL] + Set required age in jiffies for a + given grace period before RCU starts + soliciting quiescent-state help from + rcu_note_context_switch(). + rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. @@@ -2829,13 -2805,6 +2833,13 @@@ quiescent states. Units are jiffies, minimum value is one, and maximum value is HZ.
+ rcutree.rcu_nocb_leader_stride= [KNL] + Set the number of NOCB kthread groups, which + defaults to the square root of the number of + CPUs. Larger numbers reduces the wakeup overhead + on the per-CPU grace-period kthreads, but increases + that same overhead on each group's leader. + rcutree.qhimark= [KNL] Set threshold of queued RCU callbacks beyond which batch limiting is disabled. @@@ -3561,7 -3530,7 +3565,7 @@@ the allocated input device; If set to 0, video driver will only send out the event without touching backlight brightness level. - default: 0 + default: 1
virtio_mmio.device= [VMMIO] Memory mapped virtio (platform) device. @@@ -3726,10 -3695,6 +3730,10 @@@ Disables the ticketlock slowpath using Xen PV optimizations.
+ xen_nopv [X86] + Disables the PV optimizations forcing the HVM guest to + run as generic HVM guest with no PV drivers. + xirc2ps_cs= [NET,PCMCIA] Format: <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]] diff --combined MAINTAINERS index 354d0e8,bedd5d0..7654231 --- a/MAINTAINERS +++ b/MAINTAINERS @@@ -70,8 -70,6 +70,8 @@@ Descriptions of section entries
P: Person (obsolete) M: Mail patches to: FullName address@domain + R: Designated reviewer: FullName address@domain + These reviewers should be CCed on patches. L: Mailing list that is relevant to this area W: Web-page with status/info Q: Patchwork web based patch tracking system site @@@ -150,13 -148,6 +150,13 @@@ L: linux-scsi@vger.kernel.or S: Maintained F: drivers/scsi/53c700*
+6LOWPAN GENERIC (BTLE/IEEE 802.15.4) +M: Alexander Aring alex.aring@gmail.com +L: linux-zigbee-devel@lists.sourceforge.net (moderated for non-subscribers) +L: linux-bluetooth@vger.kernel.org +S: Maintained +F: net/6lowpan/ + 6PACK NETWORK DRIVER FOR AX.25 M: Andreas Koensgen ajk@comnets.uni-bremen.de L: linux-hams@vger.kernel.org @@@ -165,6 -156,7 +165,6 @@@ F: drivers/net/hamradio/6pack.
8169 10/100/1000 GIGABIT ETHERNET DRIVER M: Realtek linux nic maintainers nic_swsd@realtek.com -M: Francois Romieu romieu@fr.zoreil.com L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/realtek/r8169.c @@@ -595,7 -587,7 +595,7 @@@ W: http://www.amd.com/us-en/Connectivit S: Supported F: drivers/char/hw_random/geode-rng.c F: drivers/crypto/geode* -F: drivers/video/geode/ +F: drivers/video/fbdev/geode/ F: arch/x86/include/asm/geode.h
AMD IOMMU (AMD-VI) @@@ -724,8 -716,8 +724,8 @@@ F: drivers/ata/pata_arasan_cf. ARC FRAMEBUFFER DRIVER M: Jaya Kumar jayalk@intworks.biz S: Maintained -F: drivers/video/arcfb.c -F: drivers/video/fb_defio.c +F: drivers/video/fbdev/arcfb.c +F: drivers/video/fbdev/core/fb_defio.c
ARM MFM AND FLOPPY DRIVERS M: Ian Molton spyro@f2s.com @@@ -764,7 -756,7 +764,7 @@@ F: sound/arm/aaci. ARM PRIMECELL CLCD PL110 DRIVER M: Russell King linux@arm.linux.org.uk S: Maintained -F: drivers/video/amba-clcd.* +F: drivers/video/fbdev/amba-clcd.*
ARM PRIMECELL KMI PL050 DRIVER M: Russell King linux@arm.linux.org.uk @@@ -1161,7 -1153,7 +1161,7 @@@ M: Daniel Walker <dwalker@fifo99.com M: Bryan Huntsman bryanh@codeaurora.org L: linux-arm-msm@vger.kernel.org F: arch/arm/mach-msm/ -F: drivers/video/msm/ +F: drivers/video/fbdev/msm/ F: drivers/mmc/host/msm_sdcc.c F: drivers/mmc/host/msm_sdcc.h F: drivers/tty/serial/msm_serial.h @@@ -1322,20 -1314,6 +1322,20 @@@ W: http://oss.renesas.co Q: http://patchwork.kernel.org/project/linux-sh/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next S: Supported +F: arch/arm/boot/dts/emev2* +F: arch/arm/boot/dts/r7s* +F: arch/arm/boot/dts/r8a* +F: arch/arm/boot/dts/sh* +F: arch/arm/configs/ape6evm_defconfig +F: arch/arm/configs/armadillo800eva_defconfig +F: arch/arm/configs/bockw_defconfig +F: arch/arm/configs/genmai_defconfig +F: arch/arm/configs/koelsch_defconfig +F: arch/arm/configs/kzm9g_defconfig +F: arch/arm/configs/lager_defconfig +F: arch/arm/configs/mackerel_defconfig +F: arch/arm/configs/marzen_defconfig +F: arch/arm/configs/shmobile_defconfig F: arch/arm/mach-shmobile/ F: drivers/sh/
@@@ -1395,7 -1373,7 +1395,7 @@@ F: drivers/mtd/nand/nuc900_nand. F: drivers/rtc/rtc-nuc900.c F: drivers/spi/spi-nuc900.c F: drivers/usb/host/ehci-w90x900.c -F: drivers/video/nuc900fb.c +F: drivers/video/fbdev/nuc900fb.c
ARM/U300 MACHINE SUPPORT M: Linus Walleij linus.walleij@linaro.org @@@ -1464,9 -1442,9 +1464,9 @@@ F: drivers/rtc/rtc-vt8500. F: drivers/tty/serial/vt8500_serial.c F: drivers/usb/host/ehci-platform.c F: drivers/usb/host/uhci-platform.c -F: drivers/video/vt8500lcdfb.* -F: drivers/video/wm8505fb* -F: drivers/video/wmt_ge_rops.* +F: drivers/video/fbdev/vt8500lcdfb.* +F: drivers/video/fbdev/wm8505fb* +F: drivers/video/fbdev/wmt_ge_rops.*
ARM/ZIPIT Z2 SUPPORT M: Marek Vasut marek.vasut@gmail.com @@@ -1656,7 -1634,7 +1656,7 @@@ ATMEL LCDFB DRIVE M: Nicolas Ferre nicolas.ferre@atmel.com L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/atmel_lcdfb.c +F: drivers/video/fbdev/atmel_lcdfb.c F: include/video/atmel_lcdc.h
ATMEL MACB ETHERNET DRIVER @@@ -1803,13 -1781,6 +1803,13 @@@ W: http://bcache.evilpiepirate.or S: Maintained: F: drivers/md/bcache/
+BECEEM BCS200/BCS220-3/BCSM250 WIMAX SUPPORT +M: Kevin McKinney klmckinney1@gmail.com +M: Matthias Beyer mail@beyermatthias.de +L: devel@driverdev.osuosl.org +S: Maintained +F: drivers/staging/bcm* + BEFS FILE SYSTEM S: Orphan F: Documentation/filesystems/befs.txt @@@ -1941,8 -1912,7 +1941,8 @@@ S: Supporte F: drivers/net/ethernet/broadcom/genet/
BROADCOM BNX2 GIGABIT ETHERNET DRIVER -M: Michael Chan mchan@broadcom.com +M: Sony Chacko sony.chacko@qlogic.com +M: Dept-HSGLinuxNICDev@qlogic.com L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/broadcom/bnx2.* @@@ -1987,7 -1957,7 +1987,7 @@@ F: arch/arm/boot/dts/bcm5301x.dts F: arch/arm/boot/dts/bcm470*
BROADCOM TG3 GIGABIT ETHERNET DRIVER -M: Nithin Nayak Sujir nsujir@broadcom.com +M: Prashant Sreedharan prashant@broadcom.com M: Michael Chan mchan@broadcom.com L: netdev@vger.kernel.org S: Supported @@@ -2004,13 -1974,13 +2004,13 @@@ S: Supporte F: drivers/net/wireless/brcm80211/
BROADCOM BNX2FC 10 GIGABIT FCOE DRIVER -M: Eddie Wai eddie.wai@broadcom.com +M: QLogic-Storage-Upstream@qlogic.com L: linux-scsi@vger.kernel.org S: Supported F: drivers/scsi/bnx2fc/
BROADCOM BNX2I 1/10 GIGABIT iSCSI DRIVER -M: Eddie Wai eddie.wai@broadcom.com +M: QLogic-Storage-Upstream@qlogic.com L: linux-scsi@vger.kernel.org S: Supported F: drivers/scsi/bnx2i/ @@@ -2674,7 -2644,7 +2674,7 @@@ M: Russell King <linux@arm.linux.org.uk L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.arm.linux.org.uk/ S: Maintained -F: drivers/video/cyber2000fb.* +F: drivers/video/fbdev/cyber2000fb.*
CYCLADES ASYNC MUX DRIVER W: http://www.cyclades.com/ @@@ -2874,7 -2844,6 +2874,7 @@@ F: drivers/staging/dgnc DIGI EPCA PCI PRODUCTS M: Lidza Louina lidza.louina@gmail.com M: Mark Hounschell markh@compro.net +M: Daeseok Youn daeseok.youn@gmail.com L: driverdev-devel@linuxdriverproject.org S: Maintained F: drivers/staging/dgap/ @@@ -2912,7 -2881,7 +2912,7 @@@ M: Bernie Thompson <bernie@plugable.com L: linux-fbdev@vger.kernel.org S: Maintained W: http://plugable.com/category/projects/udlfb/ -F: drivers/video/udlfb.c +F: drivers/video/fbdev/udlfb.c F: include/video/udlfb.h F: Documentation/fb/udlfb.txt
@@@ -2931,8 -2900,8 +2931,8 @@@ S: Maintaine L: linux-media@vger.kernel.org L: dri-devel@lists.freedesktop.org L: linaro-mm-sig@lists.linaro.org -F: drivers/base/dma-buf* -F: include/linux/dma-buf* +F: drivers/dma-buf/ +F: include/linux/dma-buf* include/linux/reservation.h include/linux/*fence.h F: Documentation/dma-buf-sharing.txt T: git git://git.linaro.org/people/sumitsemwal/linux-dma-buf.git
@@@ -3221,26 -3190,12 +3221,12 @@@ T: git git://linuxtv.org/anttip/media_t S: Maintained F: drivers/media/tuners/e4000*
- EATA-DMA SCSI DRIVER - M: Michael Neuffer mike@i-Connect.Net - L: linux-eata@i-connect.net - L: linux-scsi@vger.kernel.org - S: Maintained - F: drivers/scsi/eata* - EATA ISA/EISA/PCI SCSI DRIVER M: Dario Ballabio ballabio_dario@emc.com L: linux-scsi@vger.kernel.org S: Maintained F: drivers/scsi/eata.c
- EATA-PIO SCSI DRIVER - M: Michael Neuffer mike@i-Connect.Net - L: linux-eata@i-connect.net - L: linux-scsi@vger.kernel.org - S: Maintained - F: drivers/scsi/eata_pio.* - EC100 MEDIA DRIVER M: Antti Palosaari crope@iki.fi L: linux-media@vger.kernel.org @@@ -3368,13 -3323,6 +3354,13 @@@ W: bluesmoke.sourceforge.ne S: Maintained F: drivers/edac/i82975x_edac.c
+EDAC-IE31200 +M: Jason Baron jbaron@akamai.com +L: linux-edac@vger.kernel.org +W: bluesmoke.sourceforge.net +S: Maintained +F: drivers/edac/ie31200_edac.c + EDAC-MPC85XX M: Johannes Thumshirn johannes.thumshirn@men.de L: linux-edac@vger.kernel.org @@@ -3436,7 -3384,7 +3422,7 @@@ EFIFB FRAMEBUFFER DRIVE L: linux-fbdev@vger.kernel.org M: Peter Jones pjones@redhat.com S: Maintained -F: drivers/video/efifb.c +F: drivers/video/fbdev/efifb.c
EFS FILESYSTEM W: http://aeschi.ch.eu.org/efs/ @@@ -3501,7 -3449,7 +3487,7 @@@ EPSON S1D13XXX FRAMEBUFFER DRIVE M: Kristoffer Ericson kristoffer.ericson@gmail.com S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/kristoffer/linux-hpc.git -F: drivers/video/s1d13xxxfb.c +F: drivers/video/fbdev/s1d13xxxfb.c F: include/video/s1d13xxxfb.h
ETHERNET BRIDGE @@@ -3579,7 -3527,7 +3565,7 @@@ M: Donghwa Lee <dh09.lee@samsung.com M: Kyungmin Park kyungmin.park@samsung.com L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/exynos/exynos_mipi* +F: drivers/video/fbdev/exynos/exynos_mipi* F: include/video/exynos_mipi*
F71805F HARDWARE MONITORING DRIVER @@@ -3758,7 -3706,7 +3744,7 @@@ FREESCALE DIU FRAMEBUFFER DRIVE M: Timur Tabi timur@tabi.org L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/fsl-diu-fb.* +F: drivers/video/fbdev/fsl-diu-fb.*
FREESCALE DMA DRIVER M: Li Yang leoli@freescale.com @@@ -3780,7 -3728,7 +3766,7 @@@ L: linux-fbdev@vger.kernel.or L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: include/linux/platform_data/video-imxfb.h -F: drivers/video/imxfb.c +F: drivers/video/fbdev/imxfb.c
FREESCALE SOC FS_ENET DRIVER M: Pantelis Antoniou pantelis.antoniou@gmail.com @@@ -4199,7 -4147,7 +4185,7 @@@ M: Ferenc Bakonyi <fero@drama.obuda.kan L: linux-nvidia@lists.surfsouth.com W: http://drama.obuda.kando.hu/~fero/cgi-bin/hgafb.shtml S: Maintained -F: drivers/video/hgafb.c +F: drivers/video/fbdev/hgafb.c
HIBERNATION (aka Software Suspend, aka swsusp) M: "Rafael J. Wysocki" rjw@rjwysocki.net @@@ -4229,7 -4177,7 +4215,7 @@@ L: linux-kernel@vger.kernel.or T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core S: Maintained F: Documentation/timers/ -F: kernel/hrtimer.c +F: kernel/time/hrtimer.c F: kernel/time/clockevents.c F: kernel/time/tick*.* F: kernel/time/timer_*.c @@@ -4341,7 -4289,7 +4327,7 @@@ F: drivers/hv F: drivers/input/serio/hyperv-keyboard.c F: drivers/net/hyperv/ F: drivers/scsi/storvsc_drv.c -F: drivers/video/hyperv_fb.c +F: drivers/video/fbdev/hyperv_fb.c F: include/linux/hyperv.h F: tools/hv/
@@@ -4485,10 -4433,7 +4471,7 @@@ S: Supporte F: drivers/scsi/ibmvscsi/ibmvfc*
IBM ServeRAID RAID DRIVER - P: Jack Hammer - M: Dave Jeffery ipslinux@adaptec.com - W: http://www.developer.ibm.com/welcome/netfinity/serveraid.html - S: Supported + S: Orphan F: drivers/scsi/ips.*
ICH LPC AND GPIO DRIVER @@@ -4535,7 -4480,8 +4518,7 @@@ S: Supporte F: drivers/idle/i7300_idle.c
IEEE 802.15.4 SUBSYSTEM -M: Alexander Smirnov alex.bluesman.smirnov@gmail.com -M: Dmitry Eremin-Solenikov dbaryshkov@gmail.com +M: Alexander Aring alex.aring@gmail.com L: linux-zigbee-devel@lists.sourceforge.net (moderated for non-subscribers) W: http://apps.sourceforge.net/trac/linux-zigbee T: git git://git.kernel.org/pub/scm/linux/kernel/git/lowpan/lowpan.git @@@ -4600,7 -4546,7 +4583,7 @@@ F: security/integrity/ima IMS TWINTURBO FRAMEBUFFER DRIVER L: linux-fbdev@vger.kernel.org S: Orphan -F: drivers/video/imsttfb.c +F: drivers/video/fbdev/imsttfb.c
INFINIBAND SUBSYSTEM M: Roland Dreier roland@kernel.org @@@ -4667,13 -4613,13 +4650,13 @@@ M: Maik Broemme <mbroemme@plusserver.de L: linux-fbdev@vger.kernel.org S: Maintained F: Documentation/fb/intelfb.txt -F: drivers/video/intelfb/ +F: drivers/video/fbdev/intelfb/
INTEL 810/815 FRAMEBUFFER DRIVER M: Antonino Daplas adaplas@gmail.com L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/i810/ +F: drivers/video/fbdev/i810/
INTEL MENLOW THERMAL DRIVER M: Sujith Thomas sujith.thomas@intel.com @@@ -5091,6 -5037,13 +5074,6 @@@ S: Maintaine F: Documentation/hwmon/k8temp F: drivers/hwmon/k8temp.c
-KTAP -M: Jovi Zhangwei jovi.zhangwei@gmail.com -W: http://www.ktap.org -L: ktap@freelists.org -S: Maintained -F: drivers/staging/ktap/ - KCONFIG M: "Yann E. MORIN" yann.morin.1998@free.fr L: linux-kbuild@vger.kernel.org @@@ -5441,17 -5394,16 +5424,17 @@@ F: arch/powerpc/*/*/*virtex
LINUX FOR POWERPC EMBEDDED PPC8XX M: Vitaly Bordug vitb@kernel.crashing.org -M: Marcelo Tosatti marcelo@kvack.org W: http://www.penguinppc.org/ L: linuxppc-dev@lists.ozlabs.org S: Maintained F: arch/powerpc/platforms/8xx/
LINUX FOR POWERPC EMBEDDED PPC83XX AND PPC85XX +M: Scott Wood scottwood@freescale.com M: Kumar Gala galak@kernel.crashing.org W: http://www.penguinppc.org/ L: linuxppc-dev@lists.ozlabs.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/scottwood/linux.git S: Maintained F: arch/powerpc/platforms/83xx/ F: arch/powerpc/platforms/85xx/ @@@ -5674,6 -5626,16 +5657,6 @@@ F: Documentation/networking/mac80211-in F: include/net/mac80211.h F: net/mac80211/
-MAC80211 PID RATE CONTROL -M: Stefano Brivio stefano.brivio@polimi.it -M: Mattias Nissler mattias.nissler@gmx.de -L: linux-wireless@vger.kernel.org -W: http://wireless.kernel.org/en/developers/Documentation/mac80211/RateControl/... -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git -S: Maintained -F: net/mac80211/rc80211_pid* - MACVLAN DRIVER M: Patrick McHardy kaber@trash.net L: netdev@vger.kernel.org @@@ -5737,7 -5699,7 +5720,7 @@@ F: drivers/mmc/host/mvsdio. MATROX FRAMEBUFFER DRIVER L: linux-fbdev@vger.kernel.org S: Orphan -F: drivers/video/matrox/matroxfb_* +F: drivers/video/fbdev/matrox/matroxfb_* F: include/uapi/linux/matroxfb.h
MAX16065 HARDWARE MONITOR DRIVER @@@ -6379,8 -6341,8 +6362,8 @@@ NVIDIA (rivafb and nvidiafb) FRAMEBUFFE M: Antonino Daplas adaplas@gmail.com L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/riva/ -F: drivers/video/nvidia/ +F: drivers/video/fbdev/riva/ +F: drivers/video/fbdev/nvidia/
NVM EXPRESS DRIVER M: Matthew Wilcox willy@linux.intel.com @@@ -6450,14 -6412,14 +6433,14 @@@ M: Tomi Valkeinen <tomi.valkeinen@ti.co L: linux-fbdev@vger.kernel.org L: linux-omap@vger.kernel.org S: Maintained -F: drivers/video/omap/ +F: drivers/video/fbdev/omap/
OMAP DISPLAY SUBSYSTEM and FRAMEBUFFER SUPPORT (DSS2) M: Tomi Valkeinen tomi.valkeinen@ti.com L: linux-omap@vger.kernel.org L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/omap2/ +F: drivers/video/fbdev/omap2/ F: Documentation/arm/OMAP/DSS
OMAP HARDWARE SPINLOCK SUPPORT @@@ -6748,7 -6710,7 +6731,7 @@@ F: drivers/char/agp/parisc-agp. F: drivers/input/serio/gscps2.c F: drivers/parport/parport_gsc.* F: drivers/tty/serial/8250/8250_gsc.c -F: drivers/video/sti* +F: drivers/video/fbdev/sti* F: drivers/video/console/sti* F: drivers/video/logo/logo_parisc*
@@@ -6808,7 -6770,7 +6791,7 @@@ F: arch/x86/kernel/quirks.
PCI DRIVER FOR IMX6 M: Richard Zhu r65037@freescale.com -M: Shawn Guo shawn.guo@linaro.org +M: Shawn Guo shawn.guo@freescale.com L: linux-pci@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained @@@ -6965,12 -6927,6 +6948,12 @@@ L: linux-arm-kernel@lists.infradead.or S: Maintained F: drivers/pinctrl/pinctrl-at91.c
+PIN CONTROLLER - RENESAS +M: Laurent Pinchart laurent.pinchart@ideasonboard.com +L: linux-sh@vger.kernel.org +S: Maintained +F: drivers/pinctrl/sh-pfc/ + PIN CONTROLLER - SAMSUNG M: Tomasz Figa t.figa@samsung.com M: Thomas Abraham thomas.abraham@linaro.org @@@ -7003,7 -6959,7 +6986,7 @@@ S: Maintaine T: git git://github.com/gxt/linux.git F: drivers/input/serio/i8042-unicore32io.h F: drivers/i2c/busses/i2c-puv3.c -F: drivers/video/fb-puv3.c +F: drivers/video/fbdev/fb-puv3.c F: drivers/rtc/rtc-puv3.c
PMBUS HARDWARE MONITORING DRIVERS @@@ -7035,10 -6991,10 +7018,10 @@@ POSIX CLOCKS and TIMER M: Thomas Gleixner tglx@linutronix.de L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core -S: Supported +S: Maintained F: fs/timerfd.c F: include/linux/timer* -F: kernel/*timer* +F: kernel/time/*timer*
POWER SUPPLY CLASS/SUBSYSTEM and DRIVERS M: Dmitry Eremin-Solenikov dbaryshkov@gmail.com @@@ -7252,12 -7208,6 +7235,12 @@@ M: Robert Jarzmik <robert.jarzmik@free. L: rtc-linux@googlegroups.com S: Maintained
+QAT DRIVER +M: Tadeusz Struk tadeusz.struk@intel.com +L: qat-linux@intel.com +S: Supported +F: drivers/crypto/qat/ + QIB DRIVER M: Mike Marciniszyn infinipath@intel.com L: linux-rdma@vger.kernel.org @@@ -7381,7 -7331,7 +7364,7 @@@ RADEON FRAMEBUFFER DISPLAY DRIVE M: Benjamin Herrenschmidt benh@kernel.crashing.org L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/aty/radeon* +F: drivers/video/fbdev/aty/radeon* F: include/uapi/linux/radeonfb.h
RADIOSHARK RADIO DRIVER @@@ -7403,7 -7353,7 +7386,7 @@@ RAGE128 FRAMEBUFFER DISPLAY DRIVE M: Paul Mackerras paulus@samba.org L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/aty/aty128fb.c +F: drivers/video/fbdev/aty/aty128fb.c
RALINK RT2X00 WIRELESS LAN DRIVER P: rt2x00 project @@@ -7445,14 -7395,10 +7428,14 @@@ L: linux-kernel@vger.kernel.or S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git F: Documentation/RCU/torture.txt -F: kernel/rcu/torture.c +F: kernel/rcu/rcutorture.c
RCUTORTURE TEST FRAMEWORK M: "Paul E. McKenney" paulmck@linux.vnet.ibm.com +M: Josh Triplett josh@joshtriplett.org +R: Steven Rostedt rostedt@goodmis.org +R: Mathieu Desnoyers mathieu.desnoyers@efficios.com +R: Lai Jiangshan laijs@cn.fujitsu.com L: linux-kernel@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git @@@ -7475,11 -7421,8 +7458,11 @@@ S: Supporte F: net/rds/
READ-COPY UPDATE (RCU) -M: Dipankar Sarma dipankar@in.ibm.com M: "Paul E. McKenney" paulmck@linux.vnet.ibm.com +M: Josh Triplett josh@joshtriplett.org +R: Steven Rostedt rostedt@goodmis.org +R: Mathieu Desnoyers mathieu.desnoyers@efficios.com +R: Lai Jiangshan laijs@cn.fujitsu.com L: linux-kernel@vger.kernel.org W: http://www.rdrop.com/users/paulmck/RCU/ S: Supported @@@ -7489,7 -7432,7 +7472,7 @@@ X: Documentation/RCU/torture.tx F: include/linux/rcu* X: include/linux/srcu.h F: kernel/rcu/ -X: kernel/rcu/torture.c +X: kernel/torture.c
REAL TIME CLOCK (RTC) SUBSYSTEM M: Alessandro Zummo a.zummo@towertech.it @@@ -7644,7 -7587,7 +7627,7 @@@ S3 SAVAGE FRAMEBUFFER DRIVE M: Antonino Daplas adaplas@gmail.com L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/savage/ +F: drivers/video/fbdev/savage/
S390 M: Martin Schwidefsky schwidefsky@de.ibm.com @@@ -7767,7 -7710,7 +7750,7 @@@ SAMSUNG FRAMEBUFFER DRIVE M: Jingoo Han jg1.han@samsung.com L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/s3c-fb.c +F: drivers/video/fbdev/s3c-fb.c
SAMSUNG MULTIFUNCTION DEVICE DRIVERS M: Sangbeom Kim sbkim73@samsung.com @@@ -7844,11 -7787,6 +7827,11 @@@ S: Maintaine F: include/linux/mmc/dw_mmc.h F: drivers/mmc/host/dw_mmc*
+THUNDERBOLT DRIVER +M: Andreas Noever andreas.noever@gmail.com +S: Maintained +F: drivers/thunderbolt/ + TIMEKEEPING, CLOCKSOURCE CORE, NTP M: John Stultz john.stultz@linaro.org M: Thomas Gleixner tglx@linutronix.de @@@ -8052,16 -7990,6 +8035,16 @@@ F: drivers/ata F: include/linux/ata.h F: include/linux/libata.h
+SERIAL ATA AHCI PLATFORM devices support +M: Hans de Goede hdegoede@redhat.com +M: Tejun Heo tj@kernel.org +L: linux-ide@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/libata.git +S: Supported +F: drivers/ata/ahci_platform.c +F: drivers/ata/libahci_platform.c +F: include/linux/ahci_platform.h + SERVER ENGINES 10Gbps iSCSI - BladeEngine 2 DRIVER M: Jayamohan Kallickal jayamohan.kallickal@emulex.com L: linux-scsi@vger.kernel.org @@@ -8254,7 -8182,7 +8237,7 @@@ M: Thomas Winischhofer <thomas@winischh W: http://www.winischhofer.net/linuxsisvga.shtml S: Maintained F: Documentation/fb/sisfb.txt -F: drivers/video/sis/ +F: drivers/video/fbdev/sis/ F: include/video/sisfb.h
SIS USB2VGA DRIVER @@@ -8277,9 -8205,6 +8260,9 @@@ F: mm/sl?b SLEEPABLE READ-COPY UPDATE (SRCU) M: Lai Jiangshan laijs@cn.fujitsu.com M: "Paul E. McKenney" paulmck@linux.vnet.ibm.com +M: Josh Triplett josh@joshtriplett.org +R: Steven Rostedt rostedt@goodmis.org +R: Mathieu Desnoyers mathieu.desnoyers@efficios.com L: linux-kernel@vger.kernel.org W: http://www.rdrop.com/users/paulmck/RCU/ S: Supported @@@ -8363,7 -8288,7 +8346,7 @@@ SMSC UFX6000 and UFX7000 USB to VGA DRI M: Steve Glendinning steve.glendinning@shawell.net L: linux-fbdev@vger.kernel.org S: Maintained -F: drivers/video/smscufx.c +F: drivers/video/fbdev/smscufx.c
SOC-CAMERA V4L2 SUBSYSTEM M: Guennadi Liakhovetski g.liakhovetski@gmx.de @@@ -8569,12 -8494,37 +8552,12 @@@ L: devel@driverdev.osuosl.or S: Supported F: drivers/staging/
-STAGING - AGERE HERMES II and II.5 WIRELESS DRIVERS -M: Henk de Groot pe1dnn@amsat.org -S: Odd Fixes -F: drivers/staging/wlags49_h2/ -F: drivers/staging/wlags49_h25/ - -STAGING - ASUS OLED -M: Jakub Schmidtke sjakub@gmail.com -S: Odd Fixes -F: drivers/staging/asus_oled/ - STAGING - COMEDI M: Ian Abbott abbotti@mev.co.uk M: H Hartley Sweeten hsweeten@visionengravers.com S: Odd Fixes F: drivers/staging/comedi/
-STAGING - CRYSTAL HD VIDEO DECODER -M: Naren Sankar nsankar@broadcom.com -M: Jarod Wilson jarod@wilsonet.com -M: Scott Davilla davilla@4pi.com -M: Manu Abraham abraham.manu@gmail.com -S: Odd Fixes -F: drivers/staging/crystalhd/ - -STAGING - ECHO CANCELLER -M: Steve Underwood steveu@coppice.org -M: David Rowe david@rowetel.com -S: Odd Fixes -F: drivers/staging/echo/ - STAGING - ET131X NETWORK DRIVER M: Mark Einon mark.einon@gmail.com S: Odd Fixes @@@ -8646,6 -8596,11 +8629,6 @@@ L: linux-wireless@vger.kernel.or S: Maintained F: drivers/staging/rtl8723au/
-STAGING - SILICON MOTION SM7XX FRAME BUFFER DRIVER -M: Teddy Wang teddy.wang@siliconmotion.com.cn -S: Odd Fixes -F: drivers/staging/sm7xxfb/ - STAGING - SLICOSS M: Lior Dotan liodot@gmail.com M: Christopher Harrer charrer@alacritech.com @@@ -8682,6 -8637,11 +8665,6 @@@ M: Forest Bond <forest@alittletooquiet. S: Odd Fixes F: drivers/staging/vt665?/
-STAGING - WINBOND IS89C35 WLAN USB DRIVER -M: Pavel Machek pavel@ucw.cz -S: Odd Fixes -F: drivers/staging/winbond/ - STAGING - XGI Z7,Z9,Z11 PCI DISPLAY DRIVER M: Arnaud Patard arnaud.patard@rtp-net.org S: Odd Fixes @@@ -9526,6 -9486,15 +9509,6 @@@ L: netdev@vger.kernel.or S: Maintained F: drivers/net/usb/smsc95xx.*
-USB SN9C1xx DRIVER -M: Luca Risolia luca.risolia@studio.unibo.it -L: linux-usb@vger.kernel.org -L: linux-media@vger.kernel.org -T: git git://linuxtv.org/media_tree.git -W: http://www.linux-projects.org -S: Maintained -F: drivers/staging/media/sn9c102/ - USB SUBSYSTEM M: Greg Kroah-Hartman gregkh@linuxfoundation.org L: linux-usb@vger.kernel.org @@@ -9639,7 -9608,7 +9622,7 @@@ L: linux-fbdev@vger.kernel.or W: http://dev.gentoo.org/~spock/projects/uvesafb/ S: Maintained F: Documentation/fb/uvesafb.txt -F: drivers/video/uvesafb.* +F: drivers/video/fbdev/uvesafb.*
VFAT/FAT/MSDOS FILESYSTEM M: OGAWA Hirofumi hirofumi@mail.parknet.co.jp @@@ -9712,7 -9681,7 +9695,7 @@@ S: Maintaine F: include/linux/via-core.h F: include/linux/via-gpio.h F: include/linux/via_i2c.h -F: drivers/video/via/ +F: drivers/video/fbdev/via/
VIA VELOCITY NETWORK DRIVER M: Francois Romieu romieu@fr.zoreil.com @@@ -10032,13 -10001,6 +10015,13 @@@ S: Supporte F: arch/x86/pci/*xen* F: drivers/pci/*xen*
+XEN BLOCK SUBSYSTEM +M: Konrad Rzeszutek Wilk konrad.wilk@oracle.com +L: xen-devel@lists.xenproject.org (moderated for non-subscribers) +S: Supported +F: drivers/block/xen-blkback/* +F: drivers/block/xen* + XEN SWIOTLB SUBSYSTEM M: Konrad Rzeszutek Wilk konrad.wilk@oracle.com L: xen-devel@lists.xenproject.org (moderated for non-subscribers) diff --combined Makefile index 2343264,7916d57..c66a010 --- a/Makefile +++ b/Makefile @@@ -1,7 -1,7 +1,7 @@@ VERSION = 3 PATCHLEVEL = 16 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Shuffling Zombie Juror
# *DOCUMENTATION* @@@ -41,29 -41,6 +41,29 @@@ unexport GREP_OPTION # descending is started. They are now explicitly listed as the # prepare rule.
+# Beautify output +# --------------------------------------------------------------------------- +# +# Normally, we echo the whole command before executing it. By making +# that echo $($(quiet)$(cmd)), we now have the possibility to set +# $(quiet) to choose other forms of output instead, e.g. +# +# quiet_cmd_cc_o_c = Compiling $(RELDIR)/$@ +# cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< +# +# If $(quiet) is empty, the whole command will be printed. +# If it is set to "quiet_", only the short version will be printed. +# If it is set to "silent_", nothing will be printed at all, since +# the variable $(silent_cmd_cc_o_c) doesn't exist. +# +# A simple variant is to prefix commands with $(Q) - that's useful +# for commands that shall be hidden in non-verbose mode. +# +# $(Q)ln $@ :< +# +# If KBUILD_VERBOSE equals 0 then the above command will be hidden. +# If KBUILD_VERBOSE equals 1 then the above command is displayed. +# # To put more focus on warnings, be less verbose as default # Use 'make V=1' to see the full commands
@@@ -74,29 -51,6 +74,29 @@@ ifndef KBUILD_VERBOS KBUILD_VERBOSE = 0 endif
+ifeq ($(KBUILD_VERBOSE),1) + quiet = + Q = +else + quiet=quiet_ + Q = @ +endif + +# If the user is running make -s (silent mode), suppress echoing of +# commands + +ifneq ($(filter 4.%,$(MAKE_VERSION)),) # make-4 +ifneq ($(filter %s ,$(firstword x$(MAKEFLAGS))),) + quiet=silent_ +endif +else # make-3.8x +ifneq ($(filter s% -s%,$(MAKEFLAGS)),) + quiet=silent_ +endif +endif + +export quiet Q KBUILD_VERBOSE + # Call a source code checker (by default, "sparse") as part of the # C compilation. # @@@ -174,11 -128,8 +174,11 @@@ $(filter-out _all sub-make $(CURDIR)/Ma
# Fake the "Entering directory" message once, so that IDEs/editors are # able to understand relative filenames. + echodir := @echo + quiet_echodir := @echo +silent_echodir := @: sub-make: FORCE - @echo "make[1]: Entering directory `$(KBUILD_OUTPUT)'" + $($(quiet)echodir) "make[1]: Entering directory `$(KBUILD_OUTPUT)'" $(if $(KBUILD_VERBOSE:1=),@)$(MAKE) -C $(KBUILD_OUTPUT) \ KBUILD_SRC=$(CURDIR) \ KBUILD_EXTMOD="$(KBUILD_EXTMOD)" -f $(CURDIR)/Makefile \ @@@ -341,6 -292,52 +341,6 @@@ endi export KBUILD_MODULES KBUILD_BUILTIN export KBUILD_CHECKSRC KBUILD_SRC KBUILD_EXTMOD
-# Beautify output -# --------------------------------------------------------------------------- -# -# Normally, we echo the whole command before executing it. By making -# that echo $($(quiet)$(cmd)), we now have the possibility to set -# $(quiet) to choose other forms of output instead, e.g. -# -# quiet_cmd_cc_o_c = Compiling $(RELDIR)/$@ -# cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< -# -# If $(quiet) is empty, the whole command will be printed. -# If it is set to "quiet_", only the short version will be printed. -# If it is set to "silent_", nothing will be printed at all, since -# the variable $(silent_cmd_cc_o_c) doesn't exist. -# -# A simple variant is to prefix commands with $(Q) - that's useful -# for commands that shall be hidden in non-verbose mode. -# -# $(Q)ln $@ :< -# -# If KBUILD_VERBOSE equals 0 then the above command will be hidden. -# If KBUILD_VERBOSE equals 1 then the above command is displayed. - -ifeq ($(KBUILD_VERBOSE),1) - quiet = - Q = -else - quiet=quiet_ - Q = @ -endif - -# If the user is running make -s (silent mode), suppress echoing of -# commands - -ifneq ($(filter 4.%,$(MAKE_VERSION)),) # make-4 -ifneq ($(filter %s ,$(firstword x$(MAKEFLAGS))),) - quiet=silent_ -endif -else # make-3.8x -ifneq ($(filter s% -s%,$(MAKEFLAGS)),) - quiet=silent_ -endif -endif - -export quiet Q KBUILD_VERBOSE - ifneq ($(CC),) ifeq ($(shell $(CC) -v 2>&1 | grep -c "clang version"), 1) COMPILER := clang @@@ -360,14 -357,9 +360,14 @@@ include $(srctree)/scripts/Kbuild.inclu # Make variables (CC, etc...) AS = $(CROSS_COMPILE)as LD = $(CROSS_COMPILE)ld +LDFINAL = $(LD) CC = $(CROSS_COMPILE)gcc CPP = $(CC) -E +ifdef CONFIG_LTO +AR = $(CROSS_COMPILE)gcc-ar +else AR = $(CROSS_COMPILE)ar +endif NM = $(CROSS_COMPILE)nm STRIP = $(CROSS_COMPILE)strip OBJCOPY = $(CROSS_COMPILE)objcopy @@@ -426,7 -418,7 +426,7 @@@ KERNELVERSION = $(VERSION)$(if $(PATCHL
export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC -export CPP AR NM STRIP OBJCOPY OBJDUMP +export CPP AR NM STRIP OBJCOPY OBJDUMP LDFINAL export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
@@@ -437,17 -429,6 +437,17 @@@ export KBUILD_AFLAGS_MODULE KBUILD_CFLA export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL export KBUILD_ARFLAGS
+ifdef CONFIG_LTO +# LTO gcc creates a lot of files in TMPDIR, and with /tmp as tmpfs +# it's easy to drive the machine OOM. Use the object directory +# instead. +ifndef TMPDIR +TMPDIR ?= $(objtree) +export TMPDIR +$(info setting TMPDIR=$(objtree) for LTO build) +endif +endif + # When compiling out-of-tree modules, put MODVERDIR in the module # tree rather than in the kernel tree. The kernel tree might # even be read-only. @@@ -637,6 -618,9 +637,9 @@@ els KBUILD_CFLAGS += -O2 endif
+ # Tell gcc to never replace conditional load with a non-conditional one + KBUILD_CFLAGS += $(call cc-option,--param=allow-store-data-races=0) + ifdef CONFIG_READABLE_ASM # Disable optimizations that make assembler listings hard to read. # reorder blocks reorders the control in the function @@@ -652,6 -636,22 +655,22 @@@ KBUILD_CFLAGS += $(call cc-option,-Wfra endif
# Handle stack protector mode. + # + # Since kbuild can potentially perform two passes (first with the old + # .config values and then with updated .config values), we cannot error out + # if a desired compiler option is unsupported. If we were to error, kbuild + # could never get to the second pass and actually notice that we changed + # the option to something that was supported. + # + # Additionally, we don't want to fallback and/or silently change which compiler + # flags will be used, since that leads to producing kernels with different + # security feature characteristics depending on the compiler used. ("But I + # selected CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!") + # + # The middle ground is to warn here so that the failed option is obvious, but + # to let the build fail with bad compiler flags so that we can't produce a + # kernel when there is a CONFIG and compiler mismatch. + # ifdef CONFIG_CC_STACKPROTECTOR_REGULAR stackp-flag := -fstack-protector ifeq ($(call cc-option, $(stackp-flag)),) @@@ -767,7 -767,6 +786,7 @@@ ifeq ($(shell $(CONFIG_SHELL) $(srctree endif
include $(srctree)/scripts/Makefile.extrawarn +include ${srctree}/scripts/Makefile.lto
# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments KBUILD_CPPFLAGS += $(KCPPFLAGS) @@@ -1193,7 -1192,7 +1212,7 @@@ distclean: mrprope # Packaging of the kernel to various formats # --------------------------------------------------------------------------- # rpm target kept for backward compatibility -package-dir := $(srctree)/scripts/package +package-dir := scripts/package
%src-pkg: FORCE $(Q)$(MAKE) $(build)=$(package-dir) $@ diff --combined arch/arm/Kconfig index 8d05666,6fe446b..8290325 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@@ -83,6 -83,7 +83,7 @@@ config AR http://www.arm.linux.org.uk/.
config ARM_HAS_SG_CHAIN + select ARCH_HAS_SG_CHAIN bool
config NEED_SG_DMA_LENGTH @@@ -320,6 -321,7 +321,6 @@@ config ARCH_INTEGRATO select HAVE_TCM select ICST select MULTI_IRQ_HANDLER - select NEED_MACH_MEMORY_H select PLAT_VERSATILE select SPARSE_IRQ select USE_OF @@@ -527,6 -529,21 +528,6 @@@ config ARCH_DOV help Support for the Marvell Dove SoC 88AP510
-config ARCH_KIRKWOOD - bool "Marvell Kirkwood" - select ARCH_REQUIRE_GPIOLIB - select CPU_FEROCEON - select GENERIC_CLOCKEVENTS - select MVEBU_MBUS - select PCI - select PCI_QUIRKS - select PINCTRL - select PINCTRL_KIRKWOOD - select PLAT_ORION_LEGACY - help - Support for the following Marvell Kirkwood series SoCs: - 88F6180, 88F6192 and 88F6281. - config ARCH_MV78XX0 bool "Marvell MV78xx0" select ARCH_REQUIRE_GPIOLIB @@@ -742,6 -759,61 +743,6 @@@ config ARCH_S3C64X help Samsung S3C64XX series based systems
-config ARCH_S5P64X0 - bool "Samsung S5P6440 S5P6450" - select ATAGS - select CLKDEV_LOOKUP - select CLKSRC_SAMSUNG_PWM - select CPU_V6 - select GENERIC_CLOCKEVENTS - select GPIO_SAMSUNG - select HAVE_S3C2410_I2C if I2C - select HAVE_S3C2410_WATCHDOG if WATCHDOG - select HAVE_S3C_RTC if RTC_CLASS - select NEED_MACH_GPIO_H - select SAMSUNG_ATAGS - select SAMSUNG_WDT_RESET - help - Samsung S5P64X0 CPU based systems, such as the Samsung SMDK6440, - SMDK6450. - -config ARCH_S5PC100 - bool "Samsung S5PC100" - select ARCH_REQUIRE_GPIOLIB - select ATAGS - select CLKDEV_LOOKUP - select CLKSRC_SAMSUNG_PWM - select CPU_V7 - select GENERIC_CLOCKEVENTS - select GPIO_SAMSUNG - select HAVE_S3C2410_I2C if I2C - select HAVE_S3C2410_WATCHDOG if WATCHDOG - select HAVE_S3C_RTC if RTC_CLASS - select NEED_MACH_GPIO_H - select SAMSUNG_ATAGS - select SAMSUNG_WDT_RESET - help - Samsung S5PC100 series based systems - -config ARCH_S5PV210 - bool "Samsung S5PV210/S5PC110" - select ARCH_HAS_HOLES_MEMORYMODEL - select ARCH_SPARSEMEM_ENABLE - select ATAGS - select CLKDEV_LOOKUP - select CLKSRC_SAMSUNG_PWM - select CPU_V7 - select GENERIC_CLOCKEVENTS - select GPIO_SAMSUNG - select HAVE_S3C2410_I2C if I2C - select HAVE_S3C2410_WATCHDOG if WATCHDOG - select HAVE_S3C_RTC if RTC_CLASS - select NEED_MACH_GPIO_H - select NEED_MACH_MEMORY_H - select SAMSUNG_ATAGS - help - Samsung S5PV210/S5PC110 series based systems - config ARCH_DAVINCI bool "TI DaVinci" select ARCH_HAS_HOLES_MEMORYMODEL @@@ -880,6 -952,8 +881,6 @@@ source "arch/arm/mach-ixp4xx/Kconfig
source "arch/arm/mach-keystone/Kconfig"
-source "arch/arm/mach-kirkwood/Kconfig" - source "arch/arm/mach-ks8695/Kconfig"
source "arch/arm/mach-msm/Kconfig" @@@ -931,6 -1005,10 +932,6 @@@ source "arch/arm/mach-s3c24xx/Kconfig
source "arch/arm/mach-s3c64xx/Kconfig"
-source "arch/arm/mach-s5p64x0/Kconfig" - -source "arch/arm/mach-s5pc100/Kconfig" - source "arch/arm/mach-s5pv210/Kconfig"
source "arch/arm/mach-exynos/Kconfig" @@@ -1477,8 -1555,7 +1478,8 @@@ config ARM_PSC config ARCH_NR_GPIO int default 1024 if ARCH_SHMOBILE || ARCH_TEGRA - default 512 if ARCH_EXYNOS || ARCH_KEYSTONE || SOC_OMAP5 || SOC_DRA7XX || ARCH_S3C24XX || ARCH_S3C64XX + default 512 if ARCH_EXYNOS || ARCH_KEYSTONE || SOC_OMAP5 || \ + SOC_DRA7XX || ARCH_S3C24XX || ARCH_S3C64XX || ARCH_S5PV210 default 416 if ARCH_SUNXI default 392 if ARCH_U8500 default 352 if ARCH_VT8500 @@@ -1493,7 -1570,7 +1494,7 @@@ source kernel/Kconfig.preemp
config HZ_FIXED int - default 200 if ARCH_EBSA110 || ARCH_S3C24XX || ARCH_S5P64X0 || \ + default 200 if ARCH_EBSA110 || ARCH_S3C24XX || \ ARCH_S5PV210 || ARCH_EXYNOS4 default AT91_TIMER_HZ if ARCH_AT91 default SHMOBILE_TIMER_HZ if ARCH_SHMOBILE_LEGACY @@@ -1974,6 -2051,8 +1975,8 @@@ config XIP_PHYS_ADD config KEXEC bool "Kexec system call (EXPERIMENTAL)" depends on (!SMP || PM_SLEEP_SMP) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@@ -2118,6 -2197,7 +2121,6 @@@ menu "Power management options source "kernel/power/Kconfig"
config ARCH_SUSPEND_POSSIBLE - depends on !ARCH_S5PC100 depends on CPU_ARM920T || CPU_ARM926T || CPU_FEROCEON || CPU_SA1100 || \ CPU_V6 || CPU_V6K || CPU_V7 || CPU_V7M || CPU_XSC3 || CPU_XSCALE || CPU_MOHAWK def_bool y diff --combined arch/arm64/Kconfig index 9c6f2da,65317fb..44deb53 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@@ -2,6 -2,7 +2,7 @@@ config ARM6 def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_OPP + select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANT_OPTIONAL_GPIOLIB @@@ -10,8 -11,6 +11,8 @@@ select ARM_AMBA select ARM_ARCH_TIMER select ARM_GIC + select ARM_GIC_V3 + select AUDIT_ARCH_COMPAT_GENERIC select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS select COMMON_CLK @@@ -30,12 -29,10 +31,12 @@@ select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL select HARDIRQS_SW_RESEND + select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK select HAVE_C_RECORDMCOUNT + select HAVE_CC_STACKPROTECTOR select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG @@@ -66,7 -63,6 +67,7 @@@ select RTC_LIB select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE + select HAVE_CONTEXT_TRACKING help ARM 64-bit (AArch64) Linux support.
diff --combined arch/powerpc/Kconfig index fefe7c8,bd6b599..1adebfc --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@@ -111,6 -111,7 +111,7 @@@ config PP select HAVE_DMA_API_DEBUG select HAVE_OPROFILE select HAVE_DEBUG_KMEMLEAK + select ARCH_HAS_SG_CHAIN select GENERIC_ATOMIC64 if PPC32 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_PERF_EVENTS @@@ -397,6 -398,8 +398,8 @@@ config PPC64_SUPPORTS_MEMORY_FAILUR config KEXEC bool "kexec system call" depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP)) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@@ -414,7 -417,7 +417,7 @@@ config CRASH_DUMP bool "Build a kdump crash kernel" depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP) - select RELOCATABLE if PPC64 || 44x || FSL_BOOKE + select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE help Build a kernel suitable for use as a kdump capture kernel. The same kernel binary can be used as production kernel and dump @@@ -1017,7 -1020,6 +1020,7 @@@ endmen if PPC64 config RELOCATABLE bool "Build a relocatable kernel" + depends on !COMPILE_TEST select NONSTATIC_KERNEL help This builds a kernel image that is capable of running anywhere diff --combined arch/powerpc/kvm/book3s_64_mmu_hv.c index 2d154d9,a41e625..93218ae --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@@ -37,8 -37,6 +37,6 @@@ #include <asm/ppc-opcode.h> #include <asm/cputable.h>
- #include "book3s_hv_cma.h" - /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63
@@@ -64,10 -62,10 +62,10 @@@ long kvmppc_alloc_hpt(struct kvm *kvm, }
kvm->arch.hpt_cma_alloc = 0; page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + memset((void *)hpt, 0, (1 << order)); kvm->arch.hpt_cma_alloc = 1; }
@@@ -450,7 -448,7 +448,7 @@@ static int kvmppc_mmu_book3s_64_hv_xlat unsigned long slb_v; unsigned long pp, key; unsigned long v, gr; - unsigned long *hptep; + __be64 *hptep; int index; int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
@@@ -473,13 -471,13 +471,13 @@@ preempt_enable(); return -ENOENT; } - hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); - v = hptep[0] & ~HPTE_V_HVLOCK; + hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; gr = kvm->arch.revmap[index].guest_rpte;
/* Unlock the HPTE */ asm volatile("lwsync" : : : "memory"); - hptep[0] = v; + hptep[0] = cpu_to_be64(v); preempt_enable();
gpte->eaddr = eaddr; @@@ -583,8 -581,7 +581,8 @@@ int kvmppc_book3s_hv_page_fault(struct unsigned long ea, unsigned long dsisr) { struct kvm *kvm = vcpu->kvm; - unsigned long *hptep, hpte[3], r; + unsigned long hpte[3], r; + __be64 *hptep; unsigned long mmu_seq, psize, pte_size; unsigned long gpa_base, gfn_base; unsigned long gpa, gfn, hva, pfn; @@@ -607,16 -604,16 +605,16 @@@ if (ea != vcpu->arch.pgfault_addr) return RESUME_GUEST; index = vcpu->arch.pgfault_index; - hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); rev = &kvm->arch.revmap[index]; preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) cpu_relax(); - hpte[0] = hptep[0] & ~HPTE_V_HVLOCK; - hpte[1] = hptep[1]; + hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; + hpte[1] = be64_to_cpu(hptep[1]); hpte[2] = r = rev->guest_rpte; asm volatile("lwsync" : : : "memory"); - hptep[0] = hpte[0]; + hptep[0] = cpu_to_be64(hpte[0]); preempt_enable();
if (hpte[0] != vcpu->arch.pgfault_hpte[0] || @@@ -732,9 -729,8 +730,9 @@@ preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) cpu_relax(); - if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] || - rev->guest_rpte != hpte[2]) + if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] || + be64_to_cpu(hptep[1]) != hpte[1] || + rev->guest_rpte != hpte[2]) /* HPTE has been changed under us; let the guest retry */ goto out_unlock; hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; @@@ -754,20 -750,20 +752,20 @@@ rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
- if (hptep[0] & HPTE_V_VALID) { + if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) { /* HPTE was previously valid, so we need to invalidate it */ unlock_rmap(rmap); - hptep[0] |= HPTE_V_ABSENT; + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); kvmppc_invalidate_hpte(kvm, hptep, index); /* don't lose previous R and C bits */ - r |= hptep[1] & (HPTE_R_R | HPTE_R_C); + r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); } else { kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); }
- hptep[1] = r; + hptep[1] = cpu_to_be64(r); eieio(); - hptep[0] = hpte[0]; + hptep[0] = cpu_to_be64(hpte[0]); asm volatile("ptesync" : : : "memory"); preempt_enable(); if (page && hpte_is_writable(r)) @@@ -786,7 -782,7 +784,7 @@@ return ret;
out_unlock: - hptep[0] &= ~HPTE_V_HVLOCK; + hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); preempt_enable(); goto out_put; } @@@ -862,7 -858,7 +860,7 @@@ static int kvm_unmap_rmapp(struct kvm * { struct revmap_entry *rev = kvm->arch.revmap; unsigned long h, i, j; - unsigned long *hptep; + __be64 *hptep; unsigned long ptel, psize, rcbits;
for (;;) { @@@ -878,11 -874,11 +876,11 @@@ * rmap chain lock. */ i = *rmapp & KVMPPC_RMAP_INDEX; - hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { /* unlock rmap before spinning on the HPTE lock */ unlock_rmap(rmapp); - while (hptep[0] & HPTE_V_HVLOCK) + while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) cpu_relax(); continue; } @@@ -901,14 -897,14 +899,14 @@@
/* Now check and modify the HPTE */ ptel = rev[i].guest_rpte; - psize = hpte_page_size(hptep[0], ptel); - if ((hptep[0] & HPTE_V_VALID) && + psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); + if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && hpte_rpn(ptel, psize) == gfn) { if (kvm->arch.using_mmu_notifiers) - hptep[0] |= HPTE_V_ABSENT; + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); kvmppc_invalidate_hpte(kvm, hptep, i); /* Harvest R and C */ - rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); + rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; if (rcbits & ~rev[i].guest_rpte) { rev[i].guest_rpte = ptel | rcbits; @@@ -916,7 -912,7 +914,7 @@@ } } unlock_rmap(rmapp); - hptep[0] &= ~HPTE_V_HVLOCK; + hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); } return 0; } @@@ -963,7 -959,7 +961,7 @@@ static int kvm_age_rmapp(struct kvm *kv { struct revmap_entry *rev = kvm->arch.revmap; unsigned long head, i, j; - unsigned long *hptep; + __be64 *hptep; int ret = 0;
retry: @@@ -979,24 -975,23 +977,24 @@@
i = head = *rmapp & KVMPPC_RMAP_INDEX; do { - hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); j = rev[i].forw;
/* If this HPTE isn't referenced, ignore it */ - if (!(hptep[1] & HPTE_R_R)) + if (!(be64_to_cpu(hptep[1]) & HPTE_R_R)) continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { /* unlock rmap before spinning on the HPTE lock */ unlock_rmap(rmapp); - while (hptep[0] & HPTE_V_HVLOCK) + while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) cpu_relax(); goto retry; }
/* Now check and modify the HPTE */ - if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { + if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && + (be64_to_cpu(hptep[1]) & HPTE_R_R)) { kvmppc_clear_ref_hpte(kvm, hptep, i); if (!(rev[i].guest_rpte & HPTE_R_R)) { rev[i].guest_rpte |= HPTE_R_R; @@@ -1004,7 -999,7 +1002,7 @@@ } ret = 1; } - hptep[0] &= ~HPTE_V_HVLOCK; + hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); } while ((i = j) != head);
unlock_rmap(rmapp); @@@ -1038,7 -1033,7 +1036,7 @@@ static int kvm_test_age_rmapp(struct kv do { hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); j = rev[i].forw; - if (hp[1] & HPTE_R_R) + if (be64_to_cpu(hp[1]) & HPTE_R_R) goto out; } while ((i = j) != head); } @@@ -1078,7 -1073,7 +1076,7 @@@ static int kvm_test_clear_dirty_npages( unsigned long head, i, j; unsigned long n; unsigned long v, r; - unsigned long *hptep; + __be64 *hptep; int npages_dirty = 0;
retry: @@@ -1094,8 -1089,7 +1092,8 @@@
i = head = *rmapp & KVMPPC_RMAP_INDEX; do { - hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + unsigned long hptep1; + hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); j = rev[i].forw;
/* @@@ -1112,30 -1106,29 +1110,30 @@@ * Otherwise we need to do the tlbie even if C==0 in * order to pick up any delayed writeback of C. */ - if (!(hptep[1] & HPTE_R_C) && - (!hpte_is_writable(hptep[1]) || vcpus_running(kvm))) + hptep1 = be64_to_cpu(hptep[1]); + if (!(hptep1 & HPTE_R_C) && + (!hpte_is_writable(hptep1) || vcpus_running(kvm))) continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { /* unlock rmap before spinning on the HPTE lock */ unlock_rmap(rmapp); - while (hptep[0] & HPTE_V_HVLOCK) + while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK)) cpu_relax(); goto retry; }
/* Now check and modify the HPTE */ - if (!(hptep[0] & HPTE_V_VALID)) + if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) continue;
/* need to make it temporarily absent so C is stable */ - hptep[0] |= HPTE_V_ABSENT; + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); kvmppc_invalidate_hpte(kvm, hptep, i); - v = hptep[0]; - r = hptep[1]; + v = be64_to_cpu(hptep[0]); + r = be64_to_cpu(hptep[1]); if (r & HPTE_R_C) { - hptep[1] = r & ~HPTE_R_C; + hptep[1] = cpu_to_be64(r & ~HPTE_R_C); if (!(rev[i].guest_rpte & HPTE_R_C)) { rev[i].guest_rpte |= HPTE_R_C; note_hpte_modification(kvm, &rev[i]); @@@ -1148,7 -1141,7 +1146,7 @@@ } v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); v |= HPTE_V_VALID; - hptep[0] = v; + hptep[0] = cpu_to_be64(v); } while ((i = j) != head);
unlock_rmap(rmapp); @@@ -1312,7 -1305,7 +1310,7 @@@ struct kvm_htab_ctx * Returns 1 if this HPT entry has been modified or has pending * R/C bit changes. */ -static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp) +static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp) { unsigned long rcbits_unset;
@@@ -1321,14 -1314,13 +1319,14 @@@
/* Also need to consider changes in reference and changed bits */ rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); - if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset)) + if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) && + (be64_to_cpu(hptp[1]) & rcbits_unset)) return 1;
return 0; }
-static long record_hpte(unsigned long flags, unsigned long *hptp, +static long record_hpte(unsigned long flags, __be64 *hptp, unsigned long *hpte, struct revmap_entry *revp, int want_valid, int first_pass) { @@@ -1343,10 -1335,10 +1341,10 @@@ return 0;
valid = 0; - if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) { + if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) { valid = 1; if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && - !(hptp[0] & HPTE_V_BOLTED)) + !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED)) valid = 0; } if (valid != want_valid) @@@ -1358,7 -1350,7 +1356,7 @@@ preempt_disable(); while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) cpu_relax(); - v = hptp[0]; + v = be64_to_cpu(hptp[0]);
/* re-evaluate valid and dirty from synchronized HPTE value */ valid = !!(v & HPTE_V_VALID); @@@ -1366,9 -1358,9 +1364,9 @@@
/* Harvest R and C into guest view if necessary */ rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); - if (valid && (rcbits_unset & hptp[1])) { - revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) | - HPTE_GR_MODIFIED; + if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) { + revp->guest_rpte |= (be64_to_cpu(hptp[1]) & + (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; dirty = 1; }
@@@ -1387,13 -1379,13 +1385,13 @@@ revp->guest_rpte = r; } asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); - hptp[0] &= ~HPTE_V_HVLOCK; + hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); preempt_enable(); if (!(valid == want_valid && (first_pass || dirty))) ok = 0; } - hpte[0] = v; - hpte[1] = r; + hpte[0] = cpu_to_be64(v); + hpte[1] = cpu_to_be64(r); return ok; }
@@@ -1403,7 -1395,7 +1401,7 @@@ static ssize_t kvm_htab_read(struct fil struct kvm_htab_ctx *ctx = file->private_data; struct kvm *kvm = ctx->kvm; struct kvm_get_htab_header hdr; - unsigned long *hptp; + __be64 *hptp; struct revmap_entry *revp; unsigned long i, nb, nw; unsigned long __user *lbuf; @@@ -1419,7 -1411,7 +1417,7 @@@ flags = ctx->flags;
i = ctx->index; - hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); revp = kvm->arch.revmap + i; lbuf = (unsigned long __user *)buf;
@@@ -1503,7 -1495,7 +1501,7 @@@ static ssize_t kvm_htab_write(struct fi unsigned long i, j; unsigned long v, r; unsigned long __user *lbuf; - unsigned long *hptp; + __be64 *hptp; unsigned long tmp[2]; ssize_t nb; long int err, ret; @@@ -1545,7 -1537,7 +1543,7 @@@ i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) break;
- hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); lbuf = (unsigned long __user *)buf; for (j = 0; j < hdr.n_valid; ++j) { err = -EFAULT; @@@ -1557,7 -1549,7 +1555,7 @@@ lbuf += 2; nb += HPTE_SIZE;
- if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) kvmppc_do_h_remove(kvm, 0, i, 0, tmp); err = -EIO; ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, @@@ -1583,7 -1575,7 +1581,7 @@@ }
for (j = 0; j < hdr.n_invalid; ++j) { - if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) kvmppc_do_h_remove(kvm, 0, i, 0, tmp); ++i; hptp += 2; diff --combined arch/powerpc/kvm/book3s_hv_builtin.c index 3b41447,6cf498a..329d7fd --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@@ -16,12 -16,14 +16,14 @@@ #include <linux/init.h> #include <linux/memblock.h> #include <linux/sizes.h> + #include <linux/cma.h>
#include <asm/cputable.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h>
- #include "book3s_hv_cma.h" + #define KVM_CMA_CHUNK_ORDER 18 + /* * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206) * should be power of 2. @@@ -43,6 -45,8 +45,8 @@@ static unsigned long kvm_cma_resv_rati unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT; /* 128MB */ EXPORT_SYMBOL_GPL(kvm_rma_pages);
+ static struct cma *kvm_cma; + /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ static inline int lpcr_rmls(unsigned long rma_size) @@@ -97,7 -101,7 +101,7 @@@ struct kvm_rma_info *kvm_alloc_rma( ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL); if (!ri) return NULL; - page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages); + page = cma_alloc(kvm_cma, kvm_rma_pages, get_order(kvm_rma_pages)); if (!page) goto err_out; atomic_set(&ri->use_count, 1); @@@ -112,7 -116,7 +116,7 @@@ EXPORT_SYMBOL_GPL(kvm_alloc_rma) void kvm_release_rma(struct kvm_rma_info *ri) { if (atomic_dec_and_test(&ri->use_count)) { - kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages); + cma_release(kvm_cma, pfn_to_page(ri->base_pfn), kvm_rma_pages); kfree(ri); } } @@@ -131,16 -135,18 +135,18 @@@ struct page *kvm_alloc_hpt(unsigned lon { unsigned long align_pages = HPT_ALIGN_PAGES;
+ VM_BUG_ON(get_order(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + /* Old CPUs require HPT aligned on a multiple of its size */ if (!cpu_has_feature(CPU_FTR_ARCH_206)) align_pages = nr_pages; - return kvm_alloc_cma(nr_pages, align_pages); + return cma_alloc(kvm_cma, nr_pages, get_order(align_pages)); } EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
void kvm_release_hpt(struct page *page, unsigned long nr_pages) { - kvm_release_cma(page, nr_pages); + cma_release(kvm_cma, page, nr_pages); } EXPORT_SYMBOL_GPL(kvm_release_hpt);
@@@ -179,7 -185,8 +185,8 @@@ void __init kvm_cma_reserve(void align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size); - kvm_cma_declare_contiguous(selected_size, align_size); + cma_declare_contiguous(0, selected_size, 0, align_size, + KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma); } }
@@@ -212,16 -219,3 +219,16 @@@ bool kvm_hv_mode_active(void { return atomic_read(&hv_vm_count) != 0; } + +extern int hcall_real_table[], hcall_real_table_end[]; + +int kvmppc_hcall_impl_hv_realmode(unsigned long cmd) +{ + cmd /= 4; + if (cmd < hcall_real_table_end - hcall_real_table && + hcall_real_table[cmd]) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode); diff --combined arch/x86/Kconfig index be0d37e,77e7790..8657c06 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@@ -96,6 -96,7 +96,7 @@@ config X8 select IRQ_FORCED_THREADING select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select ARCH_HAS_SG_CHAIN select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP @@@ -536,7 -537,7 +537,7 @@@ config X86_32_IRI
config SCHED_OMIT_FRAME_POINTER def_bool y - prompt "Single-depth WCHAN output" + prompt "Single-depth WCHAN output" if !LTO && !FRAME_POINTER depends on X86 ---help--- Calculate simpler /proc/<PID>/wchan values. If this option @@@ -1576,6 -1577,9 +1577,9 @@@ source kernel/Kconfig.h
config KEXEC bool "kexec system call" + select BUILD_BIN2C + select CRYPTO + select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --combined arch/x86/Makefile index c65fd96,dc302a7..c1aa368 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@@ -15,9 -15,12 +15,9 @@@ endi # that way we can complain to the user if the CPU is insufficient. # # The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For -# older versions of GCC, we need to play evil and unreliable tricks to -# attempt to ensure that our asm(".code16gcc") is first in the asm -# output. -CODE16GCC_CFLAGS := -m32 -include $(srctree)/arch/x86/boot/code16gcc.h \ - $(call cc-option, -fno-toplevel-reorder,\ - $(call cc-option, -fno-unit-at-a-time)) +# older versions of GCC, include an *assembly* header to make sure that +# gcc doesn't play any games behind our back. +CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ @@@ -183,6 -186,14 +183,14 @@@ archscripts: scripts_basi archheaders: $(Q)$(MAKE) $(build)=arch/x86/syscalls all
+ archprepare: + ifeq ($(CONFIG_KEXEC),y) + # Build only for 64bit. No loaders for 32bit yet. + ifeq ($(CONFIG_X86_64),y) + $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c + endif + endif + ### # Kernel objects
diff --combined arch/x86/mm/fault.c index 1dbade8,0193a32..939cb8c --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@@ -350,7 -350,7 +350,7 @@@ out
void vmalloc_sync_all(void) { - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0); }
/* @@@ -577,8 -577,6 +577,8 @@@ static int is_f00f_bug(struct pt_regs *
static const char nx_warning[] = KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; +static const char smep_warning[] = KERN_CRIT +"unable to execute userspace code (SMEP?) (uid: %d)\n";
static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, @@@ -599,10 -597,6 +599,10 @@@
if (pte && pte_present(*pte) && !pte_exec(*pte)) printk(nx_warning, from_kuid(&init_user_ns, current_uid())); + if (pte && pte_present(*pte) && pte_exec(*pte) && + (pgd_flags(*pgd) & _PAGE_USER) && + (read_cr4() & X86_CR4_SMEP)) + printk(smep_warning, from_kuid(&init_user_ns, current_uid())); }
printk(KERN_ALERT "BUG: unable to handle kernel "); diff --combined block/bio-integrity.c index bc423f7b,56754c4..38c8ac2 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@@ -70,10 -70,8 +70,10 @@@ struct bio_integrity_payload *bio_integ bs->bvec_integrity_pool); if (!bip->bip_vec) goto err; + bip->bip_max_vcnt = bvec_nr_vecs(idx); } else { bip->bip_vec = bip->bip_inline_vecs; + bip->bip_max_vcnt = inline_vecs; }
bip->bip_slab = idx; @@@ -116,6 -114,14 +116,6 @@@ void bio_integrity_free(struct bio *bio } EXPORT_SYMBOL(bio_integrity_free);
-static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip) -{ - if (bip->bip_slab == BIO_POOL_NONE) - return BIP_INLINE_VECS; - - return bvec_nr_vecs(bip->bip_slab); -} - /** * bio_integrity_add_page - Attach integrity metadata * @bio: bio to update @@@ -131,7 -137,7 +131,7 @@@ int bio_integrity_add_page(struct bio * struct bio_integrity_payload *bip = bio->bi_integrity; struct bio_vec *iv;
- if (bip->bip_vcnt >= bip_integrity_vecs(bip)) { + if (bip->bip_vcnt >= bip->bip_max_vcnt) { printk(KERN_ERR "%s: bip_vec full\n", __func__); return 0; } @@@ -646,6 -652,4 +646,4 @@@ void __init bio_integrity_init(void sizeof(struct bio_integrity_payload) + sizeof(struct bio_vec) * BIP_INLINE_VECS, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - if (!bip_slab) - panic("Failed to create slab\n"); } diff --combined drivers/ata/libata-core.c index d19c37a7,8f30431..259d879 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@@ -59,6 -59,7 +59,7 @@@ #include <linux/async.h> #include <linux/log2.h> #include <linux/slab.h> + #include <linux/glob.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_host.h> @@@ -4250,73 -4251,6 +4251,6 @@@ static const struct ata_blacklist_entr { } };
- /** - * glob_match - match a text string against a glob-style pattern - * @text: the string to be examined - * @pattern: the glob-style pattern to be matched against - * - * Either/both of text and pattern can be empty strings. - * - * Match text against a glob-style pattern, with wildcards and simple sets: - * - * ? matches any single character. - * * matches any run of characters. - * [xyz] matches a single character from the set: x, y, or z. - * [a-d] matches a single character from the range: a, b, c, or d. - * [a-d0-9] matches a single character from either range. - * - * The special characters ?, [, -, or *, can be matched using a set, eg. [*] - * Behaviour with malformed patterns is undefined, though generally reasonable. - * - * Sample patterns: "SD1?", "SD1[0-5]", "*R0", "SD*1?[012]*xx" - * - * This function uses one level of recursion per '*' in pattern. - * Since it calls _nothing_ else, and has _no_ explicit local variables, - * this will not cause stack problems for any reasonable use here. - * - * RETURNS: - * 0 on match, 1 otherwise. - */ - static int glob_match (const char *text, const char *pattern) - { - do { - /* Match single character or a '?' wildcard */ - if (*text == *pattern || *pattern == '?') { - if (!*pattern++) - return 0; /* End of both strings: match */ - } else { - /* Match single char against a '[' bracketed ']' pattern set */ - if (!*text || *pattern != '[') - break; /* Not a pattern set */ - while (*++pattern && *pattern != ']' && *text != *pattern) { - if (*pattern == '-' && *(pattern - 1) != '[') - if (*text > *(pattern - 1) && *text < *(pattern + 1)) { - ++pattern; - break; - } - } - if (!*pattern || *pattern == ']') - return 1; /* No match */ - while (*pattern && *pattern++ != ']'); - } - } while (*++text && *pattern); - - /* Match any run of chars against a '*' wildcard */ - if (*pattern == '*') { - if (!*++pattern) - return 0; /* Match: avoid recursion at end of pattern */ - /* Loop to handle additional pattern chars after the wildcard */ - while (*text) { - if (glob_match(text, pattern) == 0) - return 0; /* Remainder matched */ - ++text; /* Absorb (match) this char and try again */ - } - } - if (!*text && !*pattern) - return 0; /* End of both strings: match */ - return 1; /* No match */ - } - static unsigned long ata_dev_blacklisted(const struct ata_device *dev) { unsigned char model_num[ATA_ID_PROD_LEN + 1]; @@@ -4327,10 -4261,10 +4261,10 @@@ ata_id_c_string(dev->id, model_rev, ATA_ID_FW_REV, sizeof(model_rev));
while (ad->model_num) { - if (!glob_match(model_num, ad->model_num)) { + if (glob_match(model_num, ad->model_num)) { if (ad->model_rev == NULL) return ad->horkage; - if (!glob_match(model_rev, ad->model_rev)) + if (glob_match(model_rev, ad->model_rev)) return ad->horkage; } ad++; @@@ -4787,10 -4721,6 +4721,10 @@@ void swap_buf_le16(u16 *buf, unsigned i * ata_qc_new - Request an available ATA command, for queueing * @ap: target port * + * Some ATA host controllers may implement a queue depth which is less + * than ATA_MAX_QUEUE. So we shouldn't allocate a tag which is beyond + * the hardware limitation. + * * LOCKING: * None. */ @@@ -4798,16 -4728,14 +4732,16 @@@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap) { struct ata_queued_cmd *qc = NULL; - unsigned int i, tag; + unsigned int i, tag, max_queue; + + max_queue = ap->scsi_host->can_queue;
/* no command while frozen */ if (unlikely(ap->pflags & ATA_PFLAG_FROZEN)) return NULL;
- for (i = 0; i < ATA_MAX_QUEUE; i++) { - tag = (i + ap->last_tag + 1) % ATA_MAX_QUEUE; + for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) { + tag = tag < max_queue ? tag : 0;
/* the last tag is reserved for internal command. */ if (tag == ATA_TAG_INTERNAL) @@@ -6175,16 -6103,6 +6109,16 @@@ int ata_host_register(struct ata_host * { int i, rc;
+ /* + * The max queue supported by hardware must not be greater than + * ATA_MAX_QUEUE. + */ + if (sht->can_queue > ATA_MAX_QUEUE) { + dev_err(host->dev, "BUG: the hardware max queue is too large\n"); + WARN_ON(1); + return -EINVAL; + } + /* host must have been started */ if (!(host->flags & ATA_HOST_STARTED)) { dev_err(host->dev, "BUG: trying to register unstarted host\n"); diff --combined drivers/base/Kconfig index 88500fe,9d5fed1..4e7f0ff --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@@ -149,21 -149,15 +149,21 @@@ config EXTRA_FIRMWARE_DI some other directory containing the firmware files.
config FW_LOADER_USER_HELPER + bool + +config FW_LOADER_USER_HELPER_FALLBACK bool "Fallback user-helper invocation for firmware loading" depends on FW_LOADER - default y + select FW_LOADER_USER_HELPER help This option enables / disables the invocation of user-helper (e.g. udev) for loading firmware files as a fallback after the direct file loading in kernel fails. The user-mode helper is no longer required unless you have a special firmware file that - resides in a non-standard path. + resides in a non-standard path. Moreover, the udev support has + been deprecated upstream. + + If you are unsure about this, say N here.
config DEBUG_DRIVER bool "Driver Core verbose debug messages" @@@ -214,15 -208,6 +214,15 @@@ config DMA_SHARED_BUFFE APIs extension; the file's descriptor can then be passed on to other driver.
+config FENCE_TRACE + bool "Enable verbose FENCE_TRACE messages" + depends on DMA_SHARED_BUFFER + help + Enable the FENCE_TRACE printks. This will add extra + spam to the console log, but will make it easier to diagnose + lockup related problems for dma-buffers shared across multiple + devices. + config DMA_CMA bool "DMA Contiguous Memory Allocator" depends on HAVE_DMA_CONTIGUOUS && CMA @@@ -289,16 -274,6 +289,6 @@@ config CMA_ALIGNMEN
If unsure, leave the default value "8".
- config CMA_AREAS - int "Maximum count of the CMA device-private areas" - default 7 - help - CMA allows to create CMA areas for particular devices. This parameter - sets the maximum number of such device private CMA areas in the - system. - - If unsure, leave the default value "7". - endif
endmenu diff --combined drivers/leds/Kconfig index 8736f69,6784c17..27cf0cd --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@@ -11,9 -11,6 +11,6 @@@ menuconfig NEW_LED Say Y to enable Linux LED support. This allows control of supported LEDs from both userspace and optionally, by kernel events (triggers).
- This is not related to standard keyboard LEDs which are controlled - via the input system. - if NEW_LEDS
config LEDS_CLASS @@@ -32,6 -29,14 +29,6 @@@ config LEDS_88PM860 This option enables support for on-chip LED drivers found on Marvell Semiconductor 88PM8606 PMIC.
-config LEDS_ATMEL_PWM - tristate "LED Support using Atmel PWM outputs" - depends on LEDS_CLASS - depends on ATMEL_PWM - help - This option enables support for LEDs driven using outputs - of the dedicated PWM controller found on newer Atmel SOCs. - config LEDS_LM3530 tristate "LCD Backlight driver for LM3530" depends on LEDS_CLASS diff --combined drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 3abd3cb,c57b085..053e850 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@@ -215,135 -215,52 +215,135 @@@ static int i40e_get_settings(struct net /* hardware is either in 40G mode or 10G mode * NOTE: this section initializes supported and advertising */ + if (!link_up) { + /* link is down and the driver needs to fall back on + * device ID to determine what kinds of info to display, + * it's mostly a guess that may change when link is up + */ + switch (hw->device_id) { + case I40E_DEV_ID_QSFP_A: + case I40E_DEV_ID_QSFP_B: + case I40E_DEV_ID_QSFP_C: + /* pluggable QSFP */ + ecmd->supported = SUPPORTED_40000baseSR4_Full | + SUPPORTED_40000baseCR4_Full | + SUPPORTED_40000baseLR4_Full; + ecmd->advertising = ADVERTISED_40000baseSR4_Full | + ADVERTISED_40000baseCR4_Full | + ADVERTISED_40000baseLR4_Full; + break; + case I40E_DEV_ID_KX_B: + /* backplane 40G */ + ecmd->supported = SUPPORTED_40000baseKR4_Full; + ecmd->advertising = ADVERTISED_40000baseKR4_Full; + break; + case I40E_DEV_ID_KX_C: + /* backplane 10G */ + ecmd->supported = SUPPORTED_10000baseKR_Full; + ecmd->advertising = ADVERTISED_10000baseKR_Full; + break; + default: + /* all the rest are 10G/1G */ + ecmd->supported = SUPPORTED_10000baseT_Full | + SUPPORTED_1000baseT_Full; + ecmd->advertising = ADVERTISED_10000baseT_Full | + ADVERTISED_1000baseT_Full; + break; + } + + /* skip phy_type use as it is zero when link is down */ + goto no_valid_phy_type; + } + switch (hw_link_info->phy_type) { case I40E_PHY_TYPE_40GBASE_CR4: case I40E_PHY_TYPE_40GBASE_CR4_CU: - ecmd->supported = SUPPORTED_40000baseCR4_Full; - ecmd->advertising = ADVERTISED_40000baseCR4_Full; + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_40000baseCR4_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_40000baseCR4_Full; break; case I40E_PHY_TYPE_40GBASE_KR4: - ecmd->supported = SUPPORTED_40000baseKR4_Full; - ecmd->advertising = ADVERTISED_40000baseKR4_Full; + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_40000baseKR4_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_40000baseKR4_Full; break; case I40E_PHY_TYPE_40GBASE_SR4: + case I40E_PHY_TYPE_XLPPI: + case I40E_PHY_TYPE_XLAUI: ecmd->supported = SUPPORTED_40000baseSR4_Full; break; case I40E_PHY_TYPE_40GBASE_LR4: ecmd->supported = SUPPORTED_40000baseLR4_Full; - ecmd->advertising = ADVERTISED_40000baseLR4_Full; break; case I40E_PHY_TYPE_10GBASE_KX4: - ecmd->supported = SUPPORTED_10000baseKX4_Full; - ecmd->advertising = ADVERTISED_10000baseKX4_Full; + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_10000baseKX4_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_10000baseKX4_Full; break; case I40E_PHY_TYPE_10GBASE_KR: - ecmd->supported = SUPPORTED_10000baseKR_Full; - ecmd->advertising = ADVERTISED_10000baseKR_Full; + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_10000baseKR_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_10000baseKR_Full; break; - default: - if (i40e_is_40G_device(hw->device_id)) { - ecmd->supported = SUPPORTED_40000baseSR4_Full; - ecmd->advertising = ADVERTISED_40000baseSR4_Full; - } else { - ecmd->supported = SUPPORTED_10000baseT_Full; - ecmd->advertising = ADVERTISED_10000baseT_Full; - } + case I40E_PHY_TYPE_10GBASE_SR: + case I40E_PHY_TYPE_10GBASE_LR: + ecmd->supported = SUPPORTED_10000baseT_Full; + break; + case I40E_PHY_TYPE_10GBASE_CR1_CU: + case I40E_PHY_TYPE_10GBASE_CR1: + case I40E_PHY_TYPE_10GBASE_T: + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_10000baseT_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_10000baseT_Full; + break; + case I40E_PHY_TYPE_XAUI: + case I40E_PHY_TYPE_XFI: + case I40E_PHY_TYPE_SFI: + case I40E_PHY_TYPE_10GBASE_SFPP_CU: + ecmd->supported = SUPPORTED_10000baseT_Full; break; + case I40E_PHY_TYPE_1000BASE_KX: + case I40E_PHY_TYPE_1000BASE_T: + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_1000baseT_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_1000baseT_Full; + break; + case I40E_PHY_TYPE_100BASE_TX: + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_100baseT_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_100baseT_Full; + break; + case I40E_PHY_TYPE_SGMII: + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_1000baseT_Full | + SUPPORTED_100baseT_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_1000baseT_Full | + ADVERTISED_100baseT_Full; + break; + default: + /* if we got here and link is up something bad is afoot */ + WARN_ON(link_up); }
- ecmd->supported |= SUPPORTED_Autoneg; - ecmd->advertising |= ADVERTISED_Autoneg; +no_valid_phy_type: + /* this is if autoneg is enabled or disabled */ ecmd->autoneg = ((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ? AUTONEG_ENABLE : AUTONEG_DISABLE);
switch (hw->phy.media_type) { case I40E_MEDIA_TYPE_BACKPLANE: - ecmd->supported |= SUPPORTED_Backplane; - ecmd->advertising |= ADVERTISED_Backplane; + ecmd->supported |= SUPPORTED_Autoneg | + SUPPORTED_Backplane; + ecmd->advertising |= ADVERTISED_Autoneg | + ADVERTISED_Backplane; ecmd->port = PORT_NONE; break; case I40E_MEDIA_TYPE_BASET: @@@ -359,6 -276,7 +359,6 @@@ break; case I40E_MEDIA_TYPE_FIBER: ecmd->supported |= SUPPORTED_FIBRE; - ecmd->advertising |= ADVERTISED_FIBRE; ecmd->port = PORT_FIBRE; break; case I40E_MEDIA_TYPE_UNKNOWN: @@@ -369,25 -287,6 +369,25 @@@
ecmd->transceiver = XCVR_EXTERNAL;
+ ecmd->supported |= SUPPORTED_Pause; + + switch (hw->fc.current_mode) { + case I40E_FC_FULL: + ecmd->advertising |= ADVERTISED_Pause; + break; + case I40E_FC_TX_PAUSE: + ecmd->advertising |= ADVERTISED_Asym_Pause; + break; + case I40E_FC_RX_PAUSE: + ecmd->advertising |= (ADVERTISED_Pause | + ADVERTISED_Asym_Pause); + break; + default: + ecmd->advertising &= ~(ADVERTISED_Pause | + ADVERTISED_Asym_Pause); + break; + } + if (link_up) { switch (link_speed) { case I40E_LINK_SPEED_40GB: @@@ -397,9 -296,6 +397,9 @@@ case I40E_LINK_SPEED_10GB: ethtool_cmd_speed_set(ecmd, SPEED_10000); break; + case I40E_LINK_SPEED_1GB: + ethtool_cmd_speed_set(ecmd, SPEED_1000); + break; default: break; } @@@ -413,182 -309,6 +413,182 @@@ }
/** + * i40e_set_settings - Set Speed and Duplex + * @netdev: network interface device structure + * @ecmd: ethtool command + * + * Set speed/duplex per media_types advertised/forced + **/ +static int i40e_set_settings(struct net_device *netdev, + struct ethtool_cmd *ecmd) +{ + struct i40e_netdev_priv *np = netdev_priv(netdev); + struct i40e_aq_get_phy_abilities_resp abilities; + struct i40e_aq_set_phy_config config; + struct i40e_pf *pf = np->vsi->back; + struct i40e_vsi *vsi = np->vsi; + struct i40e_hw *hw = &pf->hw; + struct ethtool_cmd safe_ecmd; + i40e_status status = 0; + bool change = false; + int err = 0; + u8 autoneg; + u32 advertise; + + if (vsi != pf->vsi[pf->lan_vsi]) + return -EOPNOTSUPP; + + if (hw->phy.media_type != I40E_MEDIA_TYPE_BASET && + hw->phy.media_type != I40E_MEDIA_TYPE_FIBER && + hw->phy.media_type != I40E_MEDIA_TYPE_BACKPLANE) + return -EOPNOTSUPP; + + /* get our own copy of the bits to check against */ + memset(&safe_ecmd, 0, sizeof(struct ethtool_cmd)); + i40e_get_settings(netdev, &safe_ecmd); + + /* save autoneg and speed out of ecmd */ + autoneg = ecmd->autoneg; + advertise = ecmd->advertising; + + /* set autoneg and speed back to what they currently are */ + ecmd->autoneg = safe_ecmd.autoneg; + ecmd->advertising = safe_ecmd.advertising; + + ecmd->cmd = safe_ecmd.cmd; + /* If ecmd and safe_ecmd are not the same now, then they are + * trying to set something that we do not support + */ + if (memcmp(ecmd, &safe_ecmd, sizeof(struct ethtool_cmd))) + return -EOPNOTSUPP; + + while (test_bit(__I40E_CONFIG_BUSY, &vsi->state)) + usleep_range(1000, 2000); + + /* Get the current phy config */ + status = i40e_aq_get_phy_capabilities(hw, false, false, &abilities, + NULL); + if (status) + return -EAGAIN; + + /* Copy link_speed and abilities to config in case they are not + * set below + */ + memset(&config, 0, sizeof(struct i40e_aq_set_phy_config)); + config.link_speed = abilities.link_speed; + config.abilities = abilities.abilities; + + /* Check autoneg */ + if (autoneg == AUTONEG_ENABLE) { + /* If autoneg is not supported, return error */ + if (!(safe_ecmd.supported & SUPPORTED_Autoneg)) { + netdev_info(netdev, "Autoneg not supported on this phy\n"); + return -EINVAL; + } + /* If autoneg was not already enabled */ + if (!(hw->phy.link_info.an_info & I40E_AQ_AN_COMPLETED)) { + config.abilities = abilities.abilities | + I40E_AQ_PHY_ENABLE_AN; + change = true; + } + } else { + /* If autoneg is supported 10GBASE_T is the only phy that + * can disable it, so otherwise return error + */ + if (safe_ecmd.supported & SUPPORTED_Autoneg && + hw->phy.link_info.phy_type != I40E_PHY_TYPE_10GBASE_T) { + netdev_info(netdev, "Autoneg cannot be disabled on this phy\n"); + return -EINVAL; + } + /* If autoneg is currently enabled */ + if (hw->phy.link_info.an_info & I40E_AQ_AN_COMPLETED) { + config.abilities = abilities.abilities | + ~I40E_AQ_PHY_ENABLE_AN; + change = true; + } + } + + if (advertise & ~safe_ecmd.supported) + return -EINVAL; + + if (advertise & ADVERTISED_100baseT_Full) + if (!(abilities.link_speed & I40E_LINK_SPEED_100MB)) { + config.link_speed |= I40E_LINK_SPEED_100MB; + change = true; + } + if (advertise & ADVERTISED_1000baseT_Full || + advertise & ADVERTISED_1000baseKX_Full) + if (!(abilities.link_speed & I40E_LINK_SPEED_1GB)) { + config.link_speed |= I40E_LINK_SPEED_1GB; + change = true; + } + if (advertise & ADVERTISED_10000baseT_Full || + advertise & ADVERTISED_10000baseKX4_Full || + advertise & ADVERTISED_10000baseKR_Full) + if (!(abilities.link_speed & I40E_LINK_SPEED_10GB)) { + config.link_speed |= I40E_LINK_SPEED_10GB; + change = true; + } + if (advertise & ADVERTISED_40000baseKR4_Full || + advertise & ADVERTISED_40000baseCR4_Full || + advertise & ADVERTISED_40000baseSR4_Full || + advertise & ADVERTISED_40000baseLR4_Full) + if (!(abilities.link_speed & I40E_LINK_SPEED_40GB)) { + config.link_speed |= I40E_LINK_SPEED_40GB; + change = true; + } + + if (change) { + /* copy over the rest of the abilities */ + config.phy_type = abilities.phy_type; + config.eee_capability = abilities.eee_capability; + config.eeer = abilities.eeer_val; + config.low_power_ctrl = abilities.d3_lpan; + + /* If link is up set link and an so changes take effect */ + if (hw->phy.link_info.link_info & I40E_AQ_LINK_UP) + config.abilities |= I40E_AQ_PHY_ENABLE_ATOMIC_LINK; + + /* make the aq call */ + status = i40e_aq_set_phy_config(hw, &config, NULL); + if (status) { + netdev_info(netdev, "Set phy config failed with error %d.\n", + status); + return -EAGAIN; + } + + status = i40e_update_link_info(hw, true); + if (status) + netdev_info(netdev, "Updating link info failed with error %d\n", + status); + + } else { + netdev_info(netdev, "Nothing changed, exiting without setting anything.\n"); + } + + return err; +} + +static int i40e_nway_reset(struct net_device *netdev) +{ + /* restart autonegotiation */ + struct i40e_netdev_priv *np = netdev_priv(netdev); + struct i40e_pf *pf = np->vsi->back; + struct i40e_hw *hw = &pf->hw; + bool link_up = hw->phy.link_info.link_info & I40E_AQ_LINK_UP; + i40e_status ret = 0; + + ret = i40e_aq_set_link_restart_an(hw, link_up, NULL); + if (ret) { + netdev_info(netdev, "link restart failed, aq_err=%d\n", + pf->hw.aq.asq_last_status); + return -EIO; + } + + return 0; +} + +/** * i40e_get_pauseparam - Get Flow Control status * Return tx/rx-pause status **/ @@@ -614,81 -334,6 +614,81 @@@ static void i40e_get_pauseparam(struct } }
+/** + * i40e_set_pauseparam - Set Flow Control parameter + * @netdev: network interface device structure + * @pause: return tx/rx flow control status + **/ +static int i40e_set_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *pause) +{ + struct i40e_netdev_priv *np = netdev_priv(netdev); + struct i40e_pf *pf = np->vsi->back; + struct i40e_vsi *vsi = np->vsi; + struct i40e_hw *hw = &pf->hw; + struct i40e_link_status *hw_link_info = &hw->phy.link_info; + bool link_up = hw_link_info->link_info & I40E_AQ_LINK_UP; + i40e_status status; + u8 aq_failures; + int err; + + if (vsi != pf->vsi[pf->lan_vsi]) + return -EOPNOTSUPP; + + if (pause->autoneg != ((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ? + AUTONEG_ENABLE : AUTONEG_DISABLE)) { + netdev_info(netdev, "To change autoneg please use: ethtool -s <dev> autoneg <on|off>\n"); + return -EOPNOTSUPP; + } + + /* If we have link and don't have autoneg */ + if (!test_bit(__I40E_DOWN, &pf->state) && + !(hw_link_info->an_info & I40E_AQ_AN_COMPLETED)) { + /* Send message that it might not necessarily work*/ + netdev_info(netdev, "Autoneg did not complete so changing settings may not result in an actual change.\n"); + } + + if (hw->fc.current_mode == I40E_FC_PFC) { + netdev_info(netdev, "Priority flow control enabled. Cannot set link flow control.\n"); + return -EOPNOTSUPP; + } + + if (pause->rx_pause && pause->tx_pause) + hw->fc.requested_mode = I40E_FC_FULL; + else if (pause->rx_pause && !pause->tx_pause) + hw->fc.requested_mode = I40E_FC_RX_PAUSE; + else if (!pause->rx_pause && pause->tx_pause) + hw->fc.requested_mode = I40E_FC_TX_PAUSE; + else if (!pause->rx_pause && !pause->tx_pause) + hw->fc.requested_mode = I40E_FC_NONE; + else + return -EINVAL; + + /* Set the fc mode and only restart an if link is up*/ + status = i40e_set_fc(hw, &aq_failures, link_up); + + if (aq_failures & I40E_SET_FC_AQ_FAIL_GET) { + netdev_info(netdev, "Set fc failed on the get_phy_capabilities call with error %d and status %d\n", + status, hw->aq.asq_last_status); + err = -EAGAIN; + } + if (aq_failures & I40E_SET_FC_AQ_FAIL_SET) { + netdev_info(netdev, "Set fc failed on the set_phy_config call with error %d and status %d\n", + status, hw->aq.asq_last_status); + err = -EAGAIN; + } + if (aq_failures & I40E_SET_FC_AQ_FAIL_UPDATE) { + netdev_info(netdev, "Set fc failed on the update_link_info call with error %d and status %d\n", + status, hw->aq.asq_last_status); + err = -EAGAIN; + } + + if (!test_bit(__I40E_DOWN, &pf->state)) + return i40e_nway_reset(netdev); + + return err; +} + static u32 i40e_get_msglevel(struct net_device *netdev) { struct i40e_netdev_priv *np = netdev_priv(netdev); @@@ -1376,6 -1021,24 +1376,6 @@@ static int i40e_set_wol(struct net_devi return 0; }
-static int i40e_nway_reset(struct net_device *netdev) -{ - /* restart autonegotiation */ - struct i40e_netdev_priv *np = netdev_priv(netdev); - struct i40e_pf *pf = np->vsi->back; - struct i40e_hw *hw = &pf->hw; - i40e_status ret = 0; - - ret = i40e_aq_set_link_restart_an(hw, NULL); - if (ret) { - netdev_info(netdev, "link restart failed, aq_err=%d\n", - pf->hw.aq.asq_last_status); - return -EIO; - } - - return 0; -} - static int i40e_set_phys_id(struct net_device *netdev, enum ethtool_phys_id_state state) { @@@ -1442,36 -1105,17 +1442,36 @@@ static int i40e_set_coalesce(struct net if (ec->tx_max_coalesced_frames_irq || ec->rx_max_coalesced_frames_irq) vsi->work_limit = ec->tx_max_coalesced_frames_irq;
+ vector = vsi->base_vector; if ((ec->rx_coalesce_usecs >= (I40E_MIN_ITR << 1)) && - (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1))) + (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1))) { vsi->rx_itr_setting = ec->rx_coalesce_usecs; - else + } else if (ec->rx_coalesce_usecs == 0) { + vsi->rx_itr_setting = ec->rx_coalesce_usecs; + i40e_irq_dynamic_disable(vsi, vector); + if (ec->use_adaptive_rx_coalesce) + netif_info(pf, drv, netdev, + "Rx-secs=0, need to disable adaptive-Rx for a complete disable\n"); + } else { + netif_info(pf, drv, netdev, + "Invalid value, Rx-usecs range is 0, 8-8160\n"); return -EINVAL; + }
if ((ec->tx_coalesce_usecs >= (I40E_MIN_ITR << 1)) && - (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1))) + (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1))) { vsi->tx_itr_setting = ec->tx_coalesce_usecs; - else + } else if (ec->tx_coalesce_usecs == 0) { + vsi->tx_itr_setting = ec->tx_coalesce_usecs; + i40e_irq_dynamic_disable(vsi, vector); + if (ec->use_adaptive_tx_coalesce) + netif_info(pf, drv, netdev, + "Tx-secs=0, need to disable adaptive-Tx for a complete disable\n"); + } else { + netif_info(pf, drv, netdev, + "Invalid value, Tx-usecs range is 0, 8-8160\n"); return -EINVAL; + }
if (ec->use_adaptive_rx_coalesce) vsi->rx_itr_setting |= I40E_ITR_DYNAMIC; @@@ -1483,6 -1127,7 +1483,6 @@@ else vsi->tx_itr_setting &= ~I40E_ITR_DYNAMIC;
- vector = vsi->base_vector; for (i = 0; i < vsi->num_q_vectors; i++, vector++) { q_vector = vsi->q_vectors[i]; q_vector->rx.itr = ITR_TO_REG(vsi->rx_itr_setting); @@@ -1853,7 -1498,7 +1853,7 @@@ static int i40e_update_ethtool_fdir_ent
/* add filter to the list */ if (parent) - hlist_add_after(&parent->fdir_node, &input->fdir_node); + hlist_add_behind(&input->fdir_node, &parent->fdir_node); else hlist_add_head(&input->fdir_node, &pf->fdir_filter_list); @@@ -2086,7 -1731,6 +2086,7 @@@ static int i40e_set_channels(struct net
static const struct ethtool_ops i40e_ethtool_ops = { .get_settings = i40e_get_settings, + .set_settings = i40e_set_settings, .get_drvinfo = i40e_get_drvinfo, .get_regs_len = i40e_get_regs_len, .get_regs = i40e_get_regs, @@@ -2099,7 -1743,6 +2099,7 @@@ .get_ringparam = i40e_get_ringparam, .set_ringparam = i40e_set_ringparam, .get_pauseparam = i40e_get_pauseparam, + .set_pauseparam = i40e_set_pauseparam, .get_msglevel = i40e_get_msglevel, .set_msglevel = i40e_set_msglevel, .get_rxnfc = i40e_get_rxnfc, diff --combined drivers/staging/android/binder.c index 02b0379,0ca9785..4f34dc0 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@@ -454,8 -454,9 +454,8 @@@ static size_t binder_buffer_size(struc { if (list_is_last(&buffer->entry, &proc->buffers)) return proc->buffer + proc->buffer_size - (void *)buffer->data; - else - return (size_t)list_entry(buffer->entry.next, - struct binder_buffer, entry) - (size_t)buffer->data; + return (size_t)list_entry(buffer->entry.next, + struct binder_buffer, entry) - (size_t)buffer->data; }
static void binder_insert_free_buffer(struct binder_proc *proc, @@@ -585,7 -586,6 +585,6 @@@ static int binder_update_page_range(str
for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { int ret; - struct page **page_array_ptr;
page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
@@@ -598,8 -598,7 +597,7 @@@ } tmp_area.addr = page_addr; tmp_area.size = PAGE_SIZE + PAGE_SIZE /* guard page? */; - page_array_ptr = page; - ret = map_vm_area(&tmp_area, PAGE_KERNEL, &page_array_ptr); + ret = map_vm_area(&tmp_area, PAGE_KERNEL, page); if (ret) { pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n", proc->pid, page_addr); @@@ -1185,7 -1184,6 +1183,7 @@@ static void binder_send_failed_reply(st uint32_t error_code) { struct binder_thread *target_thread; + struct binder_transaction *next;
BUG_ON(t->flags & TF_ONE_WAY); while (1) { @@@ -1213,23 -1211,24 +1211,23 @@@ target_thread->return_error); } return; - } else { - struct binder_transaction *next = t->from_parent; + } + next = t->from_parent;
- binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, - "send failed reply for transaction %d, target dead\n", - t->debug_id); + binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, + "send failed reply for transaction %d, target dead\n", + t->debug_id);
- binder_pop_transaction(target_thread, t); - if (next == NULL) { - binder_debug(BINDER_DEBUG_DEAD_BINDER, - "reply failed, no target thread at root\n"); - return; - } - t = next; + binder_pop_transaction(target_thread, t); + if (next == NULL) { binder_debug(BINDER_DEBUG_DEAD_BINDER, - "reply failed, no target thread -- retry %d\n", - t->debug_id); + "reply failed, no target thread at root\n"); + return; } + t = next; + binder_debug(BINDER_DEBUG_DEAD_BINDER, + "reply failed, no target thread -- retry %d\n", + t->debug_id); } }
@@@ -2593,106 -2592,6 +2591,106 @@@ static unsigned int binder_poll(struct return 0; }
+static int binder_ioctl_write_read(struct file *filp, + unsigned int cmd, unsigned long arg, + struct binder_thread *thread) +{ + int ret = 0; + struct binder_proc *proc = filp->private_data; + unsigned int size = _IOC_SIZE(cmd); + void __user *ubuf = (void __user *)arg; + struct binder_write_read bwr; + + if (size != sizeof(struct binder_write_read)) { + ret = -EINVAL; + goto out; + } + if (copy_from_user(&bwr, ubuf, sizeof(bwr))) { + ret = -EFAULT; + goto out; + } + binder_debug(BINDER_DEBUG_READ_WRITE, + "%d:%d write %lld at %016llx, read %lld at %016llx\n", + proc->pid, thread->pid, + (u64)bwr.write_size, (u64)bwr.write_buffer, + (u64)bwr.read_size, (u64)bwr.read_buffer); + + if (bwr.write_size > 0) { + ret = binder_thread_write(proc, thread, + bwr.write_buffer, + bwr.write_size, + &bwr.write_consumed); + trace_binder_write_done(ret); + if (ret < 0) { + bwr.read_consumed = 0; + if (copy_to_user(ubuf, &bwr, sizeof(bwr))) + ret = -EFAULT; + goto out; + } + } + if (bwr.read_size > 0) { + ret = binder_thread_read(proc, thread, bwr.read_buffer, + bwr.read_size, + &bwr.read_consumed, + filp->f_flags & O_NONBLOCK); + trace_binder_read_done(ret); + if (!list_empty(&proc->todo)) + wake_up_interruptible(&proc->wait); + if (ret < 0) { + if (copy_to_user(ubuf, &bwr, sizeof(bwr))) + ret = -EFAULT; + goto out; + } + } + binder_debug(BINDER_DEBUG_READ_WRITE, + "%d:%d wrote %lld of %lld, read return %lld of %lld\n", + proc->pid, thread->pid, + (u64)bwr.write_consumed, (u64)bwr.write_size, + (u64)bwr.read_consumed, (u64)bwr.read_size); + if (copy_to_user(ubuf, &bwr, sizeof(bwr))) { + ret = -EFAULT; + goto out; + } +out: + return ret; +} + +static int binder_ioctl_set_ctx_mgr(struct file *filp) +{ + int ret = 0; + struct binder_proc *proc = filp->private_data; + kuid_t curr_euid = current_euid(); + + if (binder_context_mgr_node != NULL) { + pr_err("BINDER_SET_CONTEXT_MGR already set\n"); + ret = -EBUSY; + goto out; + } + if (uid_valid(binder_context_mgr_uid)) { + if (!uid_eq(binder_context_mgr_uid, curr_euid)) { + pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n", + from_kuid(&init_user_ns, curr_euid), + from_kuid(&init_user_ns, + binder_context_mgr_uid)); + ret = -EPERM; + goto out; + } + } else { + binder_context_mgr_uid = curr_euid; + } + binder_context_mgr_node = binder_new_node(proc, 0, 0); + if (binder_context_mgr_node == NULL) { + ret = -ENOMEM; + goto out; + } + binder_context_mgr_node->local_weak_refs++; + binder_context_mgr_node->local_strong_refs++; + binder_context_mgr_node->has_strong_ref = 1; + binder_context_mgr_node->has_weak_ref = 1; +out: + return ret; +} + static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int ret; @@@ -2700,9 -2599,9 +2698,9 @@@ struct binder_thread *thread; unsigned int size = _IOC_SIZE(cmd); void __user *ubuf = (void __user *)arg; - kuid_t curr_euid = current_euid();
- /*pr_info("binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd, arg);*/ + /*pr_info("binder_ioctl: %d:%d %x %lx\n", + proc->pid, current->pid, cmd, arg);*/
trace_binder_ioctl(cmd, arg);
@@@ -2718,11 -2617,61 +2716,11 @@@ }
switch (cmd) { - case BINDER_WRITE_READ: { - struct binder_write_read bwr; - - if (size != sizeof(struct binder_write_read)) { - ret = -EINVAL; + case BINDER_WRITE_READ: + ret = binder_ioctl_write_read(filp, cmd, arg, thread); + if (ret) goto err; - } - if (copy_from_user(&bwr, ubuf, sizeof(bwr))) { - ret = -EFAULT; - goto err; - } - binder_debug(BINDER_DEBUG_READ_WRITE, - "%d:%d write %lld at %016llx, read %lld at %016llx\n", - proc->pid, thread->pid, - (u64)bwr.write_size, (u64)bwr.write_buffer, - (u64)bwr.read_size, (u64)bwr.read_buffer); - - if (bwr.write_size > 0) { - ret = binder_thread_write(proc, thread, - bwr.write_buffer, - bwr.write_size, - &bwr.write_consumed); - trace_binder_write_done(ret); - if (ret < 0) { - bwr.read_consumed = 0; - if (copy_to_user(ubuf, &bwr, sizeof(bwr))) - ret = -EFAULT; - goto err; - } - } - if (bwr.read_size > 0) { - ret = binder_thread_read(proc, thread, bwr.read_buffer, - bwr.read_size, - &bwr.read_consumed, - filp->f_flags & O_NONBLOCK); - trace_binder_read_done(ret); - if (!list_empty(&proc->todo)) - wake_up_interruptible(&proc->wait); - if (ret < 0) { - if (copy_to_user(ubuf, &bwr, sizeof(bwr))) - ret = -EFAULT; - goto err; - } - } - binder_debug(BINDER_DEBUG_READ_WRITE, - "%d:%d wrote %lld of %lld, read return %lld of %lld\n", - proc->pid, thread->pid, - (u64)bwr.write_consumed, (u64)bwr.write_size, - (u64)bwr.read_consumed, (u64)bwr.read_size); - if (copy_to_user(ubuf, &bwr, sizeof(bwr))) { - ret = -EFAULT; - goto err; - } break; - } case BINDER_SET_MAX_THREADS: if (copy_from_user(&proc->max_threads, ubuf, sizeof(proc->max_threads))) { ret = -EINVAL; @@@ -2730,9 -2679,31 +2728,9 @@@ } break; case BINDER_SET_CONTEXT_MGR: - if (binder_context_mgr_node != NULL) { - pr_err("BINDER_SET_CONTEXT_MGR already set\n"); - ret = -EBUSY; - goto err; - } - if (uid_valid(binder_context_mgr_uid)) { - if (!uid_eq(binder_context_mgr_uid, curr_euid)) { - pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n", - from_kuid(&init_user_ns, curr_euid), - from_kuid(&init_user_ns, binder_context_mgr_uid)); - ret = -EPERM; - goto err; - } - } else { - binder_context_mgr_uid = curr_euid; - } - binder_context_mgr_node = binder_new_node(proc, 0, 0); - if (binder_context_mgr_node == NULL) { - ret = -ENOMEM; + ret = binder_ioctl_set_ctx_mgr(filp); + if (ret) goto err; - } - binder_context_mgr_node->local_weak_refs++; - binder_context_mgr_node->local_strong_refs++; - binder_context_mgr_node->has_strong_ref = 1; - binder_context_mgr_node->has_weak_ref = 1; break; case BINDER_THREAD_EXIT: binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n", @@@ -2796,15 -2767,9 +2794,15 @@@ static void binder_vma_close(struct vm_ binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES); }
+static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static struct vm_operations_struct binder_vm_ops = { .open = binder_vma_open, .close = binder_vma_close, + .fault = binder_vm_fault, };
static int binder_mmap(struct file *filp, struct vm_area_struct *vma) diff --combined drivers/staging/lustre/lustre/libcfs/hash.c index 5dde794,6db7391..8ef1deb --- a/drivers/staging/lustre/lustre/libcfs/hash.c +++ b/drivers/staging/lustre/lustre/libcfs/hash.c @@@ -107,7 -107,7 +107,7 @@@ * table. Also, user can break the iteration by return 1 in callback. */
-#include <linux/libcfs/libcfs.h> +#include "../../include/linux/libcfs/libcfs.h" #include <linux/seq_file.h>
#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 @@@ -351,7 -351,7 +351,7 @@@ cfs_hash_dh_hnode_add(struct cfs_hash * cfs_hash_dhead_t, dh_head);
if (dh->dh_tail != NULL) /* not empty */ - hlist_add_after(dh->dh_tail, hnode); + hlist_add_behind(hnode, dh->dh_tail); else /* empty list */ hlist_add_head(hnode, &dh->dh_head); dh->dh_tail = hnode; @@@ -406,7 -406,7 +406,7 @@@ cfs_hash_dd_hnode_add(struct cfs_hash * cfs_hash_dhead_dep_t, dd_head);
if (dh->dd_tail != NULL) /* not empty */ - hlist_add_after(dh->dd_tail, hnode); + hlist_add_behind(hnode, dh->dd_tail); else /* empty list */ hlist_add_head(hnode, &dh->dd_head); dh->dd_tail = hnode; diff --combined fs/cifs/cifssmb.c index 7d4361f,c3dc52e..692d79f --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@@ -196,6 -196,10 +196,6 @@@ cifs_reconnect_tcon(struct cifs_tcon *t if (rc) goto out;
- /* - * FIXME: check if wsize needs updated due to negotiated smb buffer - * size shrinking - */ atomic_inc(&tconInfoReconnectCount);
/* tell server Unix caps we support */ @@@ -1513,6 -1517,7 +1513,6 @@@ cifs_readv_receive(struct TCP_Server_In return length;
server->total_read += length; - rdata->bytes = length;
cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", server->total_read, buflen, data_len); @@@ -1555,18 -1560,12 +1555,18 @@@ cifs_readv_callback(struct mid_q_entry rc); } /* FIXME: should this be counted toward the initiating task? */ - task_io_account_read(rdata->bytes); - cifs_stats_bytes_read(tcon, rdata->bytes); + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: rdata->result = -EAGAIN; + if (server->sign && rdata->got_bytes) + /* reset bytes number since we can not check a sign */ + rdata->got_bytes = 0; + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; default: rdata->result = -EIO; @@@ -1735,7 -1734,10 +1735,7 @@@ CIFSSMBRead(const unsigned int xid, str
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ if (*buf) { - if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base); } else if (resp_buf_type != CIFS_NO_BUFFER) { /* return buffer to caller to free */ *buf = iov[0].iov_base; @@@ -1900,79 -1902,27 +1900,79 @@@ cifs_writev_requeue(struct cifs_writeda int i, rc; struct inode *inode = wdata->cfile->dentry->d_inode; struct TCP_Server_Info *server; + unsigned int rest_len;
- for (i = 0; i < wdata->nr_pages; i++) { - lock_page(wdata->pages[i]); - clear_page_dirty_for_io(wdata->pages[i]); - } - + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + i = 0; + rest_len = wdata->bytes; do { - server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata, cifs_writedata_release); - } while (rc == -EAGAIN); + struct cifs_writedata *wdata2; + unsigned int j, nr_pages, wsize, tailsz, cur_len; + + wsize = server->ops->wp_retry_size(inode); + if (wsize < rest_len) { + nr_pages = wsize / PAGE_CACHE_SIZE; + if (!nr_pages) { + rc = -ENOTSUPP; + break; + } + cur_len = nr_pages * PAGE_CACHE_SIZE; + tailsz = PAGE_CACHE_SIZE; + } else { + nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE); + cur_len = rest_len; + tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE; + }
- for (i = 0; i < wdata->nr_pages; i++) { - unlock_page(wdata->pages[i]); - if (rc != 0) { - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - page_cache_release(wdata->pages[i]); + wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete); + if (!wdata2) { + rc = -ENOMEM; + break; } - }
- mapping_set_error(inode->i_mapping, rc); + for (j = 0; j < nr_pages; j++) { + wdata2->pages[j] = wdata->pages[i + j]; + lock_page(wdata2->pages[j]); + clear_page_dirty_for_io(wdata2->pages[j]); + } + + wdata2->sync_mode = wdata->sync_mode; + wdata2->nr_pages = nr_pages; + wdata2->offset = page_offset(wdata2->pages[0]); + wdata2->pagesz = PAGE_CACHE_SIZE; + wdata2->tailsz = tailsz; + wdata2->bytes = cur_len; + + wdata2->cfile = find_writable_file(CIFS_I(inode), false); + if (!wdata2->cfile) { + cifs_dbg(VFS, "No writable handles for inode\n"); + rc = -EBADF; + break; + } + wdata2->pid = wdata2->cfile->pid; + rc = server->ops->async_writev(wdata2, cifs_writedata_release); + + for (j = 0; j < nr_pages; j++) { + unlock_page(wdata2->pages[j]); + if (rc != 0 && rc != -EAGAIN) { + SetPageError(wdata2->pages[j]); + end_page_writeback(wdata2->pages[j]); + page_cache_release(wdata2->pages[j]); + } + } + + if (rc) { + kref_put(&wdata2->refcount, cifs_writedata_release); + if (rc == -EAGAIN) + continue; + mapping_set_error(inode->i_mapping, rc); + break; + } + + rest_len -= cur_len; + i += nr_pages; + } while (i < wdata->nr_pages); + kref_put(&wdata->refcount, cifs_writedata_release); }
@@@ -2253,7 -2203,10 +2253,7 @@@ CIFSSMBWrite2(const unsigned int xid, s }
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ - if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@@ -2477,14 -2430,14 +2477,14 @@@ CIFSSMBPosixLock(const unsigned int xid } parm_data = (struct cifs_posix_lock *) ((char *)&pSMBr->hdr.Protocol + data_offset); - if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK)) + if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) pLockData->fl_type = F_UNLCK; else { if (parm_data->lock_type == - __constant_cpu_to_le16(CIFS_RDLCK)) + cpu_to_le16(CIFS_RDLCK)) pLockData->fl_type = F_RDLCK; else if (parm_data->lock_type == - __constant_cpu_to_le16(CIFS_WRLCK)) + cpu_to_le16(CIFS_WRLCK)) pLockData->fl_type = F_WRLCK;
pLockData->fl_start = le64_to_cpu(parm_data->start); @@@ -2498,7 -2451,10 +2498,7 @@@ plk_err_exit if (pSMB) cifs_small_buf_release(pSMB);
- if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@@ -3276,25 -3232,25 +3276,25 @@@ CIFSSMB_set_compression(const unsigned pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
pSMB->TotalParameterCount = 0; - pSMB->TotalDataCount = __constant_cpu_to_le32(2); + pSMB->TotalDataCount = cpu_to_le32(2); pSMB->MaxParameterCount = 0; pSMB->MaxDataCount = 0; pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; - pSMB->DataCount = __constant_cpu_to_le32(2); + pSMB->DataCount = cpu_to_le32(2); pSMB->DataOffset = cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req, compression_state) - 4); /* 84 */ pSMB->SetupCount = 4; - pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL); + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); pSMB->ParameterCount = 0; - pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION); + pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION); pSMB->IsFsctl = 1; /* FSCTL */ pSMB->IsRootFlag = 0; pSMB->Fid = fid; /* file handle always le */ /* 3 byte pad, followed by 2 byte compress state */ - pSMB->ByteCount = __constant_cpu_to_le16(5); + pSMB->ByteCount = cpu_to_le16(5); inc_rfc1001_len(pSMB, 5);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@@ -3430,10 -3386,10 +3430,10 @@@ static __u16 ACL_to_cifs_posix(char *pa cifs_acl->version = cpu_to_le16(1); if (acl_type == ACL_TYPE_ACCESS) { cifs_acl->access_entry_count = cpu_to_le16(count); - cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF); + cifs_acl->default_entry_count = cpu_to_le16(0xFFFF); } else if (acl_type == ACL_TYPE_DEFAULT) { cifs_acl->default_entry_count = cpu_to_le16(count); - cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF); + cifs_acl->access_entry_count = cpu_to_le16(0xFFFF); } else { cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); return 0; @@@ -3882,7 -3838,10 +3882,7 @@@ CIFSSMBGetCIFSACL(const unsigned int xi } } qsec_out: - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(buf_type, iov[0].iov_base); /* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ return rc; } diff --combined fs/cifs/file.c index 4b83d72,3c1967c..ba18d5a --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@@ -1058,7 -1058,7 +1058,7 @@@ cifs_push_mandatory_locks(struct cifsFi
max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); - buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) { free_xid(xid); return -ENOMEM; @@@ -1393,7 -1393,7 +1393,7 @@@ cifs_unlock_range(struct cifsFileInfo *
max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); - buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) return -ENOMEM;
@@@ -1670,8 -1670,8 +1670,8 @@@ cifs_write(struct cifsFileInfo *open_fi break; }
- len = min((size_t)cifs_sb->wsize, - write_size - total_written); + len = min(server->ops->wp_retry_size(dentry->d_inode), + (unsigned int)write_size - total_written); /* iov[0] is reserved for smb header */ iov[1].iov_base = (char *)write_data + total_written; iov[1].iov_len = len; @@@ -1878,178 -1878,15 +1878,178 @@@ static int cifs_partialpagewrite(struc return rc; }
+static struct cifs_writedata * +wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, + pgoff_t end, pgoff_t *index, + unsigned int *found_pages) +{ + unsigned int nr_pages; + struct page **pages; + struct cifs_writedata *wdata; + + wdata = cifs_writedata_alloc((unsigned int)tofind, + cifs_writev_complete); + if (!wdata) + return NULL; + + /* + * find_get_pages_tag seems to return a max of 256 on each + * iteration, so we must call it several times in order to + * fill the array or the wsize is effectively limited to + * 256 * PAGE_CACHE_SIZE. + */ + *found_pages = 0; + pages = wdata->pages; + do { + nr_pages = find_get_pages_tag(mapping, index, + PAGECACHE_TAG_DIRTY, tofind, + pages); + *found_pages += nr_pages; + tofind -= nr_pages; + pages += nr_pages; + } while (nr_pages && tofind && *index <= end); + + return wdata; +} + +static unsigned int +wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, + struct address_space *mapping, + struct writeback_control *wbc, + pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done) +{ + unsigned int nr_pages = 0, i; + struct page *page; + + for (i = 0; i < found_pages; i++) { + page = wdata->pages[i]; + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + + if (nr_pages == 0) + lock_page(page); + else if (!trylock_page(page)) + break; + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + break; + } + + if (!wbc->range_cyclic && page->index > end) { + *done = true; + unlock_page(page); + break; + } + + if (*next && (page->index != *next)) { + /* Not next consecutive page */ + unlock_page(page); + break; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + break; + } + + /* + * This actually clears the dirty bit in the radix tree. + * See cifs_writepage() for more commentary. + */ + set_page_writeback(page); + if (page_offset(page) >= i_size_read(mapping->host)) { + *done = true; + unlock_page(page); + end_page_writeback(page); + break; + } + + wdata->pages[i] = page; + *next = page->index + 1; + ++nr_pages; + } + + /* reset index to refind any pages skipped */ + if (nr_pages == 0) + *index = wdata->pages[0]->index + 1; + + /* put any pages we aren't going to use */ + for (i = nr_pages; i < found_pages; i++) { + page_cache_release(wdata->pages[i]); + wdata->pages[i] = NULL; + } + + return nr_pages; +} + +static int +wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, + struct address_space *mapping, struct writeback_control *wbc) +{ + int rc = 0; + struct TCP_Server_Info *server; + unsigned int i; + + wdata->sync_mode = wbc->sync_mode; + wdata->nr_pages = nr_pages; + wdata->offset = page_offset(wdata->pages[0]); + wdata->pagesz = PAGE_CACHE_SIZE; + wdata->tailsz = min(i_size_read(mapping->host) - + page_offset(wdata->pages[nr_pages - 1]), + (loff_t)PAGE_CACHE_SIZE); + wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz; + + if (wdata->cfile != NULL) + cifsFileInfo_put(wdata->cfile); + wdata->cfile = find_writable_file(CIFS_I(mapping->host), false); + if (!wdata->cfile) { + cifs_dbg(VFS, "No writable handles for inode\n"); + rc = -EBADF; + } else { + wdata->pid = wdata->cfile->pid; + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + rc = server->ops->async_writev(wdata, cifs_writedata_release); + } + + for (i = 0; i < nr_pages; ++i) + unlock_page(wdata->pages[i]); + + if (!rc) + return rc; + + /* send failure -- clean up the mess */ + for (i = 0; i < nr_pages; ++i) { + if (rc == -EAGAIN) + redirty_page_for_writepage(wbc, wdata->pages[i]); + else + SetPageError(wdata->pages[i]); + end_page_writeback(wdata->pages[i]); + page_cache_release(wdata->pages[i]); + } + if (rc != -EAGAIN) + mapping_set_error(mapping, rc); + + return rc; +} + static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb); + struct TCP_Server_Info *server; bool done = false, scanned = false, range_whole = false; pgoff_t end, index; struct cifs_writedata *wdata; - struct TCP_Server_Info *server; - struct page *page; int rc = 0;
/* @@@ -2069,55 -1906,165 +2069,55 @@@ range_whole = true; scanned = true; } + server = cifs_sb_master_tcon(cifs_sb)->ses->server; retry: while (!done && index <= end) { - unsigned int i, nr_pages, found_pages; - pgoff_t next = 0, tofind; - struct page **pages; + unsigned int nr_pages, found_pages, wsize, credits; + pgoff_t next = 0, tofind, saved_index = index;
- tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1, - end - index) + 1; + rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, + &wsize, &credits); + if (rc) + break; + + tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1;
- wdata = cifs_writedata_alloc((unsigned int)tofind, - cifs_writev_complete); + wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index, + &found_pages); if (!wdata) { rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); break; }
- /* - * find_get_pages_tag seems to return a max of 256 on each - * iteration, so we must call it several times in order to - * fill the array or the wsize is effectively limited to - * 256 * PAGE_CACHE_SIZE. - */ - found_pages = 0; - pages = wdata->pages; - do { - nr_pages = find_get_pages_tag(mapping, &index, - PAGECACHE_TAG_DIRTY, - tofind, pages); - found_pages += nr_pages; - tofind -= nr_pages; - pages += nr_pages; - } while (nr_pages && tofind && index <= end); - if (found_pages == 0) { kref_put(&wdata->refcount, cifs_writedata_release); + add_credits_and_wake_if(server, credits, 0); break; }
- nr_pages = 0; - for (i = 0; i < found_pages; i++) { - page = wdata->pages[i]; - /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping - */ - - if (nr_pages == 0) - lock_page(page); - else if (!trylock_page(page)) - break; - - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - break; - } - - if (!wbc->range_cyclic && page->index > end) { - done = true; - unlock_page(page); - break; - } - - if (next && (page->index != next)) { - /* Not next consecutive page */ - unlock_page(page); - break; - } - - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - break; - } - - /* - * This actually clears the dirty bit in the radix tree. - * See cifs_writepage() for more commentary. - */ - set_page_writeback(page); - - if (page_offset(page) >= i_size_read(mapping->host)) { - done = true; - unlock_page(page); - end_page_writeback(page); - break; - } - - wdata->pages[i] = page; - next = page->index + 1; - ++nr_pages; - } - - /* reset index to refind any pages skipped */ - if (nr_pages == 0) - index = wdata->pages[0]->index + 1; - - /* put any pages we aren't going to use */ - for (i = nr_pages; i < found_pages; i++) { - page_cache_release(wdata->pages[i]); - wdata->pages[i] = NULL; - } + nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc, + end, &index, &next, &done);
/* nothing to write? */ if (nr_pages == 0) { kref_put(&wdata->refcount, cifs_writedata_release); + add_credits_and_wake_if(server, credits, 0); continue; }
- wdata->sync_mode = wbc->sync_mode; - wdata->nr_pages = nr_pages; - wdata->offset = page_offset(wdata->pages[0]); - wdata->pagesz = PAGE_CACHE_SIZE; - wdata->tailsz = - min(i_size_read(mapping->host) - - page_offset(wdata->pages[nr_pages - 1]), - (loff_t)PAGE_CACHE_SIZE); - wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + - wdata->tailsz; + wdata->credits = credits;
- do { - if (wdata->cfile != NULL) - cifsFileInfo_put(wdata->cfile); - wdata->cfile = find_writable_file(CIFS_I(mapping->host), - false); - if (!wdata->cfile) { - cifs_dbg(VFS, "No writable handles for inode\n"); - rc = -EBADF; - break; - } - wdata->pid = wdata->cfile->pid; - server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata, - cifs_writedata_release); - } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN); + rc = wdata_send_pages(wdata, nr_pages, mapping, wbc); + if (rc) + add_credits_and_wake_if(server, wdata->credits, 0);
- for (i = 0; i < nr_pages; ++i) - unlock_page(wdata->pages[i]); + kref_put(&wdata->refcount, cifs_writedata_release);
- /* send failure -- clean up the mess */ - if (rc != 0) { - for (i = 0; i < nr_pages; ++i) { - if (rc == -EAGAIN) - redirty_page_for_writepage(wbc, - wdata->pages[i]); - else - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - page_cache_release(wdata->pages[i]); - } - if (rc != -EAGAIN) - mapping_set_error(mapping, rc); + if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) { + index = saved_index; + continue; } - kref_put(&wdata->refcount, cifs_writedata_release);
wbc->nr_to_write -= nr_pages; if (wbc->nr_to_write <= 0) @@@ -2415,106 -2362,125 +2415,106 @@@ cifs_uncached_writev_complete(struct wo kref_put(&wdata->refcount, cifs_uncached_writedata_release); }
-/* attempt to send write to server, retry on any -EAGAIN errors */ static int -cifs_uncached_retry_writev(struct cifs_writedata *wdata) +wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, + size_t *len, unsigned long nr_pages) { - int rc; - struct TCP_Server_Info *server; + int rc = 0; + size_t save_len, copied, bytes, cur_len = *len; + unsigned long i;
- server = tlink_tcon(wdata->cfile->tlink)->ses->server; + save_len = cur_len; + for (i = 0; i < nr_pages; i++) { + bytes = min_t(const size_t, cur_len, PAGE_SIZE); + copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from); + cur_len -= copied; + /* + * If we didn't copy as much as we expected, then that + * may mean we trod into an unmapped area. Stop copying + * at that point. On the next pass through the big + * loop, we'll likely end up getting a zero-length + * write and bailing out of it. + */ + if (copied < bytes) + break; + } + cur_len = save_len - cur_len; + *len = cur_len;
- do { - if (wdata->cfile->invalidHandle) { - rc = cifs_reopen_file(wdata->cfile, false); - if (rc != 0) - continue; - } - rc = server->ops->async_writev(wdata, - cifs_uncached_writedata_release); - } while (rc == -EAGAIN); + /* + * If we have no data to send, then that probably means that + * the copy above failed altogether. That's most likely because + * the address in the iovec was bogus. Return -EFAULT and let + * the caller free anything we allocated and bail out. + */ + if (!cur_len) + return -EFAULT;
+ /* + * i + 1 now represents the number of pages we actually used in + * the copy phase above. Bring nr_pages down to that, and free + * any pages that we didn't use. + */ + for ( ; nr_pages > i + 1; nr_pages--) + put_page(wdata->pages[nr_pages - 1]); return rc; }
-static ssize_t -cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) +static int +cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, + struct cifsFileInfo *open_file, + struct cifs_sb_info *cifs_sb, struct list_head *wdata_list) { + int rc = 0; + size_t cur_len; unsigned long nr_pages, i; - size_t bytes, copied, len, cur_len; - ssize_t total_written = 0; - loff_t offset; - struct cifsFileInfo *open_file; - struct cifs_tcon *tcon; - struct cifs_sb_info *cifs_sb; - struct cifs_writedata *wdata, *tmp; - struct list_head wdata_list; - int rc; + struct cifs_writedata *wdata; + struct iov_iter saved_from; + loff_t saved_offset = offset; pid_t pid; - - len = iov_iter_count(from); - rc = generic_write_checks(file, poffset, &len, 0); - if (rc) - return rc; - - if (!len) - return 0; - - iov_iter_truncate(from, len); - - INIT_LIST_HEAD(&wdata_list); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_writev) - return -ENOSYS; - - offset = *poffset; + struct TCP_Server_Info *server;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid;
+ server = tlink_tcon(open_file->tlink)->ses->server; + memcpy(&saved_from, from, sizeof(struct iov_iter)); + do { - size_t save_len; + unsigned int wsize, credits;
- nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len); + rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, + &wsize, &credits); + if (rc) + break; + + nr_pages = get_numpages(wsize, len, &cur_len); wdata = cifs_writedata_alloc(nr_pages, cifs_uncached_writev_complete); if (!wdata) { rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); break; }
rc = cifs_write_allocate_pages(wdata->pages, nr_pages); if (rc) { kfree(wdata); + add_credits_and_wake_if(server, credits, 0); break; }
- save_len = cur_len; - for (i = 0; i < nr_pages; i++) { - bytes = min_t(size_t, cur_len, PAGE_SIZE); - copied = copy_page_from_iter(wdata->pages[i], 0, bytes, - from); - cur_len -= copied; - /* - * If we didn't copy as much as we expected, then that - * may mean we trod into an unmapped area. Stop copying - * at that point. On the next pass through the big - * loop, we'll likely end up getting a zero-length - * write and bailing out of it. - */ - if (copied < bytes) - break; - } - cur_len = save_len - cur_len; - - /* - * If we have no data to send, then that probably means that - * the copy above failed altogether. That's most likely because - * the address in the iovec was bogus. Set the rc to -EFAULT, - * free anything we allocated and bail out. - */ - if (!cur_len) { + rc = wdata_fill_from_iovec(wdata, from, &cur_len, nr_pages); + if (rc) { for (i = 0; i < nr_pages; i++) put_page(wdata->pages[i]); kfree(wdata); - rc = -EFAULT; + add_credits_and_wake_if(server, credits, 0); break; }
- /* - * i + 1 now represents the number of pages we actually used in - * the copy phase above. Bring nr_pages down to that, and free - * any pages that we didn't use. - */ - for ( ; nr_pages > i + 1; nr_pages--) - put_page(wdata->pages[nr_pages - 1]); - wdata->sync_mode = WB_SYNC_ALL; wdata->nr_pages = nr_pages; wdata->offset = (__u64)offset; @@@ -2523,71 -2489,18 +2523,71 @@@ wdata->bytes = cur_len; wdata->pagesz = PAGE_SIZE; wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); - rc = cifs_uncached_retry_writev(wdata); + wdata->credits = credits; + + if (!wdata->cfile->invalidHandle || + !cifs_reopen_file(wdata->cfile, false)) + rc = server->ops->async_writev(wdata, + cifs_uncached_writedata_release); if (rc) { + add_credits_and_wake_if(server, wdata->credits, 0); kref_put(&wdata->refcount, cifs_uncached_writedata_release); + if (rc == -EAGAIN) { + memcpy(from, &saved_from, + sizeof(struct iov_iter)); + iov_iter_advance(from, offset - saved_offset); + continue; + } break; }
- list_add_tail(&wdata->list, &wdata_list); + list_add_tail(&wdata->list, wdata_list); offset += cur_len; len -= cur_len; } while (len > 0);
+ return rc; +} + +static ssize_t +cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) +{ + size_t len; + ssize_t total_written = 0; + loff_t offset; + struct cifsFileInfo *open_file; + struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb; + struct cifs_writedata *wdata, *tmp; + struct list_head wdata_list; + struct iov_iter saved_from; + int rc; + + len = iov_iter_count(from); + rc = generic_write_checks(file, poffset, &len, 0); + if (rc) + return rc; + + if (!len) + return 0; + + iov_iter_truncate(from, len); + + INIT_LIST_HEAD(&wdata_list); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + open_file = file->private_data; + tcon = tlink_tcon(open_file->tlink); + + if (!tcon->ses->server->ops->async_writev) + return -ENOSYS; + + offset = *poffset; + memcpy(&saved_from, from, sizeof(struct iov_iter)); + + rc = cifs_write_from_iter(*poffset, len, from, open_file, cifs_sb, + &wdata_list); + /* * If at least one write was successfully sent, then discard any rc * value from the later writes. If the other write succeeds, then @@@ -2616,25 -2529,7 +2616,25 @@@ restart_loop
/* resend call if it's a retryable error */ if (rc == -EAGAIN) { - rc = cifs_uncached_retry_writev(wdata); + struct list_head tmp_list; + struct iov_iter tmp_from; + + INIT_LIST_HEAD(&tmp_list); + list_del_init(&wdata->list); + + memcpy(&tmp_from, &saved_from, + sizeof(struct iov_iter)); + iov_iter_advance(&tmp_from, + wdata->offset - *poffset); + + rc = cifs_write_from_iter(wdata->offset, + wdata->bytes, &tmp_from, + open_file, cifs_sb, &tmp_list); + + list_splice(&tmp_list, &wdata_list); + + kref_put(&wdata->refcount, + cifs_uncached_writedata_release); goto restart_loop; } } @@@ -2827,6 -2722,26 +2827,6 @@@ cifs_uncached_readdata_release(struct k cifs_readdata_release(refcount); }
-static int -cifs_retry_async_readv(struct cifs_readdata *rdata) -{ - int rc; - struct TCP_Server_Info *server; - - server = tlink_tcon(rdata->cfile->tlink)->ses->server; - - do { - if (rdata->cfile->invalidHandle) { - rc = cifs_reopen_file(rdata->cfile, true); - if (rc != 0) - continue; - } - rc = server->ops->async_readv(rdata); - } while (rc == -EAGAIN); - - return rc; -} - /** * cifs_readdata_to_iov - copy data from pages in response to an iovec * @rdata: the readdata response with list of pages holding data @@@ -2839,7 -2754,7 +2839,7 @@@ static int cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) { - size_t remaining = rdata->bytes; + size_t remaining = rdata->got_bytes; unsigned int i;
for (i = 0; i < rdata->nr_pages; i++) { @@@ -2867,12 -2782,11 +2867,12 @@@ static in cifs_uncached_read_into_pages(struct TCP_Server_Info *server, struct cifs_readdata *rdata, unsigned int len) { - int total_read = 0, result = 0; + int result = 0; unsigned int i; unsigned int nr_pages = rdata->nr_pages; struct kvec iov;
+ rdata->got_bytes = 0; rdata->tailsz = PAGE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; @@@ -2906,45 -2820,55 +2906,45 @@@ if (result < 0) break;
- total_read += result; + rdata->got_bytes += result; }
- return total_read > 0 ? total_read : result; + return rdata->got_bytes > 0 && result != -ECONNABORTED ? + rdata->got_bytes : result; }
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +static int +cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, + struct cifs_sb_info *cifs_sb, struct list_head *rdata_list) { - struct file *file = iocb->ki_filp; - ssize_t rc; - size_t len, cur_len; - ssize_t total_read = 0; - loff_t offset = iocb->ki_pos; - unsigned int npages; - struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; - struct cifsFileInfo *open_file; - struct cifs_readdata *rdata, *tmp; - struct list_head rdata_list; + struct cifs_readdata *rdata; + unsigned int npages, rsize, credits; + size_t cur_len; + int rc; pid_t pid; + struct TCP_Server_Info *server;
- len = iov_iter_count(to); - if (!len) - return 0; - - INIT_LIST_HEAD(&rdata_list); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_readv) - return -ENOSYS; + server = tlink_tcon(open_file->tlink)->ses->server;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid;
- if ((file->f_flags & O_ACCMODE) == O_WRONLY) - cifs_dbg(FYI, "attempting read on write only file instance\n"); - do { - cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, + &rsize, &credits); + if (rc) + break; + + cur_len = min_t(const size_t, len, rsize); npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
/* allocate a readdata struct */ rdata = cifs_readdata_alloc(npages, cifs_uncached_readv_complete); if (!rdata) { + add_credits_and_wake_if(server, credits, 0); rc = -ENOMEM; break; } @@@ -2960,113 -2884,44 +2960,113 @@@ rdata->pid = pid; rdata->pagesz = PAGE_SIZE; rdata->read_into_pages = cifs_uncached_read_into_pages; + rdata->credits = credits;
- rc = cifs_retry_async_readv(rdata); + if (!rdata->cfile->invalidHandle || + !cifs_reopen_file(rdata->cfile, true)) + rc = server->ops->async_readv(rdata); error: if (rc) { + add_credits_and_wake_if(server, rdata->credits, 0); kref_put(&rdata->refcount, cifs_uncached_readdata_release); + if (rc == -EAGAIN) + continue; break; }
- list_add_tail(&rdata->list, &rdata_list); + list_add_tail(&rdata->list, rdata_list); offset += cur_len; len -= cur_len; } while (len > 0);
+ return rc; +} + +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + ssize_t rc; + size_t len; + ssize_t total_read = 0; + loff_t offset = iocb->ki_pos; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsFileInfo *open_file; + struct cifs_readdata *rdata, *tmp; + struct list_head rdata_list; + + len = iov_iter_count(to); + if (!len) + return 0; + + INIT_LIST_HEAD(&rdata_list); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + open_file = file->private_data; + tcon = tlink_tcon(open_file->tlink); + + if (!tcon->ses->server->ops->async_readv) + return -ENOSYS; + + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + cifs_dbg(FYI, "attempting read on write only file instance\n"); + + rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list); + /* if at least one read request send succeeded, then reset rc */ if (!list_empty(&rdata_list)) rc = 0;
len = iov_iter_count(to); /* the loop below should proceed in the order of increasing offsets */ +again: list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { - again: if (!rc) { /* FIXME: freezable sleep too? */ rc = wait_for_completion_killable(&rdata->done); if (rc) rc = -EINTR; - else if (rdata->result) { - rc = rdata->result; + else if (rdata->result == -EAGAIN) { /* resend call if it's a retryable error */ - if (rc == -EAGAIN) { - rc = cifs_retry_async_readv(rdata); - goto again; + struct list_head tmp_list; + unsigned int got_bytes = rdata->got_bytes; + + list_del_init(&rdata->list); + INIT_LIST_HEAD(&tmp_list); + + /* + * Got a part of data and then reconnect has + * happened -- fill the buffer and continue + * reading. + */ + if (got_bytes && got_bytes < rdata->bytes) { + rc = cifs_readdata_to_iov(rdata, to); + if (rc) { + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + continue; + } } - } else { + + rc = cifs_send_async_read( + rdata->offset + got_bytes, + rdata->bytes - got_bytes, + rdata->cfile, cifs_sb, + &tmp_list); + + list_splice(&tmp_list, &rdata_list); + + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + goto again; + } else if (rdata->result) + rc = rdata->result; + else rc = cifs_readdata_to_iov(rdata, to); - }
+ /* if there was a short read -- discard anything left */ + if (rdata->got_bytes && rdata->got_bytes < rdata->bytes) + rc = -ENODATA; } list_del_init(&rdata->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); @@@ -3175,19 -3030,18 +3175,19 @@@ cifs_read(struct file *file, char *read
for (total_read = 0, cur_offset = read_data; read_size > total_read; total_read += bytes_read, cur_offset += bytes_read) { - current_read_size = min_t(uint, read_size - total_read, rsize); - /* - * For windows me and 9x we do not want to request more than it - * negotiated since it will refuse the read then. - */ - if ((tcon->ses) && !(tcon->ses->capabilities & + do { + current_read_size = min_t(uint, read_size - total_read, + rsize); + /* + * For windows me and 9x we do not want to request more + * than it negotiated since it will refuse the read + * then. + */ + if ((tcon->ses) && !(tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)) { - current_read_size = min_t(uint, current_read_size, - CIFSMaxBufSize); - } - rc = -EAGAIN; - while (rc == -EAGAIN) { + current_read_size = min_t(uint, + current_read_size, CIFSMaxBufSize); + } if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); if (rc != 0) @@@ -3200,8 -3054,7 +3200,8 @@@ rc = server->ops->sync_read(xid, open_file, &io_parms, &bytes_read, &cur_offset, &buf_type); - } + } while (rc == -EAGAIN); + if (rc || (bytes_read == 0)) { if (total_read) { break; @@@ -3280,30 -3133,25 +3280,30 @@@ int cifs_file_mmap(struct file *file, s static void cifs_readv_complete(struct work_struct *work) { - unsigned int i; + unsigned int i, got_bytes; struct cifs_readdata *rdata = container_of(work, struct cifs_readdata, work);
+ got_bytes = rdata->got_bytes; for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i];
lru_cache_add_file(page);
- if (rdata->result == 0) { + if (rdata->result == 0 || + (rdata->result == -EAGAIN && got_bytes)) { flush_dcache_page(page); SetPageUptodate(page); }
unlock_page(page);
- if (rdata->result == 0) + if (rdata->result == 0 || + (rdata->result == -EAGAIN && got_bytes)) cifs_readpage_to_fscache(rdata->mapping->host, page);
+ got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes); + page_cache_release(page); rdata->pages[i] = NULL; } @@@ -3314,7 -3162,7 +3314,7 @@@ static in cifs_readpages_read_into_pages(struct TCP_Server_Info *server, struct cifs_readdata *rdata, unsigned int len) { - int total_read = 0, result = 0; + int result = 0; unsigned int i; u64 eof; pgoff_t eof_index; @@@ -3326,7 -3174,6 +3326,7 @@@ eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
+ rdata->got_bytes = 0; rdata->tailsz = PAGE_CACHE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; @@@ -3381,70 -3228,10 +3381,70 @@@ if (result < 0) break;
- total_read += result; + rdata->got_bytes += result; }
- return total_read > 0 ? total_read : result; + return rdata->got_bytes > 0 && result != -ECONNABORTED ? + rdata->got_bytes : result; +} + +static int +readpages_get_pages(struct address_space *mapping, struct list_head *page_list, + unsigned int rsize, struct list_head *tmplist, + unsigned int *nr_pages, loff_t *offset, unsigned int *bytes) +{ + struct page *page, *tpage; + unsigned int expected_index; + int rc; + + INIT_LIST_HEAD(tmplist); + + page = list_entry(page_list->prev, struct page, lru); + + /* + * Lock the page and put it in the cache. Since no one else + * should have access to this page, we're safe to simply set + * PG_locked without checking it first. + */ + __set_page_locked(page); + rc = add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL); + + /* give up if we can't stick it in the cache */ + if (rc) { + __clear_page_locked(page); + return rc; + } + + /* move first page to the tmplist */ + *offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + *bytes = PAGE_CACHE_SIZE; + *nr_pages = 1; + list_move_tail(&page->lru, tmplist); + + /* now try and add more pages onto the request */ + expected_index = page->index + 1; + list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { + /* discontinuity ? */ + if (page->index != expected_index) + break; + + /* would this page push the read over the rsize? */ + if (*bytes + PAGE_CACHE_SIZE > rsize) + break; + + __set_page_locked(page); + if (add_to_page_cache_locked(page, mapping, page->index, + GFP_KERNEL)) { + __clear_page_locked(page); + break; + } + list_move_tail(&page->lru, tmplist); + (*bytes) += PAGE_CACHE_SIZE; + expected_index++; + (*nr_pages)++; + } + return rc; }
static int cifs_readpages(struct file *file, struct address_space *mapping, @@@ -3454,10 -3241,19 +3454,10 @@@ struct list_head tmplist; struct cifsFileInfo *open_file = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - unsigned int rsize = cifs_sb->rsize; + struct TCP_Server_Info *server; pid_t pid;
/* - * Give up immediately if rsize is too small to read an entire page. - * The VFS will fall back to readpage. We should never reach this - * point however since we set ra_pages to 0 when the rsize is smaller - * than a cache page. - */ - if (unlikely(rsize < PAGE_CACHE_SIZE)) - return 0; - - /* * Reads as many pages as possible from fscache. Returns -ENOBUFS * immediately if the cookie is negative * @@@ -3475,7 -3271,7 +3475,7 @@@ pid = current->tgid;
rc = 0; - INIT_LIST_HEAD(&tmplist); + server = tlink_tcon(open_file->tlink)->ses->server;
cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", __func__, file, mapping, num_pages); @@@ -3492,35 -3288,58 +3492,35 @@@ * the rdata->pages, then we want them in increasing order. */ while (!list_empty(page_list)) { - unsigned int i; - unsigned int bytes = PAGE_CACHE_SIZE; - unsigned int expected_index; - unsigned int nr_pages = 1; + unsigned int i, nr_pages, bytes, rsize; loff_t offset; struct page *page, *tpage; struct cifs_readdata *rdata; + unsigned credits;
- page = list_entry(page_list->prev, struct page, lru); + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, + &rsize, &credits); + if (rc) + break;
/* - * Lock the page and put it in the cache. Since no one else - * should have access to this page, we're safe to simply set - * PG_locked without checking it first. + * Give up immediately if rsize is too small to read an entire + * page. The VFS will fall back to readpage. We should never + * reach this point however since we set ra_pages to 0 when the + * rsize is smaller than a cache page. */ - __set_page_locked(page); - rc = add_to_page_cache_locked(page, mapping, - page->index, GFP_KERNEL); + if (unlikely(rsize < PAGE_CACHE_SIZE)) { + add_credits_and_wake_if(server, credits, 0); + return 0; + }
- /* give up if we can't stick it in the cache */ + rc = readpages_get_pages(mapping, page_list, rsize, &tmplist, + &nr_pages, &offset, &bytes); if (rc) { - __clear_page_locked(page); + add_credits_and_wake_if(server, credits, 0); break; }
- /* move first page to the tmplist */ - offset = (loff_t)page->index << PAGE_CACHE_SHIFT; - list_move_tail(&page->lru, &tmplist); - - /* now try and add more pages onto the request */ - expected_index = page->index + 1; - list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { - /* discontinuity ? */ - if (page->index != expected_index) - break; - - /* would this page push the read over the rsize? */ - if (bytes + PAGE_CACHE_SIZE > rsize) - break; - - __set_page_locked(page); - if (add_to_page_cache_locked(page, mapping, - page->index, GFP_KERNEL)) { - __clear_page_locked(page); - break; - } - list_move_tail(&page->lru, &tmplist); - bytes += PAGE_CACHE_SIZE; - expected_index++; - nr_pages++; - } - rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete); if (!rdata) { /* best to give up if we're out of mem */ @@@ -3531,7 -3350,6 +3531,7 @@@ page_cache_release(page); } rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); break; }
@@@ -3542,32 -3360,21 +3542,32 @@@ rdata->pid = pid; rdata->pagesz = PAGE_CACHE_SIZE; rdata->read_into_pages = cifs_readpages_read_into_pages; + rdata->credits = credits;
list_for_each_entry_safe(page, tpage, &tmplist, lru) { list_del(&page->lru); rdata->pages[rdata->nr_pages++] = page; }
- rc = cifs_retry_async_readv(rdata); - if (rc != 0) { + if (!rdata->cfile->invalidHandle || + !cifs_reopen_file(rdata->cfile, true)) + rc = server->ops->async_readv(rdata); + if (rc) { + add_credits_and_wake_if(server, rdata->credits, 0); for (i = 0; i < rdata->nr_pages; i++) { page = rdata->pages[i]; lru_cache_add_file(page); unlock_page(page); page_cache_release(page); + if (rc == -EAGAIN) + list_add_tail(&page->lru, &tmplist); } kref_put(&rdata->refcount, cifs_readdata_release); + if (rc == -EAGAIN) { + /* Re-add pages to the page_list and retry */ + list_splice(&tmplist, page_list); + continue; + } break; }
diff --combined fs/cifs/sess.c index 39ee326,27e6175..39b8507 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@@ -46,7 -46,7 +46,7 @@@ static __u32 cifs_ssetup_hdr(struct cif CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); - pSMB->req.VcNumber = __constant_cpu_to_le16(1); + pSMB->req.VcNumber = cpu_to_le16(1);
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@@ -520,559 -520,382 +520,559 @@@ select_sectype(struct TCP_Server_Info * } }
-int -CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_cp) +struct sess_data { + unsigned int xid; + struct cifs_ses *ses; + struct nls_table *nls_cp; + void (*func)(struct sess_data *); + int result; + + /* we will send the SMB in three pieces: + * a fixed length beginning part, an optional + * SPNEGO blob (which can be zero length), and a + * last part which will include the strings + * and rest of bcc area. This allows us to avoid + * a large buffer 17K allocation + */ + int buf0_type; + struct kvec iov[3]; +}; + +static int +sess_alloc_buffer(struct sess_data *sess_data, int wct) { - int rc = 0; - int wct; + int rc; + struct cifs_ses *ses = sess_data->ses; struct smb_hdr *smb_buf; - char *bcc_ptr; - char *str_area; - SESSION_SETUP_ANDX *pSMB; - __u32 capabilities; - __u16 count; - int resp_buf_type; - struct kvec iov[3]; - enum securityEnum type; - __u16 action, bytes_remaining; - struct key *spnego_key = NULL; - __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ - u16 blob_len; - char *ntlmsspblob = NULL;
- if (ses == NULL) { - WARN(1, "%s: ses == NULL!", __func__); - return -EINVAL; - } + rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, + (void **)&smb_buf);
- type = select_sectype(ses->server, ses->sectype); - cifs_dbg(FYI, "sess setup type %d\n", type); - if (type == Unspecified) { - cifs_dbg(VFS, - "Unable to select appropriate authentication method!"); - return -EINVAL; + if (rc) + return rc; + + sess_data->iov[0].iov_base = (char *)smb_buf; + sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4; + /* + * This variable will be used to clear the buffer + * allocated above in case of any error in the calling function. + */ + sess_data->buf0_type = CIFS_SMALL_BUFFER; + + /* 2000 big enough to fit max user, domain, NOS name etc. */ + sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL); + if (!sess_data->iov[2].iov_base) { + rc = -ENOMEM; + goto out_free_smb_buf; }
- if (type == RawNTLMSSP) { - /* if memory allocation is successful, caller of this function - * frees it. - */ - ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); - if (!ses->ntlmssp) - return -ENOMEM; - ses->ntlmssp->sesskey_per_smbsess = false; + return 0; + +out_free_smb_buf: + kfree(smb_buf); + sess_data->iov[0].iov_base = NULL; + sess_data->iov[0].iov_len = 0; + sess_data->buf0_type = CIFS_NO_BUFFER; + return rc; +} + +static void +sess_free_buffer(struct sess_data *sess_data) +{
+ free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + sess_data->buf0_type = CIFS_NO_BUFFER; + kfree(sess_data->iov[2].iov_base); +} + +static int +sess_establish_session(struct sess_data *sess_data) +{ + struct cifs_ses *ses = sess_data->ses; + + mutex_lock(&ses->server->srv_mutex); + if (!ses->server->session_estab) { + if (ses->server->sign) { + ses->server->session_key.response = + kmemdup(ses->auth_key.response, + ses->auth_key.len, GFP_KERNEL); + if (!ses->server->session_key.response) { + mutex_unlock(&ses->server->srv_mutex); + return -ENOMEM; + } + ses->server->session_key.len = + ses->auth_key.len; + } + ses->server->sequence_number = 0x2; + ses->server->session_estab = true; } + mutex_unlock(&ses->server->srv_mutex);
-ssetup_ntlmssp_authenticate: - if (phase == NtLmChallenge) - phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ + cifs_dbg(FYI, "CIFS session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock);
- if (type == LANMAN) { -#ifndef CONFIG_CIFS_WEAK_PW_HASH - /* LANMAN and plaintext are less secure and off by default. - So we make this explicitly be turned on in kconfig (in the - build) and turned on at runtime (changed from the default) - in proc/fs/cifs or via mount parm. Unfortunately this is - needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ - return -EOPNOTSUPP; -#endif - wct = 10; /* lanman 2 style sessionsetup */ - } else if ((type == NTLM) || (type == NTLMv2)) { - /* For NTLMv2 failures eventually may need to retry NTLM */ - wct = 13; /* old style NTLM sessionsetup */ - } else /* same size: negotiate or auth, NTLMSSP or extended security */ - wct = 12; + return 0; +}
- rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, - (void **)&smb_buf); - if (rc) - return rc; +static int +sess_sendreceive(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base; + __u16 count;
- pSMB = (SESSION_SETUP_ANDX *)smb_buf; + count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len; + smb_buf->smb_buf_length = + cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count); + put_bcc(count, smb_buf); + + rc = SendReceive2(sess_data->xid, sess_data->ses, + sess_data->iov, 3 /* num_iovecs */, + &sess_data->buf0_type, + CIFS_LOG_ERROR); + + return rc; +}
+/* + * LANMAN and plaintext are less secure and off by default. + * So we make this explicitly be turned on in kconfig (in the + * build) and turned on at runtime (changed from the default) + * in proc/fs/cifs or via mount parm. Unfortunately this is + * needed for old Win (e.g. Win95), some obscure NAS and OS/2 + */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH +static void +sess_auth_lanman(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + char lnm_session_key[CIFS_AUTH_RESP_SIZE]; + __u32 capabilities; + __u16 bytes_remaining; + + /* lanman 2 style sessionsetup */ + /* wct = 10 */ + rc = sess_alloc_buffer(sess_data, 10); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; capabilities = cifs_ssetup_hdr(ses, pSMB);
- /* we will send the SMB in three pieces: - a fixed length beginning part, an optional - SPNEGO blob (which can be zero length), and a - last part which will include the strings - and rest of bcc area. This allows us to avoid - a large buffer 17K allocation */ - iov[0].iov_base = (char *)pSMB; - iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4; - - /* setting this here allows the code at the end of the function - to free the request buffer if there's an error */ - resp_buf_type = CIFS_SMALL_BUFFER; + pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
- /* 2000 big enough to fit max user, domain, NOS name etc. */ - str_area = kmalloc(2000, GFP_KERNEL); - if (str_area == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - bcc_ptr = str_area; + /* no capabilities flags in old lanman negotiation */ + pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- iov[1].iov_base = NULL; - iov[1].iov_len = 0; + /* Calculate hash with password and copy into bcc_ptr. + * Encryption Key (stored as in cryptkey) gets used if the + * security mode bit in Negottiate Protocol response states + * to use challenge/response method (i.e. Password bit is 1). + */ + rc = calc_lanman_hash(ses->password, ses->server->cryptkey, + ses->server->sec_mode & SECMODE_PW_ENCRYPT ? + true : false, lnm_session_key);
- if (type == LANMAN) { -#ifdef CONFIG_CIFS_WEAK_PW_HASH - char lnm_session_key[CIFS_AUTH_RESP_SIZE]; + memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + /* + * can not sign if LANMAN negotiated so no need + * to calculate signing key? but what if server + * changed to do higher than lanman dialect and + * we reconnected would we ever calc signing_key? + */
- pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; + cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); + /* Unicode not allowed for LANMAN dialects */ + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
- /* no capabilities flags in old lanman negotiation */ + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base;
- pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); + rc = sess_sendreceive(sess_data); + if (rc) + goto out;
- /* Calculate hash with password and copy into bcc_ptr. - * Encryption Key (stored as in cryptkey) gets used if the - * security mode bit in Negottiate Protocol response states - * to use challenge/response method (i.e. Password bit is 1). - */ + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
- rc = calc_lanman_hash(ses->password, ses->server->cryptkey, - ses->server->sec_mode & SECMODE_PW_ENCRYPT ? - true : false, lnm_session_key); + /* lanman response has a word count of 3 */ + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + }
- memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
- /* can not sign if LANMAN negotiated so no need - to calculate signing key? but what if server - changed to do higher than lanman dialect and - we reconnected would we ever calc signing_key? */ + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf);
- cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); - /* Unicode not allowed for LANMAN dialects */ - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); +} + +#else + +static void +sess_auth_lanman(struct sess_data *sess_data) +{ + sess_data->result = -EOPNOTSUPP; + sess_data->func = NULL; +} #endif - } else if (type == NTLM) { - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); - pSMB->req_no_secext.CaseInsensitivePasswordLength = + +static void +sess_auth_ntlm(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + + /* old style NTLM sessionsetup */ + /* wct = 13 */ + rc = sess_alloc_buffer(sess_data, 13); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + pSMB->req_no_secext.CaseInsensitivePasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - pSMB->req_no_secext.CaseSensitivePasswordLength = + pSMB->req_no_secext.CaseSensitivePasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- /* calculate ntlm response and session key */ - rc = setup_ntlm_response(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLM authentication\n", + /* calculate ntlm response and session key */ + rc = setup_ntlm_response(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLM authentication\n", rc); - goto ssetup_exit; - } + goto out; + }
- /* copy ntlm response */ - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - - if (ses->capabilities & CAP_UNICODE) { - /* unicode strings must be word aligned */ - if (iov[0].iov_len % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else if (type == NTLMv2) { - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); - - /* LM2 password would be here if we supported it */ - pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; - - /* calculate nlmv2 response and session key */ - rc = setup_ntlmv2_rsp(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", - rc); - goto ssetup_exit; + /* copy ntlm response */ + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + if (ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if (sess_data->iov[0].iov_len % 2) { + *bcc_ptr = 0; + bcc_ptr++; } - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - ses->auth_key.len - CIFS_SESS_KEY_SIZE); - bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; - - /* set case sensitive password length after tilen may get - * assigned, tilen is 0 otherwise. - */ - pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } else { + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + }
- if (ses->capabilities & CAP_UNICODE) { - if (iov[0].iov_len % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else if (type == Kerberos) { -#ifdef CONFIG_CIFS_UPCALL - struct cifs_spnego_msg *msg;
- spnego_key = cifs_get_spnego_key(ses); - if (IS_ERR(spnego_key)) { - rc = PTR_ERR(spnego_key); - spnego_key = NULL; - goto ssetup_exit; - } + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base;
- msg = spnego_key->payload.data; - /* check version field to make sure that cifs.upcall is - sending us a response in an expected form */ - if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { - cifs_dbg(VFS, "incorrect version of cifs.upcall " - "expected %d but got %d)", - CIFS_SPNEGO_UPCALL_VERSION, msg->version); - rc = -EKEYREJECTED; - goto ssetup_exit; - } + rc = sess_sendreceive(sess_data); + if (rc) + goto out;
- ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, - GFP_KERNEL); - if (!ses->auth_key.response) { - cifs_dbg(VFS, - "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); - rc = -ENOMEM; - goto ssetup_exit; - } - ses->auth_key.len = msg->sesskey_len; - - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; - capabilities |= CAP_EXTENDED_SECURITY; - pSMB->req.Capabilities = cpu_to_le32(capabilities); - iov[1].iov_base = msg->data + msg->sesskey_len; - iov[1].iov_len = msg->secblob_len; - pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len); - - if (ses->capabilities & CAP_UNICODE) { - /* unicode strings must be word aligned */ - if ((iov[0].iov_len + iov[1].iov_len) % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_oslm_strings(&bcc_ptr, nls_cp); - unicode_domain_string(&bcc_ptr, ses, nls_cp); - } else - /* BB: is this right? */ - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); -#else /* ! CONFIG_CIFS_UPCALL */ - cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); - rc = -ENOSYS; - goto ssetup_exit; -#endif /* CONFIG_CIFS_UPCALL */ - } else if (type == RawNTLMSSP) { - if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { - cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); - rc = -ENOSYS; - goto ssetup_exit; - } + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
- cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase); - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; - capabilities |= CAP_EXTENDED_SECURITY; - pSMB->req.Capabilities |= cpu_to_le32(capabilities); - switch(phase) { - case NtLmNegotiate: - build_ntlmssp_negotiate_blob( - pSMB->req.SecurityBlob, ses); - iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); - iov[1].iov_base = pSMB->req.SecurityBlob; - pSMB->req.SecurityBlobLength = - cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); - break; - case NtLmAuthenticate: - /* - * 5 is an empirical value, large enough to hold - * authenticate message plus max 10 of av paris, - * domain, user, workstation names, flags, etc. - */ - ntlmsspblob = kzalloc( - 5*sizeof(struct _AUTHENTICATE_MESSAGE), - GFP_KERNEL); - if (!ntlmsspblob) { - rc = -ENOMEM; - goto ssetup_exit; - } + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + }
- rc = build_ntlmssp_auth_blob(ntlmsspblob, - &blob_len, ses, nls_cp); - if (rc) - goto ssetup_exit; - iov[1].iov_len = blob_len; - iov[1].iov_base = ntlmsspblob; - pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len); - /* - * Make sure that we tell the server that we are using - * the uid that it just gave us back on the response - * (challenge) - */ - smb_buf->Uid = ses->Suid; - break; - default: - cifs_dbg(VFS, "invalid phase %d\n", phase); - rc = -ENOSYS; - goto ssetup_exit; + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; } - /* unicode strings must be word aligned */ - if ((iov[0].iov_len + iov[1].iov_len) % 2) { + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +static void +sess_auth_ntlmv2(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + + /* old style NTLM sessionsetup */ + /* wct = 13 */ + rc = sess_alloc_buffer(sess_data, 13); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + + /* LM2 password would be here if we supported it */ + pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; + + /* calculate nlmv2 response and session key */ + rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc); + goto out; + } + + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + + /* set case sensitive password length after tilen may get + * assigned, tilen is 0 otherwise. + */ + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + + if (ses->capabilities & CAP_UNICODE) { + if (sess_data->iov[0].iov_len % 2) { *bcc_ptr = 0; bcc_ptr++; } - unicode_oslm_strings(&bcc_ptr, nls_cp); + unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); } else { - cifs_dbg(VFS, "secType %d not supported!\n", type); - rc = -ENOSYS; - goto ssetup_exit; + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); }
- iov[2].iov_base = str_area; - iov[2].iov_len = (long) bcc_ptr - (long) str_area;
- count = iov[1].iov_len + iov[2].iov_len; - smb_buf->smb_buf_length = - cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count); + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base;
- put_bcc(count, smb_buf); + rc = sess_sendreceive(sess_data); + if (rc) + goto out;
- rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type, - CIFS_LOG_ERROR); - /* SMB request buf freed in SendReceive2 */ + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
- pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; - smb_buf = (struct smb_hdr *)iov[0].iov_base; + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf);
- if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) && - (smb_buf->Status.CifsError == - cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { - if (phase != NtLmNegotiate) { - cifs_dbg(VFS, "Unexpected more processing error\n"); - goto ssetup_exit; + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; } - /* NTLMSSP Negotiate sent now processing challenge (response) */ - phase = NtLmChallenge; /* process ntlmssp challenge */ - rc = 0; /* MORE_PROC rc is not an error here, but expected */ + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +#ifdef CONFIG_CIFS_UPCALL +static void +sess_auth_kerberos(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + struct key *spnego_key = NULL; + struct cifs_spnego_msg *msg; + u16 blob_len; + + /* extended security */ + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); if (rc) - goto ssetup_exit; + goto out;
- if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + spnego_key = cifs_get_spnego_key(ses); + if (IS_ERR(spnego_key)) { + rc = PTR_ERR(spnego_key); + spnego_key = NULL; + goto out; + } + + msg = spnego_key->payload.data; + /* + * check version field to make sure that cifs.upcall is + * sending us a response in an expected form + */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cifs_dbg(VFS, + "incorrect version of cifs.upcall (expected %d but got %d)", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); + rc = -EKEYREJECTED; + goto out_put_spnego_key; + } + + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto out_put_spnego_key; + } + ses->auth_key.len = msg->sesskey_len; + + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities = cpu_to_le32(capabilities); + sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; + sess_data->iov[1].iov_len = msg->secblob_len; + pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len); + + if (ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if ((sess_data->iov[0].iov_len + + sess_data->iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp); + } else { + /* BB: is this right? */ + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out_put_spnego_key; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + if (smb_buf->WordCount != 4) { rc = -EIO; cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); - goto ssetup_exit; + goto out_put_spnego_key; } - action = le16_to_cpu(pSMB->resp.Action); - if (action & GUEST_LOGIN) + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ cifs_dbg(FYI, "UID = %llu\n", ses->Suid); - /* response can have either 3 or 4 word count - Samba sends 3 */ - /* and lanman response is 3 */ + bytes_remaining = get_bcc(smb_buf); bcc_ptr = pByteArea(smb_buf);
- if (smb_buf->WordCount == 4) { - blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); - if (blob_len > bytes_remaining) { - cifs_dbg(VFS, "bad security blob length %d\n", - blob_len); - rc = -EINVAL; - goto ssetup_exit; - } - if (phase == NtLmChallenge) { - rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); - /* now goto beginning for ntlmssp authenticate phase */ - if (rc) - goto ssetup_exit; - } - bcc_ptr += blob_len; - bytes_remaining -= blob_len; + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out_put_spnego_key; } + bcc_ptr += blob_len; + bytes_remaining -= blob_len;
/* BB check if Unicode and decode strings */ if (bytes_remaining == 0) { @@@ -1083,371 -906,60 +1083,371 @@@ ++bcc_ptr; --bytes_remaining; } - decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); } else { - decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); }
-ssetup_exit: - if (spnego_key) { - key_invalidate(spnego_key); - key_put(spnego_key); + rc = sess_establish_session(sess_data); +out_put_spnego_key: + key_invalidate(spnego_key); + key_put(spnego_key); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +#else + +static void +sess_auth_kerberos(struct sess_data *sess_data) +{ + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); + sess_data->result = -ENOSYS; + sess_data->func = NULL; +} +#endif /* ! CONFIG_CIFS_UPCALL */ + +/* + * The required kvec buffers have to be allocated before calling this + * function. + */ +static int +_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) +{ + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + char *bcc_ptr; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)pSMB; + + capabilities = cifs_ssetup_hdr(ses, pSMB); + if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { + cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); + return -ENOSYS; } - kfree(str_area); - kfree(ntlmsspblob); - ntlmsspblob = NULL; - if (resp_buf_type == CIFS_SMALL_BUFFER) { - cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base); - cifs_small_buf_release(iov[0].iov_base); - } else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base);
- /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ - if ((phase == NtLmChallenge) && (rc == 0)) - goto ssetup_ntlmssp_authenticate; + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities |= cpu_to_le32(capabilities); + + bcc_ptr = sess_data->iov[2].iov_base; + /* unicode strings must be word aligned */ + if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + return 0; +} + +static void +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data); + +static void +sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u16 bytes_remaining; + char *bcc_ptr; + u16 blob_len; + + cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n"); + + /* + * if memory allocation is successful, caller of this function + * frees it. + */ + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) { + rc = -ENOMEM; + goto out; + } + ses->ntlmssp->sesskey_per_smbsess = false; + + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + + /* Build security blob before we assemble the request */ + build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses); + sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); + sess_data->iov[1].iov_base = pSMB->req.SecurityBlob; + pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); + + rc = _sess_auth_rawntlmssp_assemble_req(sess_data); + if (rc) + goto out; + + rc = sess_sendreceive(sess_data); + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + /* If true, rc here is expected and not an error */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && + smb_buf->Status.CifsError == + cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED)) + rc = 0; + + if (rc) + goto out; + + cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + + if (smb_buf->WordCount != 4) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out; + } + + rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); +out: + sess_free_buffer(sess_data);
if (!rc) { - mutex_lock(&ses->server->srv_mutex); - if (!ses->server->session_estab) { - if (ses->server->sign) { - ses->server->session_key.response = - kmemdup(ses->auth_key.response, - ses->auth_key.len, GFP_KERNEL); - if (!ses->server->session_key.response) { - rc = -ENOMEM; - mutex_unlock(&ses->server->srv_mutex); - goto keycp_exit; - } - ses->server->session_key.len = - ses->auth_key.len; - } - ses->server->sequence_number = 0x2; - ses->server->session_estab = true; - } - mutex_unlock(&ses->server->srv_mutex); + sess_data->func = sess_auth_rawntlmssp_authenticate; + return; + } + + /* Else error. Cleanup */ + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + + sess_data->func = NULL; + sess_data->result = rc; +}
- cifs_dbg(FYI, "CIFS session established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); +static void +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u16 bytes_remaining; + char *bcc_ptr; + char *ntlmsspblob = NULL; + u16 blob_len; + + cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n"); + + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); + if (rc) + goto out; + + /* Build security blob before we assemble the request */ + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)pSMB; + /* + * 5 is an empirical value, large enough to hold + * authenticate message plus max 10 of av paris, + * domain, user, workstation names, flags, etc. + */ + ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE), + GFP_KERNEL); + if (!ntlmsspblob) { + rc = -ENOMEM; + goto out; }
-keycp_exit: + rc = build_ntlmssp_auth_blob(ntlmsspblob, + &blob_len, ses, sess_data->nls_cp); + if (rc) + goto out_free_ntlmsspblob; + sess_data->iov[1].iov_len = blob_len; + sess_data->iov[1].iov_base = ntlmsspblob; + pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len); + /* + * Make sure that we tell the server that we are using + * the uid that it just gave us back on the response + * (challenge) + */ + smb_buf->Uid = ses->Suid; + + rc = _sess_auth_rawntlmssp_assemble_req(sess_data); + if (rc) + goto out_free_ntlmsspblob; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out_free_ntlmsspblob; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + if (smb_buf->WordCount != 4) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out_free_ntlmsspblob; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out_free_ntlmsspblob; + } + bcc_ptr += blob_len; + bytes_remaining -= blob_len; + + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + +out_free_ntlmsspblob: + kfree(ntlmsspblob); +out: + sess_free_buffer(sess_data); + + if (!rc) + rc = sess_establish_session(sess_data); + + /* Cleanup */ kfree(ses->auth_key.response); ses->auth_key.response = NULL; kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + + sess_data->func = NULL; + sess_data->result = rc; +} + +static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data) +{ + int type; + + type = select_sectype(ses->server, ses->sectype); + cifs_dbg(FYI, "sess setup type %d\n", type); + if (type == Unspecified) { + cifs_dbg(VFS, + "Unable to select appropriate authentication method!"); + return -EINVAL; + } + + switch (type) { + case LANMAN: + /* LANMAN and plaintext are less secure and off by default. + * So we make this explicitly be turned on in kconfig (in the + * build) and turned on at runtime (changed from the default) + * in proc/fs/cifs or via mount parm. Unfortunately this is + * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH + sess_data->func = sess_auth_lanman; + break; +#else + return -EOPNOTSUPP; +#endif + case NTLM: + sess_data->func = sess_auth_ntlm; + break; + case NTLMv2: + sess_data->func = sess_auth_ntlmv2; + break; + case Kerberos: +#ifdef CONFIG_CIFS_UPCALL + sess_data->func = sess_auth_kerberos; + break; +#else + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); + return -ENOSYS; + break; +#endif /* CONFIG_CIFS_UPCALL */ + case RawNTLMSSP: + sess_data->func = sess_auth_rawntlmssp_negotiate; + break; + default: + cifs_dbg(VFS, "secType %d not supported!\n", type); + return -ENOSYS; + } + + return 0; +} + +int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc = 0; + struct sess_data *sess_data; + + if (ses == NULL) { + WARN(1, "%s: ses == NULL!", __func__); + return -EINVAL; + } + + sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL); + if (!sess_data) + return -ENOMEM; + + rc = select_sec(ses, sess_data); + if (rc) + goto out; + + sess_data->xid = xid; + sess_data->ses = ses; + sess_data->buf0_type = CIFS_NO_BUFFER; + sess_data->nls_cp = (struct nls_table *) nls_cp; + + while (sess_data->func) + sess_data->func(sess_data); + + /* Store result before we free sess_data */ + rc = sess_data->result;
+out: + kfree(sess_data); return rc; } diff --combined fs/cifs/smb2ops.c index 081529f,7f99a0f..59437c5 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@@ -112,53 -112,6 +112,53 @@@ smb2_get_credits(struct mid_q_entry *mi return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest); }
+static int +smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, + unsigned int *num, unsigned int *credits) +{ + int rc = 0; + unsigned int scredits; + + spin_lock(&server->req_lock); + while (1) { + if (server->credits <= 0) { + spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); + rc = wait_event_killable(server->request_q, + has_credits(server, &server->credits)); + cifs_num_waiters_dec(server); + if (rc) + return rc; + spin_lock(&server->req_lock); + } else { + if (server->tcpStatus == CifsExiting) { + spin_unlock(&server->req_lock); + return -ENOENT; + } + + scredits = server->credits; + /* can deadlock with reopen */ + if (scredits == 1) { + *num = SMB2_MAX_BUFFER_SIZE; + *credits = 0; + break; + } + + /* leave one credit for a possible reopen */ + scredits--; + *num = min_t(unsigned int, size, + scredits * SMB2_MAX_BUFFER_SIZE); + + *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE); + server->credits -= *credits; + server->in_flight++; + break; + } + } + spin_unlock(&server->req_lock); + return rc; +} + static __u64 smb2_get_next_mid(struct TCP_Server_Info *server) { @@@ -229,6 -182,8 +229,6 @@@ smb2_negotiate_wsize(struct cifs_tcon * /* start with specified wsize, or default */ wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); - /* set it to the maximum buffer size value we can send with 1 credit */ - wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
return wsize; } @@@ -242,6 -197,8 +242,6 @@@ smb2_negotiate_rsize(struct cifs_tcon * /* start with specified rsize, or default */ rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); - /* set it to the maximum buffer size value we can send with 1 credit */ - rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
return rsize; } @@@ -590,7 -547,7 +590,7 @@@ smb2_clone_range(const unsigned int xid goto cchunk_out;
/* For now array only one chunk long, will make more flexible later */ - pcchunk->ChunkCount = __constant_cpu_to_le32(1); + pcchunk->ChunkCount = cpu_to_le32(1); pcchunk->Reserved = 0; pcchunk->Reserved2 = 0;
@@@ -1147,13 -1104,6 +1147,13 @@@ smb3_parse_lease_buf(void *buf, unsigne return le32_to_cpu(lc->lcontext.LeaseState); }
+static unsigned int +smb2_wp_retry_size(struct inode *inode) +{ + return min_t(unsigned int, CIFS_SB(inode->i_sb)->wsize, + SMB2_MAX_BUFFER_SIZE); +} + struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@@ -1163,7 -1113,6 +1163,7 @@@ .set_credits = smb2_set_credits, .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, + .wait_mtu_credits = cifs_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, @@@ -1228,7 -1177,6 +1228,7 @@@ .create_lease_buf = smb2_create_lease_buf, .parse_lease_buf = smb2_parse_lease_buf, .clone_range = smb2_clone_range, + .wp_retry_size = smb2_wp_retry_size, };
struct smb_version_operations smb21_operations = { @@@ -1240,7 -1188,6 +1240,7 @@@ .set_credits = smb2_set_credits, .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, @@@ -1305,7 -1252,6 +1305,7 @@@ .create_lease_buf = smb2_create_lease_buf, .parse_lease_buf = smb2_parse_lease_buf, .clone_range = smb2_clone_range, + .wp_retry_size = smb2_wp_retry_size, };
struct smb_version_operations smb30_operations = { @@@ -1317,7 -1263,6 +1317,7 @@@ .set_credits = smb2_set_credits, .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, @@@ -1385,7 -1330,6 +1385,7 @@@ .parse_lease_buf = smb3_parse_lease_buf, .clone_range = smb2_clone_range, .validate_negotiate = smb3_validate_negotiate, + .wp_retry_size = smb2_wp_retry_size, };
struct smb_version_values smb20_values = { diff --combined fs/cifs/smb2pdu.c index 768cddb,a9b03c2..2057250 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@@ -245,6 -245,10 +245,6 @@@ smb2_reconnect(__le16 smb2_command, str if (rc) goto out; atomic_inc(&tconInfoReconnectCount); - /* - * BB FIXME add code to check if wsize needs update due to negotiated - * smb buffer size shrinking. - */ out: /* * Check if handle based operation so we know whether we can continue @@@ -305,6 -309,16 +305,6 @@@ small_smb2_init(__le16 smb2_command, st return rc; }
-static void -free_rsp_buf(int resp_buftype, void *rsp) -{ - if (resp_buftype == CIFS_SMALL_BUFFER) - cifs_small_buf_release(rsp); - else if (resp_buftype == CIFS_LARGE_BUFFER) - cifs_buf_release(rsp); -} - - /* * * SMB2 Worker functions follow: @@@ -1355,7 -1369,7 +1355,7 @@@ SMB2_set_compression(const unsigned in char *ret_data = NULL;
fsctl_input.CompressionState = - __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SET_COMPRESSION, true /* is_fsctl */, @@@ -1724,18 -1738,12 +1724,18 @@@ smb2_readv_callback(struct mid_q_entry rc); } /* FIXME: should this be counted toward the initiating task? */ - task_io_account_read(rdata->bytes); - cifs_stats_bytes_read(tcon, rdata->bytes); + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: rdata->result = -EAGAIN; + if (server->sign && rdata->got_bytes) + /* reset bytes number since we can not check a sign */ + rdata->got_bytes = 0; + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; default: if (rdata->result != -ENODATA) @@@ -1754,12 -1762,11 +1754,12 @@@ int smb2_async_readv(struct cifs_readdata *rdata) { - int rc; + int rc, flags = 0; struct smb2_hdr *buf; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = &rdata->iov, .rq_nvec = 1 }; + struct TCP_Server_Info *server;
cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", __func__, rdata->offset, rdata->bytes); @@@ -1770,41 -1777,18 +1770,41 @@@ io_parms.persistent_fid = rdata->cfile->fid.persistent_fid; io_parms.volatile_fid = rdata->cfile->fid.volatile_fid; io_parms.pid = rdata->pid; + + server = io_parms.tcon->ses->server; + rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0); - if (rc) + if (rc) { + if (rc == -EAGAIN && rdata->credits) { + /* credits was reseted by reconnect */ + rdata->credits = 0; + /* reduce in_flight value since we won't send the req */ + spin_lock(&server->req_lock); + server->in_flight--; + spin_unlock(&server->req_lock); + } return rc; + }
buf = (struct smb2_hdr *)rdata->iov.iov_base; /* 4 for rfc1002 length field */ rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4;
+ if (rdata->credits) { + buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, + SMB2_MAX_BUFFER_SIZE)); + spin_lock(&server->req_lock); + server->credits += rdata->credits - + le16_to_cpu(buf->CreditCharge); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + flags = CIFS_HAS_CREDITS; + } + kref_get(&rdata->refcount); rc = cifs_call_async(io_parms.tcon->ses->server, &rqst, cifs_readv_receive, smb2_readv_callback, - rdata, 0); + rdata, flags); if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); @@@ -1922,25 -1906,15 +1922,25 @@@ in smb2_async_writev(struct cifs_writedata *wdata, void (*release)(struct kref *kref)) { - int rc = -EACCES; + int rc = -EACCES, flags = 0; struct smb2_write_req *req = NULL; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; struct kvec iov; struct smb_rqst rqst;
rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req); - if (rc) + if (rc) { + if (rc == -EAGAIN && wdata->credits) { + /* credits was reseted by reconnect */ + wdata->credits = 0; + /* reduce in_flight value since we won't send the req */ + spin_lock(&server->req_lock); + server->in_flight--; + spin_unlock(&server->req_lock); + } goto async_writev_out; + }
req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid);
@@@ -1973,20 -1947,9 +1973,20 @@@
inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
+ if (wdata->credits) { + req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, + SMB2_MAX_BUFFER_SIZE)); + spin_lock(&server->req_lock); + server->credits += wdata->credits - + le16_to_cpu(req->hdr.CreditCharge); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + flags = CIFS_HAS_CREDITS; + } + kref_get(&wdata->refcount); - rc = cifs_call_async(tcon->ses->server, &rqst, NULL, - smb2_writev_callback, wdata, 0); + rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata, + flags);
if (rc) { kref_put(&wdata->refcount, release); diff --combined fs/namespace.c index b10db3d,2a1447c..019ff81 --- a/fs/namespace.c +++ b/fs/namespace.c @@@ -225,7 -225,6 +225,7 @@@ static struct mount *alloc_vfsmnt(cons INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_LIST_HEAD(&mnt->mnt_mp_list); #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif @@@ -668,45 -667,11 +668,45 @@@ struct vfsmount *lookup_mnt(struct pat return m; }
-static struct mountpoint *new_mountpoint(struct dentry *dentry) +/* + * __is_local_mountpoint - Test to see if dentry is a mountpoint in the + * current mount namespace. + * + * The common case is dentries are not mountpoints at all and that + * test is handled inline. For the slow case when we are actually + * dealing with a mountpoint of some kind, walk through all of the + * mounts in the current mount namespace and test to see if the dentry + * is a mountpoint. + * + * The mount_hashtable is not usable in the context because we + * need to identify all mounts that may be in the current mount + * namespace not just a mount that happens to have some specified + * parent mount. + */ +bool __is_local_mountpoint(struct dentry *dentry) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + bool is_covered = false; + + if (!d_mountpoint(dentry)) + goto out; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + is_covered = (mnt->mnt_mountpoint == dentry); + if (is_covered) + break; + } + up_read(&namespace_sem); +out: + return is_covered; +} + +static struct mountpoint *lookup_mountpoint(struct dentry *dentry) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; - int ret;
hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { @@@ -717,14 -682,6 +717,14 @@@ return mp; } } + return NULL; +} + +static struct mountpoint *new_mountpoint(struct dentry *dentry) +{ + struct hlist_head *chain = mp_hash(dentry); + struct mountpoint *mp; + int ret;
mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); if (!mp) @@@ -739,7 -696,6 +739,7 @@@ mp->m_dentry = dentry; mp->m_count = 1; hlist_add_head(&mp->m_hash, chain); + INIT_LIST_HEAD(&mp->m_list); return mp; }
@@@ -747,7 -703,6 +747,7 @@@ static void put_mountpoint(struct mount { if (!--mp->m_count) { struct dentry *dentry = mp->m_dentry; + BUG_ON(!list_empty(&mp->m_list)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); @@@ -794,7 -749,6 +794,7 @@@ static void detach_mnt(struct mount *mn mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); hlist_del_init_rcu(&mnt->mnt_hash); + list_del_init(&mnt->mnt_mp_list); put_mountpoint(mnt->mnt_mp); mnt->mnt_mp = NULL; } @@@ -811,7 -765,6 +811,7 @@@ void mnt_set_mountpoint(struct mount *m child_mnt->mnt_mountpoint = dget(mp->m_dentry); child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; + list_add_tail(&child_mnt->mnt_mp_list, &mp->m_list); }
/* @@@ -845,7 -798,7 +845,7 @@@ static void commit_tree(struct mount *m list_splice(&head, n->list.prev);
if (shadows) - hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); else hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); @@@ -983,25 -936,9 +983,25 @@@ static struct mount *clone_mnt(struct m return ERR_PTR(err); }
+static void cleanup_mnt(struct mount *mnt) +{ + fsnotify_vfsmount_delete(&mnt->mnt); + dput(mnt->mnt.mnt_root); + deactivate_super(mnt->mnt.mnt_sb); + mnt_free_id(mnt); + complete(mnt->mnt_undone); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); +} + +static void cleanup_mnt_work(struct work_struct *work) +{ + cleanup_mnt(container_of(work, struct mount, mnt_cleanup_work)); +} + static void mntput_no_expire(struct mount *mnt) { -put_again: + struct completion undone; + rcu_read_lock(); mnt_add_count(mnt, -1); if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ @@@ -1015,15 -952,12 +1015,15 @@@ return; } if (unlikely(mnt->mnt_pinned)) { - mnt_add_count(mnt, mnt->mnt_pinned + 1); + init_completion(&undone); + mnt->mnt_undone = &undone; + mnt_add_count(mnt, mnt->mnt_pinned); mnt->mnt_pinned = 0; rcu_read_unlock(); unlock_mount_hash(); acct_auto_close_mnt(&mnt->mnt); - goto put_again; + wait_for_completion(&undone); + return; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); @@@ -1047,19 -981,11 +1047,19 @@@ * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); - fsnotify_vfsmount_delete(&mnt->mnt); - dput(mnt->mnt.mnt_root); - deactivate_super(mnt->mnt.mnt_sb); - mnt_free_id(mnt); - call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); + /* The stack may be deep here, cleanup the mount on a work + * queue where the stack is guaranteed to be shallow. + */ + init_completion(&undone); + if (!mnt->mnt_undone) + mnt->mnt_undone = &undone; + else + complete(&undone); + + INIT_WORK(&mnt->mnt_cleanup_work, cleanup_mnt_work); + schedule_work(&mnt->mnt_cleanup_work); + + wait_for_completion(&undone); }
void mntput(struct vfsmount *mnt) @@@ -1335,7 -1261,6 +1335,7 @@@ void umount_tree(struct mount *mnt, in p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { + list_del_init(&p->mnt_mp_list); put_mountpoint(p->mnt_mp); /* move the reference to mountpoint into ->mnt_ex_mountpoint */ p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; @@@ -1448,37 -1373,6 +1448,37 @@@ static int do_umount(struct mount *mnt return retval; }
+/* + * __detach_mounts - lazily unmount all mounts on the specified dentry + * + * During unlink, rmdir, and d_drop it is possible to loose the path + * to an existing mountpoint, and wind up leaking the mount. + * detach_mounts allows lazily unmounting those mounts instead of + * leaking them. + * + * The caller may hold dentry->d_inode->i_mutex. + */ +void __detach_mounts(struct dentry *dentry) +{ + struct mountpoint *mp; + struct mount *mnt; + + namespace_lock(); + mp = lookup_mountpoint(dentry); + if (!mp) + goto out_unlock; + + lock_mount_hash(); + while (!list_empty(&mp->m_list)) { + mnt = list_first_entry(&mp->m_list, struct mount, mnt_mp_list); + umount_tree(mnt, 2); + } + unlock_mount_hash(); + put_mountpoint(mp); +out_unlock: + namespace_unlock(); +} + /* * Is the caller allowed to modify his namespace? */ @@@ -1828,9 -1722,7 +1828,9 @@@ retry namespace_lock(); mnt = lookup_mnt(path); if (likely(!mnt)) { - struct mountpoint *mp = new_mountpoint(dentry); + struct mountpoint *mp = lookup_mountpoint(dentry); + if (!mp) + mp = new_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); mutex_unlock(&dentry->d_inode->i_mutex); diff --combined fs/proc/base.c index e442784,35afb2e..f50d4be --- a/fs/proc/base.c +++ b/fs/proc/base.c @@@ -1625,6 -1625,7 +1625,6 @@@ int pid_revalidate(struct dentry *dentr put_task_struct(task); return 1; } - d_drop(dentry); return 0; }
@@@ -1761,6 -1762,9 +1761,6 @@@ out put_task_struct(task);
out_notask: - if (status <= 0) - d_drop(dentry); - return status; }
@@@ -2445,7 -2449,7 +2445,7 @@@ static int proc_tgid_io_accounting(stru
#ifdef CONFIG_USER_NS static int proc_id_map_open(struct inode *inode, struct file *file, - struct seq_operations *seq_ops) + const struct seq_operations *seq_ops) { struct user_namespace *ns = NULL; struct task_struct *task; @@@ -2672,7 -2676,8 +2672,7 @@@ static void proc_flush_task_mnt(struct /* no ->d_hash() rejects on procfs */ dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { - shrink_dcache_parent(dentry); - d_drop(dentry); + d_invalidate(dentry); dput(dentry); }
@@@ -2692,7 -2697,8 +2692,7 @@@ name.len = snprintf(buf, sizeof(buf), "%d", pid); dentry = d_hash_and_lookup(dir, &name); if (dentry) { - shrink_dcache_parent(dentry); - d_drop(dentry); + d_invalidate(dentry); dput(dentry); }
@@@ -2774,12 -2780,12 +2774,12 @@@ out
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - int result = 0; + int result = -ENOENT; struct task_struct *task; unsigned tgid; struct pid_namespace *ns;
- tgid = name_to_int(dentry); + tgid = name_to_int(&dentry->d_name); if (tgid == ~0U) goto out;
@@@ -3027,7 -3033,7 +3027,7 @@@ static struct dentry *proc_task_lookup( if (!leader) goto out_no_task;
- tid = name_to_int(dentry); + tid = name_to_int(&dentry->d_name); if (tid == ~0U) goto out;
diff --combined fs/proc/fd.c index eb82e9f,955bb55..e11d7c5 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@@ -129,6 -129,8 +129,6 @@@ static int tid_fd_revalidate(struct den } put_task_struct(task); } - - d_drop(dentry); return 0; }
@@@ -204,7 -206,7 +204,7 @@@ static struct dentry *proc_lookupfd_com { struct task_struct *task = get_proc_task(dir); int result = -ENOENT; - unsigned fd = name_to_int(dentry); + unsigned fd = name_to_int(&dentry->d_name);
if (!task) goto out_no_task; diff --combined include/linux/fs.h index 2daccaf,8b4a021..1ab6c69 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@@ -833,7 -833,7 +833,7 @@@ static inline struct file *get_file(str * * Lockd stuffs a "host" pointer into this. */ -typedef struct files_struct *fl_owner_t; +typedef void *fl_owner_t;
struct file_lock_operations { void (*fl_copy_lock)(struct file_lock *, struct file_lock *); @@@ -2688,7 -2688,7 +2688,7 @@@ static const struct file_operations __f .read = simple_attr_read, \ .write = simple_attr_write, \ .llseek = generic_file_llseek, \ - }; + }
static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) diff --combined include/linux/kernel.h index a9e2268,44a498d..e989204 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@@ -470,6 -470,7 +470,7 @@@ extern enum system_states #define TAINT_FIRMWARE_WORKAROUND 11 #define TAINT_OOT_MODULE 12 #define TAINT_UNSIGNED_MODULE 13 + #define TAINT_SOFTLOCKUP 14
extern const char hex_asc[]; #define hex_asc_lo(x) hex_asc[((x) & 0x0f)] @@@ -493,15 -494,10 +494,10 @@@ static inline char *hex_byte_pack_upper return buf; }
extern int hex_to_bin(char ch); extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
-int mac_pton(const char *s, u8 *mac); +bool mac_pton(const char *s, u8 *mac);
/* * General tracing related utility functions - trace_printk(), @@@ -719,23 -715,8 +715,8 @@@ static inline void ftrace_dump(enum ftr (void) (&_max1 == &_max2); \ _max1 > _max2 ? _max1 : _max2; })
- #define min3(x, y, z) ({ \ - typeof(x) _min1 = (x); \ - typeof(y) _min2 = (y); \ - typeof(z) _min3 = (z); \ - (void) (&_min1 == &_min2); \ - (void) (&_min1 == &_min3); \ - _min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \ - (_min2 < _min3 ? _min2 : _min3); }) - - #define max3(x, y, z) ({ \ - typeof(x) _max1 = (x); \ - typeof(y) _max2 = (y); \ - typeof(z) _max3 = (z); \ - (void) (&_max1 == &_max2); \ - (void) (&_max1 == &_max3); \ - _max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \ - (_max2 > _max3 ? _max2 : _max3); }) + #define min3(x, y, z) min((typeof(x))min(x, y), z) + #define max3(x, y, z) max((typeof(x))max(x, y), z)
/** * min_not_zero - return the minimum that is _not_ zero, unless both are zero @@@ -750,20 -731,13 +731,13 @@@ /** * clamp - return a value clamped to a given range with strict typechecking * @val: current value - * @min: minimum allowable value - * @max: maximum allowable value + * @lo: lowest allowable value + * @hi: highest allowable value * * This macro does strict typechecking of min/max to make sure they are of the * same type as val. See the unnecessary pointer comparisons. */ - #define clamp(val, min, max) ({ \ - typeof(val) __val = (val); \ - typeof(min) __min = (min); \ - typeof(max) __max = (max); \ - (void) (&__val == &__min); \ - (void) (&__val == &__max); \ - __val = __val < __min ? __min: __val; \ - __val > __max ? __max: __val; }) + #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
/* * ..and if you can't take the strict diff --combined include/linux/sched.h index b39a671,e2f9b6c..2d231ae --- a/include/linux/sched.h +++ b/include/linux/sched.h @@@ -33,6 -33,7 +33,7 @@@ struct sched_param
#include <linux/smp.h> #include <linux/sem.h> + #include <linux/shm.h> #include <linux/signal.h> #include <linux/compiler.h> #include <linux/completion.h> @@@ -872,21 -873,21 +873,21 @@@ enum cpu_idle_type #define SD_NUMA 0x4000 /* cross-node balancing */
#ifdef CONFIG_SCHED_SMT -static inline const int cpu_smt_flags(void) +static inline int cpu_smt_flags(void) { return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; } #endif
#ifdef CONFIG_SCHED_MC -static inline const int cpu_core_flags(void) +static inline int cpu_core_flags(void) { return SD_SHARE_PKG_RESOURCES; } #endif
#ifdef CONFIG_NUMA -static inline const int cpu_numa_flags(void) +static inline int cpu_numa_flags(void) { return SD_NUMA; } @@@ -999,7 -1000,7 +1000,7 @@@ void free_sched_domains(cpumask_var_t d bool cpus_share_cache(int this_cpu, int that_cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); -typedef const int (*sched_domain_flags_f)(void); +typedef int (*sched_domain_flags_f)(void);
#define SDTL_OVERLAP 0x01
@@@ -1270,6 -1271,9 +1271,6 @@@ struct task_struct #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rcu_boost_mutex; -#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@@ -1386,6 -1390,7 +1387,7 @@@ #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; + struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ @@@ -1631,12 -1636,6 +1633,6 @@@ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ - struct memcg_batch_info { - int do_batch; /* incremented when batch uncharge started */ - struct mem_cgroup *memcg; /* target memcg of uncharge */ - unsigned long nr_pages; /* uncharged usage */ - unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ - } memcg_batch; unsigned int memcg_kmem_skip_account; struct memcg_oom_info { struct mem_cgroup *memcg; @@@ -2006,6 -2005,9 +2002,6 @@@ static inline void rcu_copy_process(str #ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - p->rcu_boost_mutex = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ INIT_LIST_HEAD(&p->rcu_node_entry); }
@@@ -2957,15 -2959,10 +2953,10 @@@ static inline void inc_syscw(struct tas
#ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); - extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); #else static inline void mm_update_next_owner(struct mm_struct *mm) { } - - static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) - { - } #endif /* CONFIG_MEMCG */
static inline unsigned long task_rlimit(const struct task_struct *tsk, diff --combined include/scsi/scsi.h index 91e2e42,d34cf2d..4b69139 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@@ -31,7 -31,7 +31,7 @@@ enum scsi_timeouts * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. */ - #ifdef ARCH_HAS_SG_CHAIN + #ifdef CONFIG_ARCH_HAS_SG_CHAIN #define SCSI_MAX_SG_CHAIN_SEGMENTS 2048 #else #define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS @@@ -385,7 -385,7 +385,7 @@@ struct scsi_lun #define SCSI_W_LUN_ACCESS_CONTROL (SCSI_W_LUN_BASE + 2) #define SCSI_W_LUN_TARGET_LOG_PAGE (SCSI_W_LUN_BASE + 3)
-static inline int scsi_is_wlun(unsigned int lun) +static inline int scsi_is_wlun(u64 lun) { return (lun & 0xff00) == SCSI_W_LUN_BASE; } diff --combined init/Kconfig index 85fb985,b66f859..1dfdd81 --- a/init/Kconfig +++ b/init/Kconfig @@@ -505,7 -505,7 +505,7 @@@ config PREEMPT_RC def_bool TREE_PREEMPT_RCU help This option enables preemptible-RCU code that is common between - the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. + TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU.
config RCU_STALL_COMMON def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE ) @@@ -737,7 -737,7 +737,7 @@@ choic
config RCU_NOCB_CPU_NONE bool "No build_forced no-CBs CPUs" - depends on RCU_NOCB_CPU && !NO_HZ_FULL + depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL help This option does not force any of the CPUs to be no-CBs CPUs. Only CPUs designated by the rcu_nocbs= boot parameter will be @@@ -751,7 -751,7 +751,7 @@@
config RCU_NOCB_CPU_ZERO bool "CPU 0 is a build_forced no-CBs CPU" - depends on RCU_NOCB_CPU && !NO_HZ_FULL + depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL help This option forces CPU 0 to be a no-CBs CPU, so that its RCU callbacks are invoked by a per-CPU kthread whose name begins @@@ -783,8 -783,13 +783,13 @@@ endchoic
endmenu # "RCU Subsystem"
+ config BUILD_BIN2C + bool + default n + config IKCONFIG tristate "Kernel .config support" + select BUILD_BIN2C ---help--- This option enables the complete Linux kernel ".config" file contents to be saved in the kernel. It provides documentation @@@ -807,15 -812,53 +812,53 @@@ config LOG_BUF_SHIF range 12 21 default 17 help - Select kernel log buffer size as a power of 2. + Select the minimal kernel log buffer size as a power of 2. + The final size is affected by LOG_CPU_MAX_BUF_SHIFT config + parameter, see below. Any higher size also might be forced + by "log_buf_len" boot parameter. + Examples: - 17 => 128 KB + 17 => 128 KB 16 => 64 KB - 15 => 32 KB - 14 => 16 KB + 15 => 32 KB + 14 => 16 KB 13 => 8 KB 12 => 4 KB
+ config LOG_CPU_MAX_BUF_SHIFT + int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)" + range 0 21 + default 12 if !BASE_SMALL + default 0 if BASE_SMALL + help + This option allows to increase the default ring buffer size + according to the number of CPUs. The value defines the contribution + of each CPU as a power of 2. The used space is typically only few + lines however it might be much more when problems are reported, + e.g. backtraces. + + The increased size means that a new buffer has to be allocated and + the original static one is unused. It makes sense only on systems + with more CPUs. Therefore this value is used only when the sum of + contributions is greater than the half of the default kernel ring + buffer as defined by LOG_BUF_SHIFT. The default values are set + so that more than 64 CPUs are needed to trigger the allocation. + + Also this option is ignored when "log_buf_len" kernel parameter is + used as it forces an exact (power of two) size of the ring buffer. + + The number of possible CPUs is used for this computation ignoring + hotplugging making the compuation optimal for the the worst case + scenerio while allowing a simple algorithm to be used from bootup. + + Examples shift values and their meaning: + 17 => 128 KB for each CPU + 16 => 64 KB for each CPU + 15 => 32 KB for each CPU + 14 => 16 KB for each CPU + 13 => 8 KB for each CPU + 12 => 4 KB for each CPU + # # Architectures with an unreliable sched_clock() should select this: # @@@ -1264,77 -1307,6 +1307,77 @@@ config CC_OPTIMIZE_FOR_SIZ
If unsure, say N.
+config LTO_MENU + bool "Enable gcc link time optimization (LTO)" + # Only tested on X86 for now. For other architectures you likely + # have to fix some things first, like adding asmlinkages etc. + depends on X86 + # lto does not support excluding flags for specific files + # right now. Can be removed if that is fixed. + depends on !FUNCTION_TRACER + help + With this option gcc will do whole program optimizations for + the whole kernel and module. This increases compile time, but can + lead to better code. It allows gcc to inline functions between + different files and do other optimization. It might also trigger + bugs due to more aggressive optimization. It allows gcc to drop unused + code. On smaller monolithic kernel configurations + it usually leads to smaller kernels, especially when modules + are disabled. + + With this option gcc will also do some global checking over + different source files. It also disables a number of kernel + features. + + This option is recommended for release builds. With LTO + the kernel always has to be re-optimized (but not re-parsed) + on each build. + + This requires a gcc 4.8 or later compiler and + Linux binutils 2.21.51.0.3 or later. gcc 4.9 builds significantly + faster than 4.8 It does not currently work with a FSF release of + binutils or with the gold linker. + + On larger configurations this may need more than 4GB of RAM. + It will likely not work on those with a 32bit compiler. + + When the toolchain support is not available this will (hopefully) + be automatically disabled. + + For more information see Documentation/lto-build + +config LTO_DISABLE + bool "Disable LTO again" + depends on LTO_MENU + default n + help + This option is merely here so that allyesconfig or allmodconfig do + not enable LTO. If you want to actually use LTO do not enable. + +config LTO + bool + default y + depends on LTO_MENU && !LTO_DISABLE + +config LTO_DEBUG + bool "Enable LTO compile time debugging" + depends on LTO + help + Enable LTO debugging in the compiler. The compiler dumps + some log files that make it easier to figure out LTO + behavior. The log files also allow to reconstruct + the global inlining and a global callgraph. + They however add some (single threaded) cost to the + compilation. When in doubt do not enable. + +config LTO_CP_CLONE + bool "Allow aggressive cloning for function specialization" + depends on LTO + help + Allow the compiler to clone and specialize functions for specific + arguments when it determines these arguments are very commonly + called. Experimential. Will increase text size. + config SYSCTL bool
@@@ -1834,8 -1806,6 +1877,8 @@@ config MODULE_FORCE_UNLOA
config MODVERSIONS bool "Module versioning support" + # LTO should work with gcc 4.9 + depends on !LTO help Usually, you have to use modules compiled with your kernel. Saying Y here makes it sometimes possible to use modules diff --combined kernel/Makefile index 973a40c,9b07bb7..de62ac0 --- a/kernel/Makefile +++ b/kernel/Makefile @@@ -3,11 -3,12 +3,11 @@@ #
obj-y = fork.o exec_domain.o panic.o \ - cpu.o exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ + cpu.o exit.o softirq.o resource.o \ + sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ - extable.o params.o posix-timers.o \ - kthread.o sys_ni.o posix-cpu-timers.o \ - hrtimer.o nsproxy.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o smpboot.o
@@@ -104,11 -105,27 +104,11 @@@ targets += config_data.g $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip)
- filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz)
-$(obj)/time.o: $(obj)/timeconst.h - -quiet_cmd_hzfile = HZFILE $@ - cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ - -targets += hz.bc -$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE - $(call if_changed,hzfile) - -quiet_cmd_bc = BC $@ - cmd_bc = bc -q $(filter-out FORCE,$^) > $@ - -targets += timeconst.h -$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE - $(call if_changed,bc) - ############################################################################### # # Roll all the X.509 certificates that we can find together and pull them into diff --combined kernel/fork.c index 8f54193,735ea98..4e6a5c1 --- a/kernel/fork.c +++ b/kernel/fork.c @@@ -365,12 -365,11 +365,11 @@@ static int dup_mmap(struct mm_struct *m */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
- mm->locked_vm = 0; - mm->mmap = NULL; - mm->vmacache_seqnum = 0; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@@ -527,19 -526,37 +526,37 @@@ static void mm_init_aio(struct mm_struc #endif }
+ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) + { + #ifdef CONFIG_MEMCG + mm->owner = p; + #endif + } + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm->map_count = 0; + mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); clear_tlb_flush_pending(mm); + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; + #endif
if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@@ -549,11 -566,17 +566,17 @@@ mm->def_flags = 0; }
- if (likely(!mm_alloc_pgd(mm))) { - mmu_notifier_mm_init(mm); - return mm; - } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext; + + return mm;
+ fail_nocontext: + mm_free_pgd(mm); + fail_nopgd: free_mm(mm); return NULL; } @@@ -587,7 -610,6 +610,6 @@@ struct mm_struct *mm_alloc(void return NULL;
memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); }
@@@ -819,17 -841,10 +841,10 @@@ static struct mm_struct *dup_mm(struct goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm);
if (!mm_init(mm, tsk)) goto fail_nomem;
- if (init_new_context(tsk, mm)) - goto fail_nocontext; - dup_mm_exe_file(oldmm, mm);
err = dup_mmap(mm, oldmm); @@@ -851,15 -866,6 +866,6 @@@ free_pt
fail_nomem: return NULL; - - fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; }
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) @@@ -1099,13 -1105,6 +1105,6 @@@ static void rt_mutex_init_task(struct t #endif }
- #ifdef CONFIG_MEMCG - void mm_init_owner(struct mm_struct *mm, struct task_struct *p) - { - mm->owner = p; - } - #endif /* CONFIG_MEMCG */ - /* * Initialize POSIX timer handling for a single task. */ @@@ -1262,7 -1261,7 +1261,7 @@@ static struct task_struct *copy_process
posix_cpu_timers_init(p);
- do_posix_clock_monotonic_gettime(&p->start_time); + ktime_get_ts(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; @@@ -1307,10 -1306,6 +1306,6 @@@ #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif - #ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; - #endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; @@@ -1328,6 -1323,7 +1323,7 @@@ if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@@ -1873,6 -1869,11 +1869,11 @@@ SYSCALL_DEFINE1(unshare, unsigned long */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + }
if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); diff --combined mm/memcontrol.c index 45c10c6,0a6e035..3cb7b4c --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@@ -754,9 -754,11 +754,11 @@@ static void __mem_cgroup_remove_exceede static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz) { - spin_lock(&mctz->lock); + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); __mem_cgroup_remove_exceeded(mz, mctz); - spin_unlock(&mctz->lock); + spin_unlock_irqrestore(&mctz->lock, flags); }
@@@ -779,7 -781,9 +781,9 @@@ static void mem_cgroup_update_tree(stru * mem is over its softlimit. */ if (excess || mz->on_tree) { - spin_lock(&mctz->lock); + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); /* if on-tree, remove it */ if (mz->on_tree) __mem_cgroup_remove_exceeded(mz, mctz); @@@ -788,7 -792,7 +792,7 @@@ * If excess is 0, no tree ops. */ __mem_cgroup_insert_exceeded(mz, mctz, excess); - spin_unlock(&mctz->lock); + spin_unlock_irqrestore(&mctz->lock, flags); } } } @@@ -839,9 -843,9 +843,9 @@@ mem_cgroup_largest_soft_limit_node(stru { struct mem_cgroup_per_zone *mz;
- spin_lock(&mctz->lock); + spin_lock_irq(&mctz->lock); mz = __mem_cgroup_largest_soft_limit_node(mctz); - spin_unlock(&mctz->lock); + spin_unlock_irq(&mctz->lock); return mz; }
@@@ -882,13 -886,6 +886,6 @@@ static long mem_cgroup_read_stat(struc return val; }
- static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - bool charge) - { - int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); - } - static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { @@@ -909,13 -906,13 +906,13 @@@
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - bool anon, int nr_pages) + int nr_pages) { /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is * counted as CACHE even if it's on ANON LRU. */ - if (anon) + if (PageAnon(page)) __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); else @@@ -1013,7 -1010,6 +1010,6 @@@ static bool mem_cgroup_event_ratelimit( */ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { - preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { @@@ -1026,8 -1022,6 +1022,6 @@@ do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); #endif - preempt_enable(); - mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); @@@ -1035,8 -1029,7 +1029,7 @@@ if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); #endif - } else - preempt_enable(); + } }
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) @@@ -1347,20 -1340,6 +1340,6 @@@ out return lruvec; }
- /* - * Following LRU functions are allowed to be used without PCG_LOCK. - * Operations are called by routine of global LRU independently from memcg. - * What we have to take care of here is validness of pc->mem_cgroup. - * - * Changes to pc->mem_cgroup happens when - * 1. charge - * 2. moving account - * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. - * It is added to LRU before charge. - * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. - * When moving account, the page is not on LRU. It's isolated. - */ - /** * mem_cgroup_page_lruvec - return lruvec for adding an lru page * @page: the page @@@ -2261,22 -2240,14 +2240,14 @@@ cleanup * * Notes: Race condition * - * We usually use lock_page_cgroup() for accessing page_cgroup member but - * it tends to be costly. But considering some conditions, we doesn't need - * to do so _always_. + * Charging occurs during page instantiation, while the page is + * unmapped and locked in page migration, or while the page table is + * locked in THP migration. No race is possible. * - * Considering "charge", lock_page_cgroup() is not required because all - * file-stat operations happen after a page is attached to radix-tree. There - * are no race with "charge". + * Uncharge happens to pages with zero references, no race possible. * - * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup - * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even - * if there are race with "uncharge". Statistics itself is properly handled - * by flags. - * - * Considering "move", this is an only case we see a race. To make the race - * small, we check memcg->moving_account and detect there are possibility - * of race or not. If there is, we take a lock. + * Charge moving between groups is protected by checking mm->moving + * account and taking the move_lock in the slowpath. */
void __mem_cgroup_begin_update_page_stat(struct page *page, @@@ -2551,55 -2522,63 +2522,63 @@@ static int memcg_cpu_hotplug_callback(s return NOTIFY_OK; }
- - /* See mem_cgroup_try_charge() for details */ - enum { - CHARGE_OK, /* success */ - CHARGE_RETRY, /* need to retry but retry is not bad */ - CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ - CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ - }; - - static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, unsigned int min_pages, - bool invoke_oom) + static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) { - unsigned long csize = nr_pages * PAGE_SIZE; + unsigned int batch = max(CHARGE_BATCH, nr_pages); + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; + unsigned long nr_reclaimed; unsigned long flags = 0; - int ret; + unsigned long long size; + int ret = 0;
- ret = res_counter_charge(&memcg->res, csize, &fail_res); + retry: + if (consume_stock(memcg, nr_pages)) + goto done;
- if (likely(!ret)) { + size = batch * PAGE_SIZE; + if (!res_counter_charge(&memcg->res, size, &fail_res)) { if (!do_swap_account) - return CHARGE_OK; - ret = res_counter_charge(&memcg->memsw, csize, &fail_res); - if (likely(!ret)) - return CHARGE_OK; - - res_counter_uncharge(&memcg->res, csize); + goto done_restock; + if (!res_counter_charge(&memcg->memsw, size, &fail_res)) + goto done_restock; + res_counter_uncharge(&memcg->res, size); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + + if (batch > nr_pages) { + batch = nr_pages; + goto retry; + } + /* - * Never reclaim on behalf of optional batching, retry with a - * single page instead. + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. */ - if (nr_pages > min_pages) - return CHARGE_RETRY; + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current) || + current->flags & PF_EXITING)) + goto bypass; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem;
if (!(gfp_mask & __GFP_WAIT)) - return CHARGE_WOULDBLOCK; + goto nomem;
- if (gfp_mask & __GFP_NORETRY) - return CHARGE_NOMEM; + nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
- ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); - if (mem_cgroup_margin(mem_over_limit) >= nr_pages) - return CHARGE_RETRY; + if (mem_cgroup_margin(mem_over_limit) >= batch) + goto retry; + + if (gfp_mask & __GFP_NORETRY) + goto nomem; /* * Even though the limit is exceeded at this point, reclaim * may have been able to free some pages. Retry the charge @@@ -2609,142 -2588,47 +2588,47 @@@ * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) - return CHARGE_RETRY; - + if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + goto retry; /* * At task move, charge accounts can be doubly counted. So, it's * better to wait until the end of task_move if something is going on. */ if (mem_cgroup_wait_acct_move(mem_over_limit)) - return CHARGE_RETRY; - - if (invoke_oom) - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - - return CHARGE_NOMEM; - } - - /** - * mem_cgroup_try_charge - try charging a memcg - * @memcg: memcg to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns 0 if @memcg was charged successfully, -EINTR if the charge - * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. - */ - static int mem_cgroup_try_charge(struct mem_cgroup *memcg, - gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) - { - unsigned int batch = max(CHARGE_BATCH, nr_pages); - int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - int ret; - - if (mem_cgroup_is_root(memcg)) - goto done; - /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) - goto bypass; + goto retry;
- if (unlikely(task_in_memcg_oom(current))) - goto nomem; + if (nr_retries--) + goto retry;
if (gfp_mask & __GFP_NOFAIL) - oom = false; - again: - if (consume_stock(memcg, nr_pages)) - goto done; - - do { - bool invoke_oom = oom && !nr_oom_retries; - - /* If killed, bypass charge */ - if (fatal_signal_pending(current)) - goto bypass; + goto bypass;
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, - nr_pages, invoke_oom); - switch (ret) { - case CHARGE_OK: - break; - case CHARGE_RETRY: /* not in OOM situation but retry */ - batch = nr_pages; - goto again; - case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - goto nomem; - case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) - goto nomem; - nr_oom_retries--; - break; - } - } while (ret != CHARGE_OK); + if (fatal_signal_pending(current)) + goto bypass;
- if (batch > nr_pages) - refill_stock(memcg, batch - nr_pages); - done: - return 0; + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; bypass: - return -EINTR; - } - - /** - * mem_cgroup_try_charge_mm - try charging a mm - * @mm: mm_struct to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns the charged mem_cgroup associated with the given mm_struct or - * NULL the charge failed. - */ - static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, - gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) - - { - struct mem_cgroup *memcg; - int ret; - - memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); - css_put(&memcg->css); - if (ret == -EINTR) - memcg = root_mem_cgroup; - else if (ret) - memcg = NULL; + memcg = root_mem_cgroup; + ret = -EINTR; + goto retry;
- return memcg; + done_restock: + if (batch > nr_pages) + refill_stock(memcg, batch - nr_pages); + done: + return ret; }
- /* - * Somemtimes we have to undo a charge we got by try_charge(). - * This function is for that and do uncharge, put css's refcnt. - * gotten by try_charge(). - */ - static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) + static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(memcg)) { - unsigned long bytes = nr_pages * PAGE_SIZE; + unsigned long bytes = nr_pages * PAGE_SIZE;
- res_counter_uncharge(&memcg->res, bytes); - if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); - } + res_counter_uncharge(&memcg->res, bytes); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, bytes); }
/* @@@ -2756,9 -2640,6 +2640,6 @@@ static void __mem_cgroup_cancel_local_c { unsigned long bytes = nr_pages * PAGE_SIZE;
- if (mem_cgroup_is_root(memcg)) - return; - res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); if (do_swap_account) res_counter_uncharge_until(&memcg->memsw, @@@ -2779,6 -2660,16 +2660,16 @@@ static struct mem_cgroup *mem_cgroup_lo return mem_cgroup_from_id(id); }
+ /* + * try_get_mem_cgroup_from_page - look up page's memcg association + * @page: the page + * + * Look up, get a css reference, and return the memcg that owns @page. + * + * The page must be locked to prevent racing with swap-in and page + * cache charges. If coming from an unlocked page table, the caller + * must ensure the page is on the LRU or this can race with charging. + */ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { struct mem_cgroup *memcg = NULL; @@@ -2789,7 -2680,6 +2680,6 @@@ VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { memcg = pc->mem_cgroup; if (memcg && !css_tryget_online(&memcg->css)) @@@ -2803,23 -2693,17 +2693,17 @@@ memcg = NULL; rcu_read_unlock(); } return memcg; }
- static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, - struct page *page, - unsigned int nr_pages, - enum charge_type ctype, - bool lrucare) + static void commit_charge(struct page *page, struct mem_cgroup *memcg, + unsigned int nr_pages, bool lrucare) { struct page_cgroup *pc = lookup_page_cgroup(page); struct zone *uninitialized_var(zone); - struct lruvec *lruvec; bool was_on_lru = false; - bool anon; + struct lruvec *lruvec;
- lock_page_cgroup(pc); VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); /* * we don't need page_cgroup_lock about tail pages, becase they are not @@@ -2841,16 -2725,22 +2725,22 @@@ } }
- pc->mem_cgroup = memcg; /* - * We access a page_cgroup asynchronously without lock_page_cgroup(). - * Especially when a page_cgroup is taken from a page, pc->mem_cgroup - * is accessed after testing USED bit. To make pc->mem_cgroup visible - * before USED bit, we need memory barrier here. - * See mem_cgroup_add_lru_list(), etc. + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point: + * + * - the page is uncharged + * + * - the page is off-LRU + * + * - an anonymous fault has exclusive page access, except for + * a locked page table + * + * - a page cache insertion, a swapin fault, or a migration + * have the page locked */ - smp_wmb(); - SetPageCgroupUsed(pc); + pc->mem_cgroup = memcg; + pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
if (lrucare) { if (was_on_lru) { @@@ -2862,20 -2752,15 +2752,15 @@@ spin_unlock_irq(&zone->lru_lock); }
- if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) - anon = true; - else - anon = false; - - mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); - unlock_page_cgroup(pc); - + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, page, nr_pages); /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. * if they exceeds softlimit. */ memcg_check_events(memcg, page); + local_irq_enable(); }
static DEFINE_MUTEX(set_limit_mutex); @@@ -2937,22 -2822,21 +2822,21 @@@ static int memcg_charge_kmem(struct mem if (ret) return ret;
- ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, - oom_gfp_allowed(gfp)); + ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); if (ret == -EINTR) { /* - * mem_cgroup_try_charge() chosed to bypass to root due to - * OOM kill or fatal signal. Since our only options are to - * either fail the allocation or charge it to this cgroup, do - * it as a temporary condition. But we can't fail. From a - * kmem/slab perspective, the cache has already been selected, - * by mem_cgroup_kmem_get_cache(), so it is too late to change + * try_charge() chose to bypass to root due to OOM kill or + * fatal signal. Since our only options are to either fail + * the allocation or charge it to this cgroup, do it as a + * temporary condition. But we can't fail. From a kmem/slab + * perspective, the cache has already been selected, by + * mem_cgroup_kmem_get_cache(), so it is too late to change * our minds. * * This condition will only trigger if the task entered - * memcg_charge_kmem in a sane state, but was OOM-killed during - * mem_cgroup_try_charge() above. Tasks that were already - * dying when the allocation triggers should have been already + * memcg_charge_kmem in a sane state, but was OOM-killed + * during try_charge() above. Tasks that were already dying + * when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ res_counter_charge_nofail(&memcg->res, size, &fail_res); @@@ -3076,6 -2960,8 +2960,8 @@@ int memcg_update_cache_size(struct kmem return 0; }
+ static void memcg_unregister_cache_func(struct work_struct *work); + int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { @@@ -3097,6 -2983,9 +2983,9 @@@ if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; + atomic_long_set(&s->memcg_params->refcnt, 1); + INIT_WORK(&s->memcg_params->unregister_work, + memcg_unregister_cache_func); css_get(&memcg->css); } else s->memcg_params->is_root_cache = true; @@@ -3178,6 -3067,25 +3067,25 @@@ static void memcg_unregister_cache(stru kmem_cache_destroy(cachep); }
+ static void memcg_unregister_cache_func(struct work_struct *work) + { + struct memcg_cache_params *params = + container_of(work, struct memcg_cache_params, unregister_work); + struct kmem_cache *cachep = memcg_params_to_cache(params); + + mutex_lock(&memcg_slab_mutex); + memcg_unregister_cache(cachep); + mutex_unlock(&memcg_slab_mutex); + } + + static void memcg_unregister_cache_rcu_func(struct rcu_head *rcu) + { + struct memcg_cache_params *params = + container_of(rcu, struct memcg_cache_params, rcu_head); + + schedule_work(¶ms->unregister_work); + } + /* * During the creation a new cache, we need to disable our accounting mechanism * altogether. This is true even if we are not creating, but rather just @@@ -3233,6 -3141,7 +3141,7 @@@ static void memcg_unregister_all_caches { struct kmem_cache *cachep; struct memcg_cache_params *params, *tmp; + LIST_HEAD(empty_caches);
if (!memcg_kmem_is_active(memcg)) return; @@@ -3240,9 -3149,31 +3149,31 @@@ mutex_lock(&memcg_slab_mutex); list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); + + memcg_cache_mark_dead(cachep); kmem_cache_shrink(cachep); - if (atomic_read(&cachep->memcg_params->nr_pages) == 0) - memcg_unregister_cache(cachep); + + if (atomic_long_dec_and_test(&cachep->memcg_params->refcnt)) + list_move(&cachep->memcg_params->list, &empty_caches); + } + + /* + * kmem_cache_free doesn't expect that the cache can be destroyed as + * soon as the object is freed, e.g. SLUB's implementation may want to + * update cache stats after putting the object to the free list. + * + * Therefore we should wait for all kmem_cache_free's to finish before + * proceeding to cache destruction. Since both SLAB and SLUB versions + * of kmem_cache_free are non-preemptable, we wait for rcu-sched grace + * period to elapse. + */ + synchronize_sched(); + + while (!list_empty(&empty_caches)) { + params = list_first_entry(&empty_caches, + struct memcg_cache_params, list); + cachep = memcg_params_to_cache(params); + memcg_unregister_cache(cachep); } mutex_unlock(&memcg_slab_mutex); } @@@ -3315,14 -3246,18 +3246,18 @@@ int __memcg_charge_slab(struct kmem_cac res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, PAGE_SIZE << order); if (!res) - atomic_add(1 << order, &cachep->memcg_params->nr_pages); + atomic_long_inc(&cachep->memcg_params->refcnt); return res; }
void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) { memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); - atomic_sub(1 << order, &cachep->memcg_params->nr_pages); + + if (unlikely(atomic_long_dec_and_test(&cachep->memcg_params->refcnt))) + /* see memcg_unregister_all_caches */ + call_rcu_sched(&cachep->memcg_params->rcu_head, + memcg_unregister_cache_rcu_func); }
/* @@@ -3463,12 -3398,13 +3398,13 @@@ void __memcg_kmem_commit_charge(struct memcg_uncharge_kmem(memcg, PAGE_SIZE << order); return; } - + /* + * The page is freshly allocated and not visible to any + * outside callers yet. Set up pc non-atomically. + */ pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); pc->mem_cgroup = memcg; - SetPageCgroupUsed(pc); - unlock_page_cgroup(pc); + pc->flags = PCG_USED; }
void __memcg_kmem_uncharge_pages(struct page *page, int order) @@@ -3478,19 -3414,11 +3414,11 @@@
pc = lookup_page_cgroup(page); if (!PageCgroupUsed(pc)) return;
- lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); + memcg = pc->mem_cgroup; + pc->flags = 0;
/* * We trust that only if there is a memcg associated with the page, it @@@ -3510,7 -3438,6 +3438,6 @@@ static inline void memcg_unregister_all
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@@ -3531,8 -3458,7 +3458,7 @@@ void mem_cgroup_split_huge_fixup(struc for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; pc->mem_cgroup = memcg; - smp_wmb();/* see __commit_charge() */ - pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + pc->flags = head_pc->flags; } __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], HPAGE_PMD_NR); @@@ -3562,7 -3488,6 +3488,6 @@@ static int mem_cgroup_move_account(stru { unsigned long flags; int ret; - bool anon = PageAnon(page);
VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@@ -3576,15 -3501,13 +3501,13 @@@ if (nr_pages > 1 && !PageTransHuge(page)) goto out;
- lock_page_cgroup(pc); - ret = -EINVAL; if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) - goto unlock; + goto out;
move_lock_mem_cgroup(from, &flags);
- if (!anon && page_mapped(page)) { + if (!PageAnon(page) && page_mapped(page)) { __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], nr_pages); __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], @@@ -3598,20 -3521,23 +3521,23 @@@ nr_pages); }
- mem_cgroup_charge_statistics(from, page, anon, -nr_pages); + /* + * It is safe to change pc->mem_cgroup here because the page + * is referenced, charged, and isolated - we can't race with + * uncharging, charging, migration, or LRU putback. + */
/* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, page, anon, nr_pages); move_unlock_mem_cgroup(from, &flags); ret = 0; - unlock: - unlock_page_cgroup(pc); - /* - * check events - */ + + local_irq_disable(); + mem_cgroup_charge_statistics(to, page, nr_pages); memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); memcg_check_events(from, page); + local_irq_enable(); out: return ret; } @@@ -3682,686 -3608,73 +3608,73 @@@ out return ret; }
- int mem_cgroup_charge_anon(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) + #ifdef CONFIG_MEMCG_SWAP + static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, + bool charge) { - unsigned int nr_pages = 1; - struct mem_cgroup *memcg; - bool oom = true; + int val = (charge) ? 1 : -1; + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); + }
- if (mem_cgroup_disabled()) - return 0; + /** + * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. + * @entry: swap entry to be moved + * @from: mem_cgroup which the entry is moved from + * @to: mem_cgroup which the entry is moved to + * + * It succeeds only when the swap_cgroup's record for this entry is the same + * as the mem_cgroup's id of @from. + * + * Returns 0 on success, -EINVAL on failure. + * + * The caller must have charged to @to, IOW, called res_counter_charge() about + * both res and memsw, and called css_get(). + */ + static int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to) + { + unsigned short old_id, new_id;
- VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - VM_BUG_ON(!mm); + old_id = mem_cgroup_id(from); + new_id = mem_cgroup_id(to);
- if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); + if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { + mem_cgroup_swap_statistics(from, false); + mem_cgroup_swap_statistics(to, true); /* - * Never OOM-kill a process for a huge page. The - * fault handler will fall back to regular pages. + * This function is only called from task migration context now. + * It postpones res_counter and refcount handling till the end + * of task migration(mem_cgroup_clear_mc()) for performance + * improvement. But we cannot postpone css_get(to) because if + * the process that has been moved to @to does swap-in, the + * refcount of @to might be decreased to 0. + * + * We are in attach() phase, so the cgroup is guaranteed to be + * alive, so we can just call css_get(). */ - oom = false; + css_get(&to->css); + return 0; } - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); - if (!memcg) - return -ENOMEM; - __mem_cgroup_commit_charge(memcg, page, nr_pages, - MEM_CGROUP_CHARGE_TYPE_ANON, false); - return 0; + return -EINVAL; + } + #else + static inline int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to) + { + return -EINVAL; } + #endif
- /* - * While swap-in, try_charge -> commit or cancel, the page is locked. - * And when try_charge() successfully returns, one refcnt to memcg without - * struct page_cgroup is acquired. This refcnt will be consumed by - * "commit()" or removed by "cancel()" - */ - static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, - gfp_t mask, - struct mem_cgroup **memcgp) + #ifdef CONFIG_DEBUG_VM + static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { - struct mem_cgroup *memcg = NULL; struct page_cgroup *pc;
pc = lookup_page_cgroup(page); /* - * Every swap fault against a single page tries to charge the - * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. The USED bit is protected by - * the page lock, which serializes swap cache removal, which - * in turn serializes uncharging. - */ - if (PageCgroupUsed(pc)) - goto out; - if (do_swap_account) - memcg = try_get_mem_cgroup_from_page(page); - if (!memcg) - memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, mask, 1, true); - css_put(&memcg->css); - if (ret == -EINTR) - memcg = root_mem_cgroup; - else if (ret) - return ret; - out: - *memcgp = memcg; - return 0; - } - - int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, - gfp_t gfp_mask, struct mem_cgroup **memcgp) - { - if (mem_cgroup_disabled()) { - *memcgp = NULL; - return 0; - } - /* - * A racing thread's fault, or swapoff, may have already - * updated the pte, and even removed page from swap cache: in - * those cases unuse_pte()'s pte_same() test will fail; but - * there's also a KSM case which does need to charge the page. - */ - if (!PageSwapCache(page)) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); - if (!memcg) - return -ENOMEM; - *memcgp = memcg; - return 0; - } - return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); - } - - void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) - { - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - __mem_cgroup_cancel_charge(memcg, 1); - } - - static void - __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, - enum charge_type ctype) - { - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - - __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); - /* - * Now swap is on-memory. This means this page may be - * counted both as mem and swap....double count. - * Fix it by uncharging from memsw. Basically, this SwapCache is stable - * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() - * may call delete_from_swap_cache() before reach here. - */ - if (do_swap_account && PageSwapCache(page)) { - swp_entry_t ent = {.val = page_private(page)}; - mem_cgroup_uncharge_swap(ent); - } - } - - void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *memcg) - { - __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_ANON); - } - - int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) - { - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - struct mem_cgroup *memcg; - int ret; - - if (mem_cgroup_disabled()) - return 0; - if (PageCompound(page)) - return 0; - - if (PageSwapCache(page)) { /* shmem */ - ret = __mem_cgroup_try_charge_swapin(mm, page, - gfp_mask, &memcg); - if (ret) - return ret; - __mem_cgroup_commit_charge_swapin(page, memcg, type); - return 0; - } - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); - if (!memcg) - return -ENOMEM; - __mem_cgroup_commit_charge(memcg, page, 1, type, false); - return 0; - } - - static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, - unsigned int nr_pages, - const enum charge_type ctype) - { - struct memcg_batch_info *batch = NULL; - bool uncharge_memsw = true; - - /* If swapout, usage of swap doesn't decrease */ - if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - uncharge_memsw = false; - - batch = ¤t->memcg_batch; - /* - * In usual, we do css_get() when we remember memcg pointer. - * But in this case, we keep res->usage until end of a series of - * uncharges. Then, it's ok to ignore memcg's refcnt. - */ - if (!batch->memcg) - batch->memcg = memcg; - /* - * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continuously can be expected to be in - * the same cgroup and we have chance to coalesce uncharges. - * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) - * because we want to do uncharge as soon as possible. - */ - - if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) - goto direct_uncharge; - - if (nr_pages > 1) - goto direct_uncharge; - - /* - * In typical case, batch->memcg == mem. This means we can - * merge a series of uncharges to an uncharge of res_counter. - * If not, we uncharge res_counter ony by one. - */ - if (batch->memcg != memcg) - goto direct_uncharge; - /* remember freed charge and uncharge it later */ - batch->nr_pages++; - if (uncharge_memsw) - batch->memsw_nr_pages++; - return; - direct_uncharge: - res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); - if (uncharge_memsw) - res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); - if (unlikely(batch->memcg != memcg)) - memcg_oom_recover(memcg); - } - - /* - * uncharge if !page_mapped(page) - */ - static struct mem_cgroup * - __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, - bool end_migration) - { - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - bool anon; - - if (mem_cgroup_disabled()) - return NULL; - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - /* - * Check if our page_cgroup is valid - */ - pc = lookup_page_cgroup(page); - if (unlikely(!PageCgroupUsed(pc))) - return NULL; - - lock_page_cgroup(pc); - - memcg = pc->mem_cgroup; - - if (!PageCgroupUsed(pc)) - goto unlock_out; - - anon = PageAnon(page); - - switch (ctype) { - case MEM_CGROUP_CHARGE_TYPE_ANON: - /* - * Generally PageAnon tells if it's the anon statistics to be - * updated; but sometimes e.g. mem_cgroup_uncharge_page() is - * used before page reached the stage of being marked PageAnon. - */ - anon = true; - /* fallthrough */ - case MEM_CGROUP_CHARGE_TYPE_DROP: - /* See mem_cgroup_prepare_migration() */ - if (page_mapped(page)) - goto unlock_out; - /* - * Pages under migration may not be uncharged. But - * end_migration() /must/ be the one uncharging the - * unused post-migration page and so it has to call - * here with the migration bit still set. See the - * res_counter handling below. - */ - if (!end_migration && PageCgroupMigration(pc)) - goto unlock_out; - break; - case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: - if (!PageAnon(page)) { /* Shared memory */ - if (page->mapping && !page_is_file_cache(page)) - goto unlock_out; - } else if (page_mapped(page)) /* Anon */ - goto unlock_out; - break; - default: - break; - } - - mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); - - ClearPageCgroupUsed(pc); - /* - * pc->mem_cgroup is not cleared here. It will be accessed when it's - * freed from LRU. This is safe because uncharged page is expected not - * to be reused (freed soon). Exception is SwapCache, it's handled by - * special functions. - */ - - unlock_page_cgroup(pc); - /* - * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed, so it's safe to call css_get(). - */ - memcg_check_events(memcg, page); - if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { - mem_cgroup_swap_statistics(memcg, true); - css_get(&memcg->css); - } - /* - * Migration does not charge the res_counter for the - * replacement page, so leave it alone when phasing out the - * page that is unused after the migration. - */ - if (!end_migration && !mem_cgroup_is_root(memcg)) - mem_cgroup_do_uncharge(memcg, nr_pages, ctype); - - return memcg; - - unlock_out: - unlock_page_cgroup(pc); - return NULL; - } - - void mem_cgroup_uncharge_page(struct page *page) - { - /* early check. */ - if (page_mapped(page)) - return; - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - /* - * If the page is in swap cache, uncharge should be deferred - * to the swap path, which also properly accounts swap usage - * and handles memcg lifetime. - * - * Note that this check is not stable and reclaim may add the - * page to swap cache at any time after this. However, if the - * page is not in swap cache by the time page->mapcount hits - * 0, there won't be any page table references to the swap - * slot, and reclaim will free it and not actually write the - * page to disk. - */ - if (PageSwapCache(page)) - return; - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); - } - - void mem_cgroup_uncharge_cache_page(struct page *page) - { - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping, page); - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); - } - - /* - * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. - * In that cases, pages are freed continuously and we can expect pages - * are in the same memcg. All these calls itself limits the number of - * pages freed at once, then uncharge_start/end() is called properly. - * This may be called prural(2) times in a context, - */ - - void mem_cgroup_uncharge_start(void) - { - current->memcg_batch.do_batch++; - /* We can do nest. */ - if (current->memcg_batch.do_batch == 1) { - current->memcg_batch.memcg = NULL; - current->memcg_batch.nr_pages = 0; - current->memcg_batch.memsw_nr_pages = 0; - } - } - - void mem_cgroup_uncharge_end(void) - { - struct memcg_batch_info *batch = ¤t->memcg_batch; - - if (!batch->do_batch) - return; - - batch->do_batch--; - if (batch->do_batch) /* If stacked, do nothing. */ - return; - - if (!batch->memcg) - return; - /* - * This "batch->memcg" is valid without any css_get/put etc... - * bacause we hide charges behind us. - */ - if (batch->nr_pages) - res_counter_uncharge(&batch->memcg->res, - batch->nr_pages * PAGE_SIZE); - if (batch->memsw_nr_pages) - res_counter_uncharge(&batch->memcg->memsw, - batch->memsw_nr_pages * PAGE_SIZE); - memcg_oom_recover(batch->memcg); - /* forget this pointer (for sanity check) */ - batch->memcg = NULL; - } - - #ifdef CONFIG_SWAP - /* - * called after __delete_from_swap_cache() and drop "page" account. - * memcg information is recorded to swap_cgroup of "ent" - */ - void - mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) - { - struct mem_cgroup *memcg; - int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; - - if (!swapout) /* this was a swap cache but the swap is unused ! */ - ctype = MEM_CGROUP_CHARGE_TYPE_DROP; - - memcg = __mem_cgroup_uncharge_common(page, ctype, false); - - /* - * record memcg information, if swapout && memcg != NULL, - * css_get() was called in uncharge(). - */ - if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, mem_cgroup_id(memcg)); - } - #endif - - #ifdef CONFIG_MEMCG_SWAP - /* - * called from swap_entry_free(). remove record in swap_cgroup and - * uncharge "memsw" account. - */ - void mem_cgroup_uncharge_swap(swp_entry_t ent) - { - struct mem_cgroup *memcg; - unsigned short id; - - if (!do_swap_account) - return; - - id = swap_cgroup_record(ent, 0); - rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { - /* - * We uncharge this because swap is freed. This memcg can - * be obsolete one. We avoid calling css_tryget_online(). - */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); - mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); - } - rcu_read_unlock(); - } - - /** - * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. - * @entry: swap entry to be moved - * @from: mem_cgroup which the entry is moved from - * @to: mem_cgroup which the entry is moved to - * - * It succeeds only when the swap_cgroup's record for this entry is the same - * as the mem_cgroup's id of @from. - * - * Returns 0 on success, -EINVAL on failure. - * - * The caller must have charged to @to, IOW, called res_counter_charge() about - * both res and memsw, and called css_get(). - */ - static int mem_cgroup_move_swap_account(swp_entry_t entry, - struct mem_cgroup *from, struct mem_cgroup *to) - { - unsigned short old_id, new_id; - - old_id = mem_cgroup_id(from); - new_id = mem_cgroup_id(to); - - if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { - mem_cgroup_swap_statistics(from, false); - mem_cgroup_swap_statistics(to, true); - /* - * This function is only called from task migration context now. - * It postpones res_counter and refcount handling till the end - * of task migration(mem_cgroup_clear_mc()) for performance - * improvement. But we cannot postpone css_get(to) because if - * the process that has been moved to @to does swap-in, the - * refcount of @to might be decreased to 0. - * - * We are in attach() phase, so the cgroup is guaranteed to be - * alive, so we can just call css_get(). - */ - css_get(&to->css); - return 0; - } - return -EINVAL; - } - #else - static inline int mem_cgroup_move_swap_account(swp_entry_t entry, - struct mem_cgroup *from, struct mem_cgroup *to) - { - return -EINVAL; - } - #endif - - /* - * Before starting migration, account PAGE_SIZE to mem_cgroup that the old - * page belongs to. - */ - void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **memcgp) - { - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - enum charge_type ctype; - - *memcgp = NULL; - - if (mem_cgroup_disabled()) - return; - - if (PageTransHuge(page)) - nr_pages <<= compound_order(page); - - pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - css_get(&memcg->css); - /* - * At migrating an anonymous page, its mapcount goes down - * to 0 and uncharge() will be called. But, even if it's fully - * unmapped, migration may fail and this page has to be - * charged again. We set MIGRATION flag here and delay uncharge - * until end_migration() is called - * - * Corner Case Thinking - * A) - * When the old page was mapped as Anon and it's unmap-and-freed - * while migration was ongoing. - * If unmap finds the old page, uncharge() of it will be delayed - * until end_migration(). If unmap finds a new page, it's - * uncharged when it make mapcount to be 1->0. If unmap code - * finds swap_migration_entry, the new page will not be mapped - * and end_migration() will find it(mapcount==0). - * - * B) - * When the old page was mapped but migraion fails, the kernel - * remaps it. A charge for it is kept by MIGRATION flag even - * if mapcount goes down to 0. We can do remap successfully - * without charging it again. - * - * C) - * The "old" page is under lock_page() until the end of - * migration, so, the old page itself will not be swapped-out. - * If the new page is swapped out before end_migraton, our - * hook to usual swap-out path will catch the event. - */ - if (PageAnon(page)) - SetPageCgroupMigration(pc); - } - unlock_page_cgroup(pc); - /* - * If the page is not charged at this point, - * we return here. - */ - if (!memcg) - return; - - *memcgp = memcg; - /* - * We charge new page before it's used/mapped. So, even if unlock_page() - * is called before end_migration, we can catch all events on this new - * page. In the case new page is migrated but not remapped, new page's - * mapcount will be finally 0 and we call uncharge in end_migration(). - */ - if (PageAnon(page)) - ctype = MEM_CGROUP_CHARGE_TYPE_ANON; - else - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; - /* - * The page is committed to the memcg, but it's not actually - * charged to the res_counter since we plan on replacing the - * old one and only one page is going to be left afterwards. - */ - __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); - } - - /* remove redundant charge if migration failed*/ - void mem_cgroup_end_migration(struct mem_cgroup *memcg, - struct page *oldpage, struct page *newpage, bool migration_ok) - { - struct page *used, *unused; - struct page_cgroup *pc; - bool anon; - - if (!memcg) - return; - - if (!migration_ok) { - used = oldpage; - unused = newpage; - } else { - used = newpage; - unused = oldpage; - } - anon = PageAnon(used); - __mem_cgroup_uncharge_common(unused, - anon ? MEM_CGROUP_CHARGE_TYPE_ANON - : MEM_CGROUP_CHARGE_TYPE_CACHE, - true); - css_put(&memcg->css); - /* - * We disallowed uncharge of pages under migration because mapcount - * of the page goes down to zero, temporarly. - * Clear the flag and check the page should be charged. - */ - pc = lookup_page_cgroup(oldpage); - lock_page_cgroup(pc); - ClearPageCgroupMigration(pc); - unlock_page_cgroup(pc); - - /* - * If a page is a file cache, radix-tree replacement is very atomic - * and we can skip this check. When it was an Anon page, its mapcount - * goes down to 0. But because we added MIGRATION flage, it's not - * uncharged yet. There are several case but page->mapcount check - * and USED bit check in mem_cgroup_uncharge_page() will do enough - * check. (see prepare_charge() also) - */ - if (anon) - mem_cgroup_uncharge_page(used); - } - - /* - * At replace page cache, newpage is not under any memcg but it's on - * LRU. So, this function doesn't touch res_counter but handles LRU - * in correct way. Both pages are locked so we cannot race with uncharge. - */ - void mem_cgroup_replace_page_cache(struct page *oldpage, - struct page *newpage) - { - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(oldpage); - /* fix accounting on old pages */ - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, oldpage, false, -1); - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); - - /* - * When called from shmem_replace_page(), in some cases the - * oldpage has already been charged, and in some cases not. - */ - if (!memcg) - return; - /* - * Even if newpage->mapping was NULL before starting replacement, - * the newpage may be on LRU(or pagevec for LRU) already. We lock - * LRU while we overwrite pc->mem_cgroup. - */ - __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); - } - - #ifdef CONFIG_DEBUG_VM - static struct page_cgroup *lookup_page_cgroup_used(struct page *page) - { - struct page_cgroup *pc; - - pc = lookup_page_cgroup(page); - /* - * Can be NULL while feeding pages into the page allocator for - * the first time, i.e. during boot or memory hotplug; - * or when mem_cgroup_disabled(). + * Can be NULL while feeding pages into the page allocator for + * the first time, i.e. during boot or memory hotplug; + * or when mem_cgroup_disabled(). */ if (likely(pc) && PageCgroupUsed(pc)) return pc; @@@ -4550,7 -3863,7 +3863,7 @@@ unsigned long mem_cgroup_soft_limit_rec gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; - spin_lock(&mctz->lock); + spin_lock_irq(&mctz->lock);
/* * If we failed to reclaim anything from this memory cgroup @@@ -4590,7 -3903,7 +3903,7 @@@ */ /* If excess == 0, no tree ops */ __mem_cgroup_insert_exceeded(mz, mctz, excess); - spin_unlock(&mctz->lock); + spin_unlock_irq(&mctz->lock); css_put(&mz->memcg->css); loop++; /* @@@ -4800,95 -4113,41 +4113,41 @@@ static int mem_cgroup_hierarchy_write(s * occur, provided the current cgroup has no children. * * For the root cgroup, parent_mem is NULL, we allow value to be - * set if there are no children. - */ - if ((!parent_memcg || !parent_memcg->use_hierarchy) && - (val == 1 || val == 0)) { - if (!memcg_has_children(memcg)) - memcg->use_hierarchy = val; - else - retval = -EBUSY; - } else - retval = -EINVAL; - - out: - mutex_unlock(&memcg_create_mutex); - - return retval; - } - - - static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) - { - struct mem_cgroup *iter; - long val = 0; - - /* Per-cpu values can be negative, use a signed accumulator */ - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); - - if (val < 0) /* race ? */ - val = 0; - return val; - } - - static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) - { - u64 val; - - if (!mem_cgroup_is_root(memcg)) { - if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); - else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); - } - - /* - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS - * as well as in MEM_CGROUP_STAT_RSS_HUGE. + * set if there are no children. */ - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); + if ((!parent_memcg || !parent_memcg->use_hierarchy) && + (val == 1 || val == 0)) { + if (!memcg_has_children(memcg)) + memcg->use_hierarchy = val; + else + retval = -EBUSY; + } else + retval = -EINVAL;
- if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); + out: + mutex_unlock(&memcg_create_mutex);
- return val << PAGE_SHIFT; + return retval; }
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) + struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - u64 val; - int name; - enum res_type type; - - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); + int name = MEMFILE_ATTR(cft->private);
switch (type) { case _MEM: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, false); - else - val = res_counter_read_u64(&memcg->res, name); - break; + return res_counter_read_u64(&memcg->res, name); case _MEMSWAP: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, true); - else - val = res_counter_read_u64(&memcg->memsw, name); - break; + return res_counter_read_u64(&memcg->memsw, name); case _KMEM: - val = res_counter_read_u64(&memcg->kmem, name); + return res_counter_read_u64(&memcg->kmem, name); break; default: BUG(); } - - return val; }
#ifdef CONFIG_MEMCG_KMEM @@@ -5350,7 -4609,10 +4609,10 @@@ static void __mem_cgroup_threshold(stru if (!t) goto unlock;
- usage = mem_cgroup_usage(memcg, swap); + if (!swap) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* * current_threshold points to threshold just below or equal to usage. @@@ -5442,15 -4704,15 +4704,15 @@@ static int __mem_cgroup_usage_register_
mutex_lock(&memcg->thresholds_lock);
- if (type == _MEM) + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG();
- usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before adding a new one */ if (thresholds->primary) __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@@ -5530,18 -4792,19 +4792,19 @@@ static void __mem_cgroup_usage_unregist int i, j, size;
mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG();
if (!thresholds->primary) goto unlock;
- usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before removing */ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@@ -6003,6 -5266,7 +5266,6 @@@ static struct cftype mem_cgroup_files[ }, { .name = "use_hierarchy", - .flags = CFTYPE_INSANE, .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, @@@ -6295,9 -5559,9 +5558,9 @@@ mem_cgroup_css_online(struct cgroup_sub * core guarantees its existence. */ } else { - res_counter_init(&memcg->res, NULL); - res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); + res_counter_init(&memcg->res, &root_mem_cgroup->res); + res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); + res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@@ -6406,80 -5670,40 +5669,63 @@@ static void mem_cgroup_css_free(struct __mem_cgroup_free(memcg); }
+/** + * mem_cgroup_css_reset - reset the states of a mem_cgroup + * @css: the target css + * + * Reset the states of the mem_cgroup associated with @css. This is + * invoked when the userland requests disabling on the default hierarchy + * but the memcg is pinned through dependency. The memcg should stop + * applying policies and should revert to the vanilla state as it may be + * made visible again. + * + * The current implementation only resets the essential configurations. + * This needs to be expanded to cover all the visible parts. + */ +static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + mem_cgroup_resize_limit(memcg, ULLONG_MAX); + mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); + memcg_update_kmem_limit(memcg, ULLONG_MAX); + res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); +} + #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ - #define PRECHARGE_COUNT_AT_ONCE 256 static int mem_cgroup_do_precharge(unsigned long count) { - int ret = 0; - int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *memcg = mc.to; + int ret;
- if (mem_cgroup_is_root(memcg)) { + /* Try a single bulk charge without reclaim first */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + if (!ret) { mc.precharge += count; - /* we don't need css_get for root */ return ret; } - /* try to charge at once */ - if (count > 1) { - struct res_counter *dummy; - /* - * "memcg" cannot be under rmdir() because we've already checked - * by cgroup_lock_live_cgroup() that it is not removed and we - * are still under the same cgroup_mutex. So we can postpone - * css_get(). - */ - if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) - goto one_by_one; - if (do_swap_account && res_counter_charge(&memcg->memsw, - PAGE_SIZE * count, &dummy)) { - res_counter_uncharge(&memcg->res, PAGE_SIZE * count); - goto one_by_one; - } - mc.precharge += count; + if (ret == -EINTR) { + cancel_charge(root_mem_cgroup, count); return ret; } - one_by_one: - /* fall back to one by one charge */ + + /* Try charges one by one with reclaim */ while (count--) { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - if (!batch_count--) { - batch_count = PRECHARGE_COUNT_AT_ONCE; - cond_resched(); - } - ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + /* + * In case of failure, any residual charges against + * mc.to will be dropped by mem_cgroup_clear_mc() + * later on. However, cancel any charges that are + * bypassed to root right away or they'll be lost. + */ + if (ret == -EINTR) + cancel_charge(root_mem_cgroup, 1); if (ret) - /* mem_cgroup_clear_mc() will do uncharge later */ return ret; mc.precharge++; + cond_resched(); } - return ret; + return 0; }
/** @@@ -6615,9 -5839,9 +5861,9 @@@ static enum mc_target_type get_mctgt_ty if (page) { pc = lookup_page_cgroup(page); /* - * Do only loose check w/o page_cgroup lock. - * mem_cgroup_move_account() checks the pc is valid or not under - * the lock. + * Do only loose check w/o serialization. + * mem_cgroup_move_account() checks the pc is valid or + * not under LRU exclusion. */ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; @@@ -6742,7 -5966,7 +5988,7 @@@ static void __mem_cgroup_clear_mc(void
/* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { - __mem_cgroup_cancel_charge(mc.to, mc.precharge); + cancel_charge(mc.to, mc.precharge); mc.precharge = 0; } /* @@@ -6750,27 -5974,24 +5996,24 @@@ * we must uncharge here. */ if (mc.moved_charge) { - __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); + cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; } /* we must fixup refcnts and charges */ if (mc.moved_swap) { /* uncharge swap account from the old cgroup */ - if (!mem_cgroup_is_root(mc.from)) - res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); + res_counter_uncharge(&mc.from->memsw, + PAGE_SIZE * mc.moved_swap);
for (i = 0; i < mc.moved_swap; i++) css_put(&mc.from->css);
- if (!mem_cgroup_is_root(mc.to)) { - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ - res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); - } + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + res_counter_uncharge(&mc.to->res, + PAGE_SIZE * mc.moved_swap); /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } @@@ -7023,17 -6244,16 +6266,17 @@@ static void mem_cgroup_move_task(struc
/* * Cgroup retains root cgroups across [un]mount cycles making it necessary - * to verify sane_behavior flag on each mount attempt. + * to verify whether we're attached to the default hierarchy on each mount + * attempt. */ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) { /* - * use_hierarchy is forced with sane_behavior. cgroup core + * use_hierarchy is forced on the default hierarchy. cgroup core * guarantees that @root doesn't have any children, so turning it * on for the root memcg is enough. */ - if (cgroup_sane_behavior(root_css->cgroup)) + if (cgroup_on_dfl(root_css->cgroup)) mem_cgroup_from_css(root_css)->use_hierarchy = true; }
@@@ -7042,12 -6262,11 +6285,12 @@@ struct cgroup_subsys memory_cgrp_subsy .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, .css_free = mem_cgroup_css_free, + .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, - .base_cftypes = mem_cgroup_files, + .legacy_cftypes = mem_cgroup_files, .early_init = 0, };
@@@ -7064,8 -6283,7 +6307,8 @@@ __setup("swapaccount=", enable_swap_acc
static void __init memsw_file_init(void) { - WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, + memsw_cgroup_files)); }
static void __init enable_swap_cgroup(void) @@@ -7082,6 -6300,380 +6325,380 @@@ static void __init enable_swap_cgroup(v } #endif
+ #ifdef CONFIG_MEMCG_SWAP + /** + * mem_cgroup_swapout - transfer a memsw charge to swap + * @page: page whose memsw charge to transfer + * @entry: swap entry to move the charge to + * + * Transfer the memsw charge of @page to @entry. + */ + void mem_cgroup_swapout(struct page *page, swp_entry_t entry) + { + struct page_cgroup *pc; + unsigned short oldid; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + if (!do_swap_account) + return; + + pc = lookup_page_cgroup(page); + + /* Readahead page, never charged */ + if (!PageCgroupUsed(pc)) + return; + + VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); + + oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); + VM_BUG_ON_PAGE(oldid, page); + + pc->flags &= ~PCG_MEMSW; + css_get(&pc->mem_cgroup->css); + mem_cgroup_swap_statistics(pc->mem_cgroup, true); + } + + /** + * mem_cgroup_uncharge_swap - uncharge a swap entry + * @entry: swap entry to uncharge + * + * Drop the memsw charge associated with @entry. + */ + void mem_cgroup_uncharge_swap(swp_entry_t entry) + { + struct mem_cgroup *memcg; + unsigned short id; + + if (!do_swap_account) + return; + + id = swap_cgroup_record(entry, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); + if (memcg) { + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); + css_put(&memcg->css); + } + rcu_read_unlock(); + } + #endif + + /** + * mem_cgroup_try_charge - try charging a page + * @page: page to charge + * @mm: mm context of the victim + * @gfp_mask: reclaim mode + * @memcgp: charged memcg return + * + * Try to charge @page to the memcg that @mm belongs to, reclaiming + * pages according to @gfp_mask if necessary. + * + * Returns 0 on success, with *@memcgp pointing to the charged memcg. + * Otherwise, an error code is returned. + * + * After page->mapping has been set up, the caller must finalize the + * charge with mem_cgroup_commit_charge(). Or abort the transaction + * with mem_cgroup_cancel_charge() in case page instantiation fails. + */ + int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp) + { + struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; + int ret = 0; + + if (mem_cgroup_disabled()) + goto out; + + if (PageSwapCache(page)) { + struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * Every swap fault against a single page tries to charge the + * page, bail as early as possible. shmem_unuse() encounters + * already charged pages, too. The USED bit is protected by + * the page lock, which serializes swap cache removal, which + * in turn serializes uncharging. + */ + if (PageCgroupUsed(pc)) + goto out; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + if (do_swap_account && PageSwapCache(page)) + memcg = try_get_mem_cgroup_from_page(page); + if (!memcg) + memcg = get_mem_cgroup_from_mm(mm); + + ret = try_charge(memcg, gfp_mask, nr_pages); + + css_put(&memcg->css); + + if (ret == -EINTR) { + memcg = root_mem_cgroup; + ret = 0; + } + out: + *memcgp = memcg; + return ret; + } + + /** + * mem_cgroup_commit_charge - commit a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * @lrucare: page might be on LRU already + * + * Finalize a charge transaction started by mem_cgroup_try_charge(), + * after page->mapping has been set up. This must happen atomically + * as part of the page instantiation, i.e. under the page table lock + * for anonymous pages, under the page lock for page and swap cache. + * + * In addition, the page must not be on the LRU during the commit, to + * prevent racing with task migration. If it might be, use @lrucare. + * + * Use mem_cgroup_cancel_charge() to cancel the transaction instead. + */ + void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare) + { + unsigned int nr_pages = 1; + + VM_BUG_ON_PAGE(!page->mapping, page); + VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + commit_charge(page, memcg, nr_pages, lrucare); + + if (do_swap_account && PageSwapCache(page)) { + swp_entry_t entry = { .val = page_private(page) }; + /* + * The swap entry might not get freed for a long time, + * let's not wait for it. The page already received a + * memory+swap charge, drop the swap entry duplicate. + */ + mem_cgroup_uncharge_swap(entry); + } + } + + /** + * mem_cgroup_cancel_charge - cancel a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * + * Cancel a charge transaction started by mem_cgroup_try_charge(). + */ + void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) + { + unsigned int nr_pages = 1; + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + cancel_charge(memcg, nr_pages); + } + + static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + unsigned long nr_mem, unsigned long nr_memsw, + unsigned long nr_anon, unsigned long nr_file, + unsigned long nr_huge, struct page *dummy_page) + { + unsigned long flags; + + if (nr_mem) + res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); + if (nr_memsw) + res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); + + memcg_oom_recover(memcg); + + local_irq_save(flags); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); + __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); + __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); + memcg_check_events(memcg, dummy_page); + local_irq_restore(flags); + } + + static void uncharge_list(struct list_head *page_list) + { + struct mem_cgroup *memcg = NULL; + unsigned long nr_memsw = 0; + unsigned long nr_anon = 0; + unsigned long nr_file = 0; + unsigned long nr_huge = 0; + unsigned long pgpgout = 0; + unsigned long nr_mem = 0; + struct list_head *next; + struct page *page; + + next = page_list->next; + do { + unsigned int nr_pages = 1; + struct page_cgroup *pc; + + page = list_entry(next, struct page, lru); + next = page->lru.next; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + pc = lookup_page_cgroup(page); + if (!PageCgroupUsed(pc)) + continue; + + /* + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point, we have + * fully exclusive access to the page. + */ + + if (memcg != pc->mem_cgroup) { + if (memcg) { + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); + pgpgout = nr_mem = nr_memsw = 0; + nr_anon = nr_file = nr_huge = 0; + } + memcg = pc->mem_cgroup; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + nr_huge += nr_pages; + } + + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + + if (pc->flags & PCG_MEM) + nr_mem += nr_pages; + if (pc->flags & PCG_MEMSW) + nr_memsw += nr_pages; + pc->flags = 0; + + pgpgout++; + } while (next != page_list); + + if (memcg) + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); + } + + /** + * mem_cgroup_uncharge - uncharge a page + * @page: page to uncharge + * + * Uncharge a page previously charged with mem_cgroup_try_charge() and + * mem_cgroup_commit_charge(). + */ + void mem_cgroup_uncharge(struct page *page) + { + if (mem_cgroup_disabled()) + return; + + INIT_LIST_HEAD(&page->lru); + uncharge_list(&page->lru); + } + + /** + * mem_cgroup_uncharge_list - uncharge a list of page + * @page_list: list of pages to uncharge + * + * Uncharge a list of pages previously charged with + * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + */ + void mem_cgroup_uncharge_list(struct list_head *page_list) + { + if (mem_cgroup_disabled()) + return; + + if (!list_empty(page_list)) + uncharge_list(page_list); + } + + /** + * mem_cgroup_migrate - migrate a charge to another page + * @oldpage: currently charged page + * @newpage: page to transfer the charge to + * @lrucare: page might be on LRU already + * + * Migrate the charge from @oldpage to @newpage. + * + * Both pages must be locked, @newpage->mapping must be set up. + */ + void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, + bool lrucare) + { + unsigned int nr_pages = 1; + struct page_cgroup *pc; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON_PAGE(PageLRU(oldpage), oldpage); + VM_BUG_ON_PAGE(PageLRU(newpage), newpage); + VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(oldpage); + if (!PageCgroupUsed(pc)) + return; + + /* Already migrated */ + if (!(pc->flags & PCG_MEM)) + return; + + VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); + pc->flags &= ~(PCG_MEM | PCG_MEMSW); + + if (PageTransHuge(oldpage)) { + nr_pages <<= compound_order(oldpage); + VM_BUG_ON_PAGE(!PageTransHuge(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageTransHuge(newpage), newpage); + } + + commit_charge(newpage, pc->mem_cgroup, nr_pages, lrucare); + } + /* * subsys_initcall() for memory controller. * diff --combined mm/slub.c index 8c24a23,b543612..3b2bae9 --- a/mm/slub.c +++ b/mm/slub.c @@@ -233,11 -233,6 +233,6 @@@ static inline void stat(const struct km * Core slab cache functions *******************************************************************/
- static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) - { - return s->node[node]; - } - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) @@@ -288,6 -283,10 +283,10 @@@ static inline void set_freepointer(stru for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size)
+ #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ + for (__p = (__addr), __idx = 1; __idx <= __objects;\ + __p += (__s)->size, __idx++) + /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) { @@@ -382,9 -381,9 +381,9 @@@ static inline bool __cmpxchg_double_sla defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { @@@ -418,9 -417,9 +417,9 @@@ static inline bool cmpxchg_double_slab( defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { @@@ -945,60 -944,6 +944,6 @@@ static void trace(struct kmem_cache *s }
/* - * Hooks for other subsystems that check memory allocations. In a typical - * production configuration these hooks all should produce no code at all. - */ - static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) - { - kmemleak_alloc(ptr, size, 1, flags); - } - - static inline void kfree_hook(const void *x) - { - kmemleak_free(x); - } - - static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) - { - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); - - return should_failslab(s->object_size, flags, s->flags); - } - - static inline void slab_post_alloc_hook(struct kmem_cache *s, - gfp_t flags, void *object) - { - flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); - } - - static inline void slab_free_hook(struct kmem_cache *s, void *x) - { - kmemleak_free_recursive(x, s->flags); - - /* - * Trouble is that we may no longer disable interrupts in the fast path - * So in order to make the debug calls that expect irqs to be - * disabled we need to disable interrupts temporarily. - */ - #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) - { - unsigned long flags; - - local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); - debug_check_no_locks_freed(x, s->object_size); - local_irq_restore(flags); - } - #endif - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->object_size); - } - - /* * Tracking of fully allocated slabs for debugging purposes. */ static void add_full(struct kmem_cache *s, @@@ -1282,6 -1227,12 +1227,12 @@@ static inline void inc_slabs_node(struc static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {}
+ #endif /* CONFIG_SLUB_DEBUG */ + + /* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); @@@ -1293,21 -1244,44 +1244,44 @@@ static inline void kfree_hook(const voi }
static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) - { return 0; } + { + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); + + return should_failslab(s->object_size, flags, s->flags); + }
- static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) + static inline void slab_post_alloc_hook(struct kmem_cache *s, + gfp_t flags, void *object) { - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, - flags & gfp_allowed_mask); + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); }
static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); - }
- #endif /* CONFIG_SLUB_DEBUG */ + /* + * Trouble is that we may no longer disable interrupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ + #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); + local_irq_restore(flags); + } + #endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); + }
/* * Slab allocation and freeing @@@ -1409,9 -1383,9 +1383,9 @@@ static struct page *new_slab(struct kme { struct page *page; void *start; - void *last; void *p; int order; + int idx;
BUG_ON(flags & GFP_SLAB_BUG_MASK);
@@@ -1432,14 -1406,13 +1406,13 @@@ if (unlikely(s->flags & SLAB_POISON)) memset(start, POISON_INUSE, PAGE_SIZE << order);
- last = start; - for_each_object(p, s, start, page->objects) { - setup_object(s, page, last); - set_freepointer(s, last, p); - last = p; + for_each_object_idx(p, idx, s, start, page->objects) { + setup_object(s, page, p); + if (likely(idx < page->objects)) + set_freepointer(s, p, p + s->size); + else + set_freepointer(s, p, NULL); } - setup_object(s, page, last); - set_freepointer(s, last, NULL);
page->freelist = start; page->inuse = page->objects; @@@ -2064,6 -2037,14 +2037,14 @@@ static void put_cpu_partial(struct kmem
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + + if (memcg_cache_dead(s)) { + unsigned long flags; + + local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + local_irq_restore(flags); + } #endif }
@@@ -2162,6 -2143,7 +2143,7 @@@ slab_out_of_memory(struct kmem_cache *s static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); int node; + struct kmem_cache_node *n;
if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; @@@ -2176,15 -2158,11 +2158,11 @@@ pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", s->name);
- for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long nr_slabs; unsigned long nr_objs; unsigned long nr_free;
- if (!n) - continue; - nr_free = count_partial(n, count_free); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); @@@ -2673,18 -2651,17 +2651,17 @@@ static __always_inline void slab_free(s
slab_free_hook(s, x);
- redo: /* - * Determine the currently cpus per cpu slab. - * The cpu may change afterward. However that does not matter since - * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succedd. + * We could make this function fully preemptable, but then we wouldn't + * have a method to wait for all currently executing kfree's to finish, + * which is necessary to avoid use-after-free on per memcg cache + * destruction. */ preempt_disable(); + redo: c = this_cpu_ptr(s->cpu_slab);
tid = c->tid; - preempt_enable();
if (likely(page == c->page)) { set_freepointer(s, object, c->freelist); @@@ -2701,6 -2678,7 +2678,7 @@@ } else __slab_free(s, page, x, addr);
+ preempt_enable(); }
void kmem_cache_free(struct kmem_cache *s, void *x) @@@ -2928,13 -2906,10 +2906,10 @@@ static void early_kmem_cache_node_alloc static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; + struct kmem_cache_node *n;
- for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = s->node[node]; - - if (n) - kmem_cache_free(kmem_cache_node, n); - + for_each_kmem_cache_node(s, node, n) { + kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; } } @@@ -3199,13 -3174,12 +3174,13 @@@ static void list_slab_objects(struct km /* * Attempt to free all partial slabs on a node. * This is called from kmem_cache_close(). We must be the last thread - * using the cache and therefore we do not need to lock anymore. + * using the cache, but we still have to lock for lockdep's sake. */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { struct page *page, *h;
+ spin_lock_irq(&n->list_lock); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { __remove_partial(n, page); @@@ -3215,7 -3189,6 +3190,7 @@@ "Objects remaining in %s on kmem_cache_close()"); } } + spin_unlock_irq(&n->list_lock); }
/* @@@ -3224,12 -3197,11 +3199,11 @@@ static inline int kmem_cache_close(struct kmem_cache *s) { int node; + struct kmem_cache_node *n;
flush_all(s); /* Attempt to free all objects */ - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { free_partial(s, n); if (n->nr_partial || slabs_node(s, node)) return 1; @@@ -3406,20 -3378,26 +3380,26 @@@ int __kmem_cache_shrink(struct kmem_cac struct page *page; struct page *t; int objects = oo_objects(s->max); + struct list_head empty_slabs; struct list_head *slabs_by_inuse = kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); unsigned long flags;
- if (!slabs_by_inuse) - return -ENOMEM; + if (memcg_cache_dead(s)) + s->min_partial = 0;
- flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); - - if (!n->nr_partial) - continue; + if (!slabs_by_inuse) { + /* + * Do not fail shrinking empty slabs if allocation of the + * temporary array failed. Just skip the slab placement + * optimization then. + */ + slabs_by_inuse = &empty_slabs; + objects = 1; + }
+ flush_all(s); + for_each_kmem_cache_node(s, node, n) { for (i = 0; i < objects; i++) INIT_LIST_HEAD(slabs_by_inuse + i);
@@@ -3432,7 -3410,9 +3412,9 @@@ * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - list_move(&page->lru, slabs_by_inuse + page->inuse); + if (page->inuse < objects) + list_move(&page->lru, + slabs_by_inuse + page->inuse); if (!page->inuse) n->nr_partial--; } @@@ -3451,7 -3431,8 +3433,8 @@@ discard_slab(s, page); }
- kfree(slabs_by_inuse); + if (slabs_by_inuse != &empty_slabs) + kfree(slabs_by_inuse); return 0; }
@@@ -3588,6 -3569,7 +3571,7 @@@ static struct kmem_cache * __init boots { int node; struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n;
memcpy(s, static_cache, kmem_cache->object_size);
@@@ -3597,19 -3579,16 +3581,16 @@@ * IPIs around. */ __flush_cpu_slab(s, smp_processor_id()); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { struct page *p;
- if (n) { - list_for_each_entry(p, &n->partial, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->partial, lru) + p->slab_cache = s;
#ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->full, lru) + p->slab_cache = s; #endif - } } list_add(&s->list, &slab_caches); return s; @@@ -3962,16 -3941,14 +3943,14 @@@ static long validate_slab_cache(struct unsigned long count = 0; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n;
if (!map) return -ENOMEM;
flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - } kfree(map); return count; } @@@ -4125,6 -4102,7 +4104,7 @@@ static int list_locations(struct kmem_c int node; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n;
if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_TEMPORARY)) { @@@ -4134,8 -4112,7 +4114,7 @@@ /* Push back cpu slabs */ flush_all(s);
- for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long flags; struct page *page;
@@@ -4207,7 -4184,7 +4186,7 @@@ #endif
#ifdef SLUB_RESILIENCY_TEST - static void resiliency_test(void) + static void __init resiliency_test(void) { u8 *p;
@@@ -4334,8 -4311,9 +4313,9 @@@ static ssize_t show_slab_objects(struc get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) {
if (flags & SO_TOTAL) x = atomic_long_read(&n->total_objects); @@@ -4351,9 -4329,9 +4331,9 @@@ } else #endif if (flags & SO_PARTIAL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n;
+ for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = count_partial(n, count_total); else if (flags & SO_OBJECTS) @@@ -4366,7 -4344,7 +4346,7 @@@ } x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA - for_each_node_state(node, N_NORMAL_MEMORY) + for (node = 0; node < nr_node_ids; node++) if (nodes[node]) x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); @@@ -4380,16 -4358,12 +4360,12 @@@ static int any_slab_objects(struct kmem_cache *s) { int node; + struct kmem_cache_node *n;
- for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) if (atomic_long_read(&n->total_objects)) return 1; - } + return 0; } #endif @@@ -5344,13 -5318,9 +5320,9 @@@ void get_slabinfo(struct kmem_cache *s unsigned long nr_objs = 0; unsigned long nr_free = 0; int node; + struct kmem_cache_node *n;
- for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); diff --combined net/bridge/br_multicast.c index b4845f4,d9c4f57..7751c92 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@@ -1174,7 -1174,7 +1174,7 @@@ static void br_multicast_add_router(str }
if (slot) - hlist_add_after_rcu(slot, &port->rlist); + hlist_add_behind_rcu(&port->rlist, slot); else hlist_add_head_rcu(&port->rlist, &br->router_list); } @@@ -2216,43 -2216,6 +2216,43 @@@ unlock EXPORT_SYMBOL_GPL(br_multicast_list_adjacent);
/** + * br_multicast_has_querier_anywhere - Checks for a querier on a bridge + * @dev: The bridge port providing the bridge on which to check for a querier + * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 + * + * Checks whether the given interface has a bridge on top and if so returns + * true if a valid querier exists anywhere on the bridged link layer. + * Otherwise returns false. + */ +bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) +{ + struct net_bridge *br; + struct net_bridge_port *port; + struct ethhdr eth; + bool ret = false; + + rcu_read_lock(); + if (!br_port_exists(dev)) + goto unlock; + + port = br_port_get_rcu(dev); + if (!port || !port->br) + goto unlock; + + br = port->br; + + memset(ð, 0, sizeof(eth)); + eth.h_proto = htons(proto); + + ret = br_multicast_querier_exists(br, ð); + +unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere); + +/** * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port * @dev: The bridge port adjacent to which to check for a querier * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 diff --combined net/xfrm/xfrm_policy.c index 0525d78,92cb08d..beeed60 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@@ -389,7 -389,7 +389,7 @@@ redo if (h != h0) continue; hlist_del(&pol->bydst); - hlist_add_after(entry0, &pol->bydst); + hlist_add_behind(&pol->bydst, entry0); } entry0 = &pol->bydst; } @@@ -654,7 -654,7 +654,7 @@@ int xfrm_policy_insert(int dir, struct break; } if (newpos) - hlist_add_after(newpos, &policy->bydst); + hlist_add_behind(&policy->bydst, newpos); else hlist_add_head(&policy->bydst, chain); xfrm_pol_hold(policy); @@@ -2097,8 -2097,6 +2097,8 @@@ struct dst_entry *xfrm_lookup(struct ne goto no_transform; }
+ dst_hold(&xdst->u.dst); + xdst->u.dst.flags |= DST_NOCACHE; route = xdst->route; } }