[linux-next] LinuxNextTracking branch, master, updated. next-20140724

24 Jul 2014

The following commit has been merged in the master branch:
commit 626b9a42eb4f3cb5bd3772604f2e092740456bae
Merge: d1a83f28255a5dcac94e11555199d068e7dbff7a 76e1488440013a0d737fbb9d1f8efe226138f7f0
Author: Stephen Rothwell sfr@canb.auug.org.au
Date:   Thu Jul 24 18:47:14 2014 +1000
Merge branch 'akpm-current/current'
Conflicts:
    	arch/arm64/Kconfig
    	mm/shmem.c
diff --combined Documentation/devicetree/bindings/i2c/trivial-devices.txt
index 37803eb,c75046a..6af570e

--- a/Documentation/devicetree/bindings/i2c/trivial-devices.txt
+++ b/Documentation/devicetree/bindings/i2c/trivial-devices.txt
@@@ -50,7 -50,6 +50,7 @@@ epson,rx8581		I2C-BUS INTERFACE REAL TI
  fsl,mag3110		MAG3110: Xtrinsic High Accuracy, 3D Magnetometer
  fsl,mc13892		MC13892: Power Management Integrated Circuit (PMIC) for i.MX35/51
  fsl,mma8450		MMA8450Q: Xtrinsic Low-power, 3-axis Xtrinsic Accelerometer
 +fsl,mma8452		MMA8452Q: 3-axis 12-bit / 8-bit Digital Accelerometer
  fsl,mpr121		MPR121: Proximity Capacitive Touch Sensor Controller
  fsl,sgtl5000		SGTL5000: Ultra Low-Power Audio Codec
  gmt,g751		G751: Digital Temperature Sensor and Thermal Watchdog with Two-Wire Interface
@@@ -70,6 -69,7 +70,7 @@@ nuvoton,npct501		i2c trusted platform m
  nxp,pca9556		Octal SMBus and I2C registered interface
  nxp,pca9557		8-bit I2C-bus and SMBus I/O port with reset
  nxp,pcf8563		Real-time clock/calendar
+ nxp,pcf85063		Tiny Real-Time Clock
  ovti,ov5642		OV5642: Color CMOS QSXGA (5-megapixel) Image Sensor with OmniBSI and Embedded TrueFocus
  pericom,pt7c4338	Real-time Clock Module
  plx,pex8648		48-Lane, 12-Port PCI Express Gen 2 (5.0 GT/s) Switch
@@@ -84,6 -84,5 +85,6 @@@ stm,m41t80		M41T80 - SERIAL ACCESS RTC 
  taos,tsl2550		Ambient Light Sensor with SMBUS/Two Wire Serial Interface
  ti,tsc2003		I2C Touch-Screen Controller
  ti,tmp102		Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface
 +ti,tmp103		Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface
  ti,tmp275		Digital Temperature Sensor
  winbond,wpct301		i2c trusted platform module (TPM)
diff --combined Documentation/kernel-parameters.txt
index d2fc335,6824f37..f1d8047
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -566,11 -566,6 +566,11 @@@ bytes respectively. Such letter suffixe
    		possible to determine what the correct size should be.
    		This option provides an override for these situations.
+	ca_keys=	[KEYS] This parameter identifies a specific key(s) on
 +			the system trusted keyring to be used for certificate
 +			trust validation.
 +			format: { id:<keyid> | builtin }
 +
    ccw_timeout_log [S390]
    		See Documentation/s390/CommonIO for details.
@@@ -1102,12 -1097,6 +1102,12 @@@
    		that can be changed at run time by the
    		set_graph_function file in the debugfs tracing directory.
+	ftrace_graph_notrace=[function-list]
 +			[FTRACE] Do not trace from the functions specified in
 +			function-list.  This list is a comma separated list of
 +			functions that can be changed at run time by the
 +			set_graph_notrace file in the debugfs tracing directory.
 +
    gamecon.map[2|3]=
    		[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
    		support via parallel port (up to 5 devices per port)
@@@ -1324,23 -1313,6 +1324,23 @@@
    		Formats: { "ima" | "ima-ng" }
    		Default: "ima-ng"
+	ima.ahash_minsize= [IMA] Minimum file size for asynchronous hash usage
 +			Format: <min_file_size>
 +			Set the minimal file size for using asynchronous hash.
 +			If left unspecified, ahash usage is disabled.
 +
 +			ahash performance varies for different data sizes on
 +			different crypto accelerators. This option can be used
 +			to achieve the best performance for a particular HW.
 +
 +	ima.ahash_bufsize= [IMA] Asynchronous hash buffer size
 +			Format: <bufsize>
 +			Set hashing buffer size. Default: 4k.
 +
 +			ahash performance varies for different chunk sizes on
 +			different crypto accelerators. This option can be used
 +			to achieve best performance for particular HW.
 +
    init=		[KNL]
    		Format: <full_path>
    		Run specified binary instead of /sbin/init as init
@@@ -1444,6 -1416,10 +1444,6 @@@
    ip=		[IP_PNP]
    		See Documentation/filesystems/nfs/nfsroot.txt.
-	ip2=		[HW] Set IO/IRQ pairs for up to 4 IntelliPort boards
 -			See comment before ip2_setup() in
 -			drivers/char/ip2/ip2base.c.
 -
    irqfixup	[HW]
    		When an interrupt is not handled search all handlers
    		for it. Intended to get systems with badly broken
@@@ -1716,8 -1692,12 +1716,12 @@@
    		7 (KERN_DEBUG)		debug-level messages
log_buf_len=n[KMG]	Sets the size of the printk ring buffer,
- 			in bytes.  n must be a power of two.  The default
- 			size is set in the kernel config file.
+ 			in bytes.  n must be a power of two and greater
+ 			than the minimal size. The minimal size is defined
+ 			by LOG_BUF_SHIFT kernel config parameter. There is
+ 			also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter
+ 			that allows to increase the default size depending on
+ 			the number of CPUs. See init/Kconfig for more details.
logo.nologo	[FB] Disables display of the built-in Linux logo.
    		This may be used to provide more screen space for
@@@ -2190,21 -2170,6 +2194,21 @@@
    		and restore using xsave. The kernel will fallback to
    		enabling legacy floating-point and sse state.
+	noxsaveopt	[X86] Disables xsaveopt used in saving x86 extended
 +			register states. The kernel will fall back to use
 +			xsave to save the states. By using this parameter,
 +			performance of saving the states is degraded because
 +			xsave doesn't support modified optimization while
 +			xsaveopt supports it on xsaveopt enabled systems.
 +
 +	noxsaves	[X86] Disables xsaves and xrstors used in saving and
 +			restoring x86 extended register state in compacted
 +			form of xsave area. The kernel will fall back to use
 +			xsaveopt and xrstor to save and restore the states
 +			in standard form of xsave area. By using this
 +			parameter, xsave area per process might occupy more
 +			memory on xsaves enabled systems.
 +
    eagerfpu=	[X86]
    		on	enable eager fpu restore
    		off	disable eager fpu restore
@@@ -2846,13 -2811,6 +2850,13 @@@
    		quiescent states.  Units are jiffies, minimum
    		value is one, and maximum value is HZ.
+	rcutree.rcu_nocb_leader_stride= [KNL]
 +			Set the number of NOCB kthread groups, which
 +			defaults to the square root of the number of
 +			CPUs.  Larger numbers reduces the wakeup overhead
 +			on the per-CPU grace-period kthreads, but increases
 +			that same overhead on each group's leader.
 +
    rcutree.qhimark= [KNL]
    		Set threshold of queued RCU callbacks beyond which
    		batch limiting is disabled.
@@@ -3069,13 -3027,6 +3073,13 @@@
S		[KNL] Run init in single mode
+	s390_iommu=	[HW,S390]
 +			Set s390 IOTLB flushing mode
 +		strict
 +			With strict flushing every unmap operation will result in
 +			an IOTLB flush. Default is lazy flushing before reuse,
 +			which is faster.
 +
    sa1100ir	[NET]
    		See drivers/net/irda/sa1100_ir.c.
@@@ -3750,10 -3701,6 +3754,10 @@@
    		Disables the ticketlock slowpath using Xen PV
    		optimizations.
+	xen_nopv	[X86]
 +			Disables the PV optimizations forcing the HVM guest to
 +			run as generic HVM guest with no PV drivers.
 +
    xirc2ps_cs=	[NET,PCMCIA]
    		Format:
    		<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
diff --combined Makefile
index c02f4ea,a4b34fe..5e4e225
--- a/Makefile
+++ b/Makefile
@@@ -360,14 -360,9 +360,14 @@@ include $(srctree)/scripts/Kbuild.inclu
  # Make variables (CC, etc...)
  AS		= $(CROSS_COMPILE)as
  LD		= $(CROSS_COMPILE)ld
 +LDFINAL	= $(LD)
  CC		= $(CROSS_COMPILE)gcc
  CPP		= $(CC) -E
 +ifdef CONFIG_LTO
 +AR		= $(CROSS_COMPILE)gcc-ar
 +else
  AR		= $(CROSS_COMPILE)ar
 +endif
  NM		= $(CROSS_COMPILE)nm
  STRIP		= $(CROSS_COMPILE)strip
  OBJCOPY		= $(CROSS_COMPILE)objcopy
@@@ -377,7 -372,6 +377,7 @@@ GENKSYMS	= scripts/genksyms/genksym
  INSTALLKERNEL  := installkernel
  DEPMOD		= /sbin/depmod
  PERL		= perl
 +PYTHON		= python
  CHECK		= sparse
CHECKFLAGS     := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
@@@ -427,8 -421,8 +427,8 @@@ KERNELVERSION = $(VERSION)$(if $(PATCHL
export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
  export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
 -export CPP AR NM STRIP OBJCOPY OBJDUMP
 -export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
 +export CPP AR NM STRIP OBJCOPY OBJDUMP LDFINAL
 +export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
  export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
@@@ -438,17 -432,6 +438,17 @@@ export KBUILD_AFLAGS_MODULE KBUILD_CFLA
  export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
  export KBUILD_ARFLAGS
+ifdef CONFIG_LTO
 +# LTO gcc creates a lot of files in TMPDIR, and with /tmp as tmpfs
 +# it's easy to drive the machine OOM. Use the object directory
 +# instead.
 +ifndef TMPDIR
 +TMPDIR ?= $(objtree)
 +export TMPDIR
 +$(info setting TMPDIR=$(objtree) for LTO build)
 +endif
 +endif
 +
  # When compiling out-of-tree modules, put MODVERDIR in the module
  # tree rather than in the kernel tree. The kernel tree might
  # even be read-only.
@@@ -638,6 -621,9 +638,9 @@@ els
  KBUILD_CFLAGS	+= -O2
  endif
+ # Tell gcc to never replace conditional load with a non-conditional one
+ KBUILD_CFLAGS	+= $(call cc-option,--param=allow-store-data-races=0)
+ 
  ifdef CONFIG_READABLE_ASM
  # Disable optimizations that make assembler listings hard to read.
  # reorder blocks reorders the control in the function
@@@ -653,6 -639,22 +656,22 @@@ KBUILD_CFLAGS += $(call cc-option,-Wfra
  endif
# Handle stack protector mode.
+ #
+ # Since kbuild can potentially perform two passes (first with the old
+ # .config values and then with updated .config values), we cannot error out
+ # if a desired compiler option is unsupported. If we were to error, kbuild
+ # could never get to the second pass and actually notice that we changed
+ # the option to something that was supported.
+ #
+ # Additionally, we don't want to fallback and/or silently change which compiler
+ # flags will be used, since that leads to producing kernels with different
+ # security feature characteristics depending on the compiler used. ("But I
+ # selected CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!")
+ #
+ # The middle ground is to warn here so that the failed option is obvious, but
+ # to let the build fail with bad compiler flags so that we can't produce a
+ # kernel when there is a CONFIG and compiler mismatch.
+ #
  ifdef CONFIG_CC_STACKPROTECTOR_REGULAR
    stackp-flag := -fstack-protector
    ifeq ($(call cc-option, $(stackp-flag)),)
@@@ -768,7 -770,6 +787,7 @@@ ifeq ($(shell $(CONFIG_SHELL) $(srctree
  endif
include $(srctree)/scripts/Makefile.extrawarn
 +include ${srctree}/scripts/Makefile.lto
# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
  KBUILD_CPPFLAGS += $(KCPPFLAGS)
@@@ -1238,9 -1239,9 +1257,9 @@@ help
    @echo  '  tags/TAGS	  - Generate tags file for editors'
    @echo  '  cscope	  - Generate cscope index'
    @echo  '  gtags           - Generate GNU GLOBAL index'
 -	@echo  '  kernelrelease	  - Output the release version string'
 -	@echo  '  kernelversion	  - Output the version stored in Makefile'
 -	@echo  '  image_name	  - Output the image name'
 +	@echo  '  kernelrelease	  - Output the release version string (use with make -s)'
 +	@echo  '  kernelversion	  - Output the version stored in Makefile (use with make -s)'
 +	@echo  '  image_name	  - Output the image name (use with make -s)'
    @echo  '  headers_install - Install sanitised kernel headers to INSTALL_HDR_PATH'; \
     echo  '                    (default: $(INSTALL_HDR_PATH))'; \
     echo  ''
diff --combined arch/arm/Kconfig
index f0ee653,551e526..5904ac5
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@@ -84,6 -84,7 +84,7 @@@ config AR
      http://www.arm.linux.org.uk/.
config ARM_HAS_SG_CHAIN
+ 	select ARCH_HAS_SG_CHAIN
    bool
config NEED_SG_DMA_LENGTH
@@@ -240,6 -241,13 +241,6 @@@ config ARM_PATCH_PHYS_VIR
      this feature (eg, building a kernel for a single machine) and
      you need to shrink the kernel to the minimal size.
-config NEED_MACH_GPIO_H
 -	bool
 -	help
 -	  Select this when mach/gpio.h is required to provide special
 -	  definitions for this platform. The need for mach/gpio.h should
 -	  be avoided when possible.
 -
  config NEED_MACH_IO_H
    bool
    help
@@@ -314,6 -322,7 +315,6 @@@ config ARCH_INTEGRATO
    select HAVE_TCM
    select ICST
    select MULTI_IRQ_HANDLER
 -	select NEED_MACH_MEMORY_H
    select PLAT_VERSATILE
    select SPARSE_IRQ
    select USE_OF
@@@ -333,6 -342,7 +334,6 @@@ config ARCH_REALVIE
    select ICST
    select NEED_MACH_MEMORY_H
    select PLAT_VERSATILE
 -	select PLAT_VERSATILE_CLCD
    help
      This enables support for ARM Ltd RealView boards.
@@@ -347,6 -357,7 +348,6 @@@ config ARCH_VERSATIL
    select HAVE_MACH_CLKDEV
    select ICST
    select PLAT_VERSATILE
 -	select PLAT_VERSATILE_CLCD
    select PLAT_VERSATILE_CLOCK
    select VERSATILE_FPGA_IRQ
    help
@@@ -519,6 -530,21 +520,6 @@@ config ARCH_DOV
    help
      Support for the Marvell Dove SoC 88AP510
-config ARCH_KIRKWOOD
 -	bool "Marvell Kirkwood"
 -	select ARCH_REQUIRE_GPIOLIB
 -	select CPU_FEROCEON
 -	select GENERIC_CLOCKEVENTS
 -	select MVEBU_MBUS
 -	select PCI
 -	select PCI_QUIRKS
 -	select PINCTRL
 -	select PINCTRL_KIRKWOOD
 -	select PLAT_ORION_LEGACY
 -	help
 -	  Support for the following Marvell Kirkwood series SoCs:
 -	  88F6180, 88F6192 and 88F6281.
 -
  config ARCH_MV78XX0
    bool "Marvell MV78xx0"
    select ARCH_REQUIRE_GPIOLIB
@@@ -610,7 -636,6 +611,7 @@@ config ARCH_PX
    select AUTO_ZRELADDR
    select CLKDEV_LOOKUP
    select CLKSRC_MMIO
 +	select CLKSRC_OF
    select GENERIC_CLOCKEVENTS
    select GPIO_PXA
    select HAVE_IDE
@@@ -735,6 -760,61 +736,6 @@@ config ARCH_S3C64X
    help
      Samsung S3C64XX series based systems
-config ARCH_S5P64X0
 -	bool "Samsung S5P6440 S5P6450"
 -	select ATAGS
 -	select CLKDEV_LOOKUP
 -	select CLKSRC_SAMSUNG_PWM
 -	select CPU_V6
 -	select GENERIC_CLOCKEVENTS
 -	select GPIO_SAMSUNG
 -	select HAVE_S3C2410_I2C if I2C
 -	select HAVE_S3C2410_WATCHDOG if WATCHDOG
 -	select HAVE_S3C_RTC if RTC_CLASS
 -	select NEED_MACH_GPIO_H
 -	select SAMSUNG_ATAGS
 -	select SAMSUNG_WDT_RESET
 -	help
 -	  Samsung S5P64X0 CPU based systems, such as the Samsung SMDK6440,
 -	  SMDK6450.
 -
 -config ARCH_S5PC100
 -	bool "Samsung S5PC100"
 -	select ARCH_REQUIRE_GPIOLIB
 -	select ATAGS
 -	select CLKDEV_LOOKUP
 -	select CLKSRC_SAMSUNG_PWM
 -	select CPU_V7
 -	select GENERIC_CLOCKEVENTS
 -	select GPIO_SAMSUNG
 -	select HAVE_S3C2410_I2C if I2C
 -	select HAVE_S3C2410_WATCHDOG if WATCHDOG
 -	select HAVE_S3C_RTC if RTC_CLASS
 -	select NEED_MACH_GPIO_H
 -	select SAMSUNG_ATAGS
 -	select SAMSUNG_WDT_RESET
 -	help
 -	  Samsung S5PC100 series based systems
 -
 -config ARCH_S5PV210
 -	bool "Samsung S5PV210/S5PC110"
 -	select ARCH_HAS_HOLES_MEMORYMODEL
 -	select ARCH_SPARSEMEM_ENABLE
 -	select ATAGS
 -	select CLKDEV_LOOKUP
 -	select CLKSRC_SAMSUNG_PWM
 -	select CPU_V7
 -	select GENERIC_CLOCKEVENTS
 -	select GPIO_SAMSUNG
 -	select HAVE_S3C2410_I2C if I2C
 -	select HAVE_S3C2410_WATCHDOG if WATCHDOG
 -	select HAVE_S3C_RTC if RTC_CLASS
 -	select NEED_MACH_GPIO_H
 -	select NEED_MACH_MEMORY_H
 -	select SAMSUNG_ATAGS
 -	help
 -	  Samsung S5PV210/S5PC110 series based systems
 -
  config ARCH_DAVINCI
    bool "TI DaVinci"
    select ARCH_HAS_HOLES_MEMORYMODEL
@@@ -873,6 -953,8 +874,6 @@@ source "arch/arm/mach-ixp4xx/Kconfig
source "arch/arm/mach-keystone/Kconfig"
-source "arch/arm/mach-kirkwood/Kconfig"
 -
  source "arch/arm/mach-ks8695/Kconfig"
source "arch/arm/mach-msm/Kconfig"
@@@ -883,8 -965,6 +884,8 @@@ source "arch/arm/mach-mv78xx0/Kconfig
source "arch/arm/mach-imx/Kconfig"
+source "arch/arm/mach-mediatek/Kconfig"
 +
  source "arch/arm/mach-mxs/Kconfig"
source "arch/arm/mach-netx/Kconfig"
@@@ -926,6 -1006,10 +927,6 @@@ source "arch/arm/mach-s3c24xx/Kconfig
source "arch/arm/mach-s3c64xx/Kconfig"
-source "arch/arm/mach-s5p64x0/Kconfig"
 -
 -source "arch/arm/mach-s5pc100/Kconfig"
 -
  source "arch/arm/mach-s5pv210/Kconfig"
source "arch/arm/mach-exynos/Kconfig"
@@@ -1472,8 -1556,7 +1473,8 @@@ config ARM_PSC
  config ARCH_NR_GPIO
    int
    default 1024 if ARCH_SHMOBILE || ARCH_TEGRA
 -	default 512 if ARCH_EXYNOS || ARCH_KEYSTONE || SOC_OMAP5 || SOC_DRA7XX || ARCH_S3C24XX || ARCH_S3C64XX
 +	default 512 if ARCH_EXYNOS || ARCH_KEYSTONE || SOC_OMAP5 || \
 +		SOC_DRA7XX || ARCH_S3C24XX || ARCH_S3C64XX || ARCH_S5PV210
    default 416 if ARCH_SUNXI
    default 392 if ARCH_U8500
    default 352 if ARCH_VT8500
@@@ -1488,7 -1571,7 +1489,7 @@@ source kernel/Kconfig.preemp
config HZ_FIXED
    int
 -	default 200 if ARCH_EBSA110 || ARCH_S3C24XX || ARCH_S5P64X0 || \
 +	default 200 if ARCH_EBSA110 || ARCH_S3C24XX || \
    	ARCH_S5PV210 || ARCH_EXYNOS4
    default AT91_TIMER_HZ if ARCH_AT91
    default SHMOBILE_TIMER_HZ if ARCH_SHMOBILE_LEGACY
@@@ -2113,6 -2196,7 +2114,6 @@@ menu "Power management options
  source "kernel/power/Kconfig"
config ARCH_SUSPEND_POSSIBLE
 -	depends on !ARCH_S5PC100
    depends on CPU_ARM920T || CPU_ARM926T || CPU_FEROCEON || CPU_SA1100 || \
    	CPU_V6 || CPU_V6K || CPU_V7 || CPU_V7M || CPU_XSC3 || CPU_XSCALE || CPU_MOHAWK
    def_bool y
diff --combined arch/arm/mm/dma-mapping.c
index 1f88db0,3116880..7a996aa
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@@ -26,6 -26,7 +26,7 @@@
  #include <linux/io.h>
  #include <linux/vmalloc.h>
  #include <linux/sizes.h>
+ #include <linux/cma.h>
#include <asm/memory.h>
  #include <asm/highmem.h>
@@@ -461,21 -462,12 +462,21 @@@ void __init dma_contiguous_remap(void
    	map.type = MT_MEMORY_DMA_READY;
/*
 -		 * Clear previous low-memory mapping
 +		 * Clear previous low-memory mapping to ensure that the
 +		 * TLB does not see any conflicting entries, then flush
 +		 * the TLB of the old entries before creating new mappings.
 +		 *
 +		 * This ensures that any speculatively loaded TLB entries
 +		 * (even though they may be rare) can not cause any problems,
 +		 * and ensures that this code is architecturally compliant.
    	 */
    	for (addr = __phys_to_virt(start); addr < __phys_to_virt(end);
    	     addr += PMD_SIZE)
    		pmd_clear(pmd_off_k(addr));
+		flush_tlb_kernel_range(__phys_to_virt(start),
 +				       __phys_to_virt(end));
 +
    	iotable_init(&map, 1);
    }
  }
diff --combined arch/arm64/Kconfig
index 555ad3c,7bc7b74..4e40949
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@@ -1,6 -1,8 +1,7 @@@
  config ARM64
    def_bool y
    select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 -	select ARCH_HAS_OPP
+ 	select ARCH_HAS_SG_CHAIN
    select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
    select ARCH_USE_CMPXCHG_LOCKREF
    select ARCH_SUPPORTS_ATOMIC_RMW
@@@ -10,9 -12,6 +11,9 @@@
    select ARM_AMBA
    select ARM_ARCH_TIMER
    select ARM_GIC
 +	select ARM_GIC_V2M if (PCI && PCI_MSI)
 +	select ARM_GIC_V3
 +	select AUDIT_ARCH_COMPAT_GENERIC
    select BUILDTIME_EXTABLE_SORT
    select CLONE_BACKWARDS
    select COMMON_CLK
@@@ -31,12 -30,10 +32,12 @@@
    select GENERIC_STRNLEN_USER
    select GENERIC_TIME_VSYSCALL
    select HARDIRQS_SW_RESEND
 +	select HAVE_ARCH_AUDITSYSCALL
    select HAVE_ARCH_JUMP_LABEL
    select HAVE_ARCH_KGDB
    select HAVE_ARCH_TRACEHOOK
    select HAVE_C_RECORDMCOUNT
 +	select HAVE_CC_STACKPROTECTOR
    select HAVE_DEBUG_BUGVERBOSE
    select HAVE_DEBUG_KMEMLEAK
    select HAVE_DMA_API_DEBUG
@@@ -67,7 -64,6 +68,7 @@@
    select RTC_LIB
    select SPARSE_IRQ
    select SYSCTL_EXCEPTION_TRACE
 +	select HAVE_CONTEXT_TRACKING
    help
      ARM 64-bit (AArch64) Linux support.
@@@ -160,63 -156,14 +161,63 @@@ endmen
menu "Kernel Features"
+choice
 +	prompt "Page size"
 +	default ARM64_4K_PAGES
 +	help
 +	  Page size (translation granule) configuration.
 +
 +config ARM64_4K_PAGES
 +	bool "4KB"
 +	help
 +	  This feature enables 4KB pages support.
 +
  config ARM64_64K_PAGES
 -	bool "Enable 64KB pages support"
 +	bool "64KB"
    help
      This feature enables 64KB pages support (4KB by default)
      allowing only two levels of page tables and faster TLB
      look-up. AArch32 emulation is not available when this feature
      is enabled.
+endchoice
 +
 +choice
 +	prompt "Virtual address space size"
 +	default ARM64_VA_BITS_39 if ARM64_4K_PAGES
 +	default ARM64_VA_BITS_42 if ARM64_64K_PAGES
 +	help
 +	  Allows choosing one of multiple possible virtual address
 +	  space sizes. The level of translation table is determined by
 +	  a combination of page size and virtual address space size.
 +
 +config ARM64_VA_BITS_39
 +	bool "39-bit"
 +	depends on ARM64_4K_PAGES
 +
 +config ARM64_VA_BITS_42
 +	bool "42-bit"
 +	depends on ARM64_64K_PAGES
 +
 +config ARM64_VA_BITS_48
 +	bool "48-bit"
 +	depends on BROKEN
 +
 +endchoice
 +
 +config ARM64_VA_BITS
 +	int
 +	default 39 if ARM64_VA_BITS_39
 +	default 42 if ARM64_VA_BITS_42
 +	default 48 if ARM64_VA_BITS_48
 +
 +config ARM64_PGTABLE_LEVELS
 +	int
 +	default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42
 +	default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48
 +	default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39
 +	default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48
 +
  config CPU_BIG_ENDIAN
         bool "Build big-endian kernel"
         help
@@@ -362,17 -309,6 +363,17 @@@ config EF
      allow the kernel to be booted as an EFI application. This
      is only useful on systems that have UEFI firmware.
+config DMI
 +	bool "Enable support for SMBIOS (DMI) tables"
 +	depends on EFI
 +	default y
 +	help
 +	  This enables SMBIOS/DMI feature for systems.
 +
 +	  This option is only useful on systems that have UEFI firmware.
 +	  However, even with this option, the resultant kernel should
 +	  continue to boot on existing non-UEFI platforms.
 +
  endmenu
menu "Userspace binary formats"
diff --combined arch/ia64/Kconfig
index 44a6915,56986a0..c84c88b
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@@ -10,7 -10,6 +10,7 @@@ config IA6
    select ARCH_MIGHT_HAVE_PC_SERIO
    select PCI if (!IA64_HP_SIM)
    select ACPI if (!IA64_HP_SIM)
 +	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
    select PM if (!IA64_HP_SIM)
    select HAVE_UNSTABLE_SCHED_CLOCK
    select HAVE_IDE
@@@ -28,6 -27,7 +28,7 @@@
    select HAVE_MEMBLOCK
    select HAVE_MEMBLOCK_NODE_MAP
    select HAVE_VIRT_CPU_ACCOUNTING
+ 	select ARCH_HAS_SG_CHAIN
    select VIRT_TO_BUS
    select ARCH_DISCARD_MEMBLOCK
    select GENERIC_IRQ_PROBE
diff --combined arch/powerpc/kvm/book3s_64_mmu_hv.c
index 09a47ae,a01744f..ad463f8
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@@ -37,8 -37,6 +37,6 @@@
  #include <asm/ppc-opcode.h>
  #include <asm/cputable.h>
- #include "book3s_hv_cma.h"
- 
  /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
  #define MAX_LPID_970	63
@@@ -64,10 -62,10 +62,10 @@@ long kvmppc_alloc_hpt(struct kvm *kvm, 
    }
kvm->arch.hpt_cma_alloc = 0;
    page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
    if (page) {
    	hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+ 		memset((void *)hpt, 0, (1 << order));
    	kvm->arch.hpt_cma_alloc = 1;
    }
@@@ -450,7 -448,7 +448,7 @@@ static int kvmppc_mmu_book3s_64_hv_xlat
    unsigned long slb_v;
    unsigned long pp, key;
    unsigned long v, gr;
 -	unsigned long *hptep;
 +	__be64 *hptep;
    int index;
    int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
@@@ -473,13 -471,13 +471,13 @@@
    	preempt_enable();
    	return -ENOENT;
    }
 -	hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
 -	v = hptep[0] & ~HPTE_V_HVLOCK;
 +	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
 +	v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
    gr = kvm->arch.revmap[index].guest_rpte;
/* Unlock the HPTE */
    asm volatile("lwsync" : : : "memory");
 -	hptep[0] = v;
 +	hptep[0] = cpu_to_be64(v);
    preempt_enable();
gpte->eaddr = eaddr;
@@@ -583,8 -581,7 +581,8 @@@ int kvmppc_book3s_hv_page_fault(struct 
    			unsigned long ea, unsigned long dsisr)
  {
    struct kvm *kvm = vcpu->kvm;
 -	unsigned long *hptep, hpte[3], r;
 +	unsigned long hpte[3], r;
 +	__be64 *hptep;
    unsigned long mmu_seq, psize, pte_size;
    unsigned long gpa_base, gfn_base;
    unsigned long gpa, gfn, hva, pfn;
@@@ -607,16 -604,16 +605,16 @@@
    if (ea != vcpu->arch.pgfault_addr)
    	return RESUME_GUEST;
    index = vcpu->arch.pgfault_index;
 -	hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
 +	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
    rev = &kvm->arch.revmap[index];
    preempt_disable();
    while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
    	cpu_relax();
 -	hpte[0] = hptep[0] & ~HPTE_V_HVLOCK;
 -	hpte[1] = hptep[1];
 +	hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
 +	hpte[1] = be64_to_cpu(hptep[1]);
    hpte[2] = r = rev->guest_rpte;
    asm volatile("lwsync" : : : "memory");
 -	hptep[0] = hpte[0];
 +	hptep[0] = cpu_to_be64(hpte[0]);
    preempt_enable();
if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
@@@ -732,9 -729,8 +730,9 @@@
    preempt_disable();
    while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
    	cpu_relax();
 -	if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] ||
 -	    rev->guest_rpte != hpte[2])
 +	if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] ||
 +		be64_to_cpu(hptep[1]) != hpte[1] ||
 +		rev->guest_rpte != hpte[2])
    	/* HPTE has been changed under us; let the guest retry */
    	goto out_unlock;
    hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
@@@ -754,20 -750,20 +752,20 @@@
    rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
    r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
-	if (hptep[0] & HPTE_V_VALID) {
 +	if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
    	/* HPTE was previously valid, so we need to invalidate it */
    	unlock_rmap(rmap);
 -		hptep[0] |= HPTE_V_ABSENT;
 +		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
    	kvmppc_invalidate_hpte(kvm, hptep, index);
    	/* don't lose previous R and C bits */
 -		r |= hptep[1] & (HPTE_R_R | HPTE_R_C);
 +		r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
    } else {
    	kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
    }
-	hptep[1] = r;
 +	hptep[1] = cpu_to_be64(r);
    eieio();
 -	hptep[0] = hpte[0];
 +	hptep[0] = cpu_to_be64(hpte[0]);
    asm volatile("ptesync" : : : "memory");
    preempt_enable();
    if (page && hpte_is_writable(r))
@@@ -786,7 -782,7 +784,7 @@@
    return ret;
out_unlock:
 -	hptep[0] &= ~HPTE_V_HVLOCK;
 +	hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
    preempt_enable();
    goto out_put;
  }
@@@ -862,7 -858,7 +860,7 @@@ static int kvm_unmap_rmapp(struct kvm *
  {
    struct revmap_entry *rev = kvm->arch.revmap;
    unsigned long h, i, j;
 -	unsigned long *hptep;
 +	__be64 *hptep;
    unsigned long ptel, psize, rcbits;
for (;;) {
@@@ -878,11 -874,11 +876,11 @@@
    	 * rmap chain lock.
    	 */
    	i = *rmapp & KVMPPC_RMAP_INDEX;
 -		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
 +		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
    	if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
    		/* unlock rmap before spinning on the HPTE lock */
    		unlock_rmap(rmapp);
 -			while (hptep[0] & HPTE_V_HVLOCK)
 +			while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
    			cpu_relax();
    		continue;
    	}
@@@ -901,14 -897,14 +899,14 @@@
/* Now check and modify the HPTE */
    	ptel = rev[i].guest_rpte;
 -		psize = hpte_page_size(hptep[0], ptel);
 -		if ((hptep[0] & HPTE_V_VALID) &&
 +		psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
 +		if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
    	    hpte_rpn(ptel, psize) == gfn) {
    		if (kvm->arch.using_mmu_notifiers)
 -				hptep[0] |= HPTE_V_ABSENT;
 +				hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
    		kvmppc_invalidate_hpte(kvm, hptep, i);
    		/* Harvest R and C */
 -			rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
 +			rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
    		*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
    		if (rcbits & ~rev[i].guest_rpte) {
    			rev[i].guest_rpte = ptel | rcbits;
@@@ -916,7 -912,7 +914,7 @@@
    		}
    	}
    	unlock_rmap(rmapp);
 -		hptep[0] &= ~HPTE_V_HVLOCK;
 +		hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
    }
    return 0;
  }
@@@ -963,7 -959,7 +961,7 @@@ static int kvm_age_rmapp(struct kvm *kv
  {
    struct revmap_entry *rev = kvm->arch.revmap;
    unsigned long head, i, j;
 -	unsigned long *hptep;
 +	__be64 *hptep;
    int ret = 0;
retry:
@@@ -979,24 -975,23 +977,24 @@@
i = head = *rmapp & KVMPPC_RMAP_INDEX;
    do {
 -		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
 +		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
    	j = rev[i].forw;
/* If this HPTE isn't referenced, ignore it */
 -		if (!(hptep[1] & HPTE_R_R))
 +		if (!(be64_to_cpu(hptep[1]) & HPTE_R_R))
    		continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
    		/* unlock rmap before spinning on the HPTE lock */
    		unlock_rmap(rmapp);
 -			while (hptep[0] & HPTE_V_HVLOCK)
 +			while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
    			cpu_relax();
    		goto retry;
    	}
/* Now check and modify the HPTE */
 -		if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
 +		if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
 +		    (be64_to_cpu(hptep[1]) & HPTE_R_R)) {
    		kvmppc_clear_ref_hpte(kvm, hptep, i);
    		if (!(rev[i].guest_rpte & HPTE_R_R)) {
    			rev[i].guest_rpte |= HPTE_R_R;
@@@ -1004,7 -999,7 +1002,7 @@@
    		}
    		ret = 1;
    	}
 -		hptep[0] &= ~HPTE_V_HVLOCK;
 +		hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
    } while ((i = j) != head);
unlock_rmap(rmapp);
@@@ -1038,7 -1033,7 +1036,7 @@@ static int kvm_test_age_rmapp(struct kv
    	do {
    		hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
    		j = rev[i].forw;
 -			if (hp[1] & HPTE_R_R)
 +			if (be64_to_cpu(hp[1]) & HPTE_R_R)
    			goto out;
    	} while ((i = j) != head);
    }
@@@ -1078,7 -1073,7 +1076,7 @@@ static int kvm_test_clear_dirty_npages(
    unsigned long head, i, j;
    unsigned long n;
    unsigned long v, r;
 -	unsigned long *hptep;
 +	__be64 *hptep;
    int npages_dirty = 0;
retry:
@@@ -1094,8 -1089,7 +1092,8 @@@
i = head = *rmapp & KVMPPC_RMAP_INDEX;
    do {
 -		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
 +		unsigned long hptep1;
 +		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
    	j = rev[i].forw;
/*
@@@ -1112,30 -1106,29 +1110,30 @@@
    	 * Otherwise we need to do the tlbie even if C==0 in
    	 * order to pick up any delayed writeback of C.
    	 */
 -		if (!(hptep[1] & HPTE_R_C) &&
 -		    (!hpte_is_writable(hptep[1]) || vcpus_running(kvm)))
 +		hptep1 = be64_to_cpu(hptep[1]);
 +		if (!(hptep1 & HPTE_R_C) &&
 +		    (!hpte_is_writable(hptep1) || vcpus_running(kvm)))
    		continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
    		/* unlock rmap before spinning on the HPTE lock */
    		unlock_rmap(rmapp);
 -			while (hptep[0] & HPTE_V_HVLOCK)
 +			while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK))
    			cpu_relax();
    		goto retry;
    	}
/* Now check and modify the HPTE */
 -		if (!(hptep[0] & HPTE_V_VALID))
 +		if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID)))
    		continue;
/* need to make it temporarily absent so C is stable */
 -		hptep[0] |= HPTE_V_ABSENT;
 +		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
    	kvmppc_invalidate_hpte(kvm, hptep, i);
 -		v = hptep[0];
 -		r = hptep[1];
 +		v = be64_to_cpu(hptep[0]);
 +		r = be64_to_cpu(hptep[1]);
    	if (r & HPTE_R_C) {
 -			hptep[1] = r & ~HPTE_R_C;
 +			hptep[1] = cpu_to_be64(r & ~HPTE_R_C);
    		if (!(rev[i].guest_rpte & HPTE_R_C)) {
    			rev[i].guest_rpte |= HPTE_R_C;
    			note_hpte_modification(kvm, &rev[i]);
@@@ -1148,7 -1141,7 +1146,7 @@@
    	}
    	v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
    	v |= HPTE_V_VALID;
 -		hptep[0] = v;
 +		hptep[0] = cpu_to_be64(v);
    } while ((i = j) != head);
unlock_rmap(rmapp);
@@@ -1312,7 -1305,7 +1310,7 @@@ struct kvm_htab_ctx 
   * Returns 1 if this HPT entry has been modified or has pending
   * R/C bit changes.
   */
 -static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp)
 +static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp)
  {
    unsigned long rcbits_unset;
@@@ -1321,14 -1314,13 +1319,14 @@@
/* Also need to consider changes in reference and changed bits */
    rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
 -	if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset))
 +	if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) &&
 +	    (be64_to_cpu(hptp[1]) & rcbits_unset))
    	return 1;
return 0;
  }
-static long record_hpte(unsigned long flags, unsigned long *hptp,
 +static long record_hpte(unsigned long flags, __be64 *hptp,
    		unsigned long *hpte, struct revmap_entry *revp,
    		int want_valid, int first_pass)
  {
@@@ -1343,10 -1335,10 +1341,10 @@@
    	return 0;
valid = 0;
 -	if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) {
 +	if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) {
    	valid = 1;
    	if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
 -		    !(hptp[0] & HPTE_V_BOLTED))
 +		    !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED))
    		valid = 0;
    }
    if (valid != want_valid)
@@@ -1358,7 -1350,7 +1356,7 @@@
    	preempt_disable();
    	while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
    		cpu_relax();
 -		v = hptp[0];
 +		v = be64_to_cpu(hptp[0]);
/* re-evaluate valid and dirty from synchronized HPTE value */
    	valid = !!(v & HPTE_V_VALID);
@@@ -1366,9 -1358,9 +1364,9 @@@
/* Harvest R and C into guest view if necessary */
    	rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
 -		if (valid && (rcbits_unset & hptp[1])) {
 -			revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) |
 -				HPTE_GR_MODIFIED;
 +		if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) {
 +			revp->guest_rpte |= (be64_to_cpu(hptp[1]) &
 +				(HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
    		dirty = 1;
    	}
@@@ -1387,13 -1379,13 +1385,13 @@@
    		revp->guest_rpte = r;
    	}
    	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
 -		hptp[0] &= ~HPTE_V_HVLOCK;
 +		hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
    	preempt_enable();
    	if (!(valid == want_valid && (first_pass || dirty)))
    		ok = 0;
    }
 -	hpte[0] = v;
 -	hpte[1] = r;
 +	hpte[0] = cpu_to_be64(v);
 +	hpte[1] = cpu_to_be64(r);
    return ok;
  }
@@@ -1403,7 -1395,7 +1401,7 @@@ static ssize_t kvm_htab_read(struct fil
    struct kvm_htab_ctx *ctx = file->private_data;
    struct kvm *kvm = ctx->kvm;
    struct kvm_get_htab_header hdr;
 -	unsigned long *hptp;
 +	__be64 *hptp;
    struct revmap_entry *revp;
    unsigned long i, nb, nw;
    unsigned long __user *lbuf;
@@@ -1419,7 -1411,7 +1417,7 @@@
    flags = ctx->flags;
i = ctx->index;
 -	hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
 +	hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
    revp = kvm->arch.revmap + i;
    lbuf = (unsigned long __user *)buf;
@@@ -1503,7 -1495,7 +1501,7 @@@ static ssize_t kvm_htab_write(struct fi
    unsigned long i, j;
    unsigned long v, r;
    unsigned long __user *lbuf;
 -	unsigned long *hptp;
 +	__be64 *hptp;
    unsigned long tmp[2];
    ssize_t nb;
    long int err, ret;
@@@ -1545,7 -1537,7 +1543,7 @@@
    	    i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
    		break;
-		hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
 +		hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
    	lbuf = (unsigned long __user *)buf;
    	for (j = 0; j < hdr.n_valid; ++j) {
    		err = -EFAULT;
@@@ -1557,7 -1549,7 +1555,7 @@@
    		lbuf += 2;
    		nb += HPTE_SIZE;
-			if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
 +			if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
    			kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
    		err = -EIO;
    		ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
@@@ -1583,7 -1575,7 +1581,7 @@@
    	}
for (j = 0; j < hdr.n_invalid; ++j) {
 -			if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
 +			if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
    			kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
    		++i;
    		hptp += 2;
diff --combined arch/powerpc/kvm/book3s_hv_builtin.c
index 3b41447,6cf498a..329d7fd
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@@ -16,12 -16,14 +16,14 @@@
  #include <linux/init.h>
  #include <linux/memblock.h>
  #include <linux/sizes.h>
+ #include <linux/cma.h>
#include <asm/cputable.h>
  #include <asm/kvm_ppc.h>
  #include <asm/kvm_book3s.h>
- #include "book3s_hv_cma.h"
+ #define KVM_CMA_CHUNK_ORDER	18
+ 
  /*
   * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
   * should be power of 2.
@@@ -43,6 -45,8 +45,8 @@@ static unsigned long kvm_cma_resv_rati
  unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT;	/* 128MB */
  EXPORT_SYMBOL_GPL(kvm_rma_pages);
+ static struct cma *kvm_cma;
+ 
  /* Work out RMLS (real mode limit selector) field value for a given RMA size.
     Assumes POWER7 or PPC970. */
  static inline int lpcr_rmls(unsigned long rma_size)
@@@ -97,7 -101,7 +101,7 @@@ struct kvm_rma_info *kvm_alloc_rma(
    ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL);
    if (!ri)
    	return NULL;
- 	page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages);
+ 	page = cma_alloc(kvm_cma, kvm_rma_pages, get_order(kvm_rma_pages));
    if (!page)
    	goto err_out;
    atomic_set(&ri->use_count, 1);
@@@ -112,7 -116,7 +116,7 @@@ EXPORT_SYMBOL_GPL(kvm_alloc_rma)
  void kvm_release_rma(struct kvm_rma_info *ri)
  {
    if (atomic_dec_and_test(&ri->use_count)) {
- 		kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages);
+ 		cma_release(kvm_cma, pfn_to_page(ri->base_pfn), kvm_rma_pages);
    	kfree(ri);
    }
  }
@@@ -131,16 -135,18 +135,18 @@@ struct page *kvm_alloc_hpt(unsigned lon
  {
    unsigned long align_pages = HPT_ALIGN_PAGES;
+ 	VM_BUG_ON(get_order(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+ 
    /* Old CPUs require HPT aligned on a multiple of its size */
    if (!cpu_has_feature(CPU_FTR_ARCH_206))
    	align_pages = nr_pages;
- 	return kvm_alloc_cma(nr_pages, align_pages);
+ 	return cma_alloc(kvm_cma, nr_pages, get_order(align_pages));
  }
  EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
void kvm_release_hpt(struct page *page, unsigned long nr_pages)
  {
- 	kvm_release_cma(page, nr_pages);
+ 	cma_release(kvm_cma, page, nr_pages);
  }
  EXPORT_SYMBOL_GPL(kvm_release_hpt);
@@@ -179,7 -185,8 +185,8 @@@ void __init kvm_cma_reserve(void
    		align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size);
- 		kvm_cma_declare_contiguous(selected_size, align_size);
+ 		cma_declare_contiguous(0, selected_size, 0, align_size,
+ 			KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma);
    }
  }
@@@ -212,16 -219,3 +219,16 @@@ bool kvm_hv_mode_active(void
  {
    return atomic_read(&hv_vm_count) != 0;
  }
 +
 +extern int hcall_real_table[], hcall_real_table_end[];
 +
 +int kvmppc_hcall_impl_hv_realmode(unsigned long cmd)
 +{
 +	cmd /= 4;
 +	if (cmd < hcall_real_table_end - hcall_real_table &&
 +	    hcall_real_table[cmd])
 +		return 1;
 +
 +	return 0;
 +}
 +EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
diff --combined arch/s390/Kconfig
index f5af5f6,d12d40e..3c94ef3
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@@ -116,6 -116,7 +116,6 @@@ config S39
    select HAVE_FTRACE_MCOUNT_RECORD
    select HAVE_FUNCTION_GRAPH_TRACER
    select HAVE_FUNCTION_TRACER
 -	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
    select HAVE_FUTEX_CMPXCHG if FUTEX
    select HAVE_KERNEL_BZIP2
    select HAVE_KERNEL_GZIP
@@@ -145,6 -146,7 +145,7 @@@
    select TTY
    select VIRT_CPU_ACCOUNTING
    select VIRT_TO_BUS
+ 	select ARCH_HAS_SG_CHAIN
config SCHED_OMIT_FRAME_POINTER
    def_bool y
diff --combined arch/sparc/Kconfig
index 4692c90,bff3192..a537816
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@@ -42,6 -42,7 +42,7 @@@ config SPAR
    select MODULES_USE_ELF_RELA
    select ODD_RT_SIGACTION
    select OLD_SIGSUSPEND
+ 	select ARCH_HAS_SG_CHAIN
config SPARC32
    def_bool !64BIT
@@@ -55,6 -56,7 +56,6 @@@ config SPARC6
    select HAVE_FUNCTION_TRACER
    select HAVE_FUNCTION_GRAPH_TRACER
    select HAVE_FUNCTION_GRAPH_FP_TEST
 -	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
    select HAVE_KRETPROBES
    select HAVE_KPROBES
    select HAVE_RCU_TABLE_FREE if SMP
diff --combined arch/x86/Kconfig
index 503f35c,2ae952c..273d20d
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -21,7 -21,6 +21,7 @@@ config X86_6
  ### Arch settings
  config X86
    def_bool y
 +	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
    select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
    select ARCH_MIGHT_HAVE_PC_PARPORT
    select ARCH_MIGHT_HAVE_PC_SERIO
@@@ -55,6 -54,7 +55,6 @@@
    select HAVE_FUNCTION_TRACER
    select HAVE_FUNCTION_GRAPH_TRACER
    select HAVE_FUNCTION_GRAPH_FP_TEST
 -	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
    select HAVE_SYSCALL_TRACEPOINTS
    select SYSCTL_EXCEPTION_TRACE
    select HAVE_KVM
@@@ -96,6 -96,7 +96,7 @@@
    select IRQ_FORCED_THREADING
    select HAVE_BPF_JIT if X86_64
    select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ 	select ARCH_HAS_SG_CHAIN
    select CLKEVT_I8253
    select ARCH_HAVE_NMI_SAFE_CMPXCHG
    select GENERIC_IOMAP
@@@ -132,7 -133,6 +133,7 @@@
    select GENERIC_CPU_AUTOPROBE
    select HAVE_ARCH_AUDITSYSCALL
    select ARCH_SUPPORTS_ATOMIC_RMW
 +	select ACPI_LEGACY_TABLES_LOOKUP if ACPI
config INSTRUCTION_DECODER
    def_bool y
@@@ -431,7 -431,6 +432,7 @@@ config X86_INTEL_C
    bool "CE4100 TV platform"
    depends on PCI
    depends on PCI_GODIRECT
 +	depends on X86_IO_APIC
    depends on X86_32
    depends on X86_EXTENDED_PLATFORM
    select X86_REBOOTFIXUPS
@@@ -539,7 -538,7 +540,7 @@@ config X86_32_IRI
config SCHED_OMIT_FRAME_POINTER
    def_bool y
 -	prompt "Single-depth WCHAN output"
 +	prompt "Single-depth WCHAN output" if !LTO && !FRAME_POINTER
    depends on X86
    ---help---
      Calculate simpler /proc/<PID>/wchan values. If this option
@@@ -838,7 -837,6 +839,7 @@@ config X86_IO_API
    def_bool y
    depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI
    select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
 +	select IRQ_DOMAIN
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
    bool "Reroute for broken boot IRQs"
diff --combined arch/x86/mm/fault.c
index 1dbade8,d30b78b..d393ac6
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@@ -350,7 -350,7 +350,7 @@@ out
void vmalloc_sync_all(void)
  {
- 	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
+ 	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
  }
/*
@@@ -577,8 -577,6 +577,8 @@@ static int is_f00f_bug(struct pt_regs *
static const char nx_warning[] = KERN_CRIT
  "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
 +static const char smep_warning[] = KERN_CRIT
 +"unable to execute userspace code (SMEP?) (uid: %d)\n";
static void
  show_fault_oops(struct pt_regs *regs, unsigned long error_code,
@@@ -599,10 -597,6 +599,10 @@@
if (pte && pte_present(*pte) && !pte_exec(*pte))
    		printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
 +		if (pte && pte_present(*pte) && pte_exec(*pte) &&
 +				(pgd_flags(*pgd) & _PAGE_USER) &&
 +				(read_cr4() & X86_CR4_SMEP))
 +			printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
    }
printk(KERN_ALERT "BUG: unable to handle kernel ");
@@@ -1218,7 -1212,8 +1218,8 @@@ good_area
    /*
     * If for any reason at all we couldn't handle the fault,
     * make sure we exit gracefully rather than endlessly redo
- 	 * the fault:
+ 	 * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
+ 	 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
     */
    fault = handle_mm_fault(mm, vma, address, flags);
diff --combined block/bio-integrity.c
index bc423f7b,56754c4..38c8ac2
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@@ -70,10 -70,8 +70,10 @@@ struct bio_integrity_payload *bio_integ
    				  bs->bvec_integrity_pool);
    	if (!bip->bip_vec)
    		goto err;
 +		bip->bip_max_vcnt = bvec_nr_vecs(idx);
    } else {
    	bip->bip_vec = bip->bip_inline_vecs;
 +		bip->bip_max_vcnt = inline_vecs;
    }
bip->bip_slab = idx;
@@@ -116,6 -114,14 +116,6 @@@ void bio_integrity_free(struct bio *bio
  }
  EXPORT_SYMBOL(bio_integrity_free);
-static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
 -{
 -	if (bip->bip_slab == BIO_POOL_NONE)
 -		return BIP_INLINE_VECS;
 -
 -	return bvec_nr_vecs(bip->bip_slab);
 -}
 -
  /**
   * bio_integrity_add_page - Attach integrity metadata
   * @bio:	bio to update
@@@ -131,7 -137,7 +131,7 @@@ int bio_integrity_add_page(struct bio *
    struct bio_integrity_payload *bip = bio->bi_integrity;
    struct bio_vec *iv;
-	if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
 +	if (bip->bip_vcnt >= bip->bip_max_vcnt) {
    	printk(KERN_ERR "%s: bip_vec full\n", __func__);
    	return 0;
    }
@@@ -646,6 -652,4 +646,4 @@@ void __init bio_integrity_init(void
    			     sizeof(struct bio_integrity_payload) +
    			     sizeof(struct bio_vec) * BIP_INLINE_VECS,
    			     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
- 	if (!bip_slab)
- 		panic("Failed to create slab\n");
  }
diff --combined drivers/ata/Kconfig
index e65d400,b0d5b5a..e1b9278
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@@ -16,6 -16,7 +16,7 @@@ menuconfig AT
    depends on BLOCK
    depends on !(M32R || M68K || S390) || BROKEN
    select SCSI
+ 	select GLOB
    ---help---
      If you want to use an ATA hard disk, ATA tape drive, ATA CD-ROM or
      any other ATA device under Linux, say Y and make sure that you know
@@@ -141,15 -142,6 +142,15 @@@ config AHCI_SUNX
If unsure, say N.
+config AHCI_TEGRA
 +	tristate "NVIDIA Tegra124 AHCI SATA support"
 +	depends on ARCH_TEGRA
 +	help
 +	  This option enables support for the NVIDIA Tegra124 SoC's
 +	  onboard AHCI SATA.
 +
 +	  If unsure, say N.
 +
  config AHCI_XGENE
    tristate "APM X-Gene 6.0Gbps AHCI SATA host controller support"
    depends on PHY_XGENE
diff --combined drivers/ata/libata-core.c
index 677c0c1,259d879..dbdc5d3
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@@ -59,6 -59,7 +59,7 @@@
  #include <linux/async.h>
  #include <linux/log2.h>
  #include <linux/slab.h>
+ #include <linux/glob.h>
  #include <scsi/scsi.h>
  #include <scsi/scsi_cmnd.h>
  #include <scsi/scsi_host.h>
@@@ -4250,73 -4251,6 +4251,6 @@@ static const struct ata_blacklist_entr
    { }
  };
- /**
-  *	glob_match - match a text string against a glob-style pattern
-  *	@text: the string to be examined
-  *	@pattern: the glob-style pattern to be matched against
-  *
-  *	Either/both of text and pattern can be empty strings.
-  *
-  *	Match text against a glob-style pattern, with wildcards and simple sets:
-  *
-  *		?	matches any single character.
-  *		*	matches any run of characters.
-  *		[xyz]	matches a single character from the set: x, y, or z.
-  *		[a-d]	matches a single character from the range: a, b, c, or d.
-  *		[a-d0-9] matches a single character from either range.
-  *
-  *	The special characters ?, [, -, or *, can be matched using a set, eg. [*]
-  *	Behaviour with malformed patterns is undefined, though generally reasonable.
-  *
-  *	Sample patterns:  "SD1?",  "SD1[0-5]",  "*R0",  "SD*1?[012]*xx"
-  *
-  *	This function uses one level of recursion per '*' in pattern.
-  *	Since it calls _nothing_ else, and has _no_ explicit local variables,
-  *	this will not cause stack problems for any reasonable use here.
-  *
-  *	RETURNS:
-  *	0 on match, 1 otherwise.
-  */
- static int glob_match (const char *text, const char *pattern)
- {
- 	do {
- 		/* Match single character or a '?' wildcard */
- 		if (*text == *pattern || *pattern == '?') {
- 			if (!*pattern++)
- 				return 0;  /* End of both strings: match */
- 		} else {
- 			/* Match single char against a '[' bracketed ']' pattern set */
- 			if (!*text || *pattern != '[')
- 				break;  /* Not a pattern set */
- 			while (*++pattern && *pattern != ']' && *text != *pattern) {
- 				if (*pattern == '-' && *(pattern - 1) != '[')
- 					if (*text > *(pattern - 1) && *text < *(pattern + 1)) {
- 						++pattern;
- 						break;
- 					}
- 			}
- 			if (!*pattern || *pattern == ']')
- 				return 1;  /* No match */
- 			while (*pattern && *pattern++ != ']');
- 		}
- 	} while (*++text && *pattern);
- 
- 	/* Match any run of chars against a '*' wildcard */
- 	if (*pattern == '*') {
- 		if (!*++pattern)
- 			return 0;  /* Match: avoid recursion at end of pattern */
- 		/* Loop to handle additional pattern chars after the wildcard */
- 		while (*text) {
- 			if (glob_match(text, pattern) == 0)
- 				return 0;  /* Remainder matched */
- 			++text;  /* Absorb (match) this char and try again */
- 		}
- 	}
- 	if (!*text && !*pattern)
- 		return 0;  /* End of both strings: match */
- 	return 1;  /* No match */
- }
- 
  static unsigned long ata_dev_blacklisted(const struct ata_device *dev)
  {
    unsigned char model_num[ATA_ID_PROD_LEN + 1];
@@@ -4327,10 -4261,10 +4261,10 @@@
    ata_id_c_string(dev->id, model_rev, ATA_ID_FW_REV, sizeof(model_rev));
while (ad->model_num) {
- 		if (!glob_match(model_num, ad->model_num)) {
+ 		if (glob_match(model_num, ad->model_num)) {
    		if (ad->model_rev == NULL)
    			return ad->horkage;
- 			if (!glob_match(model_rev, ad->model_rev))
+ 			if (glob_match(model_rev, ad->model_rev))
    			return ad->horkage;
    	}
    	ad++;
@@@ -4798,8 -4732,9 +4732,8 @@@ void swap_buf_le16(u16 *buf, unsigned i
  static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
  {
    struct ata_queued_cmd *qc = NULL;
 -	unsigned int i, tag, max_queue;
 -
 -	max_queue = ap->scsi_host->can_queue;
 +	unsigned int max_queue = ap->host->n_tags;
 +	unsigned int i, tag;
/* no command while frozen */
    if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
@@@ -6093,7 -6028,6 +6027,7 @@@ void ata_host_init(struct ata_host *hos
  {
    spin_lock_init(&host->lock);
    mutex_init(&host->eh_mutex);
 +	host->n_tags = ATA_MAX_QUEUE - 1;
    host->dev = dev;
    host->ops = ops;
  }
@@@ -6175,7 -6109,15 +6109,7 @@@ int ata_host_register(struct ata_host *
  {
    int i, rc;
-	/*
 -	 * The max queue supported by hardware must not be greater than
 -	 * ATA_MAX_QUEUE.
 -	 */
 -	if (sht->can_queue > ATA_MAX_QUEUE) {
 -		dev_err(host->dev, "BUG: the hardware max queue is too large\n");
 -		WARN_ON(1);
 -		return -EINVAL;
 -	}
 +	host->n_tags = clamp(sht->can_queue, 1, ATA_MAX_QUEUE - 1);
/* host must have been started */
    if (!(host->flags & ATA_HOST_STARTED)) {
diff --combined drivers/base/Kconfig
index 88500fe,9d5fed1..4e7f0ff
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@@ -149,21 -149,15 +149,21 @@@ config EXTRA_FIRMWARE_DI
      some other directory containing the firmware files.
config FW_LOADER_USER_HELPER
 +	bool
 +
 +config FW_LOADER_USER_HELPER_FALLBACK
    bool "Fallback user-helper invocation for firmware loading"
    depends on FW_LOADER
 -	default y
 +	select FW_LOADER_USER_HELPER
    help
      This option enables / disables the invocation of user-helper
      (e.g. udev) for loading firmware files as a fallback after the
      direct file loading in kernel fails.  The user-mode helper is
      no longer required unless you have a special firmware file that
 -	  resides in a non-standard path.
 +	  resides in a non-standard path. Moreover, the udev support has
 +	  been deprecated upstream.
 +
 +	  If you are unsure about this, say N here.
config DEBUG_DRIVER
    bool "Driver Core verbose debug messages"
@@@ -214,15 -208,6 +214,15 @@@ config DMA_SHARED_BUFFE
      APIs extension; the file's descriptor can then be passed on to other
      driver.
+config FENCE_TRACE
 +	bool "Enable verbose FENCE_TRACE messages"
 +	depends on DMA_SHARED_BUFFER
 +	help
 +	  Enable the FENCE_TRACE printks. This will add extra
 +	  spam to the console log, but will make it easier to diagnose
 +	  lockup related problems for dma-buffers shared across multiple
 +	  devices.
 +
  config DMA_CMA
    bool "DMA Contiguous Memory Allocator"
    depends on HAVE_DMA_CONTIGUOUS && CMA
@@@ -289,16 -274,6 +289,6 @@@ config CMA_ALIGNMEN
If unsure, leave the default value "8".
- config CMA_AREAS
- 	int "Maximum count of the CMA device-private areas"
- 	default 7
- 	help
- 	  CMA allows to create CMA areas for particular devices. This parameter
- 	  sets the maximum number of such device private CMA areas in the
- 	  system.
- 
- 	  If unsure, leave the default value "7".
- 
  endif
endmenu
diff --combined drivers/input/input.c
index 29ca0bb,3b9284b..236bc56
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@@ -257,10 -257,9 +257,10 @@@ static int input_handle_abs_event(struc
  }
static int input_get_disposition(struct input_dev *dev,
 -			  unsigned int type, unsigned int code, int value)
 +			  unsigned int type, unsigned int code, int *pval)
  {
    int disposition = INPUT_IGNORE_EVENT;
 +	int value = *pval;
switch (type) {
@@@ -358,7 -357,6 +358,7 @@@
    	break;
    }
+	*pval = value;
    return disposition;
  }
@@@ -367,7 -365,7 +367,7 @@@ static void input_handle_event(struct i
  {
    int disposition;
-	disposition = input_get_disposition(dev, type, code, value);
 +	disposition = input_get_disposition(dev, type, code, &value);
if ((disposition & INPUT_PASS_TO_DEVICE) && dev->event)
    	dev->event(dev, type, code, value);
@@@ -710,6 -708,9 +710,9 @@@ static void input_disconnect_device(str
    	handle->open = 0;
spin_unlock_irq(&dev->event_lock);
+ 
+ 	if (is_event_supported(EV_LED, dev->evbit, EV_MAX))
+ 		input_led_disconnect(dev);
  }
/**
@@@ -2136,6 -2137,9 +2139,9 @@@ int input_register_device(struct input_
list_add_tail(&dev->node, &input_dev_list);
+ 	if (is_event_supported(EV_LED, dev->evbit, EV_MAX))
+ 		input_led_connect(dev);
+ 
    list_for_each_entry(handler, &input_handler_list, node)
    	input_attach_handler(dev, handler);
diff --combined drivers/leds/Kconfig
index 8c96e2d,6784c17..f6e32ba
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@@ -11,9 -11,6 +11,6 @@@ menuconfig NEW_LED
      Say Y to enable Linux LED support.  This allows control of supported
      LEDs from both userspace and optionally, by kernel events (triggers).
- 	  This is not related to standard keyboard LEDs which are controlled
- 	  via the input system.
- 
  if NEW_LEDS
config LEDS_CLASS
@@@ -32,6 -29,14 +29,6 @@@ config LEDS_88PM860
      This option enables support for on-chip LED drivers found on Marvell
      Semiconductor 88PM8606 PMIC.
-config LEDS_ATMEL_PWM
 -	tristate "LED Support using Atmel PWM outputs"
 -	depends on LEDS_CLASS
 -	depends on ATMEL_PWM
 -	help
 -	  This option enables support for LEDs driven using outputs
 -	  of the dedicated PWM controller found on newer Atmel SOCs.
 -
  config LEDS_LM3530
    tristate "LCD Backlight driver for LM3530"
    depends on LEDS_CLASS
@@@ -135,13 -140,6 +132,13 @@@ config LEDS_SUNFIR
      This option enables support for the Left, Middle, and Right
      LEDs on the I/O and CPU boards of SunFire UltraSPARC servers.
+config LEDS_IPAQ_MICRO
 +	tristate "LED Support for the Compaq iPAQ h3xxx"
 +	depends on MFD_IPAQ_MICRO
 +	help
 +	  Choose this option if you want to use the notification LED on
 +	  Compaq/HP iPAQ h3100 and h3600.
 +
  config LEDS_HP6XX
    tristate "LED Support for the HP Jornada 6xx"
    depends on LEDS_CLASS
diff --combined drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 3abd3cb,c57b085..053e850
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@@ -215,135 -215,52 +215,135 @@@ static int i40e_get_settings(struct net
    /* hardware is either in 40G mode or 10G mode
     * NOTE: this section initializes supported and advertising
     */
 +	if (!link_up) {
 +		/* link is down and the driver needs to fall back on
 +		 * device ID to determine what kinds of info to display,
 +		 * it's mostly a guess that may change when link is up
 +		 */
 +		switch (hw->device_id) {
 +		case I40E_DEV_ID_QSFP_A:
 +		case I40E_DEV_ID_QSFP_B:
 +		case I40E_DEV_ID_QSFP_C:
 +			/* pluggable QSFP */
 +			ecmd->supported = SUPPORTED_40000baseSR4_Full |
 +					  SUPPORTED_40000baseCR4_Full |
 +					  SUPPORTED_40000baseLR4_Full;
 +			ecmd->advertising = ADVERTISED_40000baseSR4_Full |
 +					    ADVERTISED_40000baseCR4_Full |
 +					    ADVERTISED_40000baseLR4_Full;
 +			break;
 +		case I40E_DEV_ID_KX_B:
 +			/* backplane 40G */
 +			ecmd->supported = SUPPORTED_40000baseKR4_Full;
 +			ecmd->advertising = ADVERTISED_40000baseKR4_Full;
 +			break;
 +		case I40E_DEV_ID_KX_C:
 +			/* backplane 10G */
 +			ecmd->supported = SUPPORTED_10000baseKR_Full;
 +			ecmd->advertising = ADVERTISED_10000baseKR_Full;
 +			break;
 +		default:
 +			/* all the rest are 10G/1G */
 +			ecmd->supported = SUPPORTED_10000baseT_Full |
 +					  SUPPORTED_1000baseT_Full;
 +			ecmd->advertising = ADVERTISED_10000baseT_Full |
 +					    ADVERTISED_1000baseT_Full;
 +			break;
 +		}
 +
 +		/* skip phy_type use as it is zero when link is down */
 +		goto no_valid_phy_type;
 +	}
 +
    switch (hw_link_info->phy_type) {
    case I40E_PHY_TYPE_40GBASE_CR4:
    case I40E_PHY_TYPE_40GBASE_CR4_CU:
 -		ecmd->supported = SUPPORTED_40000baseCR4_Full;
 -		ecmd->advertising = ADVERTISED_40000baseCR4_Full;
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_40000baseCR4_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_40000baseCR4_Full;
    	break;
    case I40E_PHY_TYPE_40GBASE_KR4:
 -		ecmd->supported = SUPPORTED_40000baseKR4_Full;
 -		ecmd->advertising = ADVERTISED_40000baseKR4_Full;
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_40000baseKR4_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_40000baseKR4_Full;
    	break;
    case I40E_PHY_TYPE_40GBASE_SR4:
 +	case I40E_PHY_TYPE_XLPPI:
 +	case I40E_PHY_TYPE_XLAUI:
    	ecmd->supported = SUPPORTED_40000baseSR4_Full;
    	break;
    case I40E_PHY_TYPE_40GBASE_LR4:
    	ecmd->supported = SUPPORTED_40000baseLR4_Full;
 -		ecmd->advertising = ADVERTISED_40000baseLR4_Full;
    	break;
    case I40E_PHY_TYPE_10GBASE_KX4:
 -		ecmd->supported = SUPPORTED_10000baseKX4_Full;
 -		ecmd->advertising = ADVERTISED_10000baseKX4_Full;
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_10000baseKX4_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_10000baseKX4_Full;
    	break;
    case I40E_PHY_TYPE_10GBASE_KR:
 -		ecmd->supported = SUPPORTED_10000baseKR_Full;
 -		ecmd->advertising = ADVERTISED_10000baseKR_Full;
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_10000baseKR_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_10000baseKR_Full;
    	break;
 -	default:
 -		if (i40e_is_40G_device(hw->device_id)) {
 -			ecmd->supported = SUPPORTED_40000baseSR4_Full;
 -			ecmd->advertising = ADVERTISED_40000baseSR4_Full;
 -		} else {
 -			ecmd->supported = SUPPORTED_10000baseT_Full;
 -			ecmd->advertising = ADVERTISED_10000baseT_Full;
 -		}
 +	case I40E_PHY_TYPE_10GBASE_SR:
 +	case I40E_PHY_TYPE_10GBASE_LR:
 +		ecmd->supported = SUPPORTED_10000baseT_Full;
 +		break;
 +	case I40E_PHY_TYPE_10GBASE_CR1_CU:
 +	case I40E_PHY_TYPE_10GBASE_CR1:
 +	case I40E_PHY_TYPE_10GBASE_T:
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_10000baseT_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_10000baseT_Full;
 +		break;
 +	case I40E_PHY_TYPE_XAUI:
 +	case I40E_PHY_TYPE_XFI:
 +	case I40E_PHY_TYPE_SFI:
 +	case I40E_PHY_TYPE_10GBASE_SFPP_CU:
 +		ecmd->supported = SUPPORTED_10000baseT_Full;
    	break;
 +	case I40E_PHY_TYPE_1000BASE_KX:
 +	case I40E_PHY_TYPE_1000BASE_T:
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_1000baseT_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_1000baseT_Full;
 +		break;
 +	case I40E_PHY_TYPE_100BASE_TX:
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_100baseT_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_100baseT_Full;
 +		break;
 +	case I40E_PHY_TYPE_SGMII:
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_1000baseT_Full |
 +				  SUPPORTED_100baseT_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_1000baseT_Full |
 +				    ADVERTISED_100baseT_Full;
 +		break;
 +	default:
 +		/* if we got here and link is up something bad is afoot */
 +		WARN_ON(link_up);
    }
-	ecmd->supported |= SUPPORTED_Autoneg;
 -	ecmd->advertising |= ADVERTISED_Autoneg;
 +no_valid_phy_type:
 +	/* this is if autoneg is enabled or disabled */
    ecmd->autoneg = ((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ?
    		  AUTONEG_ENABLE : AUTONEG_DISABLE);
switch (hw->phy.media_type) {
    case I40E_MEDIA_TYPE_BACKPLANE:
 -		ecmd->supported |= SUPPORTED_Backplane;
 -		ecmd->advertising |= ADVERTISED_Backplane;
 +		ecmd->supported |= SUPPORTED_Autoneg |
 +				   SUPPORTED_Backplane;
 +		ecmd->advertising |= ADVERTISED_Autoneg |
 +				     ADVERTISED_Backplane;
    	ecmd->port = PORT_NONE;
    	break;
    case I40E_MEDIA_TYPE_BASET:
@@@ -359,6 -276,7 +359,6 @@@
    	break;
    case I40E_MEDIA_TYPE_FIBER:
    	ecmd->supported |= SUPPORTED_FIBRE;
 -		ecmd->advertising |= ADVERTISED_FIBRE;
    	ecmd->port = PORT_FIBRE;
    	break;
    case I40E_MEDIA_TYPE_UNKNOWN:
@@@ -369,25 -287,6 +369,25 @@@
ecmd->transceiver = XCVR_EXTERNAL;
+	ecmd->supported |= SUPPORTED_Pause;
 +
 +	switch (hw->fc.current_mode) {
 +	case I40E_FC_FULL:
 +		ecmd->advertising |= ADVERTISED_Pause;
 +		break;
 +	case I40E_FC_TX_PAUSE:
 +		ecmd->advertising |= ADVERTISED_Asym_Pause;
 +		break;
 +	case I40E_FC_RX_PAUSE:
 +		ecmd->advertising |= (ADVERTISED_Pause |
 +				      ADVERTISED_Asym_Pause);
 +		break;
 +	default:
 +		ecmd->advertising &= ~(ADVERTISED_Pause |
 +				       ADVERTISED_Asym_Pause);
 +		break;
 +	}
 +
    if (link_up) {
    	switch (link_speed) {
    	case I40E_LINK_SPEED_40GB:
@@@ -397,9 -296,6 +397,9 @@@
    	case I40E_LINK_SPEED_10GB:
    		ethtool_cmd_speed_set(ecmd, SPEED_10000);
    		break;
 +		case I40E_LINK_SPEED_1GB:
 +			ethtool_cmd_speed_set(ecmd, SPEED_1000);
 +			break;
    	default:
    		break;
    	}
@@@ -413,182 -309,6 +413,182 @@@
  }
/**
 + * i40e_set_settings - Set Speed and Duplex
 + * @netdev: network interface device structure
 + * @ecmd: ethtool command
 + *
 + * Set speed/duplex per media_types advertised/forced
 + **/
 +static int i40e_set_settings(struct net_device *netdev,
 +			     struct ethtool_cmd *ecmd)
 +{
 +	struct i40e_netdev_priv *np = netdev_priv(netdev);
 +	struct i40e_aq_get_phy_abilities_resp abilities;
 +	struct i40e_aq_set_phy_config config;
 +	struct i40e_pf *pf = np->vsi->back;
 +	struct i40e_vsi *vsi = np->vsi;
 +	struct i40e_hw *hw = &pf->hw;
 +	struct ethtool_cmd safe_ecmd;
 +	i40e_status status = 0;
 +	bool change = false;
 +	int err = 0;
 +	u8 autoneg;
 +	u32 advertise;
 +
 +	if (vsi != pf->vsi[pf->lan_vsi])
 +		return -EOPNOTSUPP;
 +
 +	if (hw->phy.media_type != I40E_MEDIA_TYPE_BASET &&
 +	    hw->phy.media_type != I40E_MEDIA_TYPE_FIBER &&
 +	    hw->phy.media_type != I40E_MEDIA_TYPE_BACKPLANE)
 +		return -EOPNOTSUPP;
 +
 +	/* get our own copy of the bits to check against */
 +	memset(&safe_ecmd, 0, sizeof(struct ethtool_cmd));
 +	i40e_get_settings(netdev, &safe_ecmd);
 +
 +	/* save autoneg and speed out of ecmd */
 +	autoneg = ecmd->autoneg;
 +	advertise = ecmd->advertising;
 +
 +	/* set autoneg and speed back to what they currently are */
 +	ecmd->autoneg = safe_ecmd.autoneg;
 +	ecmd->advertising = safe_ecmd.advertising;
 +
 +	ecmd->cmd = safe_ecmd.cmd;
 +	/* If ecmd and safe_ecmd are not the same now, then they are
 +	 * trying to set something that we do not support
 +	 */
 +	if (memcmp(ecmd, &safe_ecmd, sizeof(struct ethtool_cmd)))
 +		return -EOPNOTSUPP;
 +
 +	while (test_bit(__I40E_CONFIG_BUSY, &vsi->state))
 +		usleep_range(1000, 2000);
 +
 +	/* Get the current phy config */
 +	status = i40e_aq_get_phy_capabilities(hw, false, false, &abilities,
 +					      NULL);
 +	if (status)
 +		return -EAGAIN;
 +
 +	/* Copy link_speed and abilities to config in case they are not
 +	 * set below
 +	 */
 +	memset(&config, 0, sizeof(struct i40e_aq_set_phy_config));
 +	config.link_speed = abilities.link_speed;
 +	config.abilities = abilities.abilities;
 +
 +	/* Check autoneg */
 +	if (autoneg == AUTONEG_ENABLE) {
 +		/* If autoneg is not supported, return error */
 +		if (!(safe_ecmd.supported & SUPPORTED_Autoneg)) {
 +			netdev_info(netdev, "Autoneg not supported on this phy\n");
 +			return -EINVAL;
 +		}
 +		/* If autoneg was not already enabled */
 +		if (!(hw->phy.link_info.an_info & I40E_AQ_AN_COMPLETED)) {
 +			config.abilities = abilities.abilities |
 +					   I40E_AQ_PHY_ENABLE_AN;
 +			change = true;
 +		}
 +	} else {
 +		/* If autoneg is supported 10GBASE_T is the only phy that
 +		 * can disable it, so otherwise return error
 +		 */
 +		if (safe_ecmd.supported & SUPPORTED_Autoneg &&
 +		    hw->phy.link_info.phy_type != I40E_PHY_TYPE_10GBASE_T) {
 +			netdev_info(netdev, "Autoneg cannot be disabled on this phy\n");
 +			return -EINVAL;
 +		}
 +		/* If autoneg is currently enabled */
 +		if (hw->phy.link_info.an_info & I40E_AQ_AN_COMPLETED) {
 +			config.abilities = abilities.abilities |
 +					   ~I40E_AQ_PHY_ENABLE_AN;
 +			change = true;
 +		}
 +	}
 +
 +	if (advertise & ~safe_ecmd.supported)
 +		return -EINVAL;
 +
 +	if (advertise & ADVERTISED_100baseT_Full)
 +		if (!(abilities.link_speed & I40E_LINK_SPEED_100MB)) {
 +			config.link_speed |= I40E_LINK_SPEED_100MB;
 +			change = true;
 +		}
 +	if (advertise & ADVERTISED_1000baseT_Full ||
 +	    advertise & ADVERTISED_1000baseKX_Full)
 +		if (!(abilities.link_speed & I40E_LINK_SPEED_1GB)) {
 +			config.link_speed |= I40E_LINK_SPEED_1GB;
 +			change = true;
 +		}
 +	if (advertise & ADVERTISED_10000baseT_Full ||
 +	    advertise & ADVERTISED_10000baseKX4_Full ||
 +	    advertise & ADVERTISED_10000baseKR_Full)
 +		if (!(abilities.link_speed & I40E_LINK_SPEED_10GB)) {
 +			config.link_speed |= I40E_LINK_SPEED_10GB;
 +			change = true;
 +		}
 +	if (advertise & ADVERTISED_40000baseKR4_Full ||
 +	    advertise & ADVERTISED_40000baseCR4_Full ||
 +	    advertise & ADVERTISED_40000baseSR4_Full ||
 +	    advertise & ADVERTISED_40000baseLR4_Full)
 +		if (!(abilities.link_speed & I40E_LINK_SPEED_40GB)) {
 +			config.link_speed |= I40E_LINK_SPEED_40GB;
 +			change = true;
 +		}
 +
 +	if (change) {
 +		/* copy over the rest of the abilities */
 +		config.phy_type = abilities.phy_type;
 +		config.eee_capability = abilities.eee_capability;
 +		config.eeer = abilities.eeer_val;
 +		config.low_power_ctrl = abilities.d3_lpan;
 +
 +		/* If link is up set link and an so changes take effect */
 +		if (hw->phy.link_info.link_info & I40E_AQ_LINK_UP)
 +			config.abilities |= I40E_AQ_PHY_ENABLE_ATOMIC_LINK;
 +
 +		/* make the aq call */
 +		status = i40e_aq_set_phy_config(hw, &config, NULL);
 +		if (status) {
 +			netdev_info(netdev, "Set phy config failed with error %d.\n",
 +				    status);
 +			return -EAGAIN;
 +		}
 +
 +		status = i40e_update_link_info(hw, true);
 +		if (status)
 +			netdev_info(netdev, "Updating link info failed with error %d\n",
 +				    status);
 +
 +	} else {
 +		netdev_info(netdev, "Nothing changed, exiting without setting anything.\n");
 +	}
 +
 +	return err;
 +}
 +
 +static int i40e_nway_reset(struct net_device *netdev)
 +{
 +	/* restart autonegotiation */
 +	struct i40e_netdev_priv *np = netdev_priv(netdev);
 +	struct i40e_pf *pf = np->vsi->back;
 +	struct i40e_hw *hw = &pf->hw;
 +	bool link_up = hw->phy.link_info.link_info & I40E_AQ_LINK_UP;
 +	i40e_status ret = 0;
 +
 +	ret = i40e_aq_set_link_restart_an(hw, link_up, NULL);
 +	if (ret) {
 +		netdev_info(netdev, "link restart failed, aq_err=%d\n",
 +			    pf->hw.aq.asq_last_status);
 +		return -EIO;
 +	}
 +
 +	return 0;
 +}
 +
 +/**
   * i40e_get_pauseparam -  Get Flow Control status
   * Return tx/rx-pause status
   **/
@@@ -614,81 -334,6 +614,81 @@@ static void i40e_get_pauseparam(struct 
    }
  }
+/**
 + * i40e_set_pauseparam - Set Flow Control parameter
 + * @netdev: network interface device structure
 + * @pause: return tx/rx flow control status
 + **/
 +static int i40e_set_pauseparam(struct net_device *netdev,
 +			       struct ethtool_pauseparam *pause)
 +{
 +	struct i40e_netdev_priv *np = netdev_priv(netdev);
 +	struct i40e_pf *pf = np->vsi->back;
 +	struct i40e_vsi *vsi = np->vsi;
 +	struct i40e_hw *hw = &pf->hw;
 +	struct i40e_link_status *hw_link_info = &hw->phy.link_info;
 +	bool link_up = hw_link_info->link_info & I40E_AQ_LINK_UP;
 +	i40e_status status;
 +	u8 aq_failures;
 +	int err;
 +
 +	if (vsi != pf->vsi[pf->lan_vsi])
 +		return -EOPNOTSUPP;
 +
 +	if (pause->autoneg != ((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ?
 +	    AUTONEG_ENABLE : AUTONEG_DISABLE)) {
 +		netdev_info(netdev, "To change autoneg please use: ethtool -s <dev> autoneg <on|off>\n");
 +		return -EOPNOTSUPP;
 +	}
 +
 +	/* If we have link and don't have autoneg */
 +	if (!test_bit(__I40E_DOWN, &pf->state) &&
 +	    !(hw_link_info->an_info & I40E_AQ_AN_COMPLETED)) {
 +		/* Send message that it might not necessarily work*/
 +		netdev_info(netdev, "Autoneg did not complete so changing settings may not result in an actual change.\n");
 +	}
 +
 +	if (hw->fc.current_mode == I40E_FC_PFC) {
 +		netdev_info(netdev, "Priority flow control enabled. Cannot set link flow control.\n");
 +		return -EOPNOTSUPP;
 +	}
 +
 +	if (pause->rx_pause && pause->tx_pause)
 +		hw->fc.requested_mode = I40E_FC_FULL;
 +	else if (pause->rx_pause && !pause->tx_pause)
 +		hw->fc.requested_mode = I40E_FC_RX_PAUSE;
 +	else if (!pause->rx_pause && pause->tx_pause)
 +		hw->fc.requested_mode = I40E_FC_TX_PAUSE;
 +	else if (!pause->rx_pause && !pause->tx_pause)
 +		hw->fc.requested_mode = I40E_FC_NONE;
 +	else
 +		 return -EINVAL;
 +
 +	/* Set the fc mode and only restart an if link is up*/
 +	status = i40e_set_fc(hw, &aq_failures, link_up);
 +
 +	if (aq_failures & I40E_SET_FC_AQ_FAIL_GET) {
 +		netdev_info(netdev, "Set fc failed on the get_phy_capabilities call with error %d and status %d\n",
 +			    status, hw->aq.asq_last_status);
 +		err = -EAGAIN;
 +	}
 +	if (aq_failures & I40E_SET_FC_AQ_FAIL_SET) {
 +		netdev_info(netdev, "Set fc failed on the set_phy_config call with error %d and status %d\n",
 +			    status, hw->aq.asq_last_status);
 +		err = -EAGAIN;
 +	}
 +	if (aq_failures & I40E_SET_FC_AQ_FAIL_UPDATE) {
 +		netdev_info(netdev, "Set fc failed on the update_link_info call with error %d and status %d\n",
 +			    status, hw->aq.asq_last_status);
 +		err = -EAGAIN;
 +	}
 +
 +	if (!test_bit(__I40E_DOWN, &pf->state))
 +		return i40e_nway_reset(netdev);
 +
 +	return err;
 +}
 +
  static u32 i40e_get_msglevel(struct net_device *netdev)
  {
    struct i40e_netdev_priv *np = netdev_priv(netdev);
@@@ -1376,6 -1021,24 +1376,6 @@@ static int i40e_set_wol(struct net_devi
    return 0;
  }
-static int i40e_nway_reset(struct net_device *netdev)
 -{
 -	/* restart autonegotiation */
 -	struct i40e_netdev_priv *np = netdev_priv(netdev);
 -	struct i40e_pf *pf = np->vsi->back;
 -	struct i40e_hw *hw = &pf->hw;
 -	i40e_status ret = 0;
 -
 -	ret = i40e_aq_set_link_restart_an(hw, NULL);
 -	if (ret) {
 -		netdev_info(netdev, "link restart failed, aq_err=%d\n",
 -			    pf->hw.aq.asq_last_status);
 -		return -EIO;
 -	}
 -
 -	return 0;
 -}
 -
  static int i40e_set_phys_id(struct net_device *netdev,
    		    enum ethtool_phys_id_state state)
  {
@@@ -1442,36 -1105,17 +1442,36 @@@ static int i40e_set_coalesce(struct net
    if (ec->tx_max_coalesced_frames_irq || ec->rx_max_coalesced_frames_irq)
    	vsi->work_limit = ec->tx_max_coalesced_frames_irq;
+	vector = vsi->base_vector;
    if ((ec->rx_coalesce_usecs >= (I40E_MIN_ITR << 1)) &&
 -	    (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1)))
 +	    (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1))) {
    	vsi->rx_itr_setting = ec->rx_coalesce_usecs;
 -	else
 +	} else if (ec->rx_coalesce_usecs == 0) {
 +		vsi->rx_itr_setting = ec->rx_coalesce_usecs;
 +		i40e_irq_dynamic_disable(vsi, vector);
 +		if (ec->use_adaptive_rx_coalesce)
 +			netif_info(pf, drv, netdev,
 +				   "Rx-secs=0, need to disable adaptive-Rx for a complete disable\n");
 +	} else {
 +		netif_info(pf, drv, netdev,
 +			   "Invalid value, Rx-usecs range is 0, 8-8160\n");
    	return -EINVAL;
 +	}
if ((ec->tx_coalesce_usecs >= (I40E_MIN_ITR << 1)) &&
 -	    (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1)))
 +	    (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1))) {
    	vsi->tx_itr_setting = ec->tx_coalesce_usecs;
 -	else
 +	} else if (ec->tx_coalesce_usecs == 0) {
 +		vsi->tx_itr_setting = ec->tx_coalesce_usecs;
 +		i40e_irq_dynamic_disable(vsi, vector);
 +		if (ec->use_adaptive_tx_coalesce)
 +			netif_info(pf, drv, netdev,
 +				   "Tx-secs=0, need to disable adaptive-Tx for a complete disable\n");
 +	} else {
 +		netif_info(pf, drv, netdev,
 +			   "Invalid value, Tx-usecs range is 0, 8-8160\n");
    	return -EINVAL;
 +	}
if (ec->use_adaptive_rx_coalesce)
    	vsi->rx_itr_setting |= I40E_ITR_DYNAMIC;
@@@ -1483,6 -1127,7 +1483,6 @@@
    else
    	vsi->tx_itr_setting &= ~I40E_ITR_DYNAMIC;
-	vector = vsi->base_vector;
    for (i = 0; i < vsi->num_q_vectors; i++, vector++) {
    	q_vector = vsi->q_vectors[i];
    	q_vector->rx.itr = ITR_TO_REG(vsi->rx_itr_setting);
@@@ -1853,7 -1498,7 +1853,7 @@@ static int i40e_update_ethtool_fdir_ent
/* add filter to the list */
    if (parent)
- 		hlist_add_after(&parent->fdir_node, &input->fdir_node);
+ 		hlist_add_behind(&input->fdir_node, &parent->fdir_node);
    else
    	hlist_add_head(&input->fdir_node,
    		       &pf->fdir_filter_list);
@@@ -2086,7 -1731,6 +2086,7 @@@ static int i40e_set_channels(struct net
static const struct ethtool_ops i40e_ethtool_ops = {
    .get_settings		= i40e_get_settings,
 +	.set_settings		= i40e_set_settings,
    .get_drvinfo		= i40e_get_drvinfo,
    .get_regs_len		= i40e_get_regs_len,
    .get_regs		= i40e_get_regs,
@@@ -2099,7 -1743,6 +2099,7 @@@
    .get_ringparam		= i40e_get_ringparam,
    .set_ringparam		= i40e_set_ringparam,
    .get_pauseparam		= i40e_get_pauseparam,
 +	.set_pauseparam		= i40e_set_pauseparam,
    .get_msglevel		= i40e_get_msglevel,
    .set_msglevel		= i40e_set_msglevel,
    .get_rxnfc		= i40e_get_rxnfc,
diff --combined drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 94a1c07,a6e5bcc..e4100b5
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@@ -1408,6 -1408,7 +1408,6 @@@ static int ixgbe_reg_test(struct ixgbe_
    default:
    	*data = 1;
    	return 1;
 -		break;
    }
/*
@@@ -2517,7 -2518,7 +2517,7 @@@ static int ixgbe_update_ethtool_fdir_en
/* add filter to the list */
    if (parent)
- 		hlist_add_after(&parent->fdir_node, &input->fdir_node);
+ 		hlist_add_behind(&input->fdir_node, &parent->fdir_node);
    else
    	hlist_add_head(&input->fdir_node,
    		       &adapter->fdir_filter_list);
@@@ -2865,6 -2866,7 +2865,6 @@@ static int ixgbe_get_ts_info(struct net
    	break;
    default:
    	return ethtool_op_get_ts_info(dev, info);
 -		break;
    }
    return 0;
  }
diff --combined drivers/staging/android/binder.c
index 02b0379,0ca9785..4f34dc0
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
@@@ -454,8 -454,9 +454,8 @@@ static size_t binder_buffer_size(struc
  {
    if (list_is_last(&buffer->entry, &proc->buffers))
    	return proc->buffer + proc->buffer_size - (void *)buffer->data;
 -	else
 -		return (size_t)list_entry(buffer->entry.next,
 -			struct binder_buffer, entry) - (size_t)buffer->data;
 +	return (size_t)list_entry(buffer->entry.next,
 +			  struct binder_buffer, entry) - (size_t)buffer->data;
  }
static void binder_insert_free_buffer(struct binder_proc *proc,
@@@ -585,7 -586,6 +585,6 @@@ static int binder_update_page_range(str
for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
    	int ret;
- 		struct page **page_array_ptr;
page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
@@@ -598,8 -598,7 +597,7 @@@
    	}
    	tmp_area.addr = page_addr;
    	tmp_area.size = PAGE_SIZE + PAGE_SIZE /* guard page? */;
- 		page_array_ptr = page;
- 		ret = map_vm_area(&tmp_area, PAGE_KERNEL, &page_array_ptr);
+ 		ret = map_vm_area(&tmp_area, PAGE_KERNEL, page);
    	if (ret) {
    		pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n",
    		       proc->pid, page_addr);
@@@ -1185,7 -1184,6 +1183,7 @@@ static void binder_send_failed_reply(st
    			     uint32_t error_code)
  {
    struct binder_thread *target_thread;
 +	struct binder_transaction *next;
BUG_ON(t->flags & TF_ONE_WAY);
    while (1) {
@@@ -1213,23 -1211,24 +1211,23 @@@
    				target_thread->return_error);
    		}
    		return;
 -		} else {
 -			struct binder_transaction *next = t->from_parent;
 +		}
 +		next = t->from_parent;
-			binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 -				     "send failed reply for transaction %d, target dead\n",
 -				     t->debug_id);
 +		binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 +			     "send failed reply for transaction %d, target dead\n",
 +			     t->debug_id);
-			binder_pop_transaction(target_thread, t);
 -			if (next == NULL) {
 -				binder_debug(BINDER_DEBUG_DEAD_BINDER,
 -					     "reply failed, no target thread at root\n");
 -				return;
 -			}
 -			t = next;
 +		binder_pop_transaction(target_thread, t);
 +		if (next == NULL) {
    		binder_debug(BINDER_DEBUG_DEAD_BINDER,
 -				     "reply failed, no target thread -- retry %d\n",
 -				      t->debug_id);
 +				     "reply failed, no target thread at root\n");
 +			return;
    	}
 +		t = next;
 +		binder_debug(BINDER_DEBUG_DEAD_BINDER,
 +			     "reply failed, no target thread -- retry %d\n",
 +			      t->debug_id);
    }
  }
@@@ -2593,106 -2592,6 +2591,106 @@@ static unsigned int binder_poll(struct 
    return 0;
  }
+static int binder_ioctl_write_read(struct file *filp,
 +				unsigned int cmd, unsigned long arg,
 +				struct binder_thread *thread)
 +{
 +	int ret = 0;
 +	struct binder_proc *proc = filp->private_data;
 +	unsigned int size = _IOC_SIZE(cmd);
 +	void __user *ubuf = (void __user *)arg;
 +	struct binder_write_read bwr;
 +
 +	if (size != sizeof(struct binder_write_read)) {
 +		ret = -EINVAL;
 +		goto out;
 +	}
 +	if (copy_from_user(&bwr, ubuf, sizeof(bwr))) {
 +		ret = -EFAULT;
 +		goto out;
 +	}
 +	binder_debug(BINDER_DEBUG_READ_WRITE,
 +		     "%d:%d write %lld at %016llx, read %lld at %016llx\n",
 +		     proc->pid, thread->pid,
 +		     (u64)bwr.write_size, (u64)bwr.write_buffer,
 +		     (u64)bwr.read_size, (u64)bwr.read_buffer);
 +
 +	if (bwr.write_size > 0) {
 +		ret = binder_thread_write(proc, thread,
 +					  bwr.write_buffer,
 +					  bwr.write_size,
 +					  &bwr.write_consumed);
 +		trace_binder_write_done(ret);
 +		if (ret < 0) {
 +			bwr.read_consumed = 0;
 +			if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
 +				ret = -EFAULT;
 +			goto out;
 +		}
 +	}
 +	if (bwr.read_size > 0) {
 +		ret = binder_thread_read(proc, thread, bwr.read_buffer,
 +					 bwr.read_size,
 +					 &bwr.read_consumed,
 +					 filp->f_flags & O_NONBLOCK);
 +		trace_binder_read_done(ret);
 +		if (!list_empty(&proc->todo))
 +			wake_up_interruptible(&proc->wait);
 +		if (ret < 0) {
 +			if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
 +				ret = -EFAULT;
 +			goto out;
 +		}
 +	}
 +	binder_debug(BINDER_DEBUG_READ_WRITE,
 +		     "%d:%d wrote %lld of %lld, read return %lld of %lld\n",
 +		     proc->pid, thread->pid,
 +		     (u64)bwr.write_consumed, (u64)bwr.write_size,
 +		     (u64)bwr.read_consumed, (u64)bwr.read_size);
 +	if (copy_to_user(ubuf, &bwr, sizeof(bwr))) {
 +		ret = -EFAULT;
 +		goto out;
 +	}
 +out:
 +	return ret;
 +}
 +
 +static int binder_ioctl_set_ctx_mgr(struct file *filp)
 +{
 +	int ret = 0;
 +	struct binder_proc *proc = filp->private_data;
 +	kuid_t curr_euid = current_euid();
 +
 +	if (binder_context_mgr_node != NULL) {
 +		pr_err("BINDER_SET_CONTEXT_MGR already set\n");
 +		ret = -EBUSY;
 +		goto out;
 +	}
 +	if (uid_valid(binder_context_mgr_uid)) {
 +		if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
 +			pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
 +			       from_kuid(&init_user_ns, curr_euid),
 +			       from_kuid(&init_user_ns,
 +					binder_context_mgr_uid));
 +			ret = -EPERM;
 +			goto out;
 +		}
 +	} else {
 +		binder_context_mgr_uid = curr_euid;
 +	}
 +	binder_context_mgr_node = binder_new_node(proc, 0, 0);
 +	if (binder_context_mgr_node == NULL) {
 +		ret = -ENOMEM;
 +		goto out;
 +	}
 +	binder_context_mgr_node->local_weak_refs++;
 +	binder_context_mgr_node->local_strong_refs++;
 +	binder_context_mgr_node->has_strong_ref = 1;
 +	binder_context_mgr_node->has_weak_ref = 1;
 +out:
 +	return ret;
 +}
 +
  static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
  {
    int ret;
@@@ -2700,9 -2599,9 +2698,9 @@@
    struct binder_thread *thread;
    unsigned int size = _IOC_SIZE(cmd);
    void __user *ubuf = (void __user *)arg;
 -	kuid_t curr_euid = current_euid();
-	/*pr_info("binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd, arg);*/
 +	/*pr_info("binder_ioctl: %d:%d %x %lx\n",
 +			proc->pid, current->pid, cmd, arg);*/
trace_binder_ioctl(cmd, arg);
@@@ -2718,11 -2617,61 +2716,11 @@@
    }
switch (cmd) {
 -	case BINDER_WRITE_READ: {
 -		struct binder_write_read bwr;
 -
 -		if (size != sizeof(struct binder_write_read)) {
 -			ret = -EINVAL;
 +	case BINDER_WRITE_READ:
 +		ret = binder_ioctl_write_read(filp, cmd, arg, thread);
 +		if (ret)
    		goto err;
 -		}
 -		if (copy_from_user(&bwr, ubuf, sizeof(bwr))) {
 -			ret = -EFAULT;
 -			goto err;
 -		}
 -		binder_debug(BINDER_DEBUG_READ_WRITE,
 -			     "%d:%d write %lld at %016llx, read %lld at %016llx\n",
 -			     proc->pid, thread->pid,
 -			     (u64)bwr.write_size, (u64)bwr.write_buffer,
 -			     (u64)bwr.read_size, (u64)bwr.read_buffer);
 -
 -		if (bwr.write_size > 0) {
 -			ret = binder_thread_write(proc, thread,
 -						  bwr.write_buffer,
 -						  bwr.write_size,
 -						  &bwr.write_consumed);
 -			trace_binder_write_done(ret);
 -			if (ret < 0) {
 -				bwr.read_consumed = 0;
 -				if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
 -					ret = -EFAULT;
 -				goto err;
 -			}
 -		}
 -		if (bwr.read_size > 0) {
 -			ret = binder_thread_read(proc, thread, bwr.read_buffer,
 -						 bwr.read_size,
 -						 &bwr.read_consumed,
 -						 filp->f_flags & O_NONBLOCK);
 -			trace_binder_read_done(ret);
 -			if (!list_empty(&proc->todo))
 -				wake_up_interruptible(&proc->wait);
 -			if (ret < 0) {
 -				if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
 -					ret = -EFAULT;
 -				goto err;
 -			}
 -		}
 -		binder_debug(BINDER_DEBUG_READ_WRITE,
 -			     "%d:%d wrote %lld of %lld, read return %lld of %lld\n",
 -			     proc->pid, thread->pid,
 -			     (u64)bwr.write_consumed, (u64)bwr.write_size,
 -			     (u64)bwr.read_consumed, (u64)bwr.read_size);
 -		if (copy_to_user(ubuf, &bwr, sizeof(bwr))) {
 -			ret = -EFAULT;
 -			goto err;
 -		}
    	break;
 -	}
    case BINDER_SET_MAX_THREADS:
    	if (copy_from_user(&proc->max_threads, ubuf, sizeof(proc->max_threads))) {
    		ret = -EINVAL;
@@@ -2730,9 -2679,31 +2728,9 @@@
    	}
    	break;
    case BINDER_SET_CONTEXT_MGR:
 -		if (binder_context_mgr_node != NULL) {
 -			pr_err("BINDER_SET_CONTEXT_MGR already set\n");
 -			ret = -EBUSY;
 -			goto err;
 -		}
 -		if (uid_valid(binder_context_mgr_uid)) {
 -			if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
 -				pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
 -				       from_kuid(&init_user_ns, curr_euid),
 -				       from_kuid(&init_user_ns, binder_context_mgr_uid));
 -				ret = -EPERM;
 -				goto err;
 -			}
 -		} else {
 -			binder_context_mgr_uid = curr_euid;
 -		}
 -		binder_context_mgr_node = binder_new_node(proc, 0, 0);
 -		if (binder_context_mgr_node == NULL) {
 -			ret = -ENOMEM;
 +		ret = binder_ioctl_set_ctx_mgr(filp);
 +		if (ret)
    		goto err;
 -		}
 -		binder_context_mgr_node->local_weak_refs++;
 -		binder_context_mgr_node->local_strong_refs++;
 -		binder_context_mgr_node->has_strong_ref = 1;
 -		binder_context_mgr_node->has_weak_ref = 1;
    	break;
    case BINDER_THREAD_EXIT:
    	binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n",
@@@ -2796,15 -2767,9 +2794,15 @@@ static void binder_vma_close(struct vm_
    binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
  }
+static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 +{
 +	return VM_FAULT_SIGBUS;
 +}
 +
  static struct vm_operations_struct binder_vm_ops = {
    .open = binder_vma_open,
    .close = binder_vma_close,
 +	.fault = binder_vm_fault,
  };
static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
diff --combined drivers/staging/lustre/lustre/libcfs/hash.c
index 5dde794,6db7391..8ef1deb
--- a/drivers/staging/lustre/lustre/libcfs/hash.c
+++ b/drivers/staging/lustre/lustre/libcfs/hash.c
@@@ -107,7 -107,7 +107,7 @@@
   *   table. Also, user can break the iteration by return 1 in callback.
   */
-#include <linux/libcfs/libcfs.h>
 +#include "../../include/linux/libcfs/libcfs.h"
  #include <linux/seq_file.h>
#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
@@@ -351,7 -351,7 +351,7 @@@ cfs_hash_dh_hnode_add(struct cfs_hash *
    				    cfs_hash_dhead_t, dh_head);
if (dh->dh_tail != NULL) /* not empty */
- 		hlist_add_after(dh->dh_tail, hnode);
+ 		hlist_add_behind(hnode, dh->dh_tail);
    else /* empty list */
    	hlist_add_head(hnode, &dh->dh_head);
    dh->dh_tail = hnode;
@@@ -406,7 -406,7 +406,7 @@@ cfs_hash_dd_hnode_add(struct cfs_hash *
    					cfs_hash_dhead_dep_t, dd_head);
if (dh->dd_tail != NULL) /* not empty */
- 		hlist_add_after(dh->dd_tail, hnode);
+ 		hlist_add_behind(hnode, dh->dd_tail);
    else /* empty list */
    	hlist_add_head(hnode, &dh->dd_head);
    dh->dd_tail = hnode;
diff --combined drivers/video/backlight/backlight.c
index bddc8b1,19b170d..0ce8823
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@@ -190,8 -190,6 +190,6 @@@ static ssize_t brightness_store(struct 
    }
    mutex_unlock(&bd->ops_lock);
- 	backlight_generate_event(bd, BACKLIGHT_UPDATE_SYSFS);
- 
    return rc;
  }
  static DEVICE_ATTR_RW(brightness);
@@@ -223,8 -221,6 +221,8 @@@ static ssize_t actual_brightness_show(s
    mutex_lock(&bd->ops_lock);
    if (bd->ops && bd->ops->get_brightness)
    	rc = sprintf(buf, "%d\n", bd->ops->get_brightness(bd));
 +	else
 +		rc = sprintf(buf, "%d\n", bd->props.brightness);
    mutex_unlock(&bd->ops_lock);
return rc;
diff --combined fs/cifs/cifssmb.c
index 7d4361f,c3dc52e..692d79f
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@@ -196,6 -196,10 +196,6 @@@ cifs_reconnect_tcon(struct cifs_tcon *t
    if (rc)
    	goto out;
-	/*
 -	 * FIXME: check if wsize needs updated due to negotiated smb buffer
 -	 * 	  size shrinking
 -	 */
    atomic_inc(&tconInfoReconnectCount);
/* tell server Unix caps we support */
@@@ -1513,6 -1517,7 +1513,6 @@@ cifs_readv_receive(struct TCP_Server_In
    	return length;
server->total_read += length;
 -	rdata->bytes = length;
cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
    	 server->total_read, buflen, data_len);
@@@ -1555,18 -1560,12 +1555,18 @@@ cifs_readv_callback(struct mid_q_entry 
    				 rc);
    	}
    	/* FIXME: should this be counted toward the initiating task? */
 -		task_io_account_read(rdata->bytes);
 -		cifs_stats_bytes_read(tcon, rdata->bytes);
 +		task_io_account_read(rdata->got_bytes);
 +		cifs_stats_bytes_read(tcon, rdata->got_bytes);
    	break;
    case MID_REQUEST_SUBMITTED:
    case MID_RETRY_NEEDED:
    	rdata->result = -EAGAIN;
 +		if (server->sign && rdata->got_bytes)
 +			/* reset bytes number since we can not check a sign */
 +			rdata->got_bytes = 0;
 +		/* FIXME: should this be counted toward the initiating task? */
 +		task_io_account_read(rdata->got_bytes);
 +		cifs_stats_bytes_read(tcon, rdata->got_bytes);
    	break;
    default:
    	rdata->result = -EIO;
@@@ -1735,7 -1734,10 +1735,7 @@@ CIFSSMBRead(const unsigned int xid, str
/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
    if (*buf) {
 -		if (resp_buf_type == CIFS_SMALL_BUFFER)
 -			cifs_small_buf_release(iov[0].iov_base);
 -		else if (resp_buf_type == CIFS_LARGE_BUFFER)
 -			cifs_buf_release(iov[0].iov_base);
 +		free_rsp_buf(resp_buf_type, iov[0].iov_base);
    } else if (resp_buf_type != CIFS_NO_BUFFER) {
    	/* return buffer to caller to free */
    	*buf = iov[0].iov_base;
@@@ -1900,79 -1902,27 +1900,79 @@@ cifs_writev_requeue(struct cifs_writeda
    int i, rc;
    struct inode *inode = wdata->cfile->dentry->d_inode;
    struct TCP_Server_Info *server;
 +	unsigned int rest_len;
-	for (i = 0; i < wdata->nr_pages; i++) {
 -		lock_page(wdata->pages[i]);
 -		clear_page_dirty_for_io(wdata->pages[i]);
 -	}
 -
 +	server = tlink_tcon(wdata->cfile->tlink)->ses->server;
 +	i = 0;
 +	rest_len = wdata->bytes;
    do {
 -		server = tlink_tcon(wdata->cfile->tlink)->ses->server;
 -		rc = server->ops->async_writev(wdata, cifs_writedata_release);
 -	} while (rc == -EAGAIN);
 +		struct cifs_writedata *wdata2;
 +		unsigned int j, nr_pages, wsize, tailsz, cur_len;
 +
 +		wsize = server->ops->wp_retry_size(inode);
 +		if (wsize < rest_len) {
 +			nr_pages = wsize / PAGE_CACHE_SIZE;
 +			if (!nr_pages) {
 +				rc = -ENOTSUPP;
 +				break;
 +			}
 +			cur_len = nr_pages * PAGE_CACHE_SIZE;
 +			tailsz = PAGE_CACHE_SIZE;
 +		} else {
 +			nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE);
 +			cur_len = rest_len;
 +			tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE;
 +		}
-	for (i = 0; i < wdata->nr_pages; i++) {
 -		unlock_page(wdata->pages[i]);
 -		if (rc != 0) {
 -			SetPageError(wdata->pages[i]);
 -			end_page_writeback(wdata->pages[i]);
 -			page_cache_release(wdata->pages[i]);
 +		wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
 +		if (!wdata2) {
 +			rc = -ENOMEM;
 +			break;
    	}
 -	}
-	mapping_set_error(inode->i_mapping, rc);
 +		for (j = 0; j < nr_pages; j++) {
 +			wdata2->pages[j] = wdata->pages[i + j];
 +			lock_page(wdata2->pages[j]);
 +			clear_page_dirty_for_io(wdata2->pages[j]);
 +		}
 +
 +		wdata2->sync_mode = wdata->sync_mode;
 +		wdata2->nr_pages = nr_pages;
 +		wdata2->offset = page_offset(wdata2->pages[0]);
 +		wdata2->pagesz = PAGE_CACHE_SIZE;
 +		wdata2->tailsz = tailsz;
 +		wdata2->bytes = cur_len;
 +
 +		wdata2->cfile = find_writable_file(CIFS_I(inode), false);
 +		if (!wdata2->cfile) {
 +			cifs_dbg(VFS, "No writable handles for inode\n");
 +			rc = -EBADF;
 +			break;
 +		}
 +		wdata2->pid = wdata2->cfile->pid;
 +		rc = server->ops->async_writev(wdata2, cifs_writedata_release);
 +
 +		for (j = 0; j < nr_pages; j++) {
 +			unlock_page(wdata2->pages[j]);
 +			if (rc != 0 && rc != -EAGAIN) {
 +				SetPageError(wdata2->pages[j]);
 +				end_page_writeback(wdata2->pages[j]);
 +				page_cache_release(wdata2->pages[j]);
 +			}
 +		}
 +
 +		if (rc) {
 +			kref_put(&wdata2->refcount, cifs_writedata_release);
 +			if (rc == -EAGAIN)
 +				continue;
 +			mapping_set_error(inode->i_mapping, rc);
 +			break;
 +		}
 +
 +		rest_len -= cur_len;
 +		i += nr_pages;
 +	} while (i < wdata->nr_pages);
 +
    kref_put(&wdata->refcount, cifs_writedata_release);
  }
@@@ -2253,7 -2203,10 +2253,7 @@@ CIFSSMBWrite2(const unsigned int xid, s
    }
/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
 -	if (resp_buf_type == CIFS_SMALL_BUFFER)
 -		cifs_small_buf_release(iov[0].iov_base);
 -	else if (resp_buf_type == CIFS_LARGE_BUFFER)
 -		cifs_buf_release(iov[0].iov_base);
 +	free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls
    	since file handle passed in no longer valid */
@@@ -2477,14 -2430,14 +2477,14 @@@ CIFSSMBPosixLock(const unsigned int xid
    	}
    	parm_data = (struct cifs_posix_lock *)
    		((char *)&pSMBr->hdr.Protocol + data_offset);
- 		if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
+ 		if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
    		pLockData->fl_type = F_UNLCK;
    	else {
    		if (parm_data->lock_type ==
- 					__constant_cpu_to_le16(CIFS_RDLCK))
+ 					cpu_to_le16(CIFS_RDLCK))
    			pLockData->fl_type = F_RDLCK;
    		else if (parm_data->lock_type ==
- 					__constant_cpu_to_le16(CIFS_WRLCK))
+ 					cpu_to_le16(CIFS_WRLCK))
    			pLockData->fl_type = F_WRLCK;
pLockData->fl_start = le64_to_cpu(parm_data->start);
@@@ -2498,7 -2451,10 +2498,7 @@@ plk_err_exit
    if (pSMB)
    	cifs_small_buf_release(pSMB);
-	if (resp_buf_type == CIFS_SMALL_BUFFER)
 -		cifs_small_buf_release(iov[0].iov_base);
 -	else if (resp_buf_type == CIFS_LARGE_BUFFER)
 -		cifs_buf_release(iov[0].iov_base);
 +	free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls
       since file handle passed in no longer valid */
@@@ -3276,25 -3232,25 +3276,25 @@@ CIFSSMB_set_compression(const unsigned 
    pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
pSMB->TotalParameterCount = 0;
- 	pSMB->TotalDataCount = __constant_cpu_to_le32(2);
+ 	pSMB->TotalDataCount = cpu_to_le32(2);
    pSMB->MaxParameterCount = 0;
    pSMB->MaxDataCount = 0;
    pSMB->MaxSetupCount = 4;
    pSMB->Reserved = 0;
    pSMB->ParameterOffset = 0;
- 	pSMB->DataCount = __constant_cpu_to_le32(2);
+ 	pSMB->DataCount = cpu_to_le32(2);
    pSMB->DataOffset =
    	cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req,
    			compression_state) - 4);  /* 84 */
    pSMB->SetupCount = 4;
- 	pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL);
+ 	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
    pSMB->ParameterCount = 0;
- 	pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION);
+ 	pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION);
    pSMB->IsFsctl = 1; /* FSCTL */
    pSMB->IsRootFlag = 0;
    pSMB->Fid = fid; /* file handle always le */
    /* 3 byte pad, followed by 2 byte compress state */
- 	pSMB->ByteCount = __constant_cpu_to_le16(5);
+ 	pSMB->ByteCount = cpu_to_le16(5);
    inc_rfc1001_len(pSMB, 5);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@@ -3430,10 -3386,10 +3430,10 @@@ static __u16 ACL_to_cifs_posix(char *pa
    cifs_acl->version = cpu_to_le16(1);
    if (acl_type == ACL_TYPE_ACCESS) {
    	cifs_acl->access_entry_count = cpu_to_le16(count);
- 		cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF);
+ 		cifs_acl->default_entry_count = cpu_to_le16(0xFFFF);
    } else if (acl_type == ACL_TYPE_DEFAULT) {
    	cifs_acl->default_entry_count = cpu_to_le16(count);
- 		cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF);
+ 		cifs_acl->access_entry_count = cpu_to_le16(0xFFFF);
    } else {
    	cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
    	return 0;
@@@ -3882,7 -3838,10 +3882,7 @@@ CIFSSMBGetCIFSACL(const unsigned int xi
    	}
    }
  qsec_out:
 -	if (buf_type == CIFS_SMALL_BUFFER)
 -		cifs_small_buf_release(iov[0].iov_base);
 -	else if (buf_type == CIFS_LARGE_BUFFER)
 -		cifs_buf_release(iov[0].iov_base);
 +	free_rsp_buf(buf_type, iov[0].iov_base);
  /*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
    return rc;
  }
diff --combined fs/cifs/file.c
index 01a6339,3c1967c..03558d4
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@@ -1058,7 -1058,7 +1058,7 @@@ cifs_push_mandatory_locks(struct cifsFi
max_num = (max_buf - sizeof(struct smb_hdr)) /
    					sizeof(LOCKING_ANDX_RANGE);
- 	buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ 	buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
    if (!buf) {
    	free_xid(xid);
    	return -ENOMEM;
@@@ -1393,7 -1393,7 +1393,7 @@@ cifs_unlock_range(struct cifsFileInfo *
max_num = (max_buf - sizeof(struct smb_hdr)) /
    					sizeof(LOCKING_ANDX_RANGE);
- 	buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ 	buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
    if (!buf)
    	return -ENOMEM;
@@@ -1670,8 -1670,8 +1670,8 @@@ cifs_write(struct cifsFileInfo *open_fi
    				break;
    		}
-			len = min((size_t)cifs_sb->wsize,
 -				  write_size - total_written);
 +			len = min(server->ops->wp_retry_size(dentry->d_inode),
 +				  (unsigned int)write_size - total_written);
    		/* iov[0] is reserved for smb header */
    		iov[1].iov_base = (char *)write_data + total_written;
    		iov[1].iov_len = len;
@@@ -1878,178 -1878,15 +1878,178 @@@ static int cifs_partialpagewrite(struc
    return rc;
  }
+static struct cifs_writedata *
 +wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
 +			  pgoff_t end, pgoff_t *index,
 +			  unsigned int *found_pages)
 +{
 +	unsigned int nr_pages;
 +	struct page **pages;
 +	struct cifs_writedata *wdata;
 +
 +	wdata = cifs_writedata_alloc((unsigned int)tofind,
 +				     cifs_writev_complete);
 +	if (!wdata)
 +		return NULL;
 +
 +	/*
 +	 * find_get_pages_tag seems to return a max of 256 on each
 +	 * iteration, so we must call it several times in order to
 +	 * fill the array or the wsize is effectively limited to
 +	 * 256 * PAGE_CACHE_SIZE.
 +	 */
 +	*found_pages = 0;
 +	pages = wdata->pages;
 +	do {
 +		nr_pages = find_get_pages_tag(mapping, index,
 +					      PAGECACHE_TAG_DIRTY, tofind,
 +					      pages);
 +		*found_pages += nr_pages;
 +		tofind -= nr_pages;
 +		pages += nr_pages;
 +	} while (nr_pages && tofind && *index <= end);
 +
 +	return wdata;
 +}
 +
 +static unsigned int
 +wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
 +		    struct address_space *mapping,
 +		    struct writeback_control *wbc,
 +		    pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done)
 +{
 +	unsigned int nr_pages = 0, i;
 +	struct page *page;
 +
 +	for (i = 0; i < found_pages; i++) {
 +		page = wdata->pages[i];
 +		/*
 +		 * At this point we hold neither mapping->tree_lock nor
 +		 * lock on the page itself: the page may be truncated or
 +		 * invalidated (changing page->mapping to NULL), or even
 +		 * swizzled back from swapper_space to tmpfs file
 +		 * mapping
 +		 */
 +
 +		if (nr_pages == 0)
 +			lock_page(page);
 +		else if (!trylock_page(page))
 +			break;
 +
 +		if (unlikely(page->mapping != mapping)) {
 +			unlock_page(page);
 +			break;
 +		}
 +
 +		if (!wbc->range_cyclic && page->index > end) {
 +			*done = true;
 +			unlock_page(page);
 +			break;
 +		}
 +
 +		if (*next && (page->index != *next)) {
 +			/* Not next consecutive page */
 +			unlock_page(page);
 +			break;
 +		}
 +
 +		if (wbc->sync_mode != WB_SYNC_NONE)
 +			wait_on_page_writeback(page);
 +
 +		if (PageWriteback(page) ||
 +				!clear_page_dirty_for_io(page)) {
 +			unlock_page(page);
 +			break;
 +		}
 +
 +		/*
 +		 * This actually clears the dirty bit in the radix tree.
 +		 * See cifs_writepage() for more commentary.
 +		 */
 +		set_page_writeback(page);
 +		if (page_offset(page) >= i_size_read(mapping->host)) {
 +			*done = true;
 +			unlock_page(page);
 +			end_page_writeback(page);
 +			break;
 +		}
 +
 +		wdata->pages[i] = page;
 +		*next = page->index + 1;
 +		++nr_pages;
 +	}
 +
 +	/* reset index to refind any pages skipped */
 +	if (nr_pages == 0)
 +		*index = wdata->pages[0]->index + 1;
 +
 +	/* put any pages we aren't going to use */
 +	for (i = nr_pages; i < found_pages; i++) {
 +		page_cache_release(wdata->pages[i]);
 +		wdata->pages[i] = NULL;
 +	}
 +
 +	return nr_pages;
 +}
 +
 +static int
 +wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
 +		 struct address_space *mapping, struct writeback_control *wbc)
 +{
 +	int rc = 0;
 +	struct TCP_Server_Info *server;
 +	unsigned int i;
 +
 +	wdata->sync_mode = wbc->sync_mode;
 +	wdata->nr_pages = nr_pages;
 +	wdata->offset = page_offset(wdata->pages[0]);
 +	wdata->pagesz = PAGE_CACHE_SIZE;
 +	wdata->tailsz = min(i_size_read(mapping->host) -
 +			page_offset(wdata->pages[nr_pages - 1]),
 +			(loff_t)PAGE_CACHE_SIZE);
 +	wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz;
 +
 +	if (wdata->cfile != NULL)
 +		cifsFileInfo_put(wdata->cfile);
 +	wdata->cfile = find_writable_file(CIFS_I(mapping->host), false);
 +	if (!wdata->cfile) {
 +		cifs_dbg(VFS, "No writable handles for inode\n");
 +		rc = -EBADF;
 +	} else {
 +		wdata->pid = wdata->cfile->pid;
 +		server = tlink_tcon(wdata->cfile->tlink)->ses->server;
 +		rc = server->ops->async_writev(wdata, cifs_writedata_release);
 +	}
 +
 +	for (i = 0; i < nr_pages; ++i)
 +		unlock_page(wdata->pages[i]);
 +
 +	if (!rc)
 +		return rc;
 +
 +	/* send failure -- clean up the mess */
 +	for (i = 0; i < nr_pages; ++i) {
 +		if (rc == -EAGAIN)
 +			redirty_page_for_writepage(wbc, wdata->pages[i]);
 +		else
 +			SetPageError(wdata->pages[i]);
 +		end_page_writeback(wdata->pages[i]);
 +		page_cache_release(wdata->pages[i]);
 +	}
 +	if (rc != -EAGAIN)
 +		mapping_set_error(mapping, rc);
 +
 +	return rc;
 +}
 +
  static int cifs_writepages(struct address_space *mapping,
    		   struct writeback_control *wbc)
  {
    struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
 +	struct TCP_Server_Info *server;
    bool done = false, scanned = false, range_whole = false;
    pgoff_t end, index;
    struct cifs_writedata *wdata;
 -	struct TCP_Server_Info *server;
 -	struct page *page;
    int rc = 0;
/*
@@@ -2069,55 -1906,165 +2069,55 @@@
    		range_whole = true;
    	scanned = true;
    }
 +	server = cifs_sb_master_tcon(cifs_sb)->ses->server;
  retry:
    while (!done && index <= end) {
 -		unsigned int i, nr_pages, found_pages;
 -		pgoff_t next = 0, tofind;
 -		struct page **pages;
 +		unsigned int nr_pages, found_pages, wsize, credits;
 +		pgoff_t next = 0, tofind, saved_index = index;
-		tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1,
 -				end - index) + 1;
 +		rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
 +						   &wsize, &credits);
 +		if (rc)
 +			break;
-		wdata = cifs_writedata_alloc((unsigned int)tofind,
 -					     cifs_writev_complete);
 +		tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1;
 +
 +		wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index,
 +						  &found_pages);
    	if (!wdata) {
    		rc = -ENOMEM;
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
-		/*
 -		 * find_get_pages_tag seems to return a max of 256 on each
 -		 * iteration, so we must call it several times in order to
 -		 * fill the array or the wsize is effectively limited to
 -		 * 256 * PAGE_CACHE_SIZE.
 -		 */
 -		found_pages = 0;
 -		pages = wdata->pages;
 -		do {
 -			nr_pages = find_get_pages_tag(mapping, &index,
 -							PAGECACHE_TAG_DIRTY,
 -							tofind, pages);
 -			found_pages += nr_pages;
 -			tofind -= nr_pages;
 -			pages += nr_pages;
 -		} while (nr_pages && tofind && index <= end);
 -
    	if (found_pages == 0) {
    		kref_put(&wdata->refcount, cifs_writedata_release);
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
-		nr_pages = 0;
 -		for (i = 0; i < found_pages; i++) {
 -			page = wdata->pages[i];
 -			/*
 -			 * At this point we hold neither mapping->tree_lock nor
 -			 * lock on the page itself: the page may be truncated or
 -			 * invalidated (changing page->mapping to NULL), or even
 -			 * swizzled back from swapper_space to tmpfs file
 -			 * mapping
 -			 */
 -
 -			if (nr_pages == 0)
 -				lock_page(page);
 -			else if (!trylock_page(page))
 -				break;
 -
 -			if (unlikely(page->mapping != mapping)) {
 -				unlock_page(page);
 -				break;
 -			}
 -
 -			if (!wbc->range_cyclic && page->index > end) {
 -				done = true;
 -				unlock_page(page);
 -				break;
 -			}
 -
 -			if (next && (page->index != next)) {
 -				/* Not next consecutive page */
 -				unlock_page(page);
 -				break;
 -			}
 -
 -			if (wbc->sync_mode != WB_SYNC_NONE)
 -				wait_on_page_writeback(page);
 -
 -			if (PageWriteback(page) ||
 -					!clear_page_dirty_for_io(page)) {
 -				unlock_page(page);
 -				break;
 -			}
 -
 -			/*
 -			 * This actually clears the dirty bit in the radix tree.
 -			 * See cifs_writepage() for more commentary.
 -			 */
 -			set_page_writeback(page);
 -
 -			if (page_offset(page) >= i_size_read(mapping->host)) {
 -				done = true;
 -				unlock_page(page);
 -				end_page_writeback(page);
 -				break;
 -			}
 -
 -			wdata->pages[i] = page;
 -			next = page->index + 1;
 -			++nr_pages;
 -		}
 -
 -		/* reset index to refind any pages skipped */
 -		if (nr_pages == 0)
 -			index = wdata->pages[0]->index + 1;
 -
 -		/* put any pages we aren't going to use */
 -		for (i = nr_pages; i < found_pages; i++) {
 -			page_cache_release(wdata->pages[i]);
 -			wdata->pages[i] = NULL;
 -		}
 +		nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc,
 +					       end, &index, &next, &done);
/* nothing to write? */
    	if (nr_pages == 0) {
    		kref_put(&wdata->refcount, cifs_writedata_release);
 +			add_credits_and_wake_if(server, credits, 0);
    		continue;
    	}
-		wdata->sync_mode = wbc->sync_mode;
 -		wdata->nr_pages = nr_pages;
 -		wdata->offset = page_offset(wdata->pages[0]);
 -		wdata->pagesz = PAGE_CACHE_SIZE;
 -		wdata->tailsz =
 -			min(i_size_read(mapping->host) -
 -			    page_offset(wdata->pages[nr_pages - 1]),
 -			    (loff_t)PAGE_CACHE_SIZE);
 -		wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
 -					wdata->tailsz;
 +		wdata->credits = credits;
-		do {
 -			if (wdata->cfile != NULL)
 -				cifsFileInfo_put(wdata->cfile);
 -			wdata->cfile = find_writable_file(CIFS_I(mapping->host),
 -							  false);
 -			if (!wdata->cfile) {
 -				cifs_dbg(VFS, "No writable handles for inode\n");
 -				rc = -EBADF;
 -				break;
 -			}
 -			wdata->pid = wdata->cfile->pid;
 -			server = tlink_tcon(wdata->cfile->tlink)->ses->server;
 -			rc = server->ops->async_writev(wdata,
 -							cifs_writedata_release);
 -		} while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
 +		rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
 +		if (rc)
 +			add_credits_and_wake_if(server, wdata->credits, 0);
-		for (i = 0; i < nr_pages; ++i)
 -			unlock_page(wdata->pages[i]);
 +		kref_put(&wdata->refcount, cifs_writedata_release);
-		/* send failure -- clean up the mess */
 -		if (rc != 0) {
 -			for (i = 0; i < nr_pages; ++i) {
 -				if (rc == -EAGAIN)
 -					redirty_page_for_writepage(wbc,
 -							   wdata->pages[i]);
 -				else
 -					SetPageError(wdata->pages[i]);
 -				end_page_writeback(wdata->pages[i]);
 -				page_cache_release(wdata->pages[i]);
 -			}
 -			if (rc != -EAGAIN)
 -				mapping_set_error(mapping, rc);
 +		if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) {
 +			index = saved_index;
 +			continue;
    	}
 -		kref_put(&wdata->refcount, cifs_writedata_release);
wbc->nr_to_write -= nr_pages;
    	if (wbc->nr_to_write <= 0)
@@@ -2415,106 -2362,125 +2415,106 @@@ cifs_uncached_writev_complete(struct wo
    kref_put(&wdata->refcount, cifs_uncached_writedata_release);
  }
-/* attempt to send write to server, retry on any -EAGAIN errors */
  static int
 -cifs_uncached_retry_writev(struct cifs_writedata *wdata)
 +wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
 +		      size_t *len, unsigned long nr_pages)
  {
 -	int rc;
 -	struct TCP_Server_Info *server;
 +	int rc = 0;
 +	size_t save_len, copied, bytes, cur_len = *len;
 +	unsigned long i;
-	server = tlink_tcon(wdata->cfile->tlink)->ses->server;
 +	save_len = cur_len;
 +	for (i = 0; i < nr_pages; i++) {
 +		bytes = min_t(const size_t, cur_len, PAGE_SIZE);
 +		copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from);
 +		cur_len -= copied;
 +		/*
 +		 * If we didn't copy as much as we expected, then that
 +		 * may mean we trod into an unmapped area. Stop copying
 +		 * at that point. On the next pass through the big
 +		 * loop, we'll likely end up getting a zero-length
 +		 * write and bailing out of it.
 +		 */
 +		if (copied < bytes)
 +			break;
 +	}
 +	cur_len = save_len - cur_len;
 +	*len = cur_len;
-	do {
 -		if (wdata->cfile->invalidHandle) {
 -			rc = cifs_reopen_file(wdata->cfile, false);
 -			if (rc != 0)
 -				continue;
 -		}
 -		rc = server->ops->async_writev(wdata,
 -					       cifs_uncached_writedata_release);
 -	} while (rc == -EAGAIN);
 +	/*
 +	 * If we have no data to send, then that probably means that
 +	 * the copy above failed altogether. That's most likely because
 +	 * the address in the iovec was bogus. Return -EFAULT and let
 +	 * the caller free anything we allocated and bail out.
 +	 */
 +	if (!cur_len)
 +		return -EFAULT;
+	/*
 +	 * i + 1 now represents the number of pages we actually used in
 +	 * the copy phase above. Bring nr_pages down to that, and free
 +	 * any pages that we didn't use.
 +	 */
 +	for ( ; nr_pages > i + 1; nr_pages--)
 +		put_page(wdata->pages[nr_pages - 1]);
    return rc;
  }
-static ssize_t
 -cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
 +static int
 +cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 +		     struct cifsFileInfo *open_file,
 +		     struct cifs_sb_info *cifs_sb, struct list_head *wdata_list)
  {
 +	int rc = 0;
 +	size_t cur_len;
    unsigned long nr_pages, i;
 -	size_t bytes, copied, len, cur_len;
 -	ssize_t total_written = 0;
 -	loff_t offset;
 -	struct cifsFileInfo *open_file;
 -	struct cifs_tcon *tcon;
 -	struct cifs_sb_info *cifs_sb;
 -	struct cifs_writedata *wdata, *tmp;
 -	struct list_head wdata_list;
 -	int rc;
 +	struct cifs_writedata *wdata;
 +	struct iov_iter saved_from;
 +	loff_t saved_offset = offset;
    pid_t pid;
 -
 -	len = iov_iter_count(from);
 -	rc = generic_write_checks(file, poffset, &len, 0);
 -	if (rc)
 -		return rc;
 -
 -	if (!len)
 -		return 0;
 -
 -	iov_iter_truncate(from, len);
 -
 -	INIT_LIST_HEAD(&wdata_list);
 -	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 -	open_file = file->private_data;
 -	tcon = tlink_tcon(open_file->tlink);
 -
 -	if (!tcon->ses->server->ops->async_writev)
 -		return -ENOSYS;
 -
 -	offset = *poffset;
 +	struct TCP_Server_Info *server;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
    	pid = open_file->pid;
    else
    	pid = current->tgid;
+	server = tlink_tcon(open_file->tlink)->ses->server;
 +	memcpy(&saved_from, from, sizeof(struct iov_iter));
 +
    do {
 -		size_t save_len;
 +		unsigned int wsize, credits;
 +
 +		rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
 +						   &wsize, &credits);
 +		if (rc)
 +			break;
-		nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
 +		nr_pages = get_numpages(wsize, len, &cur_len);
    	wdata = cifs_writedata_alloc(nr_pages,
    				     cifs_uncached_writev_complete);
    	if (!wdata) {
    		rc = -ENOMEM;
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
    	if (rc) {
    		kfree(wdata);
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
-		save_len = cur_len;
 -		for (i = 0; i < nr_pages; i++) {
 -			bytes = min_t(size_t, cur_len, PAGE_SIZE);
 -			copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
 -						     from);
 -			cur_len -= copied;
 -			/*
 -			 * If we didn't copy as much as we expected, then that
 -			 * may mean we trod into an unmapped area. Stop copying
 -			 * at that point. On the next pass through the big
 -			 * loop, we'll likely end up getting a zero-length
 -			 * write and bailing out of it.
 -			 */
 -			if (copied < bytes)
 -				break;
 -		}
 -		cur_len = save_len - cur_len;
 -
 -		/*
 -		 * If we have no data to send, then that probably means that
 -		 * the copy above failed altogether. That's most likely because
 -		 * the address in the iovec was bogus. Set the rc to -EFAULT,
 -		 * free anything we allocated and bail out.
 -		 */
 -		if (!cur_len) {
 +		rc = wdata_fill_from_iovec(wdata, from, &cur_len, nr_pages);
 +		if (rc) {
    		for (i = 0; i < nr_pages; i++)
    			put_page(wdata->pages[i]);
    		kfree(wdata);
 -			rc = -EFAULT;
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
-		/*
 -		 * i + 1 now represents the number of pages we actually used in
 -		 * the copy phase above. Bring nr_pages down to that, and free
 -		 * any pages that we didn't use.
 -		 */
 -		for ( ; nr_pages > i + 1; nr_pages--)
 -			put_page(wdata->pages[nr_pages - 1]);
 -
    	wdata->sync_mode = WB_SYNC_ALL;
    	wdata->nr_pages = nr_pages;
    	wdata->offset = (__u64)offset;
@@@ -2523,71 -2489,18 +2523,71 @@@
    	wdata->bytes = cur_len;
    	wdata->pagesz = PAGE_SIZE;
    	wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
 -		rc = cifs_uncached_retry_writev(wdata);
 +		wdata->credits = credits;
 +
 +		if (!wdata->cfile->invalidHandle ||
 +		    !cifs_reopen_file(wdata->cfile, false))
 +			rc = server->ops->async_writev(wdata,
 +					cifs_uncached_writedata_release);
    	if (rc) {
 +			add_credits_and_wake_if(server, wdata->credits, 0);
    		kref_put(&wdata->refcount,
    			 cifs_uncached_writedata_release);
 +			if (rc == -EAGAIN) {
 +				memcpy(from, &saved_from,
 +				       sizeof(struct iov_iter));
 +				iov_iter_advance(from, offset - saved_offset);
 +				continue;
 +			}
    		break;
    	}
-		list_add_tail(&wdata->list, &wdata_list);
 +		list_add_tail(&wdata->list, wdata_list);
    	offset += cur_len;
    	len -= cur_len;
    } while (len > 0);
+	return rc;
 +}
 +
 +static ssize_t
 +cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
 +{
 +	size_t len;
 +	ssize_t total_written = 0;
 +	loff_t offset;
 +	struct cifsFileInfo *open_file;
 +	struct cifs_tcon *tcon;
 +	struct cifs_sb_info *cifs_sb;
 +	struct cifs_writedata *wdata, *tmp;
 +	struct list_head wdata_list;
 +	struct iov_iter saved_from;
 +	int rc;
 +
 +	len = iov_iter_count(from);
 +	rc = generic_write_checks(file, poffset, &len, 0);
 +	if (rc)
 +		return rc;
 +
 +	if (!len)
 +		return 0;
 +
 +	iov_iter_truncate(from, len);
 +
 +	INIT_LIST_HEAD(&wdata_list);
 +	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 +	open_file = file->private_data;
 +	tcon = tlink_tcon(open_file->tlink);
 +
 +	if (!tcon->ses->server->ops->async_writev)
 +		return -ENOSYS;
 +
 +	offset = *poffset;
 +	memcpy(&saved_from, from, sizeof(struct iov_iter));
 +
 +	rc = cifs_write_from_iter(*poffset, len, from, open_file, cifs_sb,
 +				  &wdata_list);
 +
    /*
     * If at least one write was successfully sent, then discard any rc
     * value from the later writes. If the other write succeeds, then
@@@ -2616,25 -2529,7 +2616,25 @@@ restart_loop
/* resend call if it's a retryable error */
    		if (rc == -EAGAIN) {
 -				rc = cifs_uncached_retry_writev(wdata);
 +				struct list_head tmp_list;
 +				struct iov_iter tmp_from;
 +
 +				INIT_LIST_HEAD(&tmp_list);
 +				list_del_init(&wdata->list);
 +
 +				memcpy(&tmp_from, &saved_from,
 +				       sizeof(struct iov_iter));
 +				iov_iter_advance(&tmp_from,
 +						 wdata->offset - *poffset);
 +
 +				rc = cifs_write_from_iter(wdata->offset,
 +						wdata->bytes, &tmp_from,
 +						open_file, cifs_sb, &tmp_list);
 +
 +				list_splice(&tmp_list, &wdata_list);
 +
 +				kref_put(&wdata->refcount,
 +					 cifs_uncached_writedata_release);
    			goto restart_loop;
    		}
    	}
@@@ -2827,6 -2722,26 +2827,6 @@@ cifs_uncached_readdata_release(struct k
    cifs_readdata_release(refcount);
  }
-static int
 -cifs_retry_async_readv(struct cifs_readdata *rdata)
 -{
 -	int rc;
 -	struct TCP_Server_Info *server;
 -
 -	server = tlink_tcon(rdata->cfile->tlink)->ses->server;
 -
 -	do {
 -		if (rdata->cfile->invalidHandle) {
 -			rc = cifs_reopen_file(rdata->cfile, true);
 -			if (rc != 0)
 -				continue;
 -		}
 -		rc = server->ops->async_readv(rdata);
 -	} while (rc == -EAGAIN);
 -
 -	return rc;
 -}
 -
  /**
   * cifs_readdata_to_iov - copy data from pages in response to an iovec
   * @rdata:	the readdata response with list of pages holding data
@@@ -2839,7 -2754,7 +2839,7 @@@
  static int
  cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
  {
 -	size_t remaining = rdata->bytes;
 +	size_t remaining = rdata->got_bytes;
    unsigned int i;
for (i = 0; i < rdata->nr_pages; i++) {
@@@ -2867,12 -2782,11 +2867,12 @@@ static in
  cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
    		struct cifs_readdata *rdata, unsigned int len)
  {
 -	int total_read = 0, result = 0;
 +	int result = 0;
    unsigned int i;
    unsigned int nr_pages = rdata->nr_pages;
    struct kvec iov;
+	rdata->got_bytes = 0;
    rdata->tailsz = PAGE_SIZE;
    for (i = 0; i < nr_pages; i++) {
    	struct page *page = rdata->pages[i];
@@@ -2906,45 -2820,55 +2906,45 @@@
    	if (result < 0)
    		break;
-		total_read += result;
 +		rdata->got_bytes += result;
    }
-	return total_read > 0 ? total_read : result;
 +	return rdata->got_bytes > 0 && result != -ECONNABORTED ?
 +						rdata->got_bytes : result;
  }
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
 +static int
 +cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 +		     struct cifs_sb_info *cifs_sb, struct list_head *rdata_list)
  {
 -	struct file *file = iocb->ki_filp;
 -	ssize_t rc;
 -	size_t len, cur_len;
 -	ssize_t total_read = 0;
 -	loff_t offset = iocb->ki_pos;
 -	unsigned int npages;
 -	struct cifs_sb_info *cifs_sb;
 -	struct cifs_tcon *tcon;
 -	struct cifsFileInfo *open_file;
 -	struct cifs_readdata *rdata, *tmp;
 -	struct list_head rdata_list;
 +	struct cifs_readdata *rdata;
 +	unsigned int npages, rsize, credits;
 +	size_t cur_len;
 +	int rc;
    pid_t pid;
 +	struct TCP_Server_Info *server;
-	len = iov_iter_count(to);
 -	if (!len)
 -		return 0;
 -
 -	INIT_LIST_HEAD(&rdata_list);
 -	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 -	open_file = file->private_data;
 -	tcon = tlink_tcon(open_file->tlink);
 -
 -	if (!tcon->ses->server->ops->async_readv)
 -		return -ENOSYS;
 +	server = tlink_tcon(open_file->tlink)->ses->server;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
    	pid = open_file->pid;
    else
    	pid = current->tgid;
-	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 -		cifs_dbg(FYI, "attempting read on write only file instance\n");
 -
    do {
 -		cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
 +		rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
 +						   &rsize, &credits);
 +		if (rc)
 +			break;
 +
 +		cur_len = min_t(const size_t, len, rsize);
    	npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
/* allocate a readdata struct */
    	rdata = cifs_readdata_alloc(npages,
    				    cifs_uncached_readv_complete);
    	if (!rdata) {
 +			add_credits_and_wake_if(server, credits, 0);
    		rc = -ENOMEM;
    		break;
    	}
@@@ -2960,113 -2884,44 +2960,113 @@@
    	rdata->pid = pid;
    	rdata->pagesz = PAGE_SIZE;
    	rdata->read_into_pages = cifs_uncached_read_into_pages;
 +		rdata->credits = credits;
-		rc = cifs_retry_async_readv(rdata);
 +		if (!rdata->cfile->invalidHandle ||
 +		    !cifs_reopen_file(rdata->cfile, true))
 +			rc = server->ops->async_readv(rdata);
  error:
    	if (rc) {
 +			add_credits_and_wake_if(server, rdata->credits, 0);
    		kref_put(&rdata->refcount,
    			 cifs_uncached_readdata_release);
 +			if (rc == -EAGAIN)
 +				continue;
    		break;
    	}
-		list_add_tail(&rdata->list, &rdata_list);
 +		list_add_tail(&rdata->list, rdata_list);
    	offset += cur_len;
    	len -= cur_len;
    } while (len > 0);
+	return rc;
 +}
 +
 +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
 +{
 +	struct file *file = iocb->ki_filp;
 +	ssize_t rc;
 +	size_t len;
 +	ssize_t total_read = 0;
 +	loff_t offset = iocb->ki_pos;
 +	struct cifs_sb_info *cifs_sb;
 +	struct cifs_tcon *tcon;
 +	struct cifsFileInfo *open_file;
 +	struct cifs_readdata *rdata, *tmp;
 +	struct list_head rdata_list;
 +
 +	len = iov_iter_count(to);
 +	if (!len)
 +		return 0;
 +
 +	INIT_LIST_HEAD(&rdata_list);
 +	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 +	open_file = file->private_data;
 +	tcon = tlink_tcon(open_file->tlink);
 +
 +	if (!tcon->ses->server->ops->async_readv)
 +		return -ENOSYS;
 +
 +	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 +		cifs_dbg(FYI, "attempting read on write only file instance\n");
 +
 +	rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list);
 +
    /* if at least one read request send succeeded, then reset rc */
    if (!list_empty(&rdata_list))
    	rc = 0;
len = iov_iter_count(to);
    /* the loop below should proceed in the order of increasing offsets */
 +again:
    list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
 -	again:
    	if (!rc) {
    		/* FIXME: freezable sleep too? */
    		rc = wait_for_completion_killable(&rdata->done);
    		if (rc)
    			rc = -EINTR;
 -			else if (rdata->result) {
 -				rc = rdata->result;
 +			else if (rdata->result == -EAGAIN) {
    			/* resend call if it's a retryable error */
 -				if (rc == -EAGAIN) {
 -					rc = cifs_retry_async_readv(rdata);
 -					goto again;
 +				struct list_head tmp_list;
 +				unsigned int got_bytes = rdata->got_bytes;
 +
 +				list_del_init(&rdata->list);
 +				INIT_LIST_HEAD(&tmp_list);
 +
 +				/*
 +				 * Got a part of data and then reconnect has
 +				 * happened -- fill the buffer and continue
 +				 * reading.
 +				 */
 +				if (got_bytes && got_bytes < rdata->bytes) {
 +					rc = cifs_readdata_to_iov(rdata, to);
 +					if (rc) {
 +						kref_put(&rdata->refcount,
 +						cifs_uncached_readdata_release);
 +						continue;
 +					}
    			}
 -			} else {
 +
 +				rc = cifs_send_async_read(
 +						rdata->offset + got_bytes,
 +						rdata->bytes - got_bytes,
 +						rdata->cfile, cifs_sb,
 +						&tmp_list);
 +
 +				list_splice(&tmp_list, &rdata_list);
 +
 +				kref_put(&rdata->refcount,
 +					 cifs_uncached_readdata_release);
 +				goto again;
 +			} else if (rdata->result)
 +				rc = rdata->result;
 +			else
    			rc = cifs_readdata_to_iov(rdata, to);
 -			}
+			/* if there was a short read -- discard anything left */
 +			if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
 +				rc = -ENODATA;
    	}
    	list_del_init(&rdata->list);
    	kref_put(&rdata->refcount, cifs_uncached_readdata_release);
@@@ -3175,19 -3030,18 +3175,19 @@@ cifs_read(struct file *file, char *read
for (total_read = 0, cur_offset = read_data; read_size > total_read;
         total_read += bytes_read, cur_offset += bytes_read) {
 -		current_read_size = min_t(uint, read_size - total_read, rsize);
 -		/*
 -		 * For windows me and 9x we do not want to request more than it
 -		 * negotiated since it will refuse the read then.
 -		 */
 -		if ((tcon->ses) && !(tcon->ses->capabilities &
 +		do {
 +			current_read_size = min_t(uint, read_size - total_read,
 +						  rsize);
 +			/*
 +			 * For windows me and 9x we do not want to request more
 +			 * than it negotiated since it will refuse the read
 +			 * then.
 +			 */
 +			if ((tcon->ses) && !(tcon->ses->capabilities &
    			tcon->ses->server->vals->cap_large_files)) {
 -			current_read_size = min_t(uint, current_read_size,
 -					CIFSMaxBufSize);
 -		}
 -		rc = -EAGAIN;
 -		while (rc == -EAGAIN) {
 +				current_read_size = min_t(uint,
 +					current_read_size, CIFSMaxBufSize);
 +			}
    		if (open_file->invalidHandle) {
    			rc = cifs_reopen_file(open_file, true);
    			if (rc != 0)
@@@ -3200,8 -3054,7 +3200,8 @@@
    		rc = server->ops->sync_read(xid, open_file, &io_parms,
    					    &bytes_read, &cur_offset,
    					    &buf_type);
 -		}
 +		} while (rc == -EAGAIN);
 +
    	if (rc || (bytes_read == 0)) {
    		if (total_read) {
    			break;
@@@ -3280,30 -3133,25 +3280,30 @@@ int cifs_file_mmap(struct file *file, s
  static void
  cifs_readv_complete(struct work_struct *work)
  {
 -	unsigned int i;
 +	unsigned int i, got_bytes;
    struct cifs_readdata *rdata = container_of(work,
    					struct cifs_readdata, work);
+	got_bytes = rdata->got_bytes;
    for (i = 0; i < rdata->nr_pages; i++) {
    	struct page *page = rdata->pages[i];
lru_cache_add_file(page);
-		if (rdata->result == 0) {
 +		if (rdata->result == 0 ||
 +		    (rdata->result == -EAGAIN && got_bytes)) {
    		flush_dcache_page(page);
    		SetPageUptodate(page);
    	}
unlock_page(page);
-		if (rdata->result == 0)
 +		if (rdata->result == 0 ||
 +		    (rdata->result == -EAGAIN && got_bytes))
    		cifs_readpage_to_fscache(rdata->mapping->host, page);
+		got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes);
 +
    	page_cache_release(page);
    	rdata->pages[i] = NULL;
    }
@@@ -3314,7 -3162,7 +3314,7 @@@ static in
  cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
    		struct cifs_readdata *rdata, unsigned int len)
  {
 -	int total_read = 0, result = 0;
 +	int result = 0;
    unsigned int i;
    u64 eof;
    pgoff_t eof_index;
@@@ -3326,7 -3174,6 +3326,7 @@@
    eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
    cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
+	rdata->got_bytes = 0;
    rdata->tailsz = PAGE_CACHE_SIZE;
    for (i = 0; i < nr_pages; i++) {
    	struct page *page = rdata->pages[i];
@@@ -3381,70 -3228,10 +3381,70 @@@
    	if (result < 0)
    		break;
-		total_read += result;
 +		rdata->got_bytes += result;
 +	}
 +
 +	return rdata->got_bytes > 0 && result != -ECONNABORTED ?
 +						rdata->got_bytes : result;
 +}
 +
 +static int
 +readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
 +		    unsigned int rsize, struct list_head *tmplist,
 +		    unsigned int *nr_pages, loff_t *offset, unsigned int *bytes)
 +{
 +	struct page *page, *tpage;
 +	unsigned int expected_index;
 +	int rc;
 +
 +	INIT_LIST_HEAD(tmplist);
 +
 +	page = list_entry(page_list->prev, struct page, lru);
 +
 +	/*
 +	 * Lock the page and put it in the cache. Since no one else
 +	 * should have access to this page, we're safe to simply set
 +	 * PG_locked without checking it first.
 +	 */
 +	__set_page_locked(page);
 +	rc = add_to_page_cache_locked(page, mapping,
 +				      page->index, GFP_KERNEL);
 +
 +	/* give up if we can't stick it in the cache */
 +	if (rc) {
 +		__clear_page_locked(page);
 +		return rc;
    }
-	return total_read > 0 ? total_read : result;
 +	/* move first page to the tmplist */
 +	*offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
 +	*bytes = PAGE_CACHE_SIZE;
 +	*nr_pages = 1;
 +	list_move_tail(&page->lru, tmplist);
 +
 +	/* now try and add more pages onto the request */
 +	expected_index = page->index + 1;
 +	list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
 +		/* discontinuity ? */
 +		if (page->index != expected_index)
 +			break;
 +
 +		/* would this page push the read over the rsize? */
 +		if (*bytes + PAGE_CACHE_SIZE > rsize)
 +			break;
 +
 +		__set_page_locked(page);
 +		if (add_to_page_cache_locked(page, mapping, page->index,
 +								GFP_KERNEL)) {
 +			__clear_page_locked(page);
 +			break;
 +		}
 +		list_move_tail(&page->lru, tmplist);
 +		(*bytes) += PAGE_CACHE_SIZE;
 +		expected_index++;
 +		(*nr_pages)++;
 +	}
 +	return rc;
  }
static int cifs_readpages(struct file *file, struct address_space *mapping,
@@@ -3454,10 -3241,19 +3454,10 @@@
    struct list_head tmplist;
    struct cifsFileInfo *open_file = file->private_data;
    struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 -	unsigned int rsize = cifs_sb->rsize;
 +	struct TCP_Server_Info *server;
    pid_t pid;
/*
 -	 * Give up immediately if rsize is too small to read an entire page.
 -	 * The VFS will fall back to readpage. We should never reach this
 -	 * point however since we set ra_pages to 0 when the rsize is smaller
 -	 * than a cache page.
 -	 */
 -	if (unlikely(rsize < PAGE_CACHE_SIZE))
 -		return 0;
 -
 -	/*
     * Reads as many pages as possible from fscache. Returns -ENOBUFS
     * immediately if the cookie is negative
     *
@@@ -3475,7 -3271,7 +3475,7 @@@
    	pid = current->tgid;
rc = 0;
 -	INIT_LIST_HEAD(&tmplist);
 +	server = tlink_tcon(open_file->tlink)->ses->server;
cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
    	 __func__, file, mapping, num_pages);
@@@ -3492,35 -3288,58 +3492,35 @@@
     * the rdata->pages, then we want them in increasing order.
     */
    while (!list_empty(page_list)) {
 -		unsigned int i;
 -		unsigned int bytes = PAGE_CACHE_SIZE;
 -		unsigned int expected_index;
 -		unsigned int nr_pages = 1;
 +		unsigned int i, nr_pages, bytes, rsize;
    	loff_t offset;
    	struct page *page, *tpage;
    	struct cifs_readdata *rdata;
 +		unsigned credits;
-		page = list_entry(page_list->prev, struct page, lru);
 +		rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
 +						   &rsize, &credits);
 +		if (rc)
 +			break;
/*
 -		 * Lock the page and put it in the cache. Since no one else
 -		 * should have access to this page, we're safe to simply set
 -		 * PG_locked without checking it first.
 +		 * Give up immediately if rsize is too small to read an entire
 +		 * page. The VFS will fall back to readpage. We should never
 +		 * reach this point however since we set ra_pages to 0 when the
 +		 * rsize is smaller than a cache page.
    	 */
 -		__set_page_locked(page);
 -		rc = add_to_page_cache_locked(page, mapping,
 -					      page->index, GFP_KERNEL);
 +		if (unlikely(rsize < PAGE_CACHE_SIZE)) {
 +			add_credits_and_wake_if(server, credits, 0);
 +			return 0;
 +		}
-		/* give up if we can't stick it in the cache */
 +		rc = readpages_get_pages(mapping, page_list, rsize, &tmplist,
 +					 &nr_pages, &offset, &bytes);
    	if (rc) {
 -			__clear_page_locked(page);
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
-		/* move first page to the tmplist */
 -		offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
 -		list_move_tail(&page->lru, &tmplist);
 -
 -		/* now try and add more pages onto the request */
 -		expected_index = page->index + 1;
 -		list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
 -			/* discontinuity ? */
 -			if (page->index != expected_index)
 -				break;
 -
 -			/* would this page push the read over the rsize? */
 -			if (bytes + PAGE_CACHE_SIZE > rsize)
 -				break;
 -
 -			__set_page_locked(page);
 -			if (add_to_page_cache_locked(page, mapping,
 -						page->index, GFP_KERNEL)) {
 -				__clear_page_locked(page);
 -				break;
 -			}
 -			list_move_tail(&page->lru, &tmplist);
 -			bytes += PAGE_CACHE_SIZE;
 -			expected_index++;
 -			nr_pages++;
 -		}
 -
    	rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
    	if (!rdata) {
    		/* best to give up if we're out of mem */
@@@ -3531,7 -3350,6 +3531,7 @@@
    			page_cache_release(page);
    		}
    		rc = -ENOMEM;
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
@@@ -3542,32 -3360,21 +3542,32 @@@
    	rdata->pid = pid;
    	rdata->pagesz = PAGE_CACHE_SIZE;
    	rdata->read_into_pages = cifs_readpages_read_into_pages;
 +		rdata->credits = credits;
list_for_each_entry_safe(page, tpage, &tmplist, lru) {
    		list_del(&page->lru);
    		rdata->pages[rdata->nr_pages++] = page;
    	}
-		rc = cifs_retry_async_readv(rdata);
 -		if (rc != 0) {
 +		if (!rdata->cfile->invalidHandle ||
 +		    !cifs_reopen_file(rdata->cfile, true))
 +			rc = server->ops->async_readv(rdata);
 +		if (rc) {
 +			add_credits_and_wake_if(server, rdata->credits, 0);
    		for (i = 0; i < rdata->nr_pages; i++) {
    			page = rdata->pages[i];
    			lru_cache_add_file(page);
    			unlock_page(page);
    			page_cache_release(page);
 +				if (rc == -EAGAIN)
 +					list_add_tail(&page->lru, &tmplist);
    		}
    		kref_put(&rdata->refcount, cifs_readdata_release);
 +			if (rc == -EAGAIN) {
 +				/* Re-add pages to the page_list and retry */
 +				list_splice(&tmplist, page_list);
 +				continue;
 +			}
    		break;
    	}
@@@ -3811,6 -3618,13 +3811,6 @@@ static int cifs_launder_page(struct pag
    return rc;
  }
-static int
 -cifs_pending_writers_wait(void *unused)
 -{
 -	schedule();
 -	return 0;
 -}
 -
  void cifs_oplock_break(struct work_struct *work)
  {
    struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@@ -3822,7 -3636,7 +3822,7 @@@
    int rc = 0;
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
 -			cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE);
 +			TASK_UNINTERRUPTIBLE);
server->ops->downgrade_oplock(server, cinode,
    	test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
diff --combined fs/cifs/sess.c
index 39ee326,27e6175..39b8507
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@@ -46,7 -46,7 +46,7 @@@ static __u32 cifs_ssetup_hdr(struct cif
    				CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
    				USHRT_MAX));
    pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
- 	pSMB->req.VcNumber = __constant_cpu_to_le16(1);
+ 	pSMB->req.VcNumber = cpu_to_le16(1);
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@@ -520,559 -520,382 +520,559 @@@ select_sectype(struct TCP_Server_Info *
    }
  }
-int
 -CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
 -	       const struct nls_table *nls_cp)
 +struct sess_data {
 +	unsigned int xid;
 +	struct cifs_ses *ses;
 +	struct nls_table *nls_cp;
 +	void (*func)(struct sess_data *);
 +	int result;
 +
 +	/* we will send the SMB in three pieces:
 +	 * a fixed length beginning part, an optional
 +	 * SPNEGO blob (which can be zero length), and a
 +	 * last part which will include the strings
 +	 * and rest of bcc area. This allows us to avoid
 +	 * a large buffer 17K allocation
 +	 */
 +	int buf0_type;
 +	struct kvec iov[3];
 +};
 +
 +static int
 +sess_alloc_buffer(struct sess_data *sess_data, int wct)
  {
 -	int rc = 0;
 -	int wct;
 +	int rc;
 +	struct cifs_ses *ses = sess_data->ses;
    struct smb_hdr *smb_buf;
 -	char *bcc_ptr;
 -	char *str_area;
 -	SESSION_SETUP_ANDX *pSMB;
 -	__u32 capabilities;
 -	__u16 count;
 -	int resp_buf_type;
 -	struct kvec iov[3];
 -	enum securityEnum type;
 -	__u16 action, bytes_remaining;
 -	struct key *spnego_key = NULL;
 -	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
 -	u16 blob_len;
 -	char *ntlmsspblob = NULL;
-	if (ses == NULL) {
 -		WARN(1, "%s: ses == NULL!", __func__);
 -		return -EINVAL;
 -	}
 +	rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
 +				  (void **)&smb_buf);
-	type = select_sectype(ses->server, ses->sectype);
 -	cifs_dbg(FYI, "sess setup type %d\n", type);
 -	if (type == Unspecified) {
 -		cifs_dbg(VFS,
 -			"Unable to select appropriate authentication method!");
 -		return -EINVAL;
 +	if (rc)
 +		return rc;
 +
 +	sess_data->iov[0].iov_base = (char *)smb_buf;
 +	sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
 +	/*
 +	 * This variable will be used to clear the buffer
 +	 * allocated above in case of any error in the calling function.
 +	 */
 +	sess_data->buf0_type = CIFS_SMALL_BUFFER;
 +
 +	/* 2000 big enough to fit max user, domain, NOS name etc. */
 +	sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL);
 +	if (!sess_data->iov[2].iov_base) {
 +		rc = -ENOMEM;
 +		goto out_free_smb_buf;
    }
-	if (type == RawNTLMSSP) {
 -		/* if memory allocation is successful, caller of this function
 -		 * frees it.
 -		 */
 -		ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
 -		if (!ses->ntlmssp)
 -			return -ENOMEM;
 -		ses->ntlmssp->sesskey_per_smbsess = false;
 +	return 0;
 +
 +out_free_smb_buf:
 +	kfree(smb_buf);
 +	sess_data->iov[0].iov_base = NULL;
 +	sess_data->iov[0].iov_len = 0;
 +	sess_data->buf0_type = CIFS_NO_BUFFER;
 +	return rc;
 +}
 +
 +static void
 +sess_free_buffer(struct sess_data *sess_data)
 +{
+	free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base);
 +	sess_data->buf0_type = CIFS_NO_BUFFER;
 +	kfree(sess_data->iov[2].iov_base);
 +}
 +
 +static int
 +sess_establish_session(struct sess_data *sess_data)
 +{
 +	struct cifs_ses *ses = sess_data->ses;
 +
 +	mutex_lock(&ses->server->srv_mutex);
 +	if (!ses->server->session_estab) {
 +		if (ses->server->sign) {
 +			ses->server->session_key.response =
 +				kmemdup(ses->auth_key.response,
 +				ses->auth_key.len, GFP_KERNEL);
 +			if (!ses->server->session_key.response) {
 +				mutex_unlock(&ses->server->srv_mutex);
 +				return -ENOMEM;
 +			}
 +			ses->server->session_key.len =
 +						ses->auth_key.len;
 +		}
 +		ses->server->sequence_number = 0x2;
 +		ses->server->session_estab = true;
    }
 +	mutex_unlock(&ses->server->srv_mutex);
-ssetup_ntlmssp_authenticate:
 -	if (phase == NtLmChallenge)
 -		phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
 +	cifs_dbg(FYI, "CIFS session established successfully\n");
 +	spin_lock(&GlobalMid_Lock);
 +	ses->status = CifsGood;
 +	ses->need_reconnect = false;
 +	spin_unlock(&GlobalMid_Lock);
-	if (type == LANMAN) {
 -#ifndef CONFIG_CIFS_WEAK_PW_HASH
 -		/* LANMAN and plaintext are less secure and off by default.
 -		So we make this explicitly be turned on in kconfig (in the
 -		build) and turned on at runtime (changed from the default)
 -		in proc/fs/cifs or via mount parm.  Unfortunately this is
 -		needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
 -		return -EOPNOTSUPP;
 -#endif
 -		wct = 10; /* lanman 2 style sessionsetup */
 -	} else if ((type == NTLM) || (type == NTLMv2)) {
 -		/* For NTLMv2 failures eventually may need to retry NTLM */
 -		wct = 13; /* old style NTLM sessionsetup */
 -	} else /* same size: negotiate or auth, NTLMSSP or extended security */
 -		wct = 12;
 +	return 0;
 +}
-	rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
 -			    (void **)&smb_buf);
 -	if (rc)
 -		return rc;
 +static int
 +sess_sendreceive(struct sess_data *sess_data)
 +{
 +	int rc;
 +	struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base;
 +	__u16 count;
-	pSMB = (SESSION_SETUP_ANDX *)smb_buf;
 +	count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len;
 +	smb_buf->smb_buf_length =
 +		cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
 +	put_bcc(count, smb_buf);
 +
 +	rc = SendReceive2(sess_data->xid, sess_data->ses,
 +			  sess_data->iov, 3 /* num_iovecs */,
 +			  &sess_data->buf0_type,
 +			  CIFS_LOG_ERROR);
 +
 +	return rc;
 +}
+/*
 + * LANMAN and plaintext are less secure and off by default.
 + * So we make this explicitly be turned on in kconfig (in the
 + * build) and turned on at runtime (changed from the default)
 + * in proc/fs/cifs or via mount parm.  Unfortunately this is
 + * needed for old Win (e.g. Win95), some obscure NAS and OS/2
 + */
 +#ifdef CONFIG_CIFS_WEAK_PW_HASH
 +static void
 +sess_auth_lanman(struct sess_data *sess_data)
 +{
 +	int rc = 0;
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	char *bcc_ptr;
 +	struct cifs_ses *ses = sess_data->ses;
 +	char lnm_session_key[CIFS_AUTH_RESP_SIZE];
 +	__u32 capabilities;
 +	__u16 bytes_remaining;
 +
 +	/* lanman 2 style sessionsetup */
 +	/* wct = 10 */
 +	rc = sess_alloc_buffer(sess_data, 10);
 +	if (rc)
 +		goto out;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	bcc_ptr = sess_data->iov[2].iov_base;
    capabilities = cifs_ssetup_hdr(ses, pSMB);
-	/* we will send the SMB in three pieces:
 -	a fixed length beginning part, an optional
 -	SPNEGO blob (which can be zero length), and a
 -	last part which will include the strings
 -	and rest of bcc area. This allows us to avoid
 -	a large buffer 17K allocation */
 -	iov[0].iov_base = (char *)pSMB;
 -	iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
 -
 -	/* setting this here allows the code at the end of the function
 -	   to free the request buffer if there's an error */
 -	resp_buf_type = CIFS_SMALL_BUFFER;
 +	pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
-	/* 2000 big enough to fit max user, domain, NOS name etc. */
 -	str_area = kmalloc(2000, GFP_KERNEL);
 -	if (str_area == NULL) {
 -		rc = -ENOMEM;
 -		goto ssetup_exit;
 -	}
 -	bcc_ptr = str_area;
 +	/* no capabilities flags in old lanman negotiation */
 +	pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-	iov[1].iov_base = NULL;
 -	iov[1].iov_len = 0;
 +	/* Calculate hash with password and copy into bcc_ptr.
 +	 * Encryption Key (stored as in cryptkey) gets used if the
 +	 * security mode bit in Negottiate Protocol response states
 +	 * to use challenge/response method (i.e. Password bit is 1).
 +	 */
 +	rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
 +			      ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
 +			      true : false, lnm_session_key);
-	if (type == LANMAN) {
 -#ifdef CONFIG_CIFS_WEAK_PW_HASH
 -		char lnm_session_key[CIFS_AUTH_RESP_SIZE];
 +	memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
 +	bcc_ptr += CIFS_AUTH_RESP_SIZE;
 +
 +	/*
 +	 * can not sign if LANMAN negotiated so no need
 +	 * to calculate signing key? but what if server
 +	 * changed to do higher than lanman dialect and
 +	 * we reconnected would we ever calc signing_key?
 +	 */
-		pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
 +	cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
 +	/* Unicode not allowed for LANMAN dialects */
 +	ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
-		/* no capabilities flags in old lanman negotiation */
 +	sess_data->iov[2].iov_len = (long) bcc_ptr -
 +			(long) sess_data->iov[2].iov_base;
-		pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 +	rc = sess_sendreceive(sess_data);
 +	if (rc)
 +		goto out;
-		/* Calculate hash with password and copy into bcc_ptr.
 -		 * Encryption Key (stored as in cryptkey) gets used if the
 -		 * security mode bit in Negottiate Protocol response states
 -		 * to use challenge/response method (i.e. Password bit is 1).
 -		 */
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
-		rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
 -				 ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
 -					true : false, lnm_session_key);
 +	/* lanman response has a word count of 3 */
 +	if (smb_buf->WordCount != 3) {
 +		rc = -EIO;
 +		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 +		goto out;
 +	}
-		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
 -		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 +	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
 +		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
 +
 +	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
 +	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
-		/* can not sign if LANMAN negotiated so no need
 -		to calculate signing key? but what if server
 -		changed to do higher than lanman dialect and
 -		we reconnected would we ever calc signing_key? */
 +	bytes_remaining = get_bcc(smb_buf);
 +	bcc_ptr = pByteArea(smb_buf);
-		cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
 -		/* Unicode not allowed for LANMAN dialects */
 -		ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 +	/* BB check if Unicode and decode strings */
 +	if (bytes_remaining == 0) {
 +		/* no string area to decode, do nothing */
 +	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
 +		/* unicode string area must be word-aligned */
 +		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
 +			++bcc_ptr;
 +			--bytes_remaining;
 +		}
 +		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				      sess_data->nls_cp);
 +	} else {
 +		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				    sess_data->nls_cp);
 +	}
 +
 +	rc = sess_establish_session(sess_data);
 +out:
 +	sess_data->result = rc;
 +	sess_data->func = NULL;
 +	sess_free_buffer(sess_data);
 +}
 +
 +#else
 +
 +static void
 +sess_auth_lanman(struct sess_data *sess_data)
 +{
 +	sess_data->result = -EOPNOTSUPP;
 +	sess_data->func = NULL;
 +}
  #endif
 -	} else if (type == NTLM) {
 -		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 -		pSMB->req_no_secext.CaseInsensitivePasswordLength =
 +
 +static void
 +sess_auth_ntlm(struct sess_data *sess_data)
 +{
 +	int rc = 0;
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	char *bcc_ptr;
 +	struct cifs_ses *ses = sess_data->ses;
 +	__u32 capabilities;
 +	__u16 bytes_remaining;
 +
 +	/* old style NTLM sessionsetup */
 +	/* wct = 13 */
 +	rc = sess_alloc_buffer(sess_data, 13);
 +	if (rc)
 +		goto out;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	bcc_ptr = sess_data->iov[2].iov_base;
 +	capabilities = cifs_ssetup_hdr(ses, pSMB);
 +
 +	pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 +	pSMB->req_no_secext.CaseInsensitivePasswordLength =
    		cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 -		pSMB->req_no_secext.CaseSensitivePasswordLength =
 +	pSMB->req_no_secext.CaseSensitivePasswordLength =
    		cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-		/* calculate ntlm response and session key */
 -		rc = setup_ntlm_response(ses, nls_cp);
 -		if (rc) {
 -			cifs_dbg(VFS, "Error %d during NTLM authentication\n",
 +	/* calculate ntlm response and session key */
 +	rc = setup_ntlm_response(ses, sess_data->nls_cp);
 +	if (rc) {
 +		cifs_dbg(VFS, "Error %d during NTLM authentication\n",
    			 rc);
 -			goto ssetup_exit;
 -		}
 +		goto out;
 +	}
-		/* copy ntlm response */
 -		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 -				CIFS_AUTH_RESP_SIZE);
 -		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 -		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 -				CIFS_AUTH_RESP_SIZE);
 -		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 -
 -		if (ses->capabilities & CAP_UNICODE) {
 -			/* unicode strings must be word aligned */
 -			if (iov[0].iov_len % 2) {
 -				*bcc_ptr = 0;
 -				bcc_ptr++;
 -			}
 -			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
 -		} else
 -			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 -	} else if (type == NTLMv2) {
 -		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 -
 -		/* LM2 password would be here if we supported it */
 -		pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
 -
 -		/* calculate nlmv2 response and session key */
 -		rc = setup_ntlmv2_rsp(ses, nls_cp);
 -		if (rc) {
 -			cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n",
 -				 rc);
 -			goto ssetup_exit;
 +	/* copy ntlm response */
 +	memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 +			CIFS_AUTH_RESP_SIZE);
 +	bcc_ptr += CIFS_AUTH_RESP_SIZE;
 +	memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 +			CIFS_AUTH_RESP_SIZE);
 +	bcc_ptr += CIFS_AUTH_RESP_SIZE;
 +
 +	if (ses->capabilities & CAP_UNICODE) {
 +		/* unicode strings must be word aligned */
 +		if (sess_data->iov[0].iov_len % 2) {
 +			*bcc_ptr = 0;
 +			bcc_ptr++;
    	}
 -		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 -				ses->auth_key.len - CIFS_SESS_KEY_SIZE);
 -		bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
 -
 -		/* set case sensitive password length after tilen may get
 -		 * assigned, tilen is 0 otherwise.
 -		 */
 -		pSMB->req_no_secext.CaseSensitivePasswordLength =
 -			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
 +		unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
 +	} else {
 +		ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
 +	}
-		if (ses->capabilities & CAP_UNICODE) {
 -			if (iov[0].iov_len % 2) {
 -				*bcc_ptr = 0;
 -				bcc_ptr++;
 -			}
 -			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
 -		} else
 -			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 -	} else if (type == Kerberos) {
 -#ifdef CONFIG_CIFS_UPCALL
 -		struct cifs_spnego_msg *msg;
-		spnego_key = cifs_get_spnego_key(ses);
 -		if (IS_ERR(spnego_key)) {
 -			rc = PTR_ERR(spnego_key);
 -			spnego_key = NULL;
 -			goto ssetup_exit;
 -		}
 +	sess_data->iov[2].iov_len = (long) bcc_ptr -
 +			(long) sess_data->iov[2].iov_base;
-		msg = spnego_key->payload.data;
 -		/* check version field to make sure that cifs.upcall is
 -		   sending us a response in an expected form */
 -		if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
 -			cifs_dbg(VFS, "incorrect version of cifs.upcall "
 -				   "expected %d but got %d)",
 -				   CIFS_SPNEGO_UPCALL_VERSION, msg->version);
 -			rc = -EKEYREJECTED;
 -			goto ssetup_exit;
 -		}
 +	rc = sess_sendreceive(sess_data);
 +	if (rc)
 +		goto out;
-		ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
 -						 GFP_KERNEL);
 -		if (!ses->auth_key.response) {
 -			cifs_dbg(VFS,
 -				"Kerberos can't allocate (%u bytes) memory",
 -				msg->sesskey_len);
 -			rc = -ENOMEM;
 -			goto ssetup_exit;
 -		}
 -		ses->auth_key.len = msg->sesskey_len;
 -
 -		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 -		capabilities |= CAP_EXTENDED_SECURITY;
 -		pSMB->req.Capabilities = cpu_to_le32(capabilities);
 -		iov[1].iov_base = msg->data + msg->sesskey_len;
 -		iov[1].iov_len = msg->secblob_len;
 -		pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len);
 -
 -		if (ses->capabilities & CAP_UNICODE) {
 -			/* unicode strings must be word aligned */
 -			if ((iov[0].iov_len + iov[1].iov_len) % 2) {
 -				*bcc_ptr = 0;
 -				bcc_ptr++;
 -			}
 -			unicode_oslm_strings(&bcc_ptr, nls_cp);
 -			unicode_domain_string(&bcc_ptr, ses, nls_cp);
 -		} else
 -		/* BB: is this right? */
 -			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 -#else /* ! CONFIG_CIFS_UPCALL */
 -		cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
 -		rc = -ENOSYS;
 -		goto ssetup_exit;
 -#endif /* CONFIG_CIFS_UPCALL */
 -	} else if (type == RawNTLMSSP) {
 -		if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
 -			cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
 -			rc = -ENOSYS;
 -			goto ssetup_exit;
 -		}
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
-		cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase);
 -		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 -		capabilities |= CAP_EXTENDED_SECURITY;
 -		pSMB->req.Capabilities |= cpu_to_le32(capabilities);
 -		switch(phase) {
 -		case NtLmNegotiate:
 -			build_ntlmssp_negotiate_blob(
 -				pSMB->req.SecurityBlob, ses);
 -			iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
 -			iov[1].iov_base = pSMB->req.SecurityBlob;
 -			pSMB->req.SecurityBlobLength =
 -				cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
 -			break;
 -		case NtLmAuthenticate:
 -			/*
 -			 * 5 is an empirical value, large enough to hold
 -			 * authenticate message plus max 10 of av paris,
 -			 * domain, user, workstation names, flags, etc.
 -			 */
 -			ntlmsspblob = kzalloc(
 -				5*sizeof(struct _AUTHENTICATE_MESSAGE),
 -				GFP_KERNEL);
 -			if (!ntlmsspblob) {
 -				rc = -ENOMEM;
 -				goto ssetup_exit;
 -			}
 +	if (smb_buf->WordCount != 3) {
 +		rc = -EIO;
 +		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 +		goto out;
 +	}
-			rc = build_ntlmssp_auth_blob(ntlmsspblob,
 -						&blob_len, ses, nls_cp);
 -			if (rc)
 -				goto ssetup_exit;
 -			iov[1].iov_len = blob_len;
 -			iov[1].iov_base = ntlmsspblob;
 -			pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
 -			/*
 -			 * Make sure that we tell the server that we are using
 -			 * the uid that it just gave us back on the response
 -			 * (challenge)
 -			 */
 -			smb_buf->Uid = ses->Suid;
 -			break;
 -		default:
 -			cifs_dbg(VFS, "invalid phase %d\n", phase);
 -			rc = -ENOSYS;
 -			goto ssetup_exit;
 +	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
 +		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
 +
 +	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
 +	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
 +
 +	bytes_remaining = get_bcc(smb_buf);
 +	bcc_ptr = pByteArea(smb_buf);
 +
 +	/* BB check if Unicode and decode strings */
 +	if (bytes_remaining == 0) {
 +		/* no string area to decode, do nothing */
 +	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
 +		/* unicode string area must be word-aligned */
 +		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
 +			++bcc_ptr;
 +			--bytes_remaining;
    	}
 -		/* unicode strings must be word aligned */
 -		if ((iov[0].iov_len + iov[1].iov_len) % 2) {
 +		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				      sess_data->nls_cp);
 +	} else {
 +		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				    sess_data->nls_cp);
 +	}
 +
 +	rc = sess_establish_session(sess_data);
 +out:
 +	sess_data->result = rc;
 +	sess_data->func = NULL;
 +	sess_free_buffer(sess_data);
 +	kfree(ses->auth_key.response);
 +	ses->auth_key.response = NULL;
 +}
 +
 +static void
 +sess_auth_ntlmv2(struct sess_data *sess_data)
 +{
 +	int rc = 0;
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	char *bcc_ptr;
 +	struct cifs_ses *ses = sess_data->ses;
 +	__u32 capabilities;
 +	__u16 bytes_remaining;
 +
 +	/* old style NTLM sessionsetup */
 +	/* wct = 13 */
 +	rc = sess_alloc_buffer(sess_data, 13);
 +	if (rc)
 +		goto out;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	bcc_ptr = sess_data->iov[2].iov_base;
 +	capabilities = cifs_ssetup_hdr(ses, pSMB);
 +
 +	pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 +
 +	/* LM2 password would be here if we supported it */
 +	pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
 +
 +	/* calculate nlmv2 response and session key */
 +	rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp);
 +	if (rc) {
 +		cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc);
 +		goto out;
 +	}
 +
 +	memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 +			ses->auth_key.len - CIFS_SESS_KEY_SIZE);
 +	bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
 +
 +	/* set case sensitive password length after tilen may get
 +	 * assigned, tilen is 0 otherwise.
 +	 */
 +	pSMB->req_no_secext.CaseSensitivePasswordLength =
 +		cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
 +
 +	if (ses->capabilities & CAP_UNICODE) {
 +		if (sess_data->iov[0].iov_len % 2) {
    		*bcc_ptr = 0;
    		bcc_ptr++;
    	}
 -		unicode_oslm_strings(&bcc_ptr, nls_cp);
 +		unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
    } else {
 -		cifs_dbg(VFS, "secType %d not supported!\n", type);
 -		rc = -ENOSYS;
 -		goto ssetup_exit;
 +		ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
    }
-	iov[2].iov_base = str_area;
 -	iov[2].iov_len = (long) bcc_ptr - (long) str_area;
-	count = iov[1].iov_len + iov[2].iov_len;
 -	smb_buf->smb_buf_length =
 -		cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
 +	sess_data->iov[2].iov_len = (long) bcc_ptr -
 +			(long) sess_data->iov[2].iov_base;
-	put_bcc(count, smb_buf);
 +	rc = sess_sendreceive(sess_data);
 +	if (rc)
 +		goto out;
-	rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
 -			  CIFS_LOG_ERROR);
 -	/* SMB request buf freed in SendReceive2 */
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
 +
 +	if (smb_buf->WordCount != 3) {
 +		rc = -EIO;
 +		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 +		goto out;
 +	}
 +
 +	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
 +		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
 +
 +	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
 +	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
-	pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
 -	smb_buf = (struct smb_hdr *)iov[0].iov_base;
 +	bytes_remaining = get_bcc(smb_buf);
 +	bcc_ptr = pByteArea(smb_buf);
-	if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) &&
 -	    (smb_buf->Status.CifsError ==
 -			cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
 -		if (phase != NtLmNegotiate) {
 -			cifs_dbg(VFS, "Unexpected more processing error\n");
 -			goto ssetup_exit;
 +	/* BB check if Unicode and decode strings */
 +	if (bytes_remaining == 0) {
 +		/* no string area to decode, do nothing */
 +	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
 +		/* unicode string area must be word-aligned */
 +		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
 +			++bcc_ptr;
 +			--bytes_remaining;
    	}
 -		/* NTLMSSP Negotiate sent now processing challenge (response) */
 -		phase = NtLmChallenge; /* process ntlmssp challenge */
 -		rc = 0; /* MORE_PROC rc is not an error here, but expected */
 +		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				      sess_data->nls_cp);
 +	} else {
 +		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				    sess_data->nls_cp);
    }
 +
 +	rc = sess_establish_session(sess_data);
 +out:
 +	sess_data->result = rc;
 +	sess_data->func = NULL;
 +	sess_free_buffer(sess_data);
 +	kfree(ses->auth_key.response);
 +	ses->auth_key.response = NULL;
 +}
 +
 +#ifdef CONFIG_CIFS_UPCALL
 +static void
 +sess_auth_kerberos(struct sess_data *sess_data)
 +{
 +	int rc = 0;
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	char *bcc_ptr;
 +	struct cifs_ses *ses = sess_data->ses;
 +	__u32 capabilities;
 +	__u16 bytes_remaining;
 +	struct key *spnego_key = NULL;
 +	struct cifs_spnego_msg *msg;
 +	u16 blob_len;
 +
 +	/* extended security */
 +	/* wct = 12 */
 +	rc = sess_alloc_buffer(sess_data, 12);
    if (rc)
 -		goto ssetup_exit;
 +		goto out;
-	if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	bcc_ptr = sess_data->iov[2].iov_base;
 +	capabilities = cifs_ssetup_hdr(ses, pSMB);
 +
 +	spnego_key = cifs_get_spnego_key(ses);
 +	if (IS_ERR(spnego_key)) {
 +		rc = PTR_ERR(spnego_key);
 +		spnego_key = NULL;
 +		goto out;
 +	}
 +
 +	msg = spnego_key->payload.data;
 +	/*
 +	 * check version field to make sure that cifs.upcall is
 +	 * sending us a response in an expected form
 +	 */
 +	if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
 +		cifs_dbg(VFS,
 +		  "incorrect version of cifs.upcall (expected %d but got %d)",
 +			      CIFS_SPNEGO_UPCALL_VERSION, msg->version);
 +		rc = -EKEYREJECTED;
 +		goto out_put_spnego_key;
 +	}
 +
 +	ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
 +					 GFP_KERNEL);
 +	if (!ses->auth_key.response) {
 +		cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory",
 +				msg->sesskey_len);
 +		rc = -ENOMEM;
 +		goto out_put_spnego_key;
 +	}
 +	ses->auth_key.len = msg->sesskey_len;
 +
 +	pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 +	capabilities |= CAP_EXTENDED_SECURITY;
 +	pSMB->req.Capabilities = cpu_to_le32(capabilities);
 +	sess_data->iov[1].iov_base = msg->data + msg->sesskey_len;
 +	sess_data->iov[1].iov_len = msg->secblob_len;
 +	pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len);
 +
 +	if (ses->capabilities & CAP_UNICODE) {
 +		/* unicode strings must be word aligned */
 +		if ((sess_data->iov[0].iov_len
 +			+ sess_data->iov[1].iov_len) % 2) {
 +			*bcc_ptr = 0;
 +			bcc_ptr++;
 +		}
 +		unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
 +		unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
 +	} else {
 +		/* BB: is this right? */
 +		ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
 +	}
 +
 +	sess_data->iov[2].iov_len = (long) bcc_ptr -
 +			(long) sess_data->iov[2].iov_base;
 +
 +	rc = sess_sendreceive(sess_data);
 +	if (rc)
 +		goto out_put_spnego_key;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
 +
 +	if (smb_buf->WordCount != 4) {
    	rc = -EIO;
    	cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 -		goto ssetup_exit;
 +		goto out_put_spnego_key;
    }
 -	action = le16_to_cpu(pSMB->resp.Action);
 -	if (action & GUEST_LOGIN)
 +
 +	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
    	cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
 +
    ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
    cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
 -	/* response can have either 3 or 4 word count - Samba sends 3 */
 -	/* and lanman response is 3 */
 +
    bytes_remaining = get_bcc(smb_buf);
    bcc_ptr = pByteArea(smb_buf);
-	if (smb_buf->WordCount == 4) {
 -		blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 -		if (blob_len > bytes_remaining) {
 -			cifs_dbg(VFS, "bad security blob length %d\n",
 -				 blob_len);
 -			rc = -EINVAL;
 -			goto ssetup_exit;
 -		}
 -		if (phase == NtLmChallenge) {
 -			rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
 -			/* now goto beginning for ntlmssp authenticate phase */
 -			if (rc)
 -				goto ssetup_exit;
 -		}
 -		bcc_ptr += blob_len;
 -		bytes_remaining -= blob_len;
 +	blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 +	if (blob_len > bytes_remaining) {
 +		cifs_dbg(VFS, "bad security blob length %d\n",
 +				blob_len);
 +		rc = -EINVAL;
 +		goto out_put_spnego_key;
    }
 +	bcc_ptr += blob_len;
 +	bytes_remaining -= blob_len;
/* BB check if Unicode and decode strings */
    if (bytes_remaining == 0) {
@@@ -1083,371 -906,60 +1083,371 @@@
    		++bcc_ptr;
    		--bytes_remaining;
    	}
 -		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
 +		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				      sess_data->nls_cp);
    } else {
 -		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
 +		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				    sess_data->nls_cp);
    }
-ssetup_exit:
 -	if (spnego_key) {
 -		key_invalidate(spnego_key);
 -		key_put(spnego_key);
 +	rc = sess_establish_session(sess_data);
 +out_put_spnego_key:
 +	key_invalidate(spnego_key);
 +	key_put(spnego_key);
 +out:
 +	sess_data->result = rc;
 +	sess_data->func = NULL;
 +	sess_free_buffer(sess_data);
 +	kfree(ses->auth_key.response);
 +	ses->auth_key.response = NULL;
 +}
 +
 +#else
 +
 +static void
 +sess_auth_kerberos(struct sess_data *sess_data)
 +{
 +	cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
 +	sess_data->result = -ENOSYS;
 +	sess_data->func = NULL;
 +}
 +#endif /* ! CONFIG_CIFS_UPCALL */
 +
 +/*
 + * The required kvec buffers have to be allocated before calling this
 + * function.
 + */
 +static int
 +_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
 +{
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	struct cifs_ses *ses = sess_data->ses;
 +	__u32 capabilities;
 +	char *bcc_ptr;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)pSMB;
 +
 +	capabilities = cifs_ssetup_hdr(ses, pSMB);
 +	if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
 +		cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
 +		return -ENOSYS;
    }
 -	kfree(str_area);
 -	kfree(ntlmsspblob);
 -	ntlmsspblob = NULL;
 -	if (resp_buf_type == CIFS_SMALL_BUFFER) {
 -		cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base);
 -		cifs_small_buf_release(iov[0].iov_base);
 -	} else if (resp_buf_type == CIFS_LARGE_BUFFER)
 -		cifs_buf_release(iov[0].iov_base);
-	/* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
 -	if ((phase == NtLmChallenge) && (rc == 0))
 -		goto ssetup_ntlmssp_authenticate;
 +	pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 +	capabilities |= CAP_EXTENDED_SECURITY;
 +	pSMB->req.Capabilities |= cpu_to_le32(capabilities);
 +
 +	bcc_ptr = sess_data->iov[2].iov_base;
 +	/* unicode strings must be word aligned */
 +	if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) {
 +		*bcc_ptr = 0;
 +		bcc_ptr++;
 +	}
 +	unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
 +
 +	sess_data->iov[2].iov_len = (long) bcc_ptr -
 +					(long) sess_data->iov[2].iov_base;
 +
 +	return 0;
 +}
 +
 +static void
 +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data);
 +
 +static void
 +sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
 +{
 +	int rc;
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	struct cifs_ses *ses = sess_data->ses;
 +	__u16 bytes_remaining;
 +	char *bcc_ptr;
 +	u16 blob_len;
 +
 +	cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n");
 +
 +	/*
 +	 * if memory allocation is successful, caller of this function
 +	 * frees it.
 +	 */
 +	ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
 +	if (!ses->ntlmssp) {
 +		rc = -ENOMEM;
 +		goto out;
 +	}
 +	ses->ntlmssp->sesskey_per_smbsess = false;
 +
 +	/* wct = 12 */
 +	rc = sess_alloc_buffer(sess_data, 12);
 +	if (rc)
 +		goto out;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +
 +	/* Build security blob before we assemble the request */
 +	build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses);
 +	sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
 +	sess_data->iov[1].iov_base = pSMB->req.SecurityBlob;
 +	pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
 +
 +	rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
 +	if (rc)
 +		goto out;
 +
 +	rc = sess_sendreceive(sess_data);
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
 +
 +	/* If true, rc here is expected and not an error */
 +	if (sess_data->buf0_type != CIFS_NO_BUFFER &&
 +	    smb_buf->Status.CifsError ==
 +			cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
 +		rc = 0;
 +
 +	if (rc)
 +		goto out;
 +
 +	cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
 +
 +	if (smb_buf->WordCount != 4) {
 +		rc = -EIO;
 +		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 +		goto out;
 +	}
 +
 +	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
 +	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
 +
 +	bytes_remaining = get_bcc(smb_buf);
 +	bcc_ptr = pByteArea(smb_buf);
 +
 +	blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 +	if (blob_len > bytes_remaining) {
 +		cifs_dbg(VFS, "bad security blob length %d\n",
 +				blob_len);
 +		rc = -EINVAL;
 +		goto out;
 +	}
 +
 +	rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
 +out:
 +	sess_free_buffer(sess_data);
if (!rc) {
 -		mutex_lock(&ses->server->srv_mutex);
 -		if (!ses->server->session_estab) {
 -			if (ses->server->sign) {
 -				ses->server->session_key.response =
 -					kmemdup(ses->auth_key.response,
 -					ses->auth_key.len, GFP_KERNEL);
 -				if (!ses->server->session_key.response) {
 -					rc = -ENOMEM;
 -					mutex_unlock(&ses->server->srv_mutex);
 -					goto keycp_exit;
 -				}
 -				ses->server->session_key.len =
 -							ses->auth_key.len;
 -			}
 -			ses->server->sequence_number = 0x2;
 -			ses->server->session_estab = true;
 -		}
 -		mutex_unlock(&ses->server->srv_mutex);
 +		sess_data->func = sess_auth_rawntlmssp_authenticate;
 +		return;
 +	}
 +
 +	/* Else error. Cleanup */
 +	kfree(ses->auth_key.response);
 +	ses->auth_key.response = NULL;
 +	kfree(ses->ntlmssp);
 +	ses->ntlmssp = NULL;
 +
 +	sess_data->func = NULL;
 +	sess_data->result = rc;
 +}
-		cifs_dbg(FYI, "CIFS session established successfully\n");
 -		spin_lock(&GlobalMid_Lock);
 -		ses->status = CifsGood;
 -		ses->need_reconnect = false;
 -		spin_unlock(&GlobalMid_Lock);
 +static void
 +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
 +{
 +	int rc;
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	struct cifs_ses *ses = sess_data->ses;
 +	__u16 bytes_remaining;
 +	char *bcc_ptr;
 +	char *ntlmsspblob = NULL;
 +	u16 blob_len;
 +
 +	cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n");
 +
 +	/* wct = 12 */
 +	rc = sess_alloc_buffer(sess_data, 12);
 +	if (rc)
 +		goto out;
 +
 +	/* Build security blob before we assemble the request */
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)pSMB;
 +	/*
 +	 * 5 is an empirical value, large enough to hold
 +	 * authenticate message plus max 10 of av paris,
 +	 * domain, user, workstation names, flags, etc.
 +	 */
 +	ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE),
 +				GFP_KERNEL);
 +	if (!ntlmsspblob) {
 +		rc = -ENOMEM;
 +		goto out;
    }
-keycp_exit:
 +	rc = build_ntlmssp_auth_blob(ntlmsspblob,
 +					&blob_len, ses, sess_data->nls_cp);
 +	if (rc)
 +		goto out_free_ntlmsspblob;
 +	sess_data->iov[1].iov_len = blob_len;
 +	sess_data->iov[1].iov_base = ntlmsspblob;
 +	pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
 +	/*
 +	 * Make sure that we tell the server that we are using
 +	 * the uid that it just gave us back on the response
 +	 * (challenge)
 +	 */
 +	smb_buf->Uid = ses->Suid;
 +
 +	rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
 +	if (rc)
 +		goto out_free_ntlmsspblob;
 +
 +	rc = sess_sendreceive(sess_data);
 +	if (rc)
 +		goto out_free_ntlmsspblob;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
 +	if (smb_buf->WordCount != 4) {
 +		rc = -EIO;
 +		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 +		goto out_free_ntlmsspblob;
 +	}
 +
 +	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
 +		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
 +
 +	bytes_remaining = get_bcc(smb_buf);
 +	bcc_ptr = pByteArea(smb_buf);
 +	blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 +	if (blob_len > bytes_remaining) {
 +		cifs_dbg(VFS, "bad security blob length %d\n",
 +				blob_len);
 +		rc = -EINVAL;
 +		goto out_free_ntlmsspblob;
 +	}
 +	bcc_ptr += blob_len;
 +	bytes_remaining -= blob_len;
 +
 +
 +	/* BB check if Unicode and decode strings */
 +	if (bytes_remaining == 0) {
 +		/* no string area to decode, do nothing */
 +	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
 +		/* unicode string area must be word-aligned */
 +		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
 +			++bcc_ptr;
 +			--bytes_remaining;
 +		}
 +		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				      sess_data->nls_cp);
 +	} else {
 +		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				    sess_data->nls_cp);
 +	}
 +
 +out_free_ntlmsspblob:
 +	kfree(ntlmsspblob);
 +out:
 +	sess_free_buffer(sess_data);
 +
 +	 if (!rc)
 +		rc = sess_establish_session(sess_data);
 +
 +	/* Cleanup */
    kfree(ses->auth_key.response);
    ses->auth_key.response = NULL;
    kfree(ses->ntlmssp);
 +	ses->ntlmssp = NULL;
 +
 +	sess_data->func = NULL;
 +	sess_data->result = rc;
 +}
 +
 +static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data)
 +{
 +	int type;
 +
 +	type = select_sectype(ses->server, ses->sectype);
 +	cifs_dbg(FYI, "sess setup type %d\n", type);
 +	if (type == Unspecified) {
 +		cifs_dbg(VFS,
 +			"Unable to select appropriate authentication method!");
 +		return -EINVAL;
 +	}
 +
 +	switch (type) {
 +	case LANMAN:
 +		/* LANMAN and plaintext are less secure and off by default.
 +		 * So we make this explicitly be turned on in kconfig (in the
 +		 * build) and turned on at runtime (changed from the default)
 +		 * in proc/fs/cifs or via mount parm.  Unfortunately this is
 +		 * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
 +#ifdef CONFIG_CIFS_WEAK_PW_HASH
 +		sess_data->func = sess_auth_lanman;
 +		break;
 +#else
 +		return -EOPNOTSUPP;
 +#endif
 +	case NTLM:
 +		sess_data->func = sess_auth_ntlm;
 +		break;
 +	case NTLMv2:
 +		sess_data->func = sess_auth_ntlmv2;
 +		break;
 +	case Kerberos:
 +#ifdef CONFIG_CIFS_UPCALL
 +		sess_data->func = sess_auth_kerberos;
 +		break;
 +#else
 +		cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
 +		return -ENOSYS;
 +		break;
 +#endif /* CONFIG_CIFS_UPCALL */
 +	case RawNTLMSSP:
 +		sess_data->func = sess_auth_rawntlmssp_negotiate;
 +		break;
 +	default:
 +		cifs_dbg(VFS, "secType %d not supported!\n", type);
 +		return -ENOSYS;
 +	}
 +
 +	return 0;
 +}
 +
 +int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
 +		    const struct nls_table *nls_cp)
 +{
 +	int rc = 0;
 +	struct sess_data *sess_data;
 +
 +	if (ses == NULL) {
 +		WARN(1, "%s: ses == NULL!", __func__);
 +		return -EINVAL;
 +	}
 +
 +	sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL);
 +	if (!sess_data)
 +		return -ENOMEM;
 +
 +	rc = select_sec(ses, sess_data);
 +	if (rc)
 +		goto out;
 +
 +	sess_data->xid = xid;
 +	sess_data->ses = ses;
 +	sess_data->buf0_type = CIFS_NO_BUFFER;
 +	sess_data->nls_cp = (struct nls_table *) nls_cp;
 +
 +	while (sess_data->func)
 +		sess_data->func(sess_data);
 +
 +	/* Store result before we free sess_data */
 +	rc = sess_data->result;
+out:
 +	kfree(sess_data);
    return rc;
  }
diff --combined fs/cifs/smb2ops.c
index 081529f,7f99a0f..59437c5
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@@ -112,53 -112,6 +112,53 @@@ smb2_get_credits(struct mid_q_entry *mi
    return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest);
  }
+static int
 +smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
 +		      unsigned int *num, unsigned int *credits)
 +{
 +	int rc = 0;
 +	unsigned int scredits;
 +
 +	spin_lock(&server->req_lock);
 +	while (1) {
 +		if (server->credits <= 0) {
 +			spin_unlock(&server->req_lock);
 +			cifs_num_waiters_inc(server);
 +			rc = wait_event_killable(server->request_q,
 +					has_credits(server, &server->credits));
 +			cifs_num_waiters_dec(server);
 +			if (rc)
 +				return rc;
 +			spin_lock(&server->req_lock);
 +		} else {
 +			if (server->tcpStatus == CifsExiting) {
 +				spin_unlock(&server->req_lock);
 +				return -ENOENT;
 +			}
 +
 +			scredits = server->credits;
 +			/* can deadlock with reopen */
 +			if (scredits == 1) {
 +				*num = SMB2_MAX_BUFFER_SIZE;
 +				*credits = 0;
 +				break;
 +			}
 +
 +			/* leave one credit for a possible reopen */
 +			scredits--;
 +			*num = min_t(unsigned int, size,
 +				     scredits * SMB2_MAX_BUFFER_SIZE);
 +
 +			*credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
 +			server->credits -= *credits;
 +			server->in_flight++;
 +			break;
 +		}
 +	}
 +	spin_unlock(&server->req_lock);
 +	return rc;
 +}
 +
  static __u64
  smb2_get_next_mid(struct TCP_Server_Info *server)
  {
@@@ -229,6 -182,8 +229,6 @@@ smb2_negotiate_wsize(struct cifs_tcon *
    /* start with specified wsize, or default */
    wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
    wsize = min_t(unsigned int, wsize, server->max_write);
 -	/* set it to the maximum buffer size value we can send with 1 credit */
 -	wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
return wsize;
  }
@@@ -242,6 -197,8 +242,6 @@@ smb2_negotiate_rsize(struct cifs_tcon *
    /* start with specified rsize, or default */
    rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
    rsize = min_t(unsigned int, rsize, server->max_read);
 -	/* set it to the maximum buffer size value we can send with 1 credit */
 -	rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
return rsize;
  }
@@@ -590,7 -547,7 +590,7 @@@ smb2_clone_range(const unsigned int xid
    	goto cchunk_out;
/* For now array only one chunk long, will make more flexible later */
- 	pcchunk->ChunkCount = __constant_cpu_to_le32(1);
+ 	pcchunk->ChunkCount = cpu_to_le32(1);
    pcchunk->Reserved = 0;
    pcchunk->Reserved2 = 0;
@@@ -1147,13 -1104,6 +1147,13 @@@ smb3_parse_lease_buf(void *buf, unsigne
    return le32_to_cpu(lc->lcontext.LeaseState);
  }
+static unsigned int
 +smb2_wp_retry_size(struct inode *inode)
 +{
 +	return min_t(unsigned int, CIFS_SB(inode->i_sb)->wsize,
 +		     SMB2_MAX_BUFFER_SIZE);
 +}
 +
  struct smb_version_operations smb20_operations = {
    .compare_fids = smb2_compare_fids,
    .setup_request = smb2_setup_request,
@@@ -1163,7 -1113,6 +1163,7 @@@
    .set_credits = smb2_set_credits,
    .get_credits_field = smb2_get_credits_field,
    .get_credits = smb2_get_credits,
 +	.wait_mtu_credits = cifs_wait_mtu_credits,
    .get_next_mid = smb2_get_next_mid,
    .read_data_offset = smb2_read_data_offset,
    .read_data_length = smb2_read_data_length,
@@@ -1228,7 -1177,6 +1228,7 @@@
    .create_lease_buf = smb2_create_lease_buf,
    .parse_lease_buf = smb2_parse_lease_buf,
    .clone_range = smb2_clone_range,
 +	.wp_retry_size = smb2_wp_retry_size,
  };
struct smb_version_operations smb21_operations = {
@@@ -1240,7 -1188,6 +1240,7 @@@
    .set_credits = smb2_set_credits,
    .get_credits_field = smb2_get_credits_field,
    .get_credits = smb2_get_credits,
 +	.wait_mtu_credits = smb2_wait_mtu_credits,
    .get_next_mid = smb2_get_next_mid,
    .read_data_offset = smb2_read_data_offset,
    .read_data_length = smb2_read_data_length,
@@@ -1305,7 -1252,6 +1305,7 @@@
    .create_lease_buf = smb2_create_lease_buf,
    .parse_lease_buf = smb2_parse_lease_buf,
    .clone_range = smb2_clone_range,
 +	.wp_retry_size = smb2_wp_retry_size,
  };
struct smb_version_operations smb30_operations = {
@@@ -1317,7 -1263,6 +1317,7 @@@
    .set_credits = smb2_set_credits,
    .get_credits_field = smb2_get_credits_field,
    .get_credits = smb2_get_credits,
 +	.wait_mtu_credits = smb2_wait_mtu_credits,
    .get_next_mid = smb2_get_next_mid,
    .read_data_offset = smb2_read_data_offset,
    .read_data_length = smb2_read_data_length,
@@@ -1385,7 -1330,6 +1385,7 @@@
    .parse_lease_buf = smb3_parse_lease_buf,
    .clone_range = smb2_clone_range,
    .validate_negotiate = smb3_validate_negotiate,
 +	.wp_retry_size = smb2_wp_retry_size,
  };
struct smb_version_values smb20_values = {
diff --combined fs/cifs/smb2pdu.c
index 768cddb,a9b03c2..2057250
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@@ -245,6 -245,10 +245,6 @@@ smb2_reconnect(__le16 smb2_command, str
    if (rc)
    	goto out;
    atomic_inc(&tconInfoReconnectCount);
 -	/*
 -	 * BB FIXME add code to check if wsize needs update due to negotiated
 -	 * smb buffer size shrinking.
 -	 */
  out:
    /*
     * Check if handle based operation so we know whether we can continue
@@@ -305,6 -309,16 +305,6 @@@ small_smb2_init(__le16 smb2_command, st
    return rc;
  }
-static void
 -free_rsp_buf(int resp_buftype, void *rsp)
 -{
 -	if (resp_buftype == CIFS_SMALL_BUFFER)
 -		cifs_small_buf_release(rsp);
 -	else if (resp_buftype == CIFS_LARGE_BUFFER)
 -		cifs_buf_release(rsp);
 -}
 -
 -
  /*
   *
   *	SMB2 Worker functions follow:
@@@ -1355,7 -1369,7 +1355,7 @@@ SMB2_set_compression(const unsigned in
    char *ret_data = NULL;
fsctl_input.CompressionState =
- 			__constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+ 			cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
    		FSCTL_SET_COMPRESSION, true /* is_fsctl */,
@@@ -1724,18 -1738,12 +1724,18 @@@ smb2_readv_callback(struct mid_q_entry 
    				 rc);
    	}
    	/* FIXME: should this be counted toward the initiating task? */
 -		task_io_account_read(rdata->bytes);
 -		cifs_stats_bytes_read(tcon, rdata->bytes);
 +		task_io_account_read(rdata->got_bytes);
 +		cifs_stats_bytes_read(tcon, rdata->got_bytes);
    	break;
    case MID_REQUEST_SUBMITTED:
    case MID_RETRY_NEEDED:
    	rdata->result = -EAGAIN;
 +		if (server->sign && rdata->got_bytes)
 +			/* reset bytes number since we can not check a sign */
 +			rdata->got_bytes = 0;
 +		/* FIXME: should this be counted toward the initiating task? */
 +		task_io_account_read(rdata->got_bytes);
 +		cifs_stats_bytes_read(tcon, rdata->got_bytes);
    	break;
    default:
    	if (rdata->result != -ENODATA)
@@@ -1754,12 -1762,11 +1754,12 @@@
  int
  smb2_async_readv(struct cifs_readdata *rdata)
  {
 -	int rc;
 +	int rc, flags = 0;
    struct smb2_hdr *buf;
    struct cifs_io_parms io_parms;
    struct smb_rqst rqst = { .rq_iov = &rdata->iov,
    			 .rq_nvec = 1 };
 +	struct TCP_Server_Info *server;
cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
    	 __func__, rdata->offset, rdata->bytes);
@@@ -1770,41 -1777,18 +1770,41 @@@
    io_parms.persistent_fid = rdata->cfile->fid.persistent_fid;
    io_parms.volatile_fid = rdata->cfile->fid.volatile_fid;
    io_parms.pid = rdata->pid;
 +
 +	server = io_parms.tcon->ses->server;
 +
    rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0);
 -	if (rc)
 +	if (rc) {
 +		if (rc == -EAGAIN && rdata->credits) {
 +			/* credits was reseted by reconnect */
 +			rdata->credits = 0;
 +			/* reduce in_flight value since we won't send the req */
 +			spin_lock(&server->req_lock);
 +			server->in_flight--;
 +			spin_unlock(&server->req_lock);
 +		}
    	return rc;
 +	}
buf = (struct smb2_hdr *)rdata->iov.iov_base;
    /* 4 for rfc1002 length field */
    rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4;
+	if (rdata->credits) {
 +		buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
 +						SMB2_MAX_BUFFER_SIZE));
 +		spin_lock(&server->req_lock);
 +		server->credits += rdata->credits -
 +						le16_to_cpu(buf->CreditCharge);
 +		spin_unlock(&server->req_lock);
 +		wake_up(&server->request_q);
 +		flags = CIFS_HAS_CREDITS;
 +	}
 +
    kref_get(&rdata->refcount);
    rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
    		     cifs_readv_receive, smb2_readv_callback,
 -			     rdata, 0);
 +			     rdata, flags);
    if (rc) {
    	kref_put(&rdata->refcount, cifs_readdata_release);
    	cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
@@@ -1922,25 -1906,15 +1922,25 @@@ in
  smb2_async_writev(struct cifs_writedata *wdata,
    	  void (*release)(struct kref *kref))
  {
 -	int rc = -EACCES;
 +	int rc = -EACCES, flags = 0;
    struct smb2_write_req *req = NULL;
    struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
 +	struct TCP_Server_Info *server = tcon->ses->server;
    struct kvec iov;
    struct smb_rqst rqst;
rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req);
 -	if (rc)
 +	if (rc) {
 +		if (rc == -EAGAIN && wdata->credits) {
 +			/* credits was reseted by reconnect */
 +			wdata->credits = 0;
 +			/* reduce in_flight value since we won't send the req */
 +			spin_lock(&server->req_lock);
 +			server->in_flight--;
 +			spin_unlock(&server->req_lock);
 +		}
    	goto async_writev_out;
 +	}
req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid);
@@@ -1973,20 -1947,9 +1973,20 @@@
inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
+	if (wdata->credits) {
 +		req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
 +						    SMB2_MAX_BUFFER_SIZE));
 +		spin_lock(&server->req_lock);
 +		server->credits += wdata->credits -
 +					le16_to_cpu(req->hdr.CreditCharge);
 +		spin_unlock(&server->req_lock);
 +		wake_up(&server->request_q);
 +		flags = CIFS_HAS_CREDITS;
 +	}
 +
    kref_get(&wdata->refcount);
 -	rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
 -				smb2_writev_callback, wdata, 0);
 +	rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata,
 +			     flags);
if (rc) {
    	kref_put(&wdata->refcount, release);
diff --combined fs/exec.c
index ab1f120,2ef2751..a2b42a9
--- a/fs/exec.c
+++ b/fs/exec.c
@@@ -368,10 -368,6 +368,6 @@@ static int bprm_mm_init(struct linux_bi
    if (!mm)
    	goto err;
- 	err = init_new_context(current, mm);
- 	if (err)
- 		goto err;
- 
    err = __bprm_mm_init(bprm);
    if (err)
    	goto err;
@@@ -1216,7 -1212,7 +1212,7 @@@ EXPORT_SYMBOL(install_exec_creds)
  /*
   * determine how safe it is to execute the proposed program
   * - the caller must hold ->cred_guard_mutex to protect against
 - *   PTRACE_ATTACH
 + *   PTRACE_ATTACH or seccomp thread-sync
   */
  static void check_unsafe_exec(struct linux_binprm *bprm)
  {
@@@ -1234,7 -1230,7 +1230,7 @@@
     * This isn't strictly necessary, but it makes it harder for LSMs to
     * mess up.
     */
 -	if (current->no_new_privs)
 +	if (task_no_new_privs(current))
    	bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
t = p;
@@@ -1272,7 -1268,7 +1268,7 @@@ int prepare_binprm(struct linux_binprm 
    bprm->cred->egid = current_egid();
if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
 -	    !current->no_new_privs &&
 +	    !task_no_new_privs(current) &&
        kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
        kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
    	/* Set-uid? */
diff --combined fs/fscache/main.c
index a31b83c,3248c15..b39d487
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@@ -67,7 -67,7 +67,7 @@@ static int fscache_max_active_sysctl(st
    return ret;
  }
- struct ctl_table fscache_sysctls[] = {
+ static struct ctl_table fscache_sysctls[] = {
    {
    	.procname	= "object_max_active",
    	.data		= &fscache_object_max_active,
@@@ -87,7 -87,7 +87,7 @@@
    {}
  };
- struct ctl_table fscache_sysctls_root[] = {
+ static struct ctl_table fscache_sysctls_root[] = {
    {
    	.procname	= "fscache",
    	.mode		= 0555,
@@@ -197,6 -197,24 +197,6 @@@ static void __exit fscache_exit(void
  module_exit(fscache_exit);
/*
 - * wait_on_bit() sleep function for uninterruptible waiting
 - */
 -int fscache_wait_bit(void *flags)
 -{
 -	schedule();
 -	return 0;
 -}
 -
 -/*
 - * wait_on_bit() sleep function for interruptible waiting
 - */
 -int fscache_wait_bit_interruptible(void *flags)
 -{
 -	schedule();
 -	return signal_pending(current);
 -}
 -
 -/*
   * wait_on_atomic_t() sleep function for uninterruptible waiting
   */
  int fscache_wait_atomic_t(atomic_t *p)
diff --combined fs/namespace.c
index b10db3d,2a1447c..019ff81
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@@ -225,7 -225,6 +225,7 @@@ static struct mount *alloc_vfsmnt(cons
    	INIT_LIST_HEAD(&mnt->mnt_share);
    	INIT_LIST_HEAD(&mnt->mnt_slave_list);
    	INIT_LIST_HEAD(&mnt->mnt_slave);
 +		INIT_LIST_HEAD(&mnt->mnt_mp_list);
  #ifdef CONFIG_FSNOTIFY
    	INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
  #endif
@@@ -668,45 -667,11 +668,45 @@@ struct vfsmount *lookup_mnt(struct pat
    return m;
  }
-static struct mountpoint *new_mountpoint(struct dentry *dentry)
 +/*
 + * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 + *                         current mount namespace.
 + *
 + * The common case is dentries are not mountpoints at all and that
 + * test is handled inline.  For the slow case when we are actually
 + * dealing with a mountpoint of some kind, walk through all of the
 + * mounts in the current mount namespace and test to see if the dentry
 + * is a mountpoint.
 + *
 + * The mount_hashtable is not usable in the context because we
 + * need to identify all mounts that may be in the current mount
 + * namespace not just a mount that happens to have some specified
 + * parent mount.
 + */
 +bool __is_local_mountpoint(struct dentry *dentry)
 +{
 +	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 +	struct mount *mnt;
 +	bool is_covered = false;
 +
 +	if (!d_mountpoint(dentry))
 +		goto out;
 +
 +	down_read(&namespace_sem);
 +	list_for_each_entry(mnt, &ns->list, mnt_list) {
 +		is_covered = (mnt->mnt_mountpoint == dentry);
 +		if (is_covered)
 +			break;
 +	}
 +	up_read(&namespace_sem);
 +out:
 +	return is_covered;
 +}
 +
 +static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
  {
    struct hlist_head *chain = mp_hash(dentry);
    struct mountpoint *mp;
 -	int ret;
hlist_for_each_entry(mp, chain, m_hash) {
    	if (mp->m_dentry == dentry) {
@@@ -717,14 -682,6 +717,14 @@@
    		return mp;
    	}
    }
 +	return NULL;
 +}
 +
 +static struct mountpoint *new_mountpoint(struct dentry *dentry)
 +{
 +	struct hlist_head *chain = mp_hash(dentry);
 +	struct mountpoint *mp;
 +	int ret;
mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
    if (!mp)
@@@ -739,7 -696,6 +739,7 @@@
    mp->m_dentry = dentry;
    mp->m_count = 1;
    hlist_add_head(&mp->m_hash, chain);
 +	INIT_LIST_HEAD(&mp->m_list);
    return mp;
  }
@@@ -747,7 -703,6 +747,7 @@@ static void put_mountpoint(struct mount
  {
    if (!--mp->m_count) {
    	struct dentry *dentry = mp->m_dentry;
 +		BUG_ON(!list_empty(&mp->m_list));
    	spin_lock(&dentry->d_lock);
    	dentry->d_flags &= ~DCACHE_MOUNTED;
    	spin_unlock(&dentry->d_lock);
@@@ -794,7 -749,6 +794,7 @@@ static void detach_mnt(struct mount *mn
    mnt->mnt_mountpoint = mnt->mnt.mnt_root;
    list_del_init(&mnt->mnt_child);
    hlist_del_init_rcu(&mnt->mnt_hash);
 +	list_del_init(&mnt->mnt_mp_list);
    put_mountpoint(mnt->mnt_mp);
    mnt->mnt_mp = NULL;
  }
@@@ -811,7 -765,6 +811,7 @@@ void mnt_set_mountpoint(struct mount *m
    child_mnt->mnt_mountpoint = dget(mp->m_dentry);
    child_mnt->mnt_parent = mnt;
    child_mnt->mnt_mp = mp;
 +	list_add_tail(&child_mnt->mnt_mp_list, &mp->m_list);
  }
/*
@@@ -845,7 -798,7 +845,7 @@@ static void commit_tree(struct mount *m
    list_splice(&head, n->list.prev);
if (shadows)
- 		hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
+ 		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
    else
    	hlist_add_head_rcu(&mnt->mnt_hash,
    			m_hash(&parent->mnt, mnt->mnt_mountpoint));
@@@ -983,25 -936,9 +983,25 @@@ static struct mount *clone_mnt(struct m
    return ERR_PTR(err);
  }
+static void cleanup_mnt(struct mount *mnt)
 +{
 +	fsnotify_vfsmount_delete(&mnt->mnt);
 +	dput(mnt->mnt.mnt_root);
 +	deactivate_super(mnt->mnt.mnt_sb);
 +	mnt_free_id(mnt);
 +	complete(mnt->mnt_undone);
 +	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
 +}
 +
 +static void cleanup_mnt_work(struct work_struct *work)
 +{
 +	cleanup_mnt(container_of(work, struct mount, mnt_cleanup_work));
 +}
 +
  static void mntput_no_expire(struct mount *mnt)
  {
 -put_again:
 +	struct completion undone;
 +
    rcu_read_lock();
    mnt_add_count(mnt, -1);
    if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
@@@ -1015,15 -952,12 +1015,15 @@@
    	return;
    }
    if (unlikely(mnt->mnt_pinned)) {
 -		mnt_add_count(mnt, mnt->mnt_pinned + 1);
 +		init_completion(&undone);
 +		mnt->mnt_undone = &undone;
 +		mnt_add_count(mnt, mnt->mnt_pinned);
    	mnt->mnt_pinned = 0;
    	rcu_read_unlock();
    	unlock_mount_hash();
    	acct_auto_close_mnt(&mnt->mnt);
 -		goto put_again;
 +		wait_for_completion(&undone);
 +		return;
    }
    if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
    	rcu_read_unlock();
@@@ -1047,19 -981,11 +1047,19 @@@
     * so mnt_get_writers() below is safe.
     */
    WARN_ON(mnt_get_writers(mnt));
 -	fsnotify_vfsmount_delete(&mnt->mnt);
 -	dput(mnt->mnt.mnt_root);
 -	deactivate_super(mnt->mnt.mnt_sb);
 -	mnt_free_id(mnt);
 -	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
 +	/* The stack may be deep here, cleanup the mount on a work
 +	 * queue where the stack is guaranteed to be shallow.
 +	 */
 +	init_completion(&undone);
 +	if (!mnt->mnt_undone)
 +		mnt->mnt_undone = &undone;
 +	else
 +		complete(&undone);
 +
 +	INIT_WORK(&mnt->mnt_cleanup_work, cleanup_mnt_work);
 +	schedule_work(&mnt->mnt_cleanup_work);
 +
 +	wait_for_completion(&undone);
  }
void mntput(struct vfsmount *mnt)
@@@ -1335,7 -1261,6 +1335,7 @@@ void umount_tree(struct mount *mnt, in
    		p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
    	list_del_init(&p->mnt_child);
    	if (mnt_has_parent(p)) {
 +			list_del_init(&p->mnt_mp_list);
    		put_mountpoint(p->mnt_mp);
    		/* move the reference to mountpoint into ->mnt_ex_mountpoint */
    		p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
@@@ -1448,37 -1373,6 +1448,37 @@@ static int do_umount(struct mount *mnt
    return retval;
  }
+/*
 + * __detach_mounts - lazily unmount all mounts on the specified dentry
 + *
 + * During unlink, rmdir, and d_drop it is possible to loose the path
 + * to an existing mountpoint, and wind up leaking the mount.
 + * detach_mounts allows lazily unmounting those mounts instead of
 + * leaking them.
 + * 
 + * The caller may hold dentry->d_inode->i_mutex.
 + */
 +void __detach_mounts(struct dentry *dentry)
 +{
 +	struct mountpoint *mp;
 +	struct mount *mnt;
 +
 +	namespace_lock();
 +	mp = lookup_mountpoint(dentry);
 +	if (!mp)
 +		goto out_unlock;
 +
 +	lock_mount_hash();
 +	while (!list_empty(&mp->m_list)) {
 +		mnt = list_first_entry(&mp->m_list, struct mount, mnt_mp_list);
 +		umount_tree(mnt, 2);
 +	}
 +	unlock_mount_hash();
 +	put_mountpoint(mp);
 +out_unlock:
 +	namespace_unlock();
 +}
 +
  /* 
   * Is the caller allowed to modify his namespace?
   */
@@@ -1828,9 -1722,7 +1828,9 @@@ retry
    namespace_lock();
    mnt = lookup_mnt(path);
    if (likely(!mnt)) {
 -		struct mountpoint *mp = new_mountpoint(dentry);
 +		struct mountpoint *mp = lookup_mountpoint(dentry);
 +		if (!mp)
 +			mp = new_mountpoint(dentry);
    	if (IS_ERR(mp)) {
    		namespace_unlock();
    		mutex_unlock(&dentry->d_inode->i_mutex);
diff --combined fs/proc/base.c
index e442784,043c83c..2105331
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@@ -105,7 -105,7 +105,7 @@@
   */
struct pid_entry {
- 	char *name;
+ 	const char *name;
    int len;
    umode_t mode;
    const struct inode_operations *iop;
@@@ -130,10 -130,6 +130,6 @@@
    	{ .proc_get_link = get_link } )
  #define REG(NAME, MODE, fops)				\
    NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
- #define INF(NAME, MODE, read)				\
- 	NOD(NAME, (S_IFREG|(MODE)), 			\
- 		NULL, &proc_info_file_operations,	\
- 		{ .proc_read = read } )
  #define ONE(NAME, MODE, show)				\
    NOD(NAME, (S_IFREG|(MODE)), 			\
    	NULL, &proc_single_file_operations,	\
@@@ -200,27 -196,32 +196,32 @@@ static int proc_root_link(struct dentr
    return result;
  }
- static int proc_pid_cmdline(struct task_struct *task, char *buffer)
+ static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
+ 			    struct pid *pid, struct task_struct *task)
  {
- 	return get_cmdline(task, buffer, PAGE_SIZE);
+ 	/*
+ 	 * Rely on struct seq_operations::show() being called once
+ 	 * per internal buffer allocation. See single_open(), traverse().
+ 	 */
+ 	BUG_ON(m->size < PAGE_SIZE);
+ 	m->count += get_cmdline(task, m->buf, PAGE_SIZE);
+ 	return 0;
  }
- static int proc_pid_auxv(struct task_struct *task, char *buffer)
+ static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
+ 			 struct pid *pid, struct task_struct *task)
  {
    struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
- 	int res = PTR_ERR(mm);
    if (mm && !IS_ERR(mm)) {
    	unsigned int nwords = 0;
    	do {
    		nwords += 2;
    	} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
- 		res = nwords * sizeof(mm->saved_auxv[0]);
- 		if (res > PAGE_SIZE)
- 			res = PAGE_SIZE;
- 		memcpy(buffer, mm->saved_auxv, res);
+ 		seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0]));
    	mmput(mm);
- 	}
- 	return res;
+ 		return 0;
+ 	} else
+ 		return PTR_ERR(mm);
  }
@@@ -229,7 -230,8 +230,8 @@@
   * Provides a wchan file via kallsyms in a proper one-value-per-file format.
   * Returns the resolved symbol.  If that fails, simply return the address.
   */
- static int proc_pid_wchan(struct task_struct *task, char *buffer)
+ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
+ 			  struct pid *pid, struct task_struct *task)
  {
    unsigned long wchan;
    char symname[KSYM_NAME_LEN];
@@@ -240,9 -242,9 +242,9 @@@
    	if (!ptrace_may_access(task, PTRACE_MODE_READ))
    		return 0;
    	else
- 			return sprintf(buffer, "%lu", wchan);
+ 			return seq_printf(m, "%lu", wchan);
    else
- 		return sprintf(buffer, "%s", symname);
+ 		return seq_printf(m, "%s", symname);
  }
  #endif /* CONFIG_KALLSYMS */
@@@ -304,9 -306,10 +306,10 @@@ static int proc_pid_stack(struct seq_fi
  /*
   * Provides /proc/PID/schedstat
   */
- static int proc_pid_schedstat(struct task_struct *task, char *buffer)
+ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
+ 			      struct pid *pid, struct task_struct *task)
  {
- 	return sprintf(buffer, "%llu %llu %lu\n",
+ 	return seq_printf(m, "%llu %llu %lu\n",
    		(unsigned long long)task->se.sum_exec_runtime,
    		(unsigned long long)task->sched_info.run_delay,
    		task->sched_info.pcount);
@@@ -404,7 -407,8 +407,8 @@@ static const struct file_operations pro
  };
  #endif
- static int proc_oom_score(struct task_struct *task, char *buffer)
+ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
+ 			  struct pid *pid, struct task_struct *task)
  {
    unsigned long totalpages = totalram_pages + total_swap_pages;
    unsigned long points = 0;
@@@ -414,12 -418,12 +418,12 @@@
    	points = oom_badness(task, NULL, NULL, totalpages) *
    					1000 / totalpages;
    read_unlock(&tasklist_lock);
- 	return sprintf(buffer, "%lu\n", points);
+ 	return seq_printf(m, "%lu\n", points);
  }
struct limit_names {
- 	char *name;
- 	char *unit;
+ 	const char *name;
+ 	const char *unit;
  };
static const struct limit_names lnames[RLIM_NLIMITS] = {
@@@ -442,12 -446,11 +446,11 @@@
  };
/* Display limits for a process */
- static int proc_pid_limits(struct task_struct *task, char *buffer)
+ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
+ 			   struct pid *pid, struct task_struct *task)
  {
    unsigned int i;
- 	int count = 0;
    unsigned long flags;
- 	char *bufptr = buffer;
struct rlimit rlim[RLIM_NLIMITS];
@@@ -459,35 -462,34 +462,34 @@@
    /*
     * print the file header
     */
- 	count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n",
+        seq_printf(m, "%-25s %-20s %-20s %-10s\n",
    		"Limit", "Soft Limit", "Hard Limit", "Units");
for (i = 0; i < RLIM_NLIMITS; i++) {
    	if (rlim[i].rlim_cur == RLIM_INFINITY)
- 			count += sprintf(&bufptr[count], "%-25s %-20s ",
+ 			seq_printf(m, "%-25s %-20s ",
    				 lnames[i].name, "unlimited");
    	else
- 			count += sprintf(&bufptr[count], "%-25s %-20lu ",
+ 			seq_printf(m, "%-25s %-20lu ",
    				 lnames[i].name, rlim[i].rlim_cur);
if (rlim[i].rlim_max == RLIM_INFINITY)
- 			count += sprintf(&bufptr[count], "%-20s ", "unlimited");
+ 			seq_printf(m, "%-20s ", "unlimited");
    	else
- 			count += sprintf(&bufptr[count], "%-20lu ",
- 					 rlim[i].rlim_max);
+ 			seq_printf(m, "%-20lu ", rlim[i].rlim_max);
if (lnames[i].unit)
- 			count += sprintf(&bufptr[count], "%-10s\n",
- 					 lnames[i].unit);
+ 			seq_printf(m, "%-10s\n", lnames[i].unit);
    	else
- 			count += sprintf(&bufptr[count], "\n");
+ 			seq_putc(m, '\n');
    }
- 	return count;
+ 	return 0;
  }
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- static int proc_pid_syscall(struct task_struct *task, char *buffer)
+ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
+ 			    struct pid *pid, struct task_struct *task)
  {
    long nr;
    unsigned long args[6], sp, pc;
@@@ -496,11 -498,11 +498,11 @@@
    	return res;
if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
- 		res = sprintf(buffer, "running\n");
+ 		seq_puts(m, "running\n");
    else if (nr < 0)
- 		res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+ 		seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
    else
- 		res = sprintf(buffer,
+ 		seq_printf(m,
    	       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
    	       nr,
    	       args[0], args[1], args[2], args[3], args[4], args[5],
@@@ -598,43 -600,6 +600,6 @@@ static const struct inode_operations pr
    .setattr	= proc_setattr,
  };
- #define PROC_BLOCK_SIZE	(3*1024)		/* 4K page size but our output routines use some slack for overruns */
- 
- static ssize_t proc_info_read(struct file * file, char __user * buf,
- 			  size_t count, loff_t *ppos)
- {
- 	struct inode * inode = file_inode(file);
- 	unsigned long page;
- 	ssize_t length;
- 	struct task_struct *task = get_proc_task(inode);
- 
- 	length = -ESRCH;
- 	if (!task)
- 		goto out_no_task;
- 
- 	if (count > PROC_BLOCK_SIZE)
- 		count = PROC_BLOCK_SIZE;
- 
- 	length = -ENOMEM;
- 	if (!(page = __get_free_page(GFP_TEMPORARY)))
- 		goto out;
- 
- 	length = PROC_I(inode)->op.proc_read(task, (char*)page);
- 
- 	if (length >= 0)
- 		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
- 	free_page(page);
- out:
- 	put_task_struct(task);
- out_no_task:
- 	return length;
- }
- 
- static const struct file_operations proc_info_file_operations = {
- 	.read		= proc_info_read,
- 	.llseek		= generic_file_llseek,
- };
- 
  static int proc_single_show(struct seq_file *m, void *v)
  {
    struct inode *inode = m->private;
@@@ -1625,6 -1590,7 +1590,6 @@@ int pid_revalidate(struct dentry *dentr
    	put_task_struct(task);
    	return 1;
    }
 -	d_drop(dentry);
    return 0;
  }
@@@ -1761,6 -1727,9 +1726,6 @@@ out
    put_task_struct(task);
out_notask:
 -	if (status <= 0)
 -		d_drop(dentry);
 -
    return status;
  }
@@@ -2052,7 -2021,7 +2017,7 @@@ static int show_timer(struct seq_file *
    struct k_itimer *timer;
    struct timers_private *tp = m->private;
    int notify;
- 	static char *nstr[] = {
+ 	static const char * const nstr[] = {
    	[SIGEV_SIGNAL] = "signal",
    	[SIGEV_NONE] = "none",
    	[SIGEV_THREAD] = "thread",
@@@ -2388,7 -2357,7 +2353,7 @@@ static const struct file_operations pro
  #endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
+ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
  {
    struct task_io_accounting acct = task->ioac;
    unsigned long flags;
@@@ -2412,7 -2381,7 +2377,7 @@@
unlock_task_sighand(task, &flags);
    }
- 	result = sprintf(buffer,
+ 	result = seq_printf(m,
    		"rchar: %llu\n"
    		"wchar: %llu\n"
    		"syscr: %llu\n"
@@@ -2432,20 -2401,22 +2397,22 @@@ out_unlock
    return result;
  }
- static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+ static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+ 				  struct pid *pid, struct task_struct *task)
  {
- 	return do_io_accounting(task, buffer, 0);
+ 	return do_io_accounting(task, m, 0);
  }
- static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+ static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+ 				   struct pid *pid, struct task_struct *task)
  {
- 	return do_io_accounting(task, buffer, 1);
+ 	return do_io_accounting(task, m, 1);
  }
  #endif /* CONFIG_TASK_IO_ACCOUNTING */
#ifdef CONFIG_USER_NS
  static int proc_id_map_open(struct inode *inode, struct file *file,
- 	struct seq_operations *seq_ops)
+ 	const struct seq_operations *seq_ops)
  {
    struct user_namespace *ns = NULL;
    struct task_struct *task;
@@@ -2553,10 -2524,10 +2520,10 @@@ static const struct pid_entry tgid_base
    DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
  #endif
    REG("environ",    S_IRUSR, proc_environ_operations),
- 	INF("auxv",       S_IRUSR, proc_pid_auxv),
+ 	ONE("auxv",       S_IRUSR, proc_pid_auxv),
    ONE("status",     S_IRUGO, proc_pid_status),
    ONE("personality", S_IRUSR, proc_pid_personality),
- 	INF("limits",	  S_IRUGO, proc_pid_limits),
+ 	ONE("limits",	  S_IRUGO, proc_pid_limits),
  #ifdef CONFIG_SCHED_DEBUG
    REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
  #endif
@@@ -2565,9 -2536,9 +2532,9 @@@
  #endif
    REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
  #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- 	INF("syscall",    S_IRUSR, proc_pid_syscall),
+ 	ONE("syscall",    S_IRUSR, proc_pid_syscall),
  #endif
- 	INF("cmdline",    S_IRUGO, proc_pid_cmdline),
+ 	ONE("cmdline",    S_IRUGO, proc_pid_cmdline),
    ONE("stat",       S_IRUGO, proc_tgid_stat),
    ONE("statm",      S_IRUGO, proc_pid_statm),
    REG("maps",       S_IRUGO, proc_pid_maps_operations),
@@@ -2590,13 -2561,13 +2557,13 @@@
    DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
  #endif
  #ifdef CONFIG_KALLSYMS
- 	INF("wchan",      S_IRUGO, proc_pid_wchan),
+ 	ONE("wchan",      S_IRUGO, proc_pid_wchan),
  #endif
  #ifdef CONFIG_STACKTRACE
    ONE("stack",      S_IRUSR, proc_pid_stack),
  #endif
  #ifdef CONFIG_SCHEDSTATS
- 	INF("schedstat",  S_IRUGO, proc_pid_schedstat),
+ 	ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
  #endif
  #ifdef CONFIG_LATENCYTOP
    REG("latency",  S_IRUGO, proc_lstats_operations),
@@@ -2607,7 -2578,7 +2574,7 @@@
  #ifdef CONFIG_CGROUPS
    REG("cgroup",  S_IRUGO, proc_cgroup_operations),
  #endif
- 	INF("oom_score",  S_IRUGO, proc_oom_score),
+ 	ONE("oom_score",  S_IRUGO, proc_oom_score),
    REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
    REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
  #ifdef CONFIG_AUDITSYSCALL
@@@ -2621,10 -2592,10 +2588,10 @@@
    REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
  #endif
  #ifdef CONFIG_TASK_IO_ACCOUNTING
- 	INF("io",	S_IRUSR, proc_tgid_io_accounting),
+ 	ONE("io",	S_IRUSR, proc_tgid_io_accounting),
  #endif
  #ifdef CONFIG_HARDWALL
- 	INF("hardwall",   S_IRUGO, proc_pid_hardwall),
+ 	ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
  #endif
  #ifdef CONFIG_USER_NS
    REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
@@@ -2672,7 -2643,8 +2639,7 @@@ static void proc_flush_task_mnt(struct 
    /* no ->d_hash() rejects on procfs */
    dentry = d_hash_and_lookup(mnt->mnt_root, &name);
    if (dentry) {
 -		shrink_dcache_parent(dentry);
 -		d_drop(dentry);
 +		d_invalidate(dentry);
    	dput(dentry);
    }
@@@ -2692,7 -2664,8 +2659,7 @@@
    name.len = snprintf(buf, sizeof(buf), "%d", pid);
    dentry = d_hash_and_lookup(dir, &name);
    if (dentry) {
 -		shrink_dcache_parent(dentry);
 -		d_drop(dentry);
 +		d_invalidate(dentry);
    	dput(dentry);
    }
@@@ -2774,12 -2747,12 +2741,12 @@@ out
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
  {
- 	int result = 0;
+ 	int result = -ENOENT;
    struct task_struct *task;
    unsigned tgid;
    struct pid_namespace *ns;
- 	tgid = name_to_int(dentry);
+ 	tgid = name_to_int(&dentry->d_name);
    if (tgid == ~0U)
    	goto out;
@@@ -2890,18 -2863,18 +2857,18 @@@ static const struct pid_entry tid_base_
    DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
    DIR("ns",	 S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
    REG("environ",   S_IRUSR, proc_environ_operations),
- 	INF("auxv",      S_IRUSR, proc_pid_auxv),
+ 	ONE("auxv",      S_IRUSR, proc_pid_auxv),
    ONE("status",    S_IRUGO, proc_pid_status),
    ONE("personality", S_IRUSR, proc_pid_personality),
- 	INF("limits",	 S_IRUGO, proc_pid_limits),
+ 	ONE("limits",	 S_IRUGO, proc_pid_limits),
  #ifdef CONFIG_SCHED_DEBUG
    REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
  #endif
    REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
  #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- 	INF("syscall",   S_IRUSR, proc_pid_syscall),
+ 	ONE("syscall",   S_IRUSR, proc_pid_syscall),
  #endif
- 	INF("cmdline",   S_IRUGO, proc_pid_cmdline),
+ 	ONE("cmdline",   S_IRUGO, proc_pid_cmdline),
    ONE("stat",      S_IRUGO, proc_tid_stat),
    ONE("statm",     S_IRUGO, proc_pid_statm),
    REG("maps",      S_IRUGO, proc_tid_maps_operations),
@@@ -2926,13 -2899,13 +2893,13 @@@
    DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
  #endif
  #ifdef CONFIG_KALLSYMS
- 	INF("wchan",     S_IRUGO, proc_pid_wchan),
+ 	ONE("wchan",     S_IRUGO, proc_pid_wchan),
  #endif
  #ifdef CONFIG_STACKTRACE
    ONE("stack",      S_IRUSR, proc_pid_stack),
  #endif
  #ifdef CONFIG_SCHEDSTATS
- 	INF("schedstat", S_IRUGO, proc_pid_schedstat),
+ 	ONE("schedstat", S_IRUGO, proc_pid_schedstat),
  #endif
  #ifdef CONFIG_LATENCYTOP
    REG("latency",  S_IRUGO, proc_lstats_operations),
@@@ -2943,7 -2916,7 +2910,7 @@@
  #ifdef CONFIG_CGROUPS
    REG("cgroup",  S_IRUGO, proc_cgroup_operations),
  #endif
- 	INF("oom_score", S_IRUGO, proc_oom_score),
+ 	ONE("oom_score", S_IRUGO, proc_oom_score),
    REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
    REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
  #ifdef CONFIG_AUDITSYSCALL
@@@ -2954,10 -2927,10 +2921,10 @@@
    REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
  #endif
  #ifdef CONFIG_TASK_IO_ACCOUNTING
- 	INF("io",	S_IRUSR, proc_tid_io_accounting),
+ 	ONE("io",	S_IRUSR, proc_tid_io_accounting),
  #endif
  #ifdef CONFIG_HARDWALL
- 	INF("hardwall",   S_IRUGO, proc_pid_hardwall),
+ 	ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
  #endif
  #ifdef CONFIG_USER_NS
    REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
@@@ -3027,7 -3000,7 +2994,7 @@@ static struct dentry *proc_task_lookup(
    if (!leader)
    	goto out_no_task;
- 	tid = name_to_int(dentry);
+ 	tid = name_to_int(&dentry->d_name);
    if (tid == ~0U)
    	goto out;
diff --combined fs/proc/fd.c
index eb82e9f,955bb55..e11d7c5
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@@ -129,6 -129,8 +129,6 @@@ static int tid_fd_revalidate(struct den
    	}
    	put_task_struct(task);
    }
 -
 -	d_drop(dentry);
    return 0;
  }
@@@ -204,7 -206,7 +204,7 @@@ static struct dentry *proc_lookupfd_com
  {
    struct task_struct *task = get_proc_task(dir);
    int result = -ENOENT;
- 	unsigned fd = name_to_int(dentry);
+ 	unsigned fd = name_to_int(&dentry->d_name);
if (!task)
    	goto out_no_task;
diff --combined include/linux/fs.h
index 2daccaf,8b4a021..1ab6c69
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -833,7 -833,7 +833,7 @@@ static inline struct file *get_file(str
   *
   * Lockd stuffs a "host" pointer into this.
   */
 -typedef struct files_struct *fl_owner_t;
 +typedef void *fl_owner_t;
struct file_lock_operations {
    void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
@@@ -2688,7 -2688,7 +2688,7 @@@ static const struct file_operations __f
    .read	 = simple_attr_read,					\
    .write	 = simple_attr_write,					\
    .llseek	 = generic_file_llseek,					\
- };
+ }
static inline __printf(1, 2)
  void __simple_attr_check_format(const char *fmt, ...)
diff --combined include/linux/kernel.h
index a9e2268,44a498d..e989204
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@@ -470,6 -470,7 +470,7 @@@ extern enum system_states 
  #define TAINT_FIRMWARE_WORKAROUND	11
  #define TAINT_OOT_MODULE		12
  #define TAINT_UNSIGNED_MODULE		13
+ #define TAINT_SOFTLOCKUP		14
extern const char hex_asc[];
  #define hex_asc_lo(x)	hex_asc[((x) & 0x0f)]
@@@ -493,15 -494,10 +494,10 @@@ static inline char *hex_byte_pack_upper
    return buf;
  }
extern int hex_to_bin(char ch);
  extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
-int mac_pton(const char *s, u8 *mac);
 +bool mac_pton(const char *s, u8 *mac);
/*
   * General tracing related utility functions - trace_printk(),
@@@ -719,23 -715,8 +715,8 @@@ static inline void ftrace_dump(enum ftr
    (void) (&_max1 == &_max2);		\
    _max1 > _max2 ? _max1 : _max2; })
- #define min3(x, y, z) ({			\
- 	typeof(x) _min1 = (x);			\
- 	typeof(y) _min2 = (y);			\
- 	typeof(z) _min3 = (z);			\
- 	(void) (&_min1 == &_min2);		\
- 	(void) (&_min1 == &_min3);		\
- 	_min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
- 		(_min2 < _min3 ? _min2 : _min3); })
- 
- #define max3(x, y, z) ({			\
- 	typeof(x) _max1 = (x);			\
- 	typeof(y) _max2 = (y);			\
- 	typeof(z) _max3 = (z);			\
- 	(void) (&_max1 == &_max2);		\
- 	(void) (&_max1 == &_max3);		\
- 	_max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \
- 		(_max2 > _max3 ? _max2 : _max3); })
+ #define min3(x, y, z) min((typeof(x))min(x, y), z)
+ #define max3(x, y, z) max((typeof(x))max(x, y), z)
/**
   * min_not_zero - return the minimum that is _not_ zero, unless both are zero
@@@ -750,20 -731,13 +731,13 @@@
  /**
   * clamp - return a value clamped to a given range with strict typechecking
   * @val: current value
-  * @min: minimum allowable value
-  * @max: maximum allowable value
+  * @lo: lowest allowable value
+  * @hi: highest allowable value
   *
   * This macro does strict typechecking of min/max to make sure they are of the
   * same type as val.  See the unnecessary pointer comparisons.
   */
- #define clamp(val, min, max) ({			\
- 	typeof(val) __val = (val);		\
- 	typeof(min) __min = (min);		\
- 	typeof(max) __max = (max);		\
- 	(void) (&__val == &__min);		\
- 	(void) (&__val == &__max);		\
- 	__val = __val < __min ? __min: __val;	\
- 	__val > __max ? __max: __val; })
+ #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
/*
   * ..and if you can't take the strict
diff --combined include/linux/sched.h
index fa964cf,b9d5364..89f531e
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -33,6 -33,7 +33,7 @@@ struct sched_param
#include <linux/smp.h>
  #include <linux/sem.h>
+ #include <linux/shm.h>
  #include <linux/signal.h>
  #include <linux/compiler.h>
  #include <linux/completion.h>
@@@ -1270,6 -1271,9 +1271,6 @@@ struct task_struct 
  #ifdef CONFIG_TREE_PREEMPT_RCU
    struct rcu_node *rcu_blocked_node;
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 -#ifdef CONFIG_RCU_BOOST
 -	struct rt_mutex *rcu_boost_mutex;
 -#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
    struct sched_info sched_info;
@@@ -1304,12 -1308,13 +1305,12 @@@
    			 * execve */
    unsigned in_iowait:1;
-	/* task may not gain privileges */
 -	unsigned no_new_privs:1;
 -
    /* Revert to default priority/policy when forking */
    unsigned sched_reset_on_fork:1;
    unsigned sched_contributes_to_load:1;
+	unsigned long atomic_flags; /* Flags needing atomic access. */
 +
    pid_t pid;
    pid_t tgid;
@@@ -1385,6 -1390,7 +1386,7 @@@
  #ifdef CONFIG_SYSVIPC
  /* ipc stuff */
    struct sysv_sem sysvsem;
+ 	struct sysv_shm sysvshm;
  #endif
  #ifdef CONFIG_DETECT_HUNG_TASK
  /* hung task detection */
@@@ -1436,6 -1442,8 +1438,6 @@@
    struct rb_node *pi_waiters_leftmost;
    /* Deadlock detection and priority inheritance handling */
    struct rt_mutex_waiter *pi_blocked_on;
 -	/* Top pi_waiters task */
 -	struct task_struct *pi_top_task;
  #endif
#ifdef CONFIG_DEBUG_MUTEXES
@@@ -1628,12 -1636,6 +1630,6 @@@
    unsigned long trace_recursion;
  #endif /* CONFIG_TRACING */
  #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
- 	struct memcg_batch_info {
- 		int do_batch;	/* incremented when batch uncharge started */
- 		struct mem_cgroup *memcg; /* target memcg of uncharge */
- 		unsigned long nr_pages;	/* uncharged usage */
- 		unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
- 	} memcg_batch;
    unsigned int memcg_kmem_skip_account;
    struct memcg_oom_info {
    	struct mem_cgroup *memcg;
@@@ -1961,19 -1963,6 +1957,19 @@@ static inline void memalloc_noio_restor
    current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
  }
+/* Per-process atomic flags. */
 +#define PFA_NO_NEW_PRIVS 0x00000001	/* May not gain new privileges. */
 +
 +static inline bool task_no_new_privs(struct task_struct *p)
 +{
 +	return test_bit(PFA_NO_NEW_PRIVS, &p->atomic_flags);
 +}
 +
 +static inline void task_set_no_new_privs(struct task_struct *p)
 +{
 +	set_bit(PFA_NO_NEW_PRIVS, &p->atomic_flags);
 +}
 +
  /*
   * task->jobctl flags
   */
@@@ -2016,6 -2005,9 +2012,6 @@@ static inline void rcu_copy_process(str
  #ifdef CONFIG_TREE_PREEMPT_RCU
    p->rcu_blocked_node = NULL;
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 -#ifdef CONFIG_RCU_BOOST
 -	p->rcu_boost_mutex = NULL;
 -#endif /* #ifdef CONFIG_RCU_BOOST */
    INIT_LIST_HEAD(&p->rcu_node_entry);
  }
@@@ -2364,10 -2356,8 +2360,10 @@@ static inline int on_sig_stack(unsigne
static inline int sas_ss_flags(unsigned long sp)
  {
 -	return (current->sas_ss_size == 0 ? SS_DISABLE
 -		: on_sig_stack(sp) ? SS_ONSTACK : 0);
 +	if (!current->sas_ss_size)
 +		return SS_DISABLE;
 +
 +	return on_sig_stack(sp) ? SS_ONSTACK : 0;
  }
static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
@@@ -2794,7 -2784,7 +2790,7 @@@ static inline bool __must_check current
/*
     * Polling state must be visible before we test NEED_RESCHED,
 -	 * paired by resched_task()
 +	 * paired by resched_curr()
     */
    smp_mb__after_atomic();
@@@ -2812,7 -2802,7 +2808,7 @@@ static inline bool __must_check current
/*
     * Polling state must be visible before we test NEED_RESCHED,
 -	 * paired by resched_task()
 +	 * paired by resched_curr()
     */
    smp_mb__after_atomic();
@@@ -2844,7 -2834,7 +2840,7 @@@ static inline void current_clr_polling(
     * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
     * fold.
     */
 -	smp_mb(); /* paired with resched_task() */
 +	smp_mb(); /* paired with resched_curr() */
preempt_fold_need_resched();
  }
@@@ -2969,15 -2959,10 +2965,10 @@@ static inline void inc_syscw(struct tas
#ifdef CONFIG_MEMCG
  extern void mm_update_next_owner(struct mm_struct *mm);
- extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
  #else
  static inline void mm_update_next_owner(struct mm_struct *mm)
  {
  }
- 
- static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
- {
- }
  #endif /* CONFIG_MEMCG */
static inline unsigned long task_rlimit(const struct task_struct *tsk,
diff --combined include/scsi/scsi.h
index 91e2e42,d34cf2d..4b69139
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@@ -31,7 -31,7 +31,7 @@@ enum scsi_timeouts 
   * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
   * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
   */
- #ifdef ARCH_HAS_SG_CHAIN
+ #ifdef CONFIG_ARCH_HAS_SG_CHAIN
  #define SCSI_MAX_SG_CHAIN_SEGMENTS	2048
  #else
  #define SCSI_MAX_SG_CHAIN_SEGMENTS	SCSI_MAX_SG_SEGMENTS
@@@ -385,7 -385,7 +385,7 @@@ struct scsi_lun 
  #define SCSI_W_LUN_ACCESS_CONTROL (SCSI_W_LUN_BASE + 2)
  #define SCSI_W_LUN_TARGET_LOG_PAGE (SCSI_W_LUN_BASE + 3)
-static inline int scsi_is_wlun(unsigned int lun)
 +static inline int scsi_is_wlun(u64 lun)
  {
    return (lun & 0xff00) == SCSI_W_LUN_BASE;
  }
diff --combined init/Kconfig
index 85fb985,77dc4cb..d3ef635
--- a/init/Kconfig
+++ b/init/Kconfig
@@@ -505,7 -505,7 +505,7 @@@ config PREEMPT_RC
    def_bool TREE_PREEMPT_RCU
    help
      This option enables preemptible-RCU code that is common between
 -	  the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
 +	  TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU.
config RCU_STALL_COMMON
    def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE )
@@@ -737,7 -737,7 +737,7 @@@ choic
config RCU_NOCB_CPU_NONE
    bool "No build_forced no-CBs CPUs"
 -	depends on RCU_NOCB_CPU && !NO_HZ_FULL
 +	depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL
    help
      This option does not force any of the CPUs to be no-CBs CPUs.
      Only CPUs designated by the rcu_nocbs= boot parameter will be
@@@ -751,7 -751,7 +751,7 @@@
config RCU_NOCB_CPU_ZERO
    bool "CPU 0 is a build_forced no-CBs CPU"
 -	depends on RCU_NOCB_CPU && !NO_HZ_FULL
 +	depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL
    help
      This option forces CPU 0 to be a no-CBs CPU, so that its RCU
      callbacks are invoked by a per-CPU kthread whose name begins
@@@ -807,15 -807,53 +807,53 @@@ config LOG_BUF_SHIF
    range 12 21
    default 17
    help
- 	  Select kernel log buffer size as a power of 2.
+ 	  Select the minimal kernel log buffer size as a power of 2.
+ 	  The final size is affected by LOG_CPU_MAX_BUF_SHIFT config
+ 	  parameter, see below. Any higher size also might be forced
+ 	  by "log_buf_len" boot parameter.
+ 
      Examples:
- 	  	     17 => 128 KB
+ 		     17 => 128 KB
    	     16 => 64 KB
- 	             15 => 32 KB
- 	             14 => 16 KB
+ 		     15 => 32 KB
+ 		     14 => 16 KB
    	     13 =>  8 KB
    	     12 =>  4 KB
+ config LOG_CPU_MAX_BUF_SHIFT
+ 	int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)"
+ 	range 0 21
+ 	default 12 if !BASE_SMALL
+ 	default 0 if BASE_SMALL
+ 	help
+ 	  This option allows to increase the default ring buffer size
+ 	  according to the number of CPUs. The value defines the contribution
+ 	  of each CPU as a power of 2. The used space is typically only few
+ 	  lines however it might be much more when problems are reported,
+ 	  e.g. backtraces.
+ 
+ 	  The increased size means that a new buffer has to be allocated and
+ 	  the original static one is unused. It makes sense only on systems
+ 	  with more CPUs. Therefore this value is used only when the sum of
+ 	  contributions is greater than the half of the default kernel ring
+ 	  buffer as defined by LOG_BUF_SHIFT. The default values are set
+ 	  so that more than 64 CPUs are needed to trigger the allocation.
+ 
+ 	  Also this option is ignored when "log_buf_len" kernel parameter is
+ 	  used as it forces an exact (power of two) size of the ring buffer.
+ 
+ 	  The number of possible CPUs is used for this computation ignoring
+ 	  hotplugging making the compuation optimal for the the worst case
+ 	  scenerio while allowing a simple algorithm to be used from bootup.
+ 
+ 	  Examples shift values and their meaning:
+ 		     17 => 128 KB for each CPU
+ 		     16 =>  64 KB for each CPU
+ 		     15 =>  32 KB for each CPU
+ 		     14 =>  16 KB for each CPU
+ 		     13 =>   8 KB for each CPU
+ 		     12 =>   4 KB for each CPU
+ 
  #
  # Architectures with an unreliable sched_clock() should select this:
  #
@@@ -1264,77 -1302,6 +1302,77 @@@ config CC_OPTIMIZE_FOR_SIZ
If unsure, say N.
+config LTO_MENU
 +	bool "Enable gcc link time optimization (LTO)"
 +	# Only tested on X86 for now. For other architectures you likely
 +	# have to fix some things first, like adding asmlinkages etc.
 +	depends on X86
 +	# lto does not support excluding flags for specific files
 +	# right now. Can be removed if that is fixed.
 +	depends on !FUNCTION_TRACER
 +	help
 +	  With this option gcc will do whole program optimizations for
 +	  the whole kernel and module. This increases compile time, but can
 +	  lead to better code. It allows gcc to inline functions between
 +	  different files and do other optimization.  It might also trigger
 +	  bugs due to more aggressive optimization. It allows gcc to drop unused
 +	  code. On smaller monolithic kernel configurations
 +	  it usually leads to smaller kernels, especially when modules
 +	  are disabled.
 +
 +	  With this option gcc will also do some global checking over
 +	  different source files. It also disables a number of kernel
 +	  features.
 +
 +	  This option is recommended for release builds. With LTO
 +	  the kernel always has to be re-optimized (but not re-parsed)
 +	  on each build.
 +
 +	  This requires a gcc 4.8 or later compiler and
 +	  Linux binutils 2.21.51.0.3 or later.  gcc 4.9 builds significantly
 +	  faster than 4.8 It does not currently work with a FSF release of
 +	  binutils or with the gold linker.
 +
 +	  On larger configurations this may need more than 4GB of RAM.
 +	  It will likely not work on those with a 32bit compiler.
 +
 +	  When the toolchain support is not available this will (hopefully)
 +	  be automatically disabled.
 +
 +	  For more information see Documentation/lto-build
 +
 +config LTO_DISABLE
 +         bool "Disable LTO again"
 +         depends on LTO_MENU
 +         default n
 +         help
 +           This option is merely here so that allyesconfig or allmodconfig do
 +           not enable LTO. If you want to actually use LTO do not enable.
 +
 +config LTO
 +	bool
 +	default y
 +	depends on LTO_MENU && !LTO_DISABLE
 +
 +config LTO_DEBUG
 +	bool "Enable LTO compile time debugging"
 +	depends on LTO
 +	help
 +	  Enable LTO debugging in the compiler. The compiler dumps
 +	  some log files that make it easier to figure out LTO
 +	  behavior. The log files also allow to reconstruct
 +	  the global inlining and a global callgraph.
 +	  They however add some (single threaded) cost to the
 +	  compilation.  When in doubt do not enable.
 +
 +config LTO_CP_CLONE
 +	bool "Allow aggressive cloning for function specialization"
 +	depends on LTO
 +	help
 +	  Allow the compiler to clone and specialize functions for specific
 +	  arguments when it determines these arguments are very commonly
 +	  called.  Experimential. Will increase text size.
 +
  config SYSCTL
    bool
@@@ -1834,8 -1801,6 +1872,8 @@@ config MODULE_FORCE_UNLOA
config MODVERSIONS
    bool "Module versioning support"
 +	# LTO should work with gcc 4.9
 +	depends on !LTO
    help
      Usually, you have to use modules compiled with your kernel.
      Saying Y here makes it sometimes possible to use modules
diff --combined kernel/acct.c
index 3cec8c4,1bfdda0..98c4a20
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@@ -93,7 -93,6 +93,7 @@@ struct bsd_acct_struct
static DEFINE_SPINLOCK(acct_lock);
  static LIST_HEAD(acct_list);
 +static LIST_HEAD(acct_close_list);
/*
   * Check the amount of free space and suspend/resume accordingly.
@@@ -142,12 -141,12 +142,12 @@@ static int check_free_space(struct bsd_
    if (acct->active) {
    	if (act < 0) {
    		acct->active = 0;
- 			printk(KERN_INFO "Process accounting paused\n");
+ 			pr_info("Process accounting paused\n");
    	}
    } else {
    	if (act > 0) {
    		acct->active = 1;
- 			printk(KERN_INFO "Process accounting resumed\n");
+ 			pr_info("Process accounting resumed\n");
    	}
    }
@@@ -262,6 -261,7 +262,7 @@@ SYSCALL_DEFINE1(acct, const char __use
if (name) {
    	struct filename *tmp = getname(name);
+ 
    	if (IS_ERR(tmp))
    		return PTR_ERR(tmp);
    	error = acct_on(tmp);
@@@ -281,20 -281,6 +282,20 @@@
    return error;
  }
+static void acct_close_mnts(struct work_struct *unused)
 +{
 +	struct bsd_acct_struct *acct;
 +
 +	spin_lock(&acct_lock);
 +restart:
 +	list_for_each_entry(acct, &acct_close_list, list) {
 +		acct_file_reopen(acct, NULL, NULL);
 +		goto restart;
 +	}
 +	spin_unlock(&acct_lock);
 +}
 +static DECLARE_WORK(acct_close_work, acct_close_mnts);
 +
  /**
   * acct_auto_close - turn off a filesystem's accounting if it is on
   * @m: vfsmount being shut down
@@@ -304,15 -290,15 +305,15 @@@
   */
  void acct_auto_close_mnt(struct vfsmount *m)
  {
 -	struct bsd_acct_struct *acct;
 +	struct bsd_acct_struct *acct, *tmp;
spin_lock(&acct_lock);
 -restart:
 -	list_for_each_entry(acct, &acct_list, list)
 +	list_for_each_entry_safe(acct, tmp, &acct_list, list) {
    	if (acct->file && acct->file->f_path.mnt == m) {
 -			acct_file_reopen(acct, NULL, NULL);
 -			goto restart;
 +			list_move_tail(&acct->list, &acct_close_list);
 +			schedule_work(&acct_close_work);
    	}
 +	}
    spin_unlock(&acct_lock);
  }
@@@ -391,7 -377,7 +392,7 @@@ static comp_t encode_comp_t(unsigned lo
    return exp;
  }
- #if ACCT_VERSION==1 || ACCT_VERSION==2
+ #if ACCT_VERSION == 1 || ACCT_VERSION == 2
  /*
   * encode an u64 into a comp2_t (24 bits)
   *
@@@ -404,7 -390,7 +405,7 @@@
  #define MANTSIZE2       20                      /* 20 bit mantissa. */
  #define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
  #define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
- #define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */
+ #define MAXEXP2         ((1 << EXPSIZE2) - 1)    /* Maximum exponent. */
static comp2_t encode_comp2_t(u64 value)
  {
@@@ -435,7 -421,7 +436,7 @@@
  }
  #endif
- #if ACCT_VERSION==3
+ #if ACCT_VERSION == 3
  /*
   * encode an u64 into a 32 bit IEEE float
   */
@@@ -444,8 -430,9 +445,9 @@@ static u32 encode_float(u64 value
    unsigned exp = 190;
    unsigned u;
- 	if (value==0) return 0;
- 	while ((s64)value > 0){
+ 	if (value == 0)
+ 		return 0;
+ 	while ((s64)value > 0) {
    	value <<= 1;
    	exp--;
    }
@@@ -499,22 -486,23 +501,23 @@@ static void do_acct_process(struct bsd_
    strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
/* calculate run_time in nsec*/
 -	do_posix_clock_monotonic_gettime(&uptime);
 +	ktime_get_ts(&uptime);
    run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
    run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
    	       + current->group_leader->start_time.tv_nsec;
    /* convert nsec -> AHZ */
    elapsed = nsec_to_AHZ(run_time);
- #if ACCT_VERSION==3
+ #if ACCT_VERSION == 3
    ac.ac_etime = encode_float(elapsed);
  #else
    ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
- 	                       (unsigned long) elapsed : (unsigned long) -1l);
+ 				(unsigned long) elapsed : (unsigned long) -1l);
  #endif
- #if ACCT_VERSION==1 || ACCT_VERSION==2
+ #if ACCT_VERSION == 1 || ACCT_VERSION == 2
    {
    	/* new enlarged etime field */
    	comp2_t etime = encode_comp2_t(elapsed);
+ 
    	ac.ac_etime_hi = etime >> 16;
    	ac.ac_etime_lo = (u16) etime;
    }
@@@ -524,15 -512,15 +527,15 @@@
    /* we really need to bite the bullet and change layout */
    ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
    ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
- #if ACCT_VERSION==2
+ #if ACCT_VERSION == 2
    ac.ac_ahz = AHZ;
  #endif
- #if ACCT_VERSION==1 || ACCT_VERSION==2
+ #if ACCT_VERSION == 1 || ACCT_VERSION == 2
    /* backward-compatible 16 bit fields */
    ac.ac_uid16 = ac.ac_uid;
    ac.ac_gid16 = ac.ac_gid;
  #endif
- #if ACCT_VERSION==3
+ #if ACCT_VERSION == 3
    ac.ac_pid = task_tgid_nr_ns(current, ns);
    rcu_read_lock();
    ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
@@@ -593,6 -581,7 +596,7 @@@ void acct_collect(long exitcode, int gr
if (group_dead && current->mm) {
    	struct vm_area_struct *vma;
+ 
    	down_read(&current->mm->mmap_sem);
    	vma = current->mm->mmap;
    	while (vma) {
diff --combined kernel/fork.c
index 7657301,735ea98..38dcf83
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@@ -315,15 -315,6 +315,15 @@@ static struct task_struct *dup_task_str
    	goto free_ti;
tsk->stack = ti;
 +#ifdef CONFIG_SECCOMP
 +	/*
 +	 * We must handle setting up seccomp filters once we're under
 +	 * the sighand lock in case orig has changed between now and
 +	 * then. Until then, filter must be NULL to avoid messing up
 +	 * the usage counts on the error path calling free_task.
 +	 */
 +	tsk->seccomp.filter = NULL;
 +#endif
setup_thread_stack(tsk, orig);
    clear_user_return_notifier(tsk);
@@@ -374,12 -365,11 +374,11 @@@ static int dup_mmap(struct mm_struct *m
     */
    down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
- 	mm->locked_vm = 0;
- 	mm->mmap = NULL;
- 	mm->vmacache_seqnum = 0;
- 	mm->map_count = 0;
- 	cpumask_clear(mm_cpumask(mm));
- 	mm->mm_rb = RB_ROOT;
+ 	mm->total_vm = oldmm->total_vm;
+ 	mm->shared_vm = oldmm->shared_vm;
+ 	mm->exec_vm = oldmm->exec_vm;
+ 	mm->stack_vm = oldmm->stack_vm;
+ 
    rb_link = &mm->mm_rb.rb_node;
    rb_parent = NULL;
    pprev = &mm->mmap;
@@@ -536,19 -526,37 +535,37 @@@ static void mm_init_aio(struct mm_struc
  #endif
  }
+ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+ {
+ #ifdef CONFIG_MEMCG
+ 	mm->owner = p;
+ #endif
+ }
+ 
  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
  {
+ 	mm->mmap = NULL;
+ 	mm->mm_rb = RB_ROOT;
+ 	mm->vmacache_seqnum = 0;
    atomic_set(&mm->mm_users, 1);
    atomic_set(&mm->mm_count, 1);
    init_rwsem(&mm->mmap_sem);
    INIT_LIST_HEAD(&mm->mmlist);
    mm->core_state = NULL;
    atomic_long_set(&mm->nr_ptes, 0);
+ 	mm->map_count = 0;
+ 	mm->locked_vm = 0;
+ 	mm->pinned_vm = 0;
    memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
    spin_lock_init(&mm->page_table_lock);
+ 	mm_init_cpumask(mm);
    mm_init_aio(mm);
    mm_init_owner(mm, p);
+ 	mmu_notifier_mm_init(mm);
    clear_tlb_flush_pending(mm);
+ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ 	mm->pmd_huge_pte = NULL;
+ #endif
if (current->mm) {
    	mm->flags = current->mm->flags & MMF_INIT_MASK;
@@@ -558,11 -566,17 +575,17 @@@
    	mm->def_flags = 0;
    }
- 	if (likely(!mm_alloc_pgd(mm))) {
- 		mmu_notifier_mm_init(mm);
- 		return mm;
- 	}
+ 	if (mm_alloc_pgd(mm))
+ 		goto fail_nopgd;
+ 
+ 	if (init_new_context(p, mm))
+ 		goto fail_nocontext;
+ 
+ 	return mm;
+ fail_nocontext:
+ 	mm_free_pgd(mm);
+ fail_nopgd:
    free_mm(mm);
    return NULL;
  }
@@@ -596,7 -610,6 +619,6 @@@ struct mm_struct *mm_alloc(void
    	return NULL;
memset(mm, 0, sizeof(*mm));
- 	mm_init_cpumask(mm);
    return mm_init(mm, current);
  }
@@@ -828,17 -841,10 +850,10 @@@ static struct mm_struct *dup_mm(struct 
    	goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
- 	mm_init_cpumask(mm);
if (!mm_init(mm, tsk))
    	goto fail_nomem;
- 	if (init_new_context(tsk, mm))
- 		goto fail_nocontext;
- 
    dup_mm_exe_file(oldmm, mm);
err = dup_mmap(mm, oldmm);
@@@ -860,15 -866,6 +875,6 @@@ free_pt
fail_nomem:
    return NULL;
- 
- fail_nocontext:
- 	/*
- 	 * If init_new_context() failed, we cannot use mmput() to free the mm
- 	 * because it calls destroy_context()
- 	 */
- 	mm_free_pgd(mm);
- 	free_mm(mm);
- 	return NULL;
  }
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
@@@ -1090,39 -1087,6 +1096,39 @@@ static int copy_signal(unsigned long cl
    return 0;
  }
+static void copy_seccomp(struct task_struct *p)
 +{
 +#ifdef CONFIG_SECCOMP
 +	/*
 +	 * Must be called with sighand->lock held, which is common to
 +	 * all threads in the group. Holding cred_guard_mutex is not
 +	 * needed because this new task is not yet running and cannot
 +	 * be racing exec.
 +	 */
 +	BUG_ON(!spin_is_locked(&current->sighand->siglock));
 +
 +	/* Ref-count the new filter user, and assign it. */
 +	get_seccomp_filter(current);
 +	p->seccomp = current->seccomp;
 +
 +	/*
 +	 * Explicitly enable no_new_privs here in case it got set
 +	 * between the task_struct being duplicated and holding the
 +	 * sighand lock. The seccomp state and nnp must be in sync.
 +	 */
 +	if (task_no_new_privs(current))
 +		task_set_no_new_privs(p);
 +
 +	/*
 +	 * If the parent gained a seccomp mode after copying thread
 +	 * flags and between before we held the sighand lock, we have
 +	 * to manually enable the seccomp thread flag here.
 +	 */
 +	if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
 +		set_tsk_thread_flag(p, TIF_SECCOMP);
 +#endif
 +}
 +
  SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
  {
    current->clear_child_tid = tidptr;
@@@ -1137,16 -1101,10 +1143,9 @@@ static void rt_mutex_init_task(struct t
    p->pi_waiters = RB_ROOT;
    p->pi_waiters_leftmost = NULL;
    p->pi_blocked_on = NULL;
  #endif
  }
- #ifdef CONFIG_MEMCG
- void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
- {
- 	mm->owner = p;
- }
- #endif /* CONFIG_MEMCG */
- 
  /*
   * Initialize POSIX timer handling for a single task.
   */
@@@ -1237,6 -1195,7 +1236,6 @@@ static struct task_struct *copy_process
    	goto fork_out;
ftrace_graph_init_task(p);
 -	get_seccomp_filter(p);
rt_mutex_init_task(p);
@@@ -1302,7 -1261,7 +1301,7 @@@
posix_cpu_timers_init(p);
-	do_posix_clock_monotonic_gettime(&p->start_time);
 +	ktime_get_ts(&p->start_time);
    p->real_start_time = p->start_time;
    monotonic_to_bootbased(&p->real_start_time);
    p->io_context = NULL;
@@@ -1347,10 -1306,6 +1346,6 @@@
  #ifdef CONFIG_DEBUG_MUTEXES
    p->blocked_on = NULL; /* not blocked yet */
  #endif
- #ifdef CONFIG_MEMCG
- 	p->memcg_batch.do_batch = 0;
- 	p->memcg_batch.memcg = NULL;
- #endif
  #ifdef CONFIG_BCACHE
    p->sequential_io	= 0;
    p->sequential_io_avg	= 0;
@@@ -1368,6 -1323,7 +1363,7 @@@
    if (retval)
    	goto bad_fork_cleanup_policy;
    /* copy all the process information */
+ 	shm_init_task(p);
    retval = copy_semundo(clone_flags, p);
    if (retval)
    	goto bad_fork_cleanup_audit;
@@@ -1477,12 -1433,6 +1473,12 @@@
    spin_lock(&current->sighand->siglock);
/*
 +	 * Copy seccomp details explicitly here, in case they were changed
 +	 * before holding sighand lock.
 +	 */
 +	copy_seccomp(p);
 +
 +	/*
     * Process group and session signals need to be delivered to just the
     * parent before the fork or both the parent and the child after the
     * fork. Restart if a signal comes in before we add the new process to
@@@ -1919,6 -1869,11 +1915,11 @@@ SYSCALL_DEFINE1(unshare, unsigned long
    		 */
    		exit_sem(current);
    	}
+ 		if (unshare_flags & CLONE_NEWIPC) {
+ 			/* Orphan segments in old ns (see sem above). */
+ 			exit_shm(current);
+ 			shm_init_task(current);
+ 		}
if (new_nsproxy)
    		switch_task_namespaces(current, new_nsproxy);
diff --combined lib/Kconfig.debug
index 066936a,fd939e1..ff15fb6
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@@ -15,7 -15,7 +15,7 @@@ config PRINTK_TIM
      The behavior is also controlled by the kernel command line
      parameter printk.time=1. See Documentation/kernel-parameters.txt
- config DEFAULT_MESSAGE_LOGLEVEL
+ config MESSAGE_LOGLEVEL_DEFAULT
    int "Default message log level (1-7)"
    range 1 7
    default "4"
@@@ -180,7 -180,7 +180,7 @@@ config STRIP_ASM_SYM
config READABLE_ASM
          bool "Generate readable assembler code"
 -        depends on DEBUG_KERNEL
 +        depends on DEBUG_KERNEL && !LTO
          help
            Disable some compiler optimizations that tend to generate human unreadable
            assembler output. This may make the kernel slightly slower, but it helps
@@@ -835,7 -835,7 +835,7 @@@ config DEBUG_RT_MUTEXE
config RT_MUTEX_TESTER
    bool "Built-in scriptable tester for rt-mutexes"
 -	depends on DEBUG_KERNEL && RT_MUTEXES
 +	depends on DEBUG_KERNEL && RT_MUTEXES && BROKEN
    help
      This option enables a rt-mutex tester.
@@@ -1131,6 -1131,20 +1131,6 @@@ config PROVE_RCU_REPEATEDL
Say N if you are unsure.
-config PROVE_RCU_DELAY
 -	bool "RCU debugging: preemptible RCU race provocation"
 -	depends on DEBUG_KERNEL && PREEMPT_RCU
 -	default n
 -	help
 -	 There is a class of races that involve an unlikely preemption
 -	 of __rcu_read_unlock() just after ->rcu_read_lock_nesting has
 -	 been set to INT_MIN.  This feature inserts a delay at that
 -	 point to increase the probability of these races.
 -
 -	 Say Y to increase probability of preemption of __rcu_read_unlock().
 -
 -	 Say N if you are unsure.
 -
  config SPARSE_RCU_POINTER
    bool "RCU debugging: sparse-based checks for pointer usage"
    default n
@@@ -1635,19 -1649,6 +1635,19 @@@ config TEST_BP
If unsure, say N.
+config TEST_FIRMWARE
 +	tristate "Test firmware loading via userspace interface"
 +	default n
 +	depends on FW_LOADER
 +	help
 +	  This builds the "test_firmware" module that creates a userspace
 +	  interface for testing firmware loading. This can be used to
 +	  control the triggering of firmware loading without needing an
 +	  actual firmware-using device. The contents can be rechecked by
 +	  userspace.
 +
 +	  If unsure, say N.
 +
  source "samples/Kconfig"
source "lib/Kconfig.kgdb"
diff --combined lib/Makefile
index 230b4b1,e48067c..44dbcee
--- a/lib/Makefile
+++ b/lib/Makefile
@@@ -34,7 -34,6 +34,7 @@@ obj-$(CONFIG_TEST_KSTRTOX) += test-kstr
  obj-$(CONFIG_TEST_MODULE) += test_module.o
  obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o
  obj-$(CONFIG_TEST_BPF) += test_bpf.o
 +obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
ifeq ($(CONFIG_DEBUG_KOBJECT),y)
  CFLAGS_kobject.o += -DDEBUG
@@@ -72,6 -71,7 +72,7 @@@ obj-$(CONFIG_CRC32)	+= crc32.
  obj-$(CONFIG_CRC7)	+= crc7.o
  obj-$(CONFIG_LIBCRC32C)	+= libcrc32c.o
  obj-$(CONFIG_CRC8)	+= crc8.o
+ obj-$(CONFIG_CRC64_ECMA)	+= crc64_ecma.o
  obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
@@@ -137,6 -137,8 +138,8 @@@ obj-$(CONFIG_CORDIC) += cordic.
obj-$(CONFIG_DQL) += dynamic_queue_limits.o
+ obj-$(CONFIG_GLOB) += glob.o
+ 
  obj-$(CONFIG_MPILIB) += mpi/
  obj-$(CONFIG_SIGNATURE) += digsig.o
diff --combined mm/filemap.c
index d175917,fb74fb8..367ea2c
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@@ -31,6 -31,7 +31,7 @@@
  #include <linux/security.h>
  #include <linux/cpuset.h>
  #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+ #include <linux/hugetlb.h>
  #include <linux/memcontrol.h>
  #include <linux/cleancache.h>
  #include <linux/rmap.h>
@@@ -233,7 -234,6 +234,6 @@@ void delete_from_page_cache(struct pag
    spin_lock_irq(&mapping->tree_lock);
    __delete_from_page_cache(page, NULL);
    spin_unlock_irq(&mapping->tree_lock);
- 	mem_cgroup_uncharge_cache_page(page);
if (freepage)
    	freepage(page);
@@@ -241,6 -241,18 +241,6 @@@
  }
  EXPORT_SYMBOL(delete_from_page_cache);
-static int sleep_on_page(void *word)
 -{
 -	io_schedule();
 -	return 0;
 -}
 -
 -static int sleep_on_page_killable(void *word)
 -{
 -	sleep_on_page(word);
 -	return fatal_signal_pending(current) ? -EINTR : 0;
 -}
 -
  static int filemap_check_errors(struct address_space *mapping)
  {
    int ret = 0;
@@@ -489,8 -501,7 +489,7 @@@ int replace_page_cache_page(struct pag
    	if (PageSwapBacked(new))
    		__inc_zone_page_state(new, NR_SHMEM);
    	spin_unlock_irq(&mapping->tree_lock);
- 		/* mem_cgroup codes must not be called under tree_lock */
- 		mem_cgroup_replace_page_cache(old, new);
+ 		mem_cgroup_migrate(old, new, true);
    	radix_tree_preload_end();
    	if (freepage)
    		freepage(old);
@@@ -548,19 -559,24 +547,24 @@@ static int __add_to_page_cache_locked(s
    			      pgoff_t offset, gfp_t gfp_mask,
    			      void **shadowp)
  {
+ 	int huge = PageHuge(page);
+ 	struct mem_cgroup *memcg;
    int error;
VM_BUG_ON_PAGE(!PageLocked(page), page);
    VM_BUG_ON_PAGE(PageSwapBacked(page), page);
- 	error = mem_cgroup_charge_file(page, current->mm,
- 					gfp_mask & GFP_RECLAIM_MASK);
- 	if (error)
- 		return error;
+ 	if (!huge) {
+ 		error = mem_cgroup_try_charge(page, current->mm,
+ 					      gfp_mask, &memcg);
+ 		if (error)
+ 			return error;
+ 	}
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
    if (error) {
- 		mem_cgroup_uncharge_cache_page(page);
+ 		if (!huge)
+ 			mem_cgroup_cancel_charge(page, memcg);
    	return error;
    }
@@@ -575,13 -591,16 +579,16 @@@
    	goto err_insert;
    __inc_zone_page_state(page, NR_FILE_PAGES);
    spin_unlock_irq(&mapping->tree_lock);
+ 	if (!huge)
+ 		mem_cgroup_commit_charge(page, memcg, false);
    trace_mm_filemap_add_to_page_cache(page);
    return 0;
  err_insert:
    page->mapping = NULL;
    /* Leave page->index set: truncation relies upon it */
    spin_unlock_irq(&mapping->tree_lock);
- 	mem_cgroup_uncharge_cache_page(page);
+ 	if (!huge)
+ 		mem_cgroup_cancel_charge(page, memcg);
    page_cache_release(page);
    return error;
  }
@@@ -680,7 -699,7 +687,7 @@@ void wait_on_page_bit(struct page *page
    DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
if (test_bit(bit_nr, &page->flags))
 -		__wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
 +		__wait_on_bit(page_waitqueue(page), &wait, bit_wait_io,
    						TASK_UNINTERRUPTIBLE);
  }
  EXPORT_SYMBOL(wait_on_page_bit);
@@@ -693,7 -712,7 +700,7 @@@ int wait_on_page_bit_killable(struct pa
    	return 0;
return __wait_on_bit(page_waitqueue(page), &wait,
 -			     sleep_on_page_killable, TASK_KILLABLE);
 +			     bit_wait_io, TASK_KILLABLE);
  }
/**
@@@ -794,7 -813,7 +801,7 @@@ void __lock_page(struct page *page
  {
    DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-	__wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
 +	__wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
    						TASK_UNINTERRUPTIBLE);
  }
  EXPORT_SYMBOL(__lock_page);
@@@ -804,10 -823,21 +811,21 @@@ int __lock_page_killable(struct page *p
    DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
return __wait_on_bit_lock(page_waitqueue(page), &wait,
 -					sleep_on_page_killable, TASK_KILLABLE);
 +					bit_wait_io, TASK_KILLABLE);
  }
  EXPORT_SYMBOL_GPL(__lock_page_killable);
+ /*
+  * Return values:
+  * 1 - page is locked; mmap_sem is still held.
+  * 0 - page is not locked.
+  *     mmap_sem has been released (up_read()), unless flags had both
+  *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
+  *     which case mmap_sem is still held.
+  *
+  * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
+  * with the page locked and the mmap_sem unperturbed.
+  */
  int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
    		 unsigned int flags)
  {
@@@ -1088,9 -1118,9 +1106,9 @@@ no_page
    	if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
    		fgp_flags |= FGP_LOCK;
- 		/* Init accessed so avoit atomic mark_page_accessed later */
+ 		/* Init accessed so avoid atomic mark_page_accessed later */
    	if (fgp_flags & FGP_ACCESSED)
- 			init_page_accessed(page);
+ 			__SetPageReferenced(page);
err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
    	if (unlikely(err)) {
@@@ -1824,6 -1854,18 +1842,18 @@@ static void do_async_mmap_readahead(str
   * The goto's are kind of ugly, but this streamlines the normal case of having
   * it in the page cache, and handles the special cases reasonably without
   * having a lot of duplicated code.
+  *
+  * vma->vm_mm->mmap_sem must be held on entry.
+  *
+  * If our return value has VM_FAULT_RETRY set, it's because
+  * lock_page_or_retry() returned 0.
+  * The mmap_sem has usually been released in this case.
+  * See __lock_page_or_retry() for the exception.
+  *
+  * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
+  * has not been released.
+  *
+  * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
   */
  int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
diff --combined mm/memcontrol.c
index 45c10c6,d44bf3e..6f81411
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -648,10 -648,8 +648,8 @@@ EXPORT_SYMBOL(memcg_kmem_enabled_key)
static void disarm_kmem_keys(struct mem_cgroup *memcg)
  {
- 	if (memcg_kmem_is_active(memcg)) {
+ 	if (memcg_kmem_is_active(memcg))
    	static_key_slow_dec(&memcg_kmem_enabled_key);
- 		ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
- 	}
    /*
     * This check can't live in kmem destruction function,
     * since the charges will outlive the cgroup
@@@ -754,9 -752,11 +752,11 @@@ static void __mem_cgroup_remove_exceede
  static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
    			       struct mem_cgroup_tree_per_zone *mctz)
  {
- 	spin_lock(&mctz->lock);
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&mctz->lock, flags);
    __mem_cgroup_remove_exceeded(mz, mctz);
- 	spin_unlock(&mctz->lock);
+ 	spin_unlock_irqrestore(&mctz->lock, flags);
  }
@@@ -779,7 -779,9 +779,9 @@@ static void mem_cgroup_update_tree(stru
    	 * mem is over its softlimit.
    	 */
    	if (excess || mz->on_tree) {
- 			spin_lock(&mctz->lock);
+ 			unsigned long flags;
+ 
+ 			spin_lock_irqsave(&mctz->lock, flags);
    		/* if on-tree, remove it */
    		if (mz->on_tree)
    			__mem_cgroup_remove_exceeded(mz, mctz);
@@@ -788,7 -790,7 +790,7 @@@
    		 * If excess is 0, no tree ops.
    		 */
    		__mem_cgroup_insert_exceeded(mz, mctz, excess);
- 			spin_unlock(&mctz->lock);
+ 			spin_unlock_irqrestore(&mctz->lock, flags);
    	}
    }
  }
@@@ -839,9 -841,9 +841,9 @@@ mem_cgroup_largest_soft_limit_node(stru
  {
    struct mem_cgroup_per_zone *mz;
- 	spin_lock(&mctz->lock);
+ 	spin_lock_irq(&mctz->lock);
    mz = __mem_cgroup_largest_soft_limit_node(mctz);
- 	spin_unlock(&mctz->lock);
+ 	spin_unlock_irq(&mctz->lock);
    return mz;
  }
@@@ -882,13 -884,6 +884,6 @@@ static long mem_cgroup_read_stat(struc
    return val;
  }
- static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
- 					 bool charge)
- {
- 	int val = (charge) ? 1 : -1;
- 	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
- }
- 
  static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
    				    enum mem_cgroup_events_index idx)
  {
@@@ -909,13 -904,13 +904,13 @@@
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
    				 struct page *page,
- 					 bool anon, int nr_pages)
+ 					 int nr_pages)
  {
    /*
     * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
     * counted as CACHE even if it's on ANON LRU.
     */
- 	if (anon)
+ 	if (PageAnon(page))
    	__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
    			nr_pages);
    else
@@@ -1013,7 -1008,6 +1008,6 @@@ static bool mem_cgroup_event_ratelimit(
   */
  static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
  {
- 	preempt_disable();
    /* threshold event is triggered in finer grain than soft limit */
    if (unlikely(mem_cgroup_event_ratelimit(memcg,
    					MEM_CGROUP_TARGET_THRESH))) {
@@@ -1026,8 -1020,6 +1020,6 @@@
    	do_numainfo = mem_cgroup_event_ratelimit(memcg,
    					MEM_CGROUP_TARGET_NUMAINFO);
  #endif
- 		preempt_enable();
- 
    	mem_cgroup_threshold(memcg);
    	if (unlikely(do_softlimit))
    		mem_cgroup_update_tree(memcg, page);
@@@ -1035,8 -1027,7 +1027,7 @@@
    	if (unlikely(do_numainfo))
    		atomic_inc(&memcg->numainfo_events);
  #endif
- 	} else
- 		preempt_enable();
+ 	}
  }
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@@ -1347,20 -1338,6 +1338,6 @@@ out
    return lruvec;
  }
- /*
-  * Following LRU functions are allowed to be used without PCG_LOCK.
-  * Operations are called by routine of global LRU independently from memcg.
-  * What we have to take care of here is validness of pc->mem_cgroup.
-  *
-  * Changes to pc->mem_cgroup happens when
-  * 1. charge
-  * 2. moving account
-  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
-  * It is added to LRU before charge.
-  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
-  * When moving account, the page is not on LRU. It's isolated.
-  */
- 
  /**
   * mem_cgroup_page_lruvec - return lruvec for adding an lru page
   * @page: the page
@@@ -2261,22 -2238,14 +2238,14 @@@ cleanup
   *
   * Notes: Race condition
   *
-  * We usually use lock_page_cgroup() for accessing page_cgroup member but
-  * it tends to be costly. But considering some conditions, we doesn't need
-  * to do so _always_.
+  * Charging occurs during page instantiation, while the page is
+  * unmapped and locked in page migration, or while the page table is
+  * locked in THP migration.  No race is possible.
   *
-  * Considering "charge", lock_page_cgroup() is not required because all
-  * file-stat operations happen after a page is attached to radix-tree. There
-  * are no race with "charge".
+  * Uncharge happens to pages with zero references, no race possible.
   *
-  * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
-  * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
-  * if there are race with "uncharge". Statistics itself is properly handled
-  * by flags.
-  *
-  * Considering "move", this is an only case we see a race. To make the race
-  * small, we check memcg->moving_account and detect there are possibility
-  * of race or not. If there is, we take a lock.
+  * Charge moving between groups is protected by checking mm->moving
+  * account and taking the move_lock in the slowpath.
   */
void __mem_cgroup_begin_update_page_stat(struct page *page,
@@@ -2551,55 -2520,63 +2520,63 @@@ static int memcg_cpu_hotplug_callback(s
    return NOTIFY_OK;
  }
- 
- /* See mem_cgroup_try_charge() for details */
- enum {
- 	CHARGE_OK,		/* success */
- 	CHARGE_RETRY,		/* need to retry but retry is not bad */
- 	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
- 	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
- };
- 
- static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- 				unsigned int nr_pages, unsigned int min_pages,
- 				bool invoke_oom)
+ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ 		      unsigned int nr_pages)
  {
- 	unsigned long csize = nr_pages * PAGE_SIZE;
+ 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
+ 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
    struct mem_cgroup *mem_over_limit;
    struct res_counter *fail_res;
+ 	unsigned long nr_reclaimed;
    unsigned long flags = 0;
- 	int ret;
+ 	unsigned long long size;
+ 	int ret = 0;
- 	ret = res_counter_charge(&memcg->res, csize, &fail_res);
+ retry:
+ 	if (consume_stock(memcg, nr_pages))
+ 		goto done;
- 	if (likely(!ret)) {
+ 	size = batch * PAGE_SIZE;
+ 	if (!res_counter_charge(&memcg->res, size, &fail_res)) {
    	if (!do_swap_account)
- 			return CHARGE_OK;
- 		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
- 		if (likely(!ret))
- 			return CHARGE_OK;
- 
- 		res_counter_uncharge(&memcg->res, csize);
+ 			goto done_restock;
+ 		if (!res_counter_charge(&memcg->memsw, size, &fail_res))
+ 			goto done_restock;
+ 		res_counter_uncharge(&memcg->res, size);
    	mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
    	flags |= MEM_CGROUP_RECLAIM_NOSWAP;
    } else
    	mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+ 
+ 	if (batch > nr_pages) {
+ 		batch = nr_pages;
+ 		goto retry;
+ 	}
+ 
    /*
- 	 * Never reclaim on behalf of optional batching, retry with a
- 	 * single page instead.
+ 	 * Unlike in global OOM situations, memcg is not in a physical
+ 	 * memory shortage.  Allow dying and OOM-killed tasks to
+ 	 * bypass the last charges so that they can exit quickly and
+ 	 * free their memory.
     */
- 	if (nr_pages > min_pages)
- 		return CHARGE_RETRY;
+ 	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+ 		     fatal_signal_pending(current) ||
+ 		     current->flags & PF_EXITING))
+ 		goto bypass;
+ 
+ 	if (unlikely(task_in_memcg_oom(current)))
+ 		goto nomem;
if (!(gfp_mask & __GFP_WAIT))
- 		return CHARGE_WOULDBLOCK;
+ 		goto nomem;
- 	if (gfp_mask & __GFP_NORETRY)
- 		return CHARGE_NOMEM;
+ 	nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
+ 
+ 	if (mem_cgroup_margin(mem_over_limit) >= batch)
+ 		goto retry;
- 	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
- 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
- 		return CHARGE_RETRY;
+ 	if (gfp_mask & __GFP_NORETRY)
+ 		goto nomem;
    /*
     * Even though the limit is exceeded at this point, reclaim
     * may have been able to free some pages.  Retry the charge
@@@ -2609,142 -2586,47 +2586,47 @@@
     * unlikely to succeed so close to the limit, and we fall back
     * to regular pages anyway in case of failure.
     */
- 	if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
- 		return CHARGE_RETRY;
- 
+ 	if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER))
+ 		goto retry;
    /*
     * At task move, charge accounts can be doubly counted. So, it's
     * better to wait until the end of task_move if something is going on.
     */
    if (mem_cgroup_wait_acct_move(mem_over_limit))
- 		return CHARGE_RETRY;
- 
- 	if (invoke_oom)
- 		mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
- 
- 	return CHARGE_NOMEM;
- }
- 
- /**
-  * mem_cgroup_try_charge - try charging a memcg
-  * @memcg: memcg to charge
-  * @nr_pages: number of pages to charge
-  * @oom: trigger OOM if reclaim fails
-  *
-  * Returns 0 if @memcg was charged successfully, -EINTR if the charge
-  * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
-  */
- static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
- 				 gfp_t gfp_mask,
- 				 unsigned int nr_pages,
- 				 bool oom)
- {
- 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
- 	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
- 	int ret;
- 
- 	if (mem_cgroup_is_root(memcg))
- 		goto done;
- 	/*
- 	 * Unlike in global OOM situations, memcg is not in a physical
- 	 * memory shortage.  Allow dying and OOM-killed tasks to
- 	 * bypass the last charges so that they can exit quickly and
- 	 * free their memory.
- 	 */
- 	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
- 		     fatal_signal_pending(current) ||
- 		     current->flags & PF_EXITING))
- 		goto bypass;
+ 		goto retry;
- 	if (unlikely(task_in_memcg_oom(current)))
- 		goto nomem;
+ 	if (nr_retries--)
+ 		goto retry;
if (gfp_mask & __GFP_NOFAIL)
- 		oom = false;
- again:
- 	if (consume_stock(memcg, nr_pages))
- 		goto done;
- 
- 	do {
- 		bool invoke_oom = oom && !nr_oom_retries;
- 
- 		/* If killed, bypass charge */
- 		if (fatal_signal_pending(current))
- 			goto bypass;
+ 		goto bypass;
- 		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
- 					   nr_pages, invoke_oom);
- 		switch (ret) {
- 		case CHARGE_OK:
- 			break;
- 		case CHARGE_RETRY: /* not in OOM situation but retry */
- 			batch = nr_pages;
- 			goto again;
- 		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
- 			goto nomem;
- 		case CHARGE_NOMEM: /* OOM routine works */
- 			if (!oom || invoke_oom)
- 				goto nomem;
- 			nr_oom_retries--;
- 			break;
- 		}
- 	} while (ret != CHARGE_OK);
+ 	if (fatal_signal_pending(current))
+ 		goto bypass;
- 	if (batch > nr_pages)
- 		refill_stock(memcg, batch - nr_pages);
- done:
- 	return 0;
+ 	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch));
  nomem:
    if (!(gfp_mask & __GFP_NOFAIL))
    	return -ENOMEM;
  bypass:
- 	return -EINTR;
- }
- 
- /**
-  * mem_cgroup_try_charge_mm - try charging a mm
-  * @mm: mm_struct to charge
-  * @nr_pages: number of pages to charge
-  * @oom: trigger OOM if reclaim fails
-  *
-  * Returns the charged mem_cgroup associated with the given mm_struct or
-  * NULL the charge failed.
-  */
- static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
- 				 gfp_t gfp_mask,
- 				 unsigned int nr_pages,
- 				 bool oom)
- 
- {
- 	struct mem_cgroup *memcg;
- 	int ret;
- 
- 	memcg = get_mem_cgroup_from_mm(mm);
- 	ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
- 	css_put(&memcg->css);
- 	if (ret == -EINTR)
- 		memcg = root_mem_cgroup;
- 	else if (ret)
- 		memcg = NULL;
+ 	memcg = root_mem_cgroup;
+ 	ret = -EINTR;
+ 	goto retry;
- 	return memcg;
+ done_restock:
+ 	if (batch > nr_pages)
+ 		refill_stock(memcg, batch - nr_pages);
+ done:
+ 	return ret;
  }
- /*
-  * Somemtimes we have to undo a charge we got by try_charge().
-  * This function is for that and do uncharge, put css's refcnt.
-  * gotten by try_charge().
-  */
- static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
- 				       unsigned int nr_pages)
+ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
  {
- 	if (!mem_cgroup_is_root(memcg)) {
- 		unsigned long bytes = nr_pages * PAGE_SIZE;
+ 	unsigned long bytes = nr_pages * PAGE_SIZE;
- 		res_counter_uncharge(&memcg->res, bytes);
- 		if (do_swap_account)
- 			res_counter_uncharge(&memcg->memsw, bytes);
- 	}
+ 	res_counter_uncharge(&memcg->res, bytes);
+ 	if (do_swap_account)
+ 		res_counter_uncharge(&memcg->memsw, bytes);
  }
/*
@@@ -2756,9 -2638,6 +2638,6 @@@ static void __mem_cgroup_cancel_local_c
  {
    unsigned long bytes = nr_pages * PAGE_SIZE;
- 	if (mem_cgroup_is_root(memcg))
- 		return;
- 
    res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
    if (do_swap_account)
    	res_counter_uncharge_until(&memcg->memsw,
@@@ -2779,6 -2658,16 +2658,16 @@@ static struct mem_cgroup *mem_cgroup_lo
    return mem_cgroup_from_id(id);
  }
+ /*
+  * try_get_mem_cgroup_from_page - look up page's memcg association
+  * @page: the page
+  *
+  * Look up, get a css reference, and return the memcg that owns @page.
+  *
+  * The page must be locked to prevent racing with swap-in and page
+  * cache charges.  If coming from an unlocked page table, the caller
+  * must ensure the page is on the LRU or this can race with charging.
+  */
  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
  {
    struct mem_cgroup *memcg = NULL;
@@@ -2789,7 -2678,6 +2678,6 @@@
    VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page);
- 	lock_page_cgroup(pc);
    if (PageCgroupUsed(pc)) {
    	memcg = pc->mem_cgroup;
    	if (memcg && !css_tryget_online(&memcg->css))
@@@ -2803,23 -2691,46 +2691,46 @@@
    		memcg = NULL;
    	rcu_read_unlock();
    }
    return memcg;
  }
- static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
- 				       struct page *page,
- 				       unsigned int nr_pages,
- 				       enum charge_type ctype,
- 				       bool lrucare)
+ static void lock_page_lru(struct page *page, int *isolated)
+ {
+ 	struct zone *zone = page_zone(page);
+ 
+ 	spin_lock_irq(&zone->lru_lock);
+ 	if (PageLRU(page)) {
+ 		struct lruvec *lruvec;
+ 
+ 		lruvec = mem_cgroup_page_lruvec(page, zone);
+ 		ClearPageLRU(page);
+ 		del_page_from_lru_list(page, lruvec, page_lru(page));
+ 		*isolated = 1;
+ 	} else
+ 		*isolated = 0;
+ }
+ 
+ static void unlock_page_lru(struct page *page, int isolated)
+ {
+ 	struct zone *zone = page_zone(page);
+ 
+ 	if (isolated) {
+ 		struct lruvec *lruvec;
+ 
+ 		lruvec = mem_cgroup_page_lruvec(page, zone);
+ 		VM_BUG_ON_PAGE(PageLRU(page), page);
+ 		SetPageLRU(page);
+ 		add_page_to_lru_list(page, lruvec, page_lru(page));
+ 	}
+ 	spin_unlock_irq(&zone->lru_lock);
+ }
+ 
+ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
+ 			  unsigned int nr_pages, bool lrucare)
  {
    struct page_cgroup *pc = lookup_page_cgroup(page);
- 	struct zone *uninitialized_var(zone);
- 	struct lruvec *lruvec;
- 	bool was_on_lru = false;
- 	bool anon;
+ 	int isolated;
- 	lock_page_cgroup(pc);
    VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
    /*
     * we don't need page_cgroup_lock about tail pages, becase they are not
@@@ -2830,52 -2741,38 +2741,38 @@@
     * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
     * may already be on some other mem_cgroup's LRU.  Take care of it.
     */
- 	if (lrucare) {
- 		zone = page_zone(page);
- 		spin_lock_irq(&zone->lru_lock);
- 		if (PageLRU(page)) {
- 			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- 			ClearPageLRU(page);
- 			del_page_from_lru_list(page, lruvec, page_lru(page));
- 			was_on_lru = true;
- 		}
- 	}
+ 	if (lrucare)
+ 		lock_page_lru(page, &isolated);
- 	pc->mem_cgroup = memcg;
    /*
- 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
- 	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
- 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
- 	 * before USED bit, we need memory barrier here.
- 	 * See mem_cgroup_add_lru_list(), etc.
+ 	 * Nobody should be changing or seriously looking at
+ 	 * pc->mem_cgroup and pc->flags at this point:
+ 	 *
+ 	 * - the page is uncharged
+ 	 *
+ 	 * - the page is off-LRU
+ 	 *
+ 	 * - an anonymous fault has exclusive page access, except for
+ 	 *   a locked page table
+ 	 *
+ 	 * - a page cache insertion, a swapin fault, or a migration
+ 	 *   have the page locked
     */
- 	smp_wmb();
- 	SetPageCgroupUsed(pc);
- 
- 	if (lrucare) {
- 		if (was_on_lru) {
- 			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- 			VM_BUG_ON_PAGE(PageLRU(page), page);
- 			SetPageLRU(page);
- 			add_page_to_lru_list(page, lruvec, page_lru(page));
- 		}
- 		spin_unlock_irq(&zone->lru_lock);
- 	}
- 
- 	if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
- 		anon = true;
- 	else
- 		anon = false;
+ 	pc->mem_cgroup = memcg;
+ 	pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
- 	mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
- 	unlock_page_cgroup(pc);
+ 	if (lrucare)
+ 		unlock_page_lru(page, isolated);
+ 	local_irq_disable();
+ 	mem_cgroup_charge_statistics(memcg, page, nr_pages);
    /*
     * "charge_statistics" updated event counter. Then, check it.
     * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
     * if they exceeds softlimit.
     */
    memcg_check_events(memcg, page);
+ 	local_irq_enable();
  }
static DEFINE_MUTEX(set_limit_mutex);
@@@ -2896,16 -2793,13 +2793,13 @@@ static inline bool memcg_can_account_km
  }
/*
-  * This is a bit cumbersome, but it is rarely used and avoids a backpointer
-  * in the memcg_cache_params struct.
+  * helper for acessing a memcg's index. It will be used as an index in the
+  * child cache array in kmem_cache, and also to derive its name. This function
+  * will return -1 when this is not a kmem-limited memcg.
   */
- static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
+ static inline int memcg_cache_id(struct mem_cgroup *memcg)
  {
- 	struct kmem_cache *cachep;
- 
- 	VM_BUG_ON(p->is_root_cache);
- 	cachep = p->root_cache;
- 	return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
+ 	return memcg ? memcg->kmemcg_id : -1;
  }
#ifdef CONFIG_SLABINFO
@@@ -2921,7 -2815,7 +2815,7 @@@ static int mem_cgroup_slabinfo_read(str
mutex_lock(&memcg_slab_mutex);
    list_for_each_entry(params, &memcg->memcg_slab_caches, list)
- 		cache_show(memcg_params_to_cache(params), m);
+ 		cache_show(params->cachep, m);
    mutex_unlock(&memcg_slab_mutex);
return 0;
@@@ -2937,22 -2831,21 +2831,21 @@@ static int memcg_charge_kmem(struct mem
    if (ret)
    	return ret;
- 	ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
- 				    oom_gfp_allowed(gfp));
+ 	ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);
    if (ret == -EINTR)  {
    	/*
- 		 * mem_cgroup_try_charge() chosed to bypass to root due to
- 		 * OOM kill or fatal signal.  Since our only options are to
- 		 * either fail the allocation or charge it to this cgroup, do
- 		 * it as a temporary condition. But we can't fail. From a
- 		 * kmem/slab perspective, the cache has already been selected,
- 		 * by mem_cgroup_kmem_get_cache(), so it is too late to change
+ 		 * try_charge() chose to bypass to root due to OOM kill or
+ 		 * fatal signal.  Since our only options are to either fail
+ 		 * the allocation or charge it to this cgroup, do it as a
+ 		 * temporary condition. But we can't fail. From a kmem/slab
+ 		 * perspective, the cache has already been selected, by
+ 		 * mem_cgroup_kmem_get_cache(), so it is too late to change
    	 * our minds.
    	 *
    	 * This condition will only trigger if the task entered
- 		 * memcg_charge_kmem in a sane state, but was OOM-killed during
- 		 * mem_cgroup_try_charge() above. Tasks that were already
- 		 * dying when the allocation triggers should have been already
+ 		 * memcg_charge_kmem in a sane state, but was OOM-killed
+ 		 * during try_charge() above. Tasks that were already dying
+ 		 * when the allocation triggers should have been already
    	 * directed to the root cgroup in memcontrol.h
    	 */
    	res_counter_charge_nofail(&memcg->res, size, &fail_res);
@@@ -2988,16 -2881,6 +2881,6 @@@ static void memcg_uncharge_kmem(struct 
    	css_put(&memcg->css);
  }
- /*
-  * helper for acessing a memcg's index. It will be used as an index in the
-  * child cache array in kmem_cache, and also to derive its name. This function
-  * will return -1 when this is not a kmem-limited memcg.
-  */
- int memcg_cache_id(struct mem_cgroup *memcg)
- {
- 	return memcg ? memcg->kmemcg_id : -1;
- }
- 
  static size_t memcg_caches_array_size(int num_groups)
  {
    ssize_t size;
@@@ -3043,6 -2926,10 +2926,10 @@@ int memcg_update_cache_size(struct kmem
    		return -ENOMEM;
new_params->is_root_cache = true;
+ 		INIT_LIST_HEAD(&new_params->children);
+ 		if (cur_params)
+ 			list_replace(&cur_params->children,
+ 				     &new_params->children);
/*
    	 * There is the chance it will be bigger than
@@@ -3095,11 -2982,14 +2982,14 @@@ int memcg_alloc_cache_params(struct mem
    	return -ENOMEM;
if (memcg) {
+ 		s->memcg_params->cachep = s;
    	s->memcg_params->memcg = memcg;
    	s->memcg_params->root_cache = root_cache;
    	css_get(&memcg->css);
- 	} else
+ 	} else {
    	s->memcg_params->is_root_cache = true;
+ 		INIT_LIST_HEAD(&s->memcg_params->children);
+ 	}
return 0;
  }
@@@ -3119,11 -3009,18 +3009,18 @@@ static void memcg_register_cache(struc
    static char memcg_name_buf[NAME_MAX + 1]; /* protected by
    					     memcg_slab_mutex */
    struct kmem_cache *cachep;
+ 	char *cache_name;
    int id;
lockdep_assert_held(&memcg_slab_mutex);
id = memcg_cache_id(memcg);
+ 	/*
+ 	 * The cgroup was taken offline while the create work was pending,
+ 	 * nothing to do then.
+ 	 */
+ 	if (id < 0)
+ 		return;
/*
     * Since per-memcg caches are created asynchronously on first
@@@ -3134,14 -3031,22 +3031,22 @@@
    	return;
cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
- 	cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
+ 
+ 	cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
+ 			       mem_cgroup_id(memcg), memcg_name_buf);
+ 	if (!cache_name)
+ 		return;
+ 
+ 	cachep = memcg_create_kmem_cache(memcg, root_cache, cache_name);
    /*
     * If we could not create a memcg cache, do not complain, because
     * that's not critical at all as we can always proceed with the root
     * cache.
     */
- 	if (!cachep)
+ 	if (!cachep) {
+ 		kfree(cache_name);
    	return;
+ 	}
list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
@@@ -3170,8 -3075,17 +3075,17 @@@ static void memcg_unregister_cache(stru
    memcg = cachep->memcg_params->memcg;
    id = memcg_cache_id(memcg);
- 	BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
- 	root_cache->memcg_params->memcg_caches[id] = NULL;
+ 	/*
+ 	 * This function can be called both after and before css offline. If
+ 	 * it's called before css offline, which happens on the root cache
+ 	 * destruction, we should clear the slot corresponding to the cache in
+ 	 * memcg_caches array. Otherwise the slot must have already been
+ 	 * cleared in memcg_unregister_all_caches.
+ 	 */
+ 	if (id >= 0) {
+ 		BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
+ 		root_cache->memcg_params->memcg_caches[id] = NULL;
+ 	}
list_del(&cachep->memcg_params->list);
@@@ -3209,42 -3123,41 +3123,41 @@@ static inline void memcg_resume_kmem_ac
    current->memcg_kmem_skip_account--;
  }
- int __memcg_cleanup_cache_params(struct kmem_cache *s)
+ void __memcg_cleanup_cache_params(struct kmem_cache *s)
  {
- 	struct kmem_cache *c;
- 	int i, failed = 0;
+ 	struct memcg_cache_params *params, *tmp;
mutex_lock(&memcg_slab_mutex);
- 	for_each_memcg_cache_index(i) {
- 		c = cache_from_memcg_idx(s, i);
- 		if (!c)
- 			continue;
- 
- 		memcg_unregister_cache(c);
- 
- 		if (cache_from_memcg_idx(s, i))
- 			failed++;
- 	}
+ 	list_for_each_entry_safe(params, tmp,
+ 			&s->memcg_params->children, siblings)
+ 		memcg_unregister_cache(params->cachep);
    mutex_unlock(&memcg_slab_mutex);
- 	return failed;
  }
static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
  {
- 	struct kmem_cache *cachep;
    struct memcg_cache_params *params, *tmp;
+ 	int id = memcg_cache_id(memcg);
if (!memcg_kmem_is_active(memcg))
    	return;
mutex_lock(&memcg_slab_mutex);
+ 	memcg->kmemcg_id = -1;
    list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
- 		cachep = memcg_params_to_cache(params);
+ 		struct kmem_cache *cachep = params->cachep;
+ 		struct kmem_cache *root_cache = params->root_cache;
+ 
+ 		BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
+ 		root_cache->memcg_params->memcg_caches[id] = NULL;
+ 
    	kmem_cache_shrink(cachep);
    	if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
    		memcg_unregister_cache(cachep);
    }
    mutex_unlock(&memcg_slab_mutex);
+ 
+ 	ida_simple_remove(&kmem_limited_groups, id);
  }
struct memcg_register_cache_work {
@@@ -3343,6 -3256,7 +3256,7 @@@ struct kmem_cache *__memcg_kmem_get_cac
  {
    struct mem_cgroup *memcg;
    struct kmem_cache *memcg_cachep;
+ 	int id;
VM_BUG_ON(!cachep->memcg_params);
    VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@@ -3356,7 -3270,15 +3270,15 @@@
    if (!memcg_can_account_kmem(memcg))
    	goto out;
- 	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+ 	id = memcg_cache_id(memcg);
+ 	/*
+ 	 * This can happen if current was migrated to another cgroup and this
+ 	 * cgroup was taken offline after we issued mem_cgroup_from_task above.
+ 	 */
+ 	if (unlikely(id < 0))
+ 		goto out;
+ 
+ 	memcg_cachep = cache_from_memcg_idx(cachep, id);
    if (likely(memcg_cachep)) {
    	cachep = memcg_cachep;
    	goto out;
@@@ -3463,12 -3385,13 +3385,13 @@@ void __memcg_kmem_commit_charge(struct 
    	memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
    	return;
    }
- 
+ 	/*
+ 	 * The page is freshly allocated and not visible to any
+ 	 * outside callers yet.  Set up pc non-atomically.
+ 	 */
    pc = lookup_page_cgroup(page);
- 	lock_page_cgroup(pc);
    pc->mem_cgroup = memcg;
- 	SetPageCgroupUsed(pc);
- 	unlock_page_cgroup(pc);
+ 	pc->flags = PCG_USED;
  }
void __memcg_kmem_uncharge_pages(struct page *page, int order)
@@@ -3478,19 -3401,11 +3401,11 @@@
pc = lookup_page_cgroup(page);
    if (!PageCgroupUsed(pc))
    	return;
- 	lock_page_cgroup(pc);
- 	if (PageCgroupUsed(pc)) {
- 		memcg = pc->mem_cgroup;
- 		ClearPageCgroupUsed(pc);
- 	}
- 	unlock_page_cgroup(pc);
+ 	memcg = pc->mem_cgroup;
+ 	pc->flags = 0;
/*
     * We trust that only if there is a memcg associated with the page, it
@@@ -3510,7 -3425,6 +3425,6 @@@ static inline void memcg_unregister_all
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
  /*
   * Because tail pages are not marked as "used", set it. We're under
   * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@@ -3531,8 -3445,7 +3445,7 @@@ void mem_cgroup_split_huge_fixup(struc
    for (i = 1; i < HPAGE_PMD_NR; i++) {
    	pc = head_pc + i;
    	pc->mem_cgroup = memcg;
- 		smp_wmb();/* see __commit_charge() */
- 		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+ 		pc->flags = head_pc->flags;
    }
    __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
    	       HPAGE_PMD_NR);
@@@ -3562,7 -3475,6 +3475,6 @@@ static int mem_cgroup_move_account(stru
  {
    unsigned long flags;
    int ret;
- 	bool anon = PageAnon(page);
VM_BUG_ON(from == to);
    VM_BUG_ON_PAGE(PageLRU(page), page);
@@@ -3576,15 -3488,21 +3488,21 @@@
    if (nr_pages > 1 && !PageTransHuge(page))
    	goto out;
- 	lock_page_cgroup(pc);
+ 	/*
+ 	 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
+ 	 * of its source page while we change it: page migration takes
+ 	 * both pages off the LRU, but page cache replacement doesn't.
+ 	 */
+ 	if (!trylock_page(page))
+ 		goto out;
ret = -EINVAL;
    if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
- 		goto unlock;
+ 		goto out_unlock;
move_lock_mem_cgroup(from, &flags);
- 	if (!anon && page_mapped(page)) {
+ 	if (!PageAnon(page) && page_mapped(page)) {
    	__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
    		       nr_pages);
    	__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
@@@ -3598,20 -3516,25 +3516,25 @@@
    		       nr_pages);
    }
- 	mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
+ 	/*
+ 	 * It is safe to change pc->mem_cgroup here because the page
+ 	 * is referenced, charged, and isolated - we can't race with
+ 	 * uncharging, charging, migration, or LRU putback.
+ 	 */
/* caller should have done css_get */
    pc->mem_cgroup = to;
- 	mem_cgroup_charge_statistics(to, page, anon, nr_pages);
    move_unlock_mem_cgroup(from, &flags);
    ret = 0;
- unlock:
- 	unlock_page_cgroup(pc);
- 	/*
- 	 * check events
- 	 */
+ 
+ 	local_irq_disable();
+ 	mem_cgroup_charge_statistics(to, page, nr_pages);
    memcg_check_events(to, page);
+ 	mem_cgroup_charge_statistics(from, page, -nr_pages);
    memcg_check_events(from, page);
+ 	local_irq_enable();
+ out_unlock:
+ 	unlock_page(page);
  out:
    return ret;
  }
@@@ -3682,483 -3605,39 +3605,39 @@@ out
    return ret;
  }
- int mem_cgroup_charge_anon(struct page *page,
- 			      struct mm_struct *mm, gfp_t gfp_mask)
+ #ifdef CONFIG_MEMCG_SWAP
+ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
+ 					 bool charge)
  {
- 	unsigned int nr_pages = 1;
- 	struct mem_cgroup *memcg;
- 	bool oom = true;
+ 	int val = (charge) ? 1 : -1;
+ 	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
+ }
- 	if (mem_cgroup_disabled())
- 		return 0;
+ /**
+  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
+  * @entry: swap entry to be moved
+  * @from:  mem_cgroup which the entry is moved from
+  * @to:  mem_cgroup which the entry is moved to
+  *
+  * It succeeds only when the swap_cgroup's record for this entry is the same
+  * as the mem_cgroup's id of @from.
+  *
+  * Returns 0 on success, -EINVAL on failure.
+  *
+  * The caller must have charged to @to, IOW, called res_counter_charge() about
+  * both res and memsw, and called css_get().
+  */
+ static int mem_cgroup_move_swap_account(swp_entry_t entry,
+ 				struct mem_cgroup *from, struct mem_cgroup *to)
+ {
+ 	unsigned short old_id, new_id;
- 	VM_BUG_ON_PAGE(page_mapped(page), page);
- 	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
- 	VM_BUG_ON(!mm);
+ 	old_id = mem_cgroup_id(from);
+ 	new_id = mem_cgroup_id(to);
- 	if (PageTransHuge(page)) {
- 		nr_pages <<= compound_order(page);
- 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- 		/*
- 		 * Never OOM-kill a process for a huge page.  The
- 		 * fault handler will fall back to regular pages.
- 		 */
- 		oom = false;
- 	}
- 
- 	memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
- 	if (!memcg)
- 		return -ENOMEM;
- 	__mem_cgroup_commit_charge(memcg, page, nr_pages,
- 				   MEM_CGROUP_CHARGE_TYPE_ANON, false);
- 	return 0;
- }
- 
- /*
-  * While swap-in, try_charge -> commit or cancel, the page is locked.
-  * And when try_charge() successfully returns, one refcnt to memcg without
-  * struct page_cgroup is acquired. This refcnt will be consumed by
-  * "commit()" or removed by "cancel()"
-  */
- static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
- 					  struct page *page,
- 					  gfp_t mask,
- 					  struct mem_cgroup **memcgp)
- {
- 	struct mem_cgroup *memcg = NULL;
- 	struct page_cgroup *pc;
- 	int ret;
- 
- 	pc = lookup_page_cgroup(page);
- 	/*
- 	 * Every swap fault against a single page tries to charge the
- 	 * page, bail as early as possible.  shmem_unuse() encounters
- 	 * already charged pages, too.  The USED bit is protected by
- 	 * the page lock, which serializes swap cache removal, which
- 	 * in turn serializes uncharging.
- 	 */
- 	if (PageCgroupUsed(pc))
- 		goto out;
- 	if (do_swap_account)
- 		memcg = try_get_mem_cgroup_from_page(page);
- 	if (!memcg)
- 		memcg = get_mem_cgroup_from_mm(mm);
- 	ret = mem_cgroup_try_charge(memcg, mask, 1, true);
- 	css_put(&memcg->css);
- 	if (ret == -EINTR)
- 		memcg = root_mem_cgroup;
- 	else if (ret)
- 		return ret;
- out:
- 	*memcgp = memcg;
- 	return 0;
- }
- 
- int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
- 				 gfp_t gfp_mask, struct mem_cgroup **memcgp)
- {
- 	if (mem_cgroup_disabled()) {
- 		*memcgp = NULL;
- 		return 0;
- 	}
- 	/*
- 	 * A racing thread's fault, or swapoff, may have already
- 	 * updated the pte, and even removed page from swap cache: in
- 	 * those cases unuse_pte()'s pte_same() test will fail; but
- 	 * there's also a KSM case which does need to charge the page.
- 	 */
- 	if (!PageSwapCache(page)) {
- 		struct mem_cgroup *memcg;
- 
- 		memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
- 		if (!memcg)
- 			return -ENOMEM;
- 		*memcgp = memcg;
- 		return 0;
- 	}
- 	return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
- }
- 
- void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
- {
- 	if (mem_cgroup_disabled())
- 		return;
- 	if (!memcg)
- 		return;
- 	__mem_cgroup_cancel_charge(memcg, 1);
- }
- 
- static void
- __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
- 					enum charge_type ctype)
- {
- 	if (mem_cgroup_disabled())
- 		return;
- 	if (!memcg)
- 		return;
- 
- 	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
- 	/*
- 	 * Now swap is on-memory. This means this page may be
- 	 * counted both as mem and swap....double count.
- 	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
- 	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
- 	 * may call delete_from_swap_cache() before reach here.
- 	 */
- 	if (do_swap_account && PageSwapCache(page)) {
- 		swp_entry_t ent = {.val = page_private(page)};
- 		mem_cgroup_uncharge_swap(ent);
- 	}
- }
- 
- void mem_cgroup_commit_charge_swapin(struct page *page,
- 				     struct mem_cgroup *memcg)
- {
- 	__mem_cgroup_commit_charge_swapin(page, memcg,
- 					  MEM_CGROUP_CHARGE_TYPE_ANON);
- }
- 
- int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
- 				gfp_t gfp_mask)
- {
- 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
- 	struct mem_cgroup *memcg;
- 	int ret;
- 
- 	if (mem_cgroup_disabled())
- 		return 0;
- 	if (PageCompound(page))
- 		return 0;
- 
- 	if (PageSwapCache(page)) { /* shmem */
- 		ret = __mem_cgroup_try_charge_swapin(mm, page,
- 						     gfp_mask, &memcg);
- 		if (ret)
- 			return ret;
- 		__mem_cgroup_commit_charge_swapin(page, memcg, type);
- 		return 0;
- 	}
- 
- 	memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
- 	if (!memcg)
- 		return -ENOMEM;
- 	__mem_cgroup_commit_charge(memcg, page, 1, type, false);
- 	return 0;
- }
- 
- static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
- 				   unsigned int nr_pages,
- 				   const enum charge_type ctype)
- {
- 	struct memcg_batch_info *batch = NULL;
- 	bool uncharge_memsw = true;
- 
- 	/* If swapout, usage of swap doesn't decrease */
- 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
- 		uncharge_memsw = false;
- 
- 	batch = &current->memcg_batch;
- 	/*
- 	 * In usual, we do css_get() when we remember memcg pointer.
- 	 * But in this case, we keep res->usage until end of a series of
- 	 * uncharges. Then, it's ok to ignore memcg's refcnt.
- 	 */
- 	if (!batch->memcg)
- 		batch->memcg = memcg;
- 	/*
- 	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
- 	 * In those cases, all pages freed continuously can be expected to be in
- 	 * the same cgroup and we have chance to coalesce uncharges.
- 	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
- 	 * because we want to do uncharge as soon as possible.
- 	 */
- 
- 	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
- 		goto direct_uncharge;
- 
- 	if (nr_pages > 1)
- 		goto direct_uncharge;
- 
- 	/*
- 	 * In typical case, batch->memcg == mem. This means we can
- 	 * merge a series of uncharges to an uncharge of res_counter.
- 	 * If not, we uncharge res_counter ony by one.
- 	 */
- 	if (batch->memcg != memcg)
- 		goto direct_uncharge;
- 	/* remember freed charge and uncharge it later */
- 	batch->nr_pages++;
- 	if (uncharge_memsw)
- 		batch->memsw_nr_pages++;
- 	return;
- direct_uncharge:
- 	res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
- 	if (uncharge_memsw)
- 		res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
- 	if (unlikely(batch->memcg != memcg))
- 		memcg_oom_recover(memcg);
- }
- 
- /*
-  * uncharge if !page_mapped(page)
-  */
- static struct mem_cgroup *
- __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
- 			     bool end_migration)
- {
- 	struct mem_cgroup *memcg = NULL;
- 	unsigned int nr_pages = 1;
- 	struct page_cgroup *pc;
- 	bool anon;
- 
- 	if (mem_cgroup_disabled())
- 		return NULL;
- 
- 	if (PageTransHuge(page)) {
- 		nr_pages <<= compound_order(page);
- 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- 	}
- 	/*
- 	 * Check if our page_cgroup is valid
- 	 */
- 	pc = lookup_page_cgroup(page);
- 	if (unlikely(!PageCgroupUsed(pc)))
- 		return NULL;
- 
- 	lock_page_cgroup(pc);
- 
- 	memcg = pc->mem_cgroup;
- 
- 	if (!PageCgroupUsed(pc))
- 		goto unlock_out;
- 
- 	anon = PageAnon(page);
- 
- 	switch (ctype) {
- 	case MEM_CGROUP_CHARGE_TYPE_ANON:
- 		/*
- 		 * Generally PageAnon tells if it's the anon statistics to be
- 		 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
- 		 * used before page reached the stage of being marked PageAnon.
- 		 */
- 		anon = true;
- 		/* fallthrough */
- 	case MEM_CGROUP_CHARGE_TYPE_DROP:
- 		/* See mem_cgroup_prepare_migration() */
- 		if (page_mapped(page))
- 			goto unlock_out;
- 		/*
- 		 * Pages under migration may not be uncharged.  But
- 		 * end_migration() /must/ be the one uncharging the
- 		 * unused post-migration page and so it has to call
- 		 * here with the migration bit still set.  See the
- 		 * res_counter handling below.
- 		 */
- 		if (!end_migration && PageCgroupMigration(pc))
- 			goto unlock_out;
- 		break;
- 	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
- 		if (!PageAnon(page)) {	/* Shared memory */
- 			if (page->mapping && !page_is_file_cache(page))
- 				goto unlock_out;
- 		} else if (page_mapped(page)) /* Anon */
- 				goto unlock_out;
- 		break;
- 	default:
- 		break;
- 	}
- 
- 	mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
- 
- 	ClearPageCgroupUsed(pc);
- 	/*
- 	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
- 	 * freed from LRU. This is safe because uncharged page is expected not
- 	 * to be reused (freed soon). Exception is SwapCache, it's handled by
- 	 * special functions.
- 	 */
- 
- 	unlock_page_cgroup(pc);
- 	/*
- 	 * even after unlock, we have memcg->res.usage here and this memcg
- 	 * will never be freed, so it's safe to call css_get().
- 	 */
- 	memcg_check_events(memcg, page);
- 	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
- 		mem_cgroup_swap_statistics(memcg, true);
- 		css_get(&memcg->css);
- 	}
- 	/*
- 	 * Migration does not charge the res_counter for the
- 	 * replacement page, so leave it alone when phasing out the
- 	 * page that is unused after the migration.
- 	 */
- 	if (!end_migration && !mem_cgroup_is_root(memcg))
- 		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
- 
- 	return memcg;
- 
- unlock_out:
- 	unlock_page_cgroup(pc);
- 	return NULL;
- }
- 
- void mem_cgroup_uncharge_page(struct page *page)
- {
- 	/* early check. */
- 	if (page_mapped(page))
- 		return;
- 	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
- 	/*
- 	 * If the page is in swap cache, uncharge should be deferred
- 	 * to the swap path, which also properly accounts swap usage
- 	 * and handles memcg lifetime.
- 	 *
- 	 * Note that this check is not stable and reclaim may add the
- 	 * page to swap cache at any time after this.  However, if the
- 	 * page is not in swap cache by the time page->mapcount hits
- 	 * 0, there won't be any page table references to the swap
- 	 * slot, and reclaim will free it and not actually write the
- 	 * page to disk.
- 	 */
- 	if (PageSwapCache(page))
- 		return;
- 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
- }
- 
- void mem_cgroup_uncharge_cache_page(struct page *page)
- {
- 	VM_BUG_ON_PAGE(page_mapped(page), page);
- 	VM_BUG_ON_PAGE(page->mapping, page);
- 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
- }
- 
- /*
-  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
-  * In that cases, pages are freed continuously and we can expect pages
-  * are in the same memcg. All these calls itself limits the number of
-  * pages freed at once, then uncharge_start/end() is called properly.
-  * This may be called prural(2) times in a context,
-  */
- 
- void mem_cgroup_uncharge_start(void)
- {
- 	current->memcg_batch.do_batch++;
- 	/* We can do nest. */
- 	if (current->memcg_batch.do_batch == 1) {
- 		current->memcg_batch.memcg = NULL;
- 		current->memcg_batch.nr_pages = 0;
- 		current->memcg_batch.memsw_nr_pages = 0;
- 	}
- }
- 
- void mem_cgroup_uncharge_end(void)
- {
- 	struct memcg_batch_info *batch = &current->memcg_batch;
- 
- 	if (!batch->do_batch)
- 		return;
- 
- 	batch->do_batch--;
- 	if (batch->do_batch) /* If stacked, do nothing. */
- 		return;
- 
- 	if (!batch->memcg)
- 		return;
- 	/*
- 	 * This "batch->memcg" is valid without any css_get/put etc...
- 	 * bacause we hide charges behind us.
- 	 */
- 	if (batch->nr_pages)
- 		res_counter_uncharge(&batch->memcg->res,
- 				     batch->nr_pages * PAGE_SIZE);
- 	if (batch->memsw_nr_pages)
- 		res_counter_uncharge(&batch->memcg->memsw,
- 				     batch->memsw_nr_pages * PAGE_SIZE);
- 	memcg_oom_recover(batch->memcg);
- 	/* forget this pointer (for sanity check) */
- 	batch->memcg = NULL;
- }
- 
- #ifdef CONFIG_SWAP
- /*
-  * called after __delete_from_swap_cache() and drop "page" account.
-  * memcg information is recorded to swap_cgroup of "ent"
-  */
- void
- mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
- {
- 	struct mem_cgroup *memcg;
- 	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
- 
- 	if (!swapout) /* this was a swap cache but the swap is unused ! */
- 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
- 
- 	memcg = __mem_cgroup_uncharge_common(page, ctype, false);
- 
- 	/*
- 	 * record memcg information,  if swapout && memcg != NULL,
- 	 * css_get() was called in uncharge().
- 	 */
- 	if (do_swap_account && swapout && memcg)
- 		swap_cgroup_record(ent, mem_cgroup_id(memcg));
- }
- #endif
- 
- #ifdef CONFIG_MEMCG_SWAP
- /*
-  * called from swap_entry_free(). remove record in swap_cgroup and
-  * uncharge "memsw" account.
-  */
- void mem_cgroup_uncharge_swap(swp_entry_t ent)
- {
- 	struct mem_cgroup *memcg;
- 	unsigned short id;
- 
- 	if (!do_swap_account)
- 		return;
- 
- 	id = swap_cgroup_record(ent, 0);
- 	rcu_read_lock();
- 	memcg = mem_cgroup_lookup(id);
- 	if (memcg) {
- 		/*
- 		 * We uncharge this because swap is freed.  This memcg can
- 		 * be obsolete one. We avoid calling css_tryget_online().
- 		 */
- 		if (!mem_cgroup_is_root(memcg))
- 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
- 		mem_cgroup_swap_statistics(memcg, false);
- 		css_put(&memcg->css);
- 	}
- 	rcu_read_unlock();
- }
- 
- /**
-  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
-  * @entry: swap entry to be moved
-  * @from:  mem_cgroup which the entry is moved from
-  * @to:  mem_cgroup which the entry is moved to
-  *
-  * It succeeds only when the swap_cgroup's record for this entry is the same
-  * as the mem_cgroup's id of @from.
-  *
-  * Returns 0 on success, -EINVAL on failure.
-  *
-  * The caller must have charged to @to, IOW, called res_counter_charge() about
-  * both res and memsw, and called css_get().
-  */
- static int mem_cgroup_move_swap_account(swp_entry_t entry,
- 				struct mem_cgroup *from, struct mem_cgroup *to)
- {
- 	unsigned short old_id, new_id;
- 
- 	old_id = mem_cgroup_id(from);
- 	new_id = mem_cgroup_id(to);
- 
- 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
- 		mem_cgroup_swap_statistics(from, false);
- 		mem_cgroup_swap_statistics(to, true);
+ 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
+ 		mem_cgroup_swap_statistics(from, false);
+ 		mem_cgroup_swap_statistics(to, true);
    	/*
    	 * This function is only called from task migration context now.
    	 * It postpones res_counter and refcount handling till the end
@@@ -4183,175 -3662,6 +3662,6 @@@ static inline int mem_cgroup_move_swap_
  }
  #endif
- /*
-  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
-  * page belongs to.
-  */
- void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
- 				  struct mem_cgroup **memcgp)
- {
- 	struct mem_cgroup *memcg = NULL;
- 	unsigned int nr_pages = 1;
- 	struct page_cgroup *pc;
- 	enum charge_type ctype;
- 
- 	*memcgp = NULL;
- 
- 	if (mem_cgroup_disabled())
- 		return;
- 
- 	if (PageTransHuge(page))
- 		nr_pages <<= compound_order(page);
- 
- 	pc = lookup_page_cgroup(page);
- 	lock_page_cgroup(pc);
- 	if (PageCgroupUsed(pc)) {
- 		memcg = pc->mem_cgroup;
- 		css_get(&memcg->css);
- 		/*
- 		 * At migrating an anonymous page, its mapcount goes down
- 		 * to 0 and uncharge() will be called. But, even if it's fully
- 		 * unmapped, migration may fail and this page has to be
- 		 * charged again. We set MIGRATION flag here and delay uncharge
- 		 * until end_migration() is called
- 		 *
- 		 * Corner Case Thinking
- 		 * A)
- 		 * When the old page was mapped as Anon and it's unmap-and-freed
- 		 * while migration was ongoing.
- 		 * If unmap finds the old page, uncharge() of it will be delayed
- 		 * until end_migration(). If unmap finds a new page, it's
- 		 * uncharged when it make mapcount to be 1->0. If unmap code
- 		 * finds swap_migration_entry, the new page will not be mapped
- 		 * and end_migration() will find it(mapcount==0).
- 		 *
- 		 * B)
- 		 * When the old page was mapped but migraion fails, the kernel
- 		 * remaps it. A charge for it is kept by MIGRATION flag even
- 		 * if mapcount goes down to 0. We can do remap successfully
- 		 * without charging it again.
- 		 *
- 		 * C)
- 		 * The "old" page is under lock_page() until the end of
- 		 * migration, so, the old page itself will not be swapped-out.
- 		 * If the new page is swapped out before end_migraton, our
- 		 * hook to usual swap-out path will catch the event.
- 		 */
- 		if (PageAnon(page))
- 			SetPageCgroupMigration(pc);
- 	}
- 	unlock_page_cgroup(pc);
- 	/*
- 	 * If the page is not charged at this point,
- 	 * we return here.
- 	 */
- 	if (!memcg)
- 		return;
- 
- 	*memcgp = memcg;
- 	/*
- 	 * We charge new page before it's used/mapped. So, even if unlock_page()
- 	 * is called before end_migration, we can catch all events on this new
- 	 * page. In the case new page is migrated but not remapped, new page's
- 	 * mapcount will be finally 0 and we call uncharge in end_migration().
- 	 */
- 	if (PageAnon(page))
- 		ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
- 	else
- 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
- 	/*
- 	 * The page is committed to the memcg, but it's not actually
- 	 * charged to the res_counter since we plan on replacing the
- 	 * old one and only one page is going to be left afterwards.
- 	 */
- 	__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
- }
- 
- /* remove redundant charge if migration failed*/
- void mem_cgroup_end_migration(struct mem_cgroup *memcg,
- 	struct page *oldpage, struct page *newpage, bool migration_ok)
- {
- 	struct page *used, *unused;
- 	struct page_cgroup *pc;
- 	bool anon;
- 
- 	if (!memcg)
- 		return;
- 
- 	if (!migration_ok) {
- 		used = oldpage;
- 		unused = newpage;
- 	} else {
- 		used = newpage;
- 		unused = oldpage;
- 	}
- 	anon = PageAnon(used);
- 	__mem_cgroup_uncharge_common(unused,
- 				     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
- 				     : MEM_CGROUP_CHARGE_TYPE_CACHE,
- 				     true);
- 	css_put(&memcg->css);
- 	/*
- 	 * We disallowed uncharge of pages under migration because mapcount
- 	 * of the page goes down to zero, temporarly.
- 	 * Clear the flag and check the page should be charged.
- 	 */
- 	pc = lookup_page_cgroup(oldpage);
- 	lock_page_cgroup(pc);
- 	ClearPageCgroupMigration(pc);
- 	unlock_page_cgroup(pc);
- 
- 	/*
- 	 * If a page is a file cache, radix-tree replacement is very atomic
- 	 * and we can skip this check. When it was an Anon page, its mapcount
- 	 * goes down to 0. But because we added MIGRATION flage, it's not
- 	 * uncharged yet. There are several case but page->mapcount check
- 	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
- 	 * check. (see prepare_charge() also)
- 	 */
- 	if (anon)
- 		mem_cgroup_uncharge_page(used);
- }
- 
- /*
-  * At replace page cache, newpage is not under any memcg but it's on
-  * LRU. So, this function doesn't touch res_counter but handles LRU
-  * in correct way. Both pages are locked so we cannot race with uncharge.
-  */
- void mem_cgroup_replace_page_cache(struct page *oldpage,
- 				  struct page *newpage)
- {
- 	struct mem_cgroup *memcg = NULL;
- 	struct page_cgroup *pc;
- 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
- 
- 	if (mem_cgroup_disabled())
- 		return;
- 
- 	pc = lookup_page_cgroup(oldpage);
- 	/* fix accounting on old pages */
- 	lock_page_cgroup(pc);
- 	if (PageCgroupUsed(pc)) {
- 		memcg = pc->mem_cgroup;
- 		mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
- 		ClearPageCgroupUsed(pc);
- 	}
- 	unlock_page_cgroup(pc);
- 
- 	/*
- 	 * When called from shmem_replace_page(), in some cases the
- 	 * oldpage has already been charged, and in some cases not.
- 	 */
- 	if (!memcg)
- 		return;
- 	/*
- 	 * Even if newpage->mapping was NULL before starting replacement,
- 	 * the newpage may be on LRU(or pagevec for LRU) already. We lock
- 	 * LRU while we overwrite pc->mem_cgroup.
- 	 */
- 	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
- }
- 
  #ifdef CONFIG_DEBUG_VM
  static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
  {
@@@ -4550,7 -3860,7 +3860,7 @@@ unsigned long mem_cgroup_soft_limit_rec
    					    gfp_mask, &nr_scanned);
    	nr_reclaimed += reclaimed;
    	*total_scanned += nr_scanned;
- 		spin_lock(&mctz->lock);
+ 		spin_lock_irq(&mctz->lock);
/*
    	 * If we failed to reclaim anything from this memory cgroup
@@@ -4590,7 -3900,7 +3900,7 @@@
    	 */
    	/* If excess == 0, no tree ops */
    	__mem_cgroup_insert_exceeded(mz, mctz, excess);
- 		spin_unlock(&mctz->lock);
+ 		spin_unlock_irq(&mctz->lock);
    	css_put(&mz->memcg->css);
    	loop++;
    	/*
@@@ -4809,86 -4119,32 +4119,32 @@@ static int mem_cgroup_hierarchy_write(s
    	else
    		retval = -EBUSY;
    } else
- 		retval = -EINVAL;
- 
- out:
- 	mutex_unlock(&memcg_create_mutex);
- 
- 	return retval;
- }
- 
- 
- static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
- 					       enum mem_cgroup_stat_index idx)
- {
- 	struct mem_cgroup *iter;
- 	long val = 0;
- 
- 	/* Per-cpu values can be negative, use a signed accumulator */
- 	for_each_mem_cgroup_tree(iter, memcg)
- 		val += mem_cgroup_read_stat(iter, idx);
- 
- 	if (val < 0) /* race ? */
- 		val = 0;
- 	return val;
- }
- 
- static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
- {
- 	u64 val;
- 
- 	if (!mem_cgroup_is_root(memcg)) {
- 		if (!swap)
- 			return res_counter_read_u64(&memcg->res, RES_USAGE);
- 		else
- 			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
- 	}
- 
- 	/*
- 	 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
- 	 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
- 	 */
- 	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
- 	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+ 		retval = -EINVAL;
- 	if (swap)
- 		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
+ out:
+ 	mutex_unlock(&memcg_create_mutex);
- 	return val << PAGE_SHIFT;
+ 	return retval;
  }
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
- 				   struct cftype *cft)
+ 			       struct cftype *cft)
  {
    struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- 	u64 val;
- 	int name;
- 	enum res_type type;
- 
- 	type = MEMFILE_TYPE(cft->private);
- 	name = MEMFILE_ATTR(cft->private);
+ 	enum res_type type = MEMFILE_TYPE(cft->private);
+ 	int name = MEMFILE_ATTR(cft->private);
switch (type) {
    case _MEM:
- 		if (name == RES_USAGE)
- 			val = mem_cgroup_usage(memcg, false);
- 		else
- 			val = res_counter_read_u64(&memcg->res, name);
- 		break;
+ 		return res_counter_read_u64(&memcg->res, name);
    case _MEMSWAP:
- 		if (name == RES_USAGE)
- 			val = mem_cgroup_usage(memcg, true);
- 		else
- 			val = res_counter_read_u64(&memcg->memsw, name);
- 		break;
+ 		return res_counter_read_u64(&memcg->memsw, name);
    case _KMEM:
- 		val = res_counter_read_u64(&memcg->kmem, name);
+ 		return res_counter_read_u64(&memcg->kmem, name);
    	break;
    default:
    	BUG();
    }
- 
- 	return val;
  }
#ifdef CONFIG_MEMCG_KMEM
@@@ -5350,7 -4606,10 +4606,10 @@@ static void __mem_cgroup_threshold(stru
    if (!t)
    	goto unlock;
- 	usage = mem_cgroup_usage(memcg, swap);
+ 	if (!swap)
+ 		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ 	else
+ 		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/*
     * current_threshold points to threshold just below or equal to usage.
@@@ -5442,15 -4701,15 +4701,15 @@@ static int __mem_cgroup_usage_register_
mutex_lock(&memcg->thresholds_lock);
- 	if (type == _MEM)
+ 	if (type == _MEM) {
    	thresholds = &memcg->thresholds;
- 	else if (type == _MEMSWAP)
+ 		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ 	} else if (type == _MEMSWAP) {
    	thresholds = &memcg->memsw_thresholds;
- 	else
+ 		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ 	} else
    	BUG();
- 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
- 
    /* Check if a threshold crossed before adding a new one */
    if (thresholds->primary)
    	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@@ -5530,18 -4789,19 +4789,19 @@@ static void __mem_cgroup_usage_unregist
    int i, j, size;
mutex_lock(&memcg->thresholds_lock);
- 	if (type == _MEM)
+ 
+ 	if (type == _MEM) {
    	thresholds = &memcg->thresholds;
- 	else if (type == _MEMSWAP)
+ 		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ 	} else if (type == _MEMSWAP) {
    	thresholds = &memcg->memsw_thresholds;
- 	else
+ 		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ 	} else
    	BUG();
if (!thresholds->primary)
    	goto unlock;
- 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
- 
    /* Check if a threshold crossed before removing */
    __mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@@ -6003,6 -5263,7 +5263,6 @@@ static struct cftype mem_cgroup_files[
    },
    {
    	.name = "use_hierarchy",
 -		.flags = CFTYPE_INSANE,
    	.write_u64 = mem_cgroup_hierarchy_write,
    	.read_u64 = mem_cgroup_hierarchy_read,
    },
@@@ -6295,9 -5556,9 +5555,9 @@@ mem_cgroup_css_online(struct cgroup_sub
    	 * core guarantees its existence.
    	 */
    } else {
- 		res_counter_init(&memcg->res, NULL);
- 		res_counter_init(&memcg->memsw, NULL);
- 		res_counter_init(&memcg->kmem, NULL);
+ 		res_counter_init(&memcg->res, &root_mem_cgroup->res);
+ 		res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
+ 		res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
    	/*
    	 * Deeper hierachy with use_hierarchy == false doesn't make
    	 * much sense so let cgroup subsystem know about this
@@@ -6406,80 -5667,40 +5666,63 @@@ static void mem_cgroup_css_free(struct 
    __mem_cgroup_free(memcg);
  }
+/**
 + * mem_cgroup_css_reset - reset the states of a mem_cgroup
 + * @css: the target css
 + *
 + * Reset the states of the mem_cgroup associated with @css.  This is
 + * invoked when the userland requests disabling on the default hierarchy
 + * but the memcg is pinned through dependency.  The memcg should stop
 + * applying policies and should revert to the vanilla state as it may be
 + * made visible again.
 + *
 + * The current implementation only resets the essential configurations.
 + * This needs to be expanded to cover all the visible parts.
 + */
 +static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 +{
 +	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 +
 +	mem_cgroup_resize_limit(memcg, ULLONG_MAX);
 +	mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
 +	memcg_update_kmem_limit(memcg, ULLONG_MAX);
 +	res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
 +}
 +
  #ifdef CONFIG_MMU
  /* Handlers for move charge at task migration. */
- #define PRECHARGE_COUNT_AT_ONCE	256
  static int mem_cgroup_do_precharge(unsigned long count)
  {
- 	int ret = 0;
- 	int batch_count = PRECHARGE_COUNT_AT_ONCE;
- 	struct mem_cgroup *memcg = mc.to;
+ 	int ret;
- 	if (mem_cgroup_is_root(memcg)) {
+ 	/* Try a single bulk charge without reclaim first */
+ 	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+ 	if (!ret) {
    	mc.precharge += count;
- 		/* we don't need css_get for root */
    	return ret;
    }
- 	/* try to charge at once */
- 	if (count > 1) {
- 		struct res_counter *dummy;
- 		/*
- 		 * "memcg" cannot be under rmdir() because we've already checked
- 		 * by cgroup_lock_live_cgroup() that it is not removed and we
- 		 * are still under the same cgroup_mutex. So we can postpone
- 		 * css_get().
- 		 */
- 		if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
- 			goto one_by_one;
- 		if (do_swap_account && res_counter_charge(&memcg->memsw,
- 						PAGE_SIZE * count, &dummy)) {
- 			res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
- 			goto one_by_one;
- 		}
- 		mc.precharge += count;
+ 	if (ret == -EINTR) {
+ 		cancel_charge(root_mem_cgroup, count);
    	return ret;
    }
- one_by_one:
- 	/* fall back to one by one charge */
+ 
+ 	/* Try charges one by one with reclaim */
    while (count--) {
- 		if (signal_pending(current)) {
- 			ret = -EINTR;
- 			break;
- 		}
- 		if (!batch_count--) {
- 			batch_count = PRECHARGE_COUNT_AT_ONCE;
- 			cond_resched();
- 		}
- 		ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
+ 		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
+ 		/*
+ 		 * In case of failure, any residual charges against
+ 		 * mc.to will be dropped by mem_cgroup_clear_mc()
+ 		 * later on.  However, cancel any charges that are
+ 		 * bypassed to root right away or they'll be lost.
+ 		 */
+ 		if (ret == -EINTR)
+ 			cancel_charge(root_mem_cgroup, 1);
    	if (ret)
- 			/* mem_cgroup_clear_mc() will do uncharge later */
    		return ret;
    	mc.precharge++;
+ 		cond_resched();
    }
- 	return ret;
+ 	return 0;
  }
/**
@@@ -6615,9 -5836,9 +5858,9 @@@ static enum mc_target_type get_mctgt_ty
    if (page) {
    	pc = lookup_page_cgroup(page);
    	/*
- 		 * Do only loose check w/o page_cgroup lock.
- 		 * mem_cgroup_move_account() checks the pc is valid or not under
- 		 * the lock.
+ 		 * Do only loose check w/o serialization.
+ 		 * mem_cgroup_move_account() checks the pc is valid or
+ 		 * not under LRU exclusion.
    	 */
    	if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
    		ret = MC_TARGET_PAGE;
@@@ -6742,7 -5963,7 +5985,7 @@@ static void __mem_cgroup_clear_mc(void
/* we must uncharge all the leftover precharges from mc.to */
    if (mc.precharge) {
- 		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
+ 		cancel_charge(mc.to, mc.precharge);
    	mc.precharge = 0;
    }
    /*
@@@ -6750,27 -5971,24 +5993,24 @@@
     * we must uncharge here.
     */
    if (mc.moved_charge) {
- 		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
+ 		cancel_charge(mc.from, mc.moved_charge);
    	mc.moved_charge = 0;
    }
    /* we must fixup refcnts and charges */
    if (mc.moved_swap) {
    	/* uncharge swap account from the old cgroup */
- 		if (!mem_cgroup_is_root(mc.from))
- 			res_counter_uncharge(&mc.from->memsw,
- 						PAGE_SIZE * mc.moved_swap);
+ 		res_counter_uncharge(&mc.from->memsw,
+ 				     PAGE_SIZE * mc.moved_swap);
for (i = 0; i < mc.moved_swap; i++)
    		css_put(&mc.from->css);
- 		if (!mem_cgroup_is_root(mc.to)) {
- 			/*
- 			 * we charged both to->res and to->memsw, so we should
- 			 * uncharge to->res.
- 			 */
- 			res_counter_uncharge(&mc.to->res,
- 						PAGE_SIZE * mc.moved_swap);
- 		}
+ 		/*
+ 		 * we charged both to->res and to->memsw, so we should
+ 		 * uncharge to->res.
+ 		 */
+ 		res_counter_uncharge(&mc.to->res,
+ 				     PAGE_SIZE * mc.moved_swap);
    	/* we've already done css_get(mc.to) */
    	mc.moved_swap = 0;
    }
@@@ -7023,17 -6241,16 +6263,17 @@@ static void mem_cgroup_move_task(struc
/*
   * Cgroup retains root cgroups across [un]mount cycles making it necessary
 - * to verify sane_behavior flag on each mount attempt.
 + * to verify whether we're attached to the default hierarchy on each mount
 + * attempt.
   */
  static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
  {
    /*
 -	 * use_hierarchy is forced with sane_behavior.  cgroup core
 +	 * use_hierarchy is forced on the default hierarchy.  cgroup core
     * guarantees that @root doesn't have any children, so turning it
     * on for the root memcg is enough.
     */
 -	if (cgroup_sane_behavior(root_css->cgroup))
 +	if (cgroup_on_dfl(root_css->cgroup))
    	mem_cgroup_from_css(root_css)->use_hierarchy = true;
  }
@@@ -7042,12 -6259,11 +6282,12 @@@ struct cgroup_subsys memory_cgrp_subsy
    .css_online = mem_cgroup_css_online,
    .css_offline = mem_cgroup_css_offline,
    .css_free = mem_cgroup_css_free,
 +	.css_reset = mem_cgroup_css_reset,
    .can_attach = mem_cgroup_can_attach,
    .cancel_attach = mem_cgroup_cancel_attach,
    .attach = mem_cgroup_move_task,
    .bind = mem_cgroup_bind,
 -	.base_cftypes = mem_cgroup_files,
 +	.legacy_cftypes = mem_cgroup_files,
    .early_init = 0,
  };
@@@ -7064,8 -6280,7 +6304,8 @@@ __setup("swapaccount=", enable_swap_acc
static void __init memsw_file_init(void)
  {
 -	WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
 +	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
 +					  memsw_cgroup_files));
  }
static void __init enable_swap_cgroup(void)
@@@ -7082,6 -6297,397 +6322,397 @@@ static void __init enable_swap_cgroup(v
  }
  #endif
+ #ifdef CONFIG_MEMCG_SWAP
+ /**
+  * mem_cgroup_swapout - transfer a memsw charge to swap
+  * @page: page whose memsw charge to transfer
+  * @entry: swap entry to move the charge to
+  *
+  * Transfer the memsw charge of @page to @entry.
+  */
+ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+ {
+ 	struct page_cgroup *pc;
+ 	unsigned short oldid;
+ 
+ 	VM_BUG_ON_PAGE(PageLRU(page), page);
+ 	VM_BUG_ON_PAGE(page_count(page), page);
+ 
+ 	if (!do_swap_account)
+ 		return;
+ 
+ 	pc = lookup_page_cgroup(page);
+ 
+ 	/* Readahead page, never charged */
+ 	if (!PageCgroupUsed(pc))
+ 		return;
+ 
+ 	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
+ 
+ 	oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
+ 	VM_BUG_ON_PAGE(oldid, page);
+ 
+ 	pc->flags &= ~PCG_MEMSW;
+ 	css_get(&pc->mem_cgroup->css);
+ 	mem_cgroup_swap_statistics(pc->mem_cgroup, true);
+ }
+ 
+ /**
+  * mem_cgroup_uncharge_swap - uncharge a swap entry
+  * @entry: swap entry to uncharge
+  *
+  * Drop the memsw charge associated with @entry.
+  */
+ void mem_cgroup_uncharge_swap(swp_entry_t entry)
+ {
+ 	struct mem_cgroup *memcg;
+ 	unsigned short id;
+ 
+ 	if (!do_swap_account)
+ 		return;
+ 
+ 	id = swap_cgroup_record(entry, 0);
+ 	rcu_read_lock();
+ 	memcg = mem_cgroup_lookup(id);
+ 	if (memcg) {
+ 		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+ 		mem_cgroup_swap_statistics(memcg, false);
+ 		css_put(&memcg->css);
+ 	}
+ 	rcu_read_unlock();
+ }
+ #endif
+ 
+ /**
+  * mem_cgroup_try_charge - try charging a page
+  * @page: page to charge
+  * @mm: mm context of the victim
+  * @gfp_mask: reclaim mode
+  * @memcgp: charged memcg return
+  *
+  * Try to charge @page to the memcg that @mm belongs to, reclaiming
+  * pages according to @gfp_mask if necessary.
+  *
+  * Returns 0 on success, with *@memcgp pointing to the charged memcg.
+  * Otherwise, an error code is returned.
+  *
+  * After page->mapping has been set up, the caller must finalize the
+  * charge with mem_cgroup_commit_charge().  Or abort the transaction
+  * with mem_cgroup_cancel_charge() in case page instantiation fails.
+  */
+ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
+ 			  gfp_t gfp_mask, struct mem_cgroup **memcgp)
+ {
+ 	struct mem_cgroup *memcg = NULL;
+ 	unsigned int nr_pages = 1;
+ 	int ret = 0;
+ 
+ 	if (mem_cgroup_disabled())
+ 		goto out;
+ 
+ 	if (PageSwapCache(page)) {
+ 		struct page_cgroup *pc = lookup_page_cgroup(page);
+ 		/*
+ 		 * Every swap fault against a single page tries to charge the
+ 		 * page, bail as early as possible.  shmem_unuse() encounters
+ 		 * already charged pages, too.  The USED bit is protected by
+ 		 * the page lock, which serializes swap cache removal, which
+ 		 * in turn serializes uncharging.
+ 		 */
+ 		if (PageCgroupUsed(pc))
+ 			goto out;
+ 	}
+ 
+ 	if (PageTransHuge(page)) {
+ 		nr_pages <<= compound_order(page);
+ 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ 	}
+ 
+ 	if (do_swap_account && PageSwapCache(page))
+ 		memcg = try_get_mem_cgroup_from_page(page);
+ 	if (!memcg)
+ 		memcg = get_mem_cgroup_from_mm(mm);
+ 
+ 	ret = try_charge(memcg, gfp_mask, nr_pages);
+ 
+ 	css_put(&memcg->css);
+ 
+ 	if (ret == -EINTR) {
+ 		memcg = root_mem_cgroup;
+ 		ret = 0;
+ 	}
+ out:
+ 	*memcgp = memcg;
+ 	return ret;
+ }
+ 
+ /**
+  * mem_cgroup_commit_charge - commit a page charge
+  * @page: page to charge
+  * @memcg: memcg to charge the page to
+  * @lrucare: page might be on LRU already
+  *
+  * Finalize a charge transaction started by mem_cgroup_try_charge(),
+  * after page->mapping has been set up.  This must happen atomically
+  * as part of the page instantiation, i.e. under the page table lock
+  * for anonymous pages, under the page lock for page and swap cache.
+  *
+  * In addition, the page must not be on the LRU during the commit, to
+  * prevent racing with task migration.  If it might be, use @lrucare.
+  *
+  * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
+  */
+ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
+ 			      bool lrucare)
+ {
+ 	unsigned int nr_pages = 1;
+ 
+ 	VM_BUG_ON_PAGE(!page->mapping, page);
+ 	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
+ 
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 	/*
+ 	 * Swap faults will attempt to charge the same page multiple
+ 	 * times.  But reuse_swap_page() might have removed the page
+ 	 * from swapcache already, so we can't check PageSwapCache().
+ 	 */
+ 	if (!memcg)
+ 		return;
+ 
+ 	if (PageTransHuge(page)) {
+ 		nr_pages <<= compound_order(page);
+ 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ 	}
+ 
+ 	commit_charge(page, memcg, nr_pages, lrucare);
+ 
+ 	if (do_swap_account && PageSwapCache(page)) {
+ 		swp_entry_t entry = { .val = page_private(page) };
+ 		/*
+ 		 * The swap entry might not get freed for a long time,
+ 		 * let's not wait for it.  The page already received a
+ 		 * memory+swap charge, drop the swap entry duplicate.
+ 		 */
+ 		mem_cgroup_uncharge_swap(entry);
+ 	}
+ }
+ 
+ /**
+  * mem_cgroup_cancel_charge - cancel a page charge
+  * @page: page to charge
+  * @memcg: memcg to charge the page to
+  *
+  * Cancel a charge transaction started by mem_cgroup_try_charge().
+  */
+ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+ {
+ 	unsigned int nr_pages = 1;
+ 
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 	/*
+ 	 * Swap faults will attempt to charge the same page multiple
+ 	 * times.  But reuse_swap_page() might have removed the page
+ 	 * from swapcache already, so we can't check PageSwapCache().
+ 	 */
+ 	if (!memcg)
+ 		return;
+ 
+ 	if (PageTransHuge(page)) {
+ 		nr_pages <<= compound_order(page);
+ 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ 	}
+ 
+ 	cancel_charge(memcg, nr_pages);
+ }
+ 
+ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
+ 			   unsigned long nr_mem, unsigned long nr_memsw,
+ 			   unsigned long nr_anon, unsigned long nr_file,
+ 			   unsigned long nr_huge, struct page *dummy_page)
+ {
+ 	unsigned long flags;
+ 
+ 	if (nr_mem)
+ 		res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE);
+ 	if (nr_memsw)
+ 		res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE);
+ 
+ 	memcg_oom_recover(memcg);
+ 
+ 	local_irq_save(flags);
+ 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
+ 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
+ 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
+ 	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
+ 	__this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
+ 	memcg_check_events(memcg, dummy_page);
+ 	local_irq_restore(flags);
+ }
+ 
+ static void uncharge_list(struct list_head *page_list)
+ {
+ 	struct mem_cgroup *memcg = NULL;
+ 	unsigned long nr_memsw = 0;
+ 	unsigned long nr_anon = 0;
+ 	unsigned long nr_file = 0;
+ 	unsigned long nr_huge = 0;
+ 	unsigned long pgpgout = 0;
+ 	unsigned long nr_mem = 0;
+ 	struct list_head *next;
+ 	struct page *page;
+ 
+ 	next = page_list->next;
+ 	do {
+ 		unsigned int nr_pages = 1;
+ 		struct page_cgroup *pc;
+ 
+ 		page = list_entry(next, struct page, lru);
+ 		next = page->lru.next;
+ 
+ 		VM_BUG_ON_PAGE(PageLRU(page), page);
+ 		VM_BUG_ON_PAGE(page_count(page), page);
+ 
+ 		pc = lookup_page_cgroup(page);
+ 		if (!PageCgroupUsed(pc))
+ 			continue;
+ 
+ 		/*
+ 		 * Nobody should be changing or seriously looking at
+ 		 * pc->mem_cgroup and pc->flags at this point, we have
+ 		 * fully exclusive access to the page.
+ 		 */
+ 
+ 		if (memcg != pc->mem_cgroup) {
+ 			if (memcg) {
+ 				uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
+ 					       nr_anon, nr_file, nr_huge, page);
+ 				pgpgout = nr_mem = nr_memsw = 0;
+ 				nr_anon = nr_file = nr_huge = 0;
+ 			}
+ 			memcg = pc->mem_cgroup;
+ 		}
+ 
+ 		if (PageTransHuge(page)) {
+ 			nr_pages <<= compound_order(page);
+ 			VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ 			nr_huge += nr_pages;
+ 		}
+ 
+ 		if (PageAnon(page))
+ 			nr_anon += nr_pages;
+ 		else
+ 			nr_file += nr_pages;
+ 
+ 		if (pc->flags & PCG_MEM)
+ 			nr_mem += nr_pages;
+ 		if (pc->flags & PCG_MEMSW)
+ 			nr_memsw += nr_pages;
+ 		pc->flags = 0;
+ 
+ 		pgpgout++;
+ 	} while (next != page_list);
+ 
+ 	if (memcg)
+ 		uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
+ 			       nr_anon, nr_file, nr_huge, page);
+ }
+ 
+ /**
+  * mem_cgroup_uncharge - uncharge a page
+  * @page: page to uncharge
+  *
+  * Uncharge a page previously charged with mem_cgroup_try_charge() and
+  * mem_cgroup_commit_charge().
+  */
+ void mem_cgroup_uncharge(struct page *page)
+ {
+ 	struct page_cgroup *pc;
+ 
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 
+ 	/* Don't touch page->lru of any random page, pre-check: */
+ 	pc = lookup_page_cgroup(page);
+ 	if (!PageCgroupUsed(pc))
+ 		return;
+ 
+ 	INIT_LIST_HEAD(&page->lru);
+ 	uncharge_list(&page->lru);
+ }
+ 
+ /**
+  * mem_cgroup_uncharge_list - uncharge a list of page
+  * @page_list: list of pages to uncharge
+  *
+  * Uncharge a list of pages previously charged with
+  * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
+  */
+ void mem_cgroup_uncharge_list(struct list_head *page_list)
+ {
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 
+ 	if (!list_empty(page_list))
+ 		uncharge_list(page_list);
+ }
+ 
+ /**
+  * mem_cgroup_migrate - migrate a charge to another page
+  * @oldpage: currently charged page
+  * @newpage: page to transfer the charge to
+  * @lrucare: both pages might be on the LRU already
+  *
+  * Migrate the charge from @oldpage to @newpage.
+  *
+  * Both pages must be locked, @newpage->mapping must be set up.
+  */
+ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
+ 			bool lrucare)
+ {
+ 	unsigned int nr_pages = 1;
+ 	struct page_cgroup *pc;
+ 	int isolated;
+ 
+ 	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+ 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ 	VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
+ 	VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
+ 	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
+ 
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 
+ 	pc = lookup_page_cgroup(oldpage);
+ 	if (!PageCgroupUsed(pc))
+ 		return;
+ 
+ 	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
+ 	VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
+ 
+ 	if (PageTransHuge(oldpage)) {
+ 		nr_pages <<= compound_order(oldpage);
+ 		VM_BUG_ON_PAGE(!PageTransHuge(oldpage), oldpage);
+ 		VM_BUG_ON_PAGE(!PageTransHuge(newpage), newpage);
+ 	}
+ 
+ 	if (lrucare)
+ 		lock_page_lru(oldpage, &isolated);
+ 
+ 	pc->flags = 0;
+ 
+ 	if (lrucare)
+ 		unlock_page_lru(oldpage, isolated);
+ 
+ 	local_irq_disable();
+ 	mem_cgroup_charge_statistics(pc->mem_cgroup, oldpage, -nr_pages);
+ 	memcg_check_events(pc->mem_cgroup, oldpage);
+ 	local_irq_enable();
+ 
+ 	commit_charge(newpage, pc->mem_cgroup, nr_pages, lrucare);
+ }
+ 
  /*
   * subsys_initcall() for memory controller.
   *
diff --combined mm/shmem.c
index af68b15,b16d3e7..6dc80d2
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@@ -149,6 -149,19 +149,19 @@@ static inline void shmem_unacct_size(un
    	vm_unacct_memory(VM_ACCT(size));
  }
+ static inline int shmem_reacct_size(unsigned long flags,
+ 		loff_t oldsize, loff_t newsize)
+ {
+ 	if (!(flags & VM_NORESERVE)) {
+ 		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
+ 			return security_vm_enough_memory_mm(current->mm,
+ 					VM_ACCT(newsize) - VM_ACCT(oldsize));
+ 		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
+ 			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
+ 	}
+ 	return 0;
+ }
+ 
  /*
   * ... whereas tmpfs objects are accounted incrementally as
   * pages are allocated, in order to allow huge sparse files.
@@@ -280,7 -293,7 +293,7 @@@ static bool shmem_confirm_swap(struct a
   */
  static int shmem_add_to_page_cache(struct page *page,
    			   struct address_space *mapping,
- 				   pgoff_t index, gfp_t gfp, void *expected)
+ 				   pgoff_t index, void *expected)
  {
    int error;
@@@ -406,7 -419,6 +419,6 @@@ static void shmem_undo_range(struct ino
    		pvec.pages, indices);
    	if (!pvec.nr)
    		break;
- 		mem_cgroup_uncharge_start();
    	for (i = 0; i < pagevec_count(&pvec); i++) {
    		struct page *page = pvec.pages[i];
@@@ -434,7 -446,6 +446,6 @@@
    	}
    	pagevec_remove_exceptionals(&pvec);
    	pagevec_release(&pvec);
- 		mem_cgroup_uncharge_end();
    	cond_resched();
    	index++;
    }
@@@ -482,7 -493,6 +493,6 @@@
    		index = start;
    		continue;
    	}
- 		mem_cgroup_uncharge_start();
    	for (i = 0; i < pagevec_count(&pvec); i++) {
    		struct page *page = pvec.pages[i];
@@@ -518,7 -528,6 +528,6 @@@
    	}
    	pagevec_remove_exceptionals(&pvec);
    	pagevec_release(&pvec);
- 		mem_cgroup_uncharge_end();
    	index++;
    }
@@@ -549,6 -558,10 +558,10 @@@ static int shmem_setattr(struct dentry 
    	loff_t newsize = attr->ia_size;
if (newsize != oldsize) {
+ 			error = shmem_reacct_size(SHMEM_I(inode)->flags,
+ 					oldsize, newsize);
+ 			if (error)
+ 				return error;
    		i_size_write(inode, newsize);
    		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
    	}
@@@ -604,7 -617,7 +617,7 @@@ static int shmem_unuse_inode(struct shm
    radswap = swp_to_radix_entry(swap);
    index = radix_tree_locate_item(&mapping->page_tree, radswap);
    if (index == -1)
- 		return 0;
+ 		return -EAGAIN;	/* tell shmem_unuse we found nothing */
/*
     * Move _head_ to start search for next from here.
@@@ -649,7 -662,7 +662,7 @@@
     */
    if (!error)
    	error = shmem_add_to_page_cache(*pagep, mapping, index,
- 						GFP_NOWAIT, radswap);
+ 						radswap);
    if (error != -ENOMEM) {
    	/*
    	 * Truncation and eviction use free_swap_and_cache(), which
@@@ -663,7 -676,6 +676,6 @@@
    		spin_unlock(&info->lock);
    		swap_free(swap);
    	}
- 		error = 1;	/* not an error, but entry was found */
    }
    return error;
  }
@@@ -675,7 -687,7 +687,7 @@@ int shmem_unuse(swp_entry_t swap, struc
  {
    struct list_head *this, *next;
    struct shmem_inode_info *info;
- 	int found = 0;
+ 	struct mem_cgroup *memcg;
    int error = 0;
/*
@@@ -690,26 -702,32 +702,32 @@@
     * the shmem_swaplist_mutex which might hold up shmem_writepage().
     * Charged back to the user (not to caller) when swap account is used.
     */
- 	error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL);
+ 	error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
    if (error)
    	goto out;
    /* No radix_tree_preload: swap entry keeps a place for page in tree */
+ 	error = -EAGAIN;
mutex_lock(&shmem_swaplist_mutex);
    list_for_each_safe(this, next, &shmem_swaplist) {
    	info = list_entry(this, struct shmem_inode_info, swaplist);
    	if (info->swapped)
- 			found = shmem_unuse_inode(info, swap, &page);
+ 			error = shmem_unuse_inode(info, swap, &page);
    	else
    		list_del_init(&info->swaplist);
    	cond_resched();
- 		if (found)
+ 		if (error != -EAGAIN)
    		break;
+ 		/* found nothing in this: move on to search the next */
    }
    mutex_unlock(&shmem_swaplist_mutex);
- 	if (found < 0)
- 		error = found;
+ 	if (error) {
+ 		if (error != -ENOMEM)
+ 			error = 0;
+ 		mem_cgroup_cancel_charge(page, memcg);
+ 	} else
+ 		mem_cgroup_commit_charge(page, memcg, true);
  out:
    unlock_page(page);
    page_cache_release(page);
@@@ -813,7 -831,7 +831,7 @@@ static int shmem_writepage(struct page 
    }
mutex_unlock(&shmem_swaplist_mutex);
- 	swapcache_free(swap, NULL);
+ 	swapcache_free(swap);
  redirty:
    set_page_dirty(page);
    if (wbc->for_reclaim)
@@@ -986,7 -1004,7 +1004,7 @@@ static int shmem_replace_page(struct pa
    	 */
    	oldpage = newpage;
    } else {
- 		mem_cgroup_replace_page_cache(oldpage, newpage);
+ 		mem_cgroup_migrate(oldpage, newpage, false);
    	lru_cache_add_anon(newpage);
    	*pagep = newpage;
    }
@@@ -1013,6 -1031,7 +1031,7 @@@ static int shmem_getpage_gfp(struct ino
    struct address_space *mapping = inode->i_mapping;
    struct shmem_inode_info *info;
    struct shmem_sb_info *sbinfo;
+ 	struct mem_cgroup *memcg;
    struct page *page;
    swp_entry_t swap;
    int error;
@@@ -1091,11 -1110,10 +1110,10 @@@ repeat
    			goto failed;
    	}
- 		error = mem_cgroup_charge_file(page, current->mm,
- 						gfp & GFP_RECLAIM_MASK);
+ 		error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
    	if (!error) {
    		error = shmem_add_to_page_cache(page, mapping, index,
- 						gfp, swp_to_radix_entry(swap));
+ 						swp_to_radix_entry(swap));
    		/*
    		 * We already confirmed swap under page lock, and make
    		 * no memory allocation here, so usually no possibility
@@@ -1108,12 -1126,16 +1126,16 @@@
    		 * Reset swap.val? No, leave it so "failed" goes back to
    		 * "repeat": reading a hole and writing should succeed.
    		 */
- 			if (error)
+ 			if (error) {
+ 				mem_cgroup_cancel_charge(page, memcg);
    			delete_from_swap_cache(page);
+ 			}
    	}
    	if (error)
    		goto failed;
+ 		mem_cgroup_commit_charge(page, memcg, true);
+ 
    	spin_lock(&info->lock);
    	info->swapped--;
    	shmem_recalc_inode(inode);
@@@ -1149,22 -1171,22 +1171,22 @@@
    	__SetPageSwapBacked(page);
    	__set_page_locked(page);
    	if (sgp == SGP_WRITE)
- 			init_page_accessed(page);
+ 			__SetPageReferenced(page);
- 		error = mem_cgroup_charge_file(page, current->mm,
- 						gfp & GFP_RECLAIM_MASK);
+ 		error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
    	if (error)
    		goto decused;
    	error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
    	if (!error) {
    		error = shmem_add_to_page_cache(page, mapping, index,
- 							gfp, NULL);
+ 							NULL);
    		radix_tree_preload_end();
    	}
    	if (error) {
- 			mem_cgroup_uncharge_cache_page(page);
+ 			mem_cgroup_cancel_charge(page, memcg);
    		goto decused;
    	}
+ 		mem_cgroup_commit_charge(page, memcg, false);
    	lru_cache_add_anon(page);
spin_lock(&info->lock);
@@@ -1289,7 -1311,7 +1311,7 @@@ static int shmem_fault(struct vm_area_s
shmem_falloc_waitq = shmem_falloc->waitq;
    		prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
 -					TASK_KILLABLE);
 +					TASK_UNINTERRUPTIBLE);
    		spin_unlock(&inode->i_lock);
    		schedule();
@@@ -2932,16 -2954,16 +2954,16 @@@ static struct file *__shmem_file_setup(
    this.len = strlen(name);
    this.hash = 0; /* will go */
    sb = shm_mnt->mnt_sb;
+ 	path.mnt = mntget(shm_mnt);
    path.dentry = d_alloc_pseudo(sb, &this);
    if (!path.dentry)
    	goto put_memory;
    d_set_d_op(path.dentry, &anon_ops);
- 	path.mnt = mntget(shm_mnt);
res = ERR_PTR(-ENOSPC);
    inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
    if (!inode)
- 		goto put_dentry;
+ 		goto put_memory;
inode->i_flags |= i_flags;
    d_instantiate(path.dentry, inode);
@@@ -2949,19 -2971,19 +2971,19 @@@
    clear_nlink(inode);	/* It is unlinked */
    res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
    if (IS_ERR(res))
- 		goto put_dentry;
+ 		goto put_path;
res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
    	  &shmem_file_operations);
    if (IS_ERR(res))
- 		goto put_dentry;
+ 		goto put_path;
return res;
- put_dentry:
- 	path_put(&path);
  put_memory:
    shmem_unacct_size(flags, size);
+ put_path:
+ 	path_put(&path);
    return res;
  }
diff --combined mm/slab_common.c
index d31c4ba,8b711f5..d80ec43
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@@ -19,6 -19,8 +19,8 @@@
  #include <asm/tlbflush.h>
  #include <asm/page.h>
  #include <linux/memcontrol.h>
+ 
+ #define CREATE_TRACE_POINTS
  #include <trace/events/kmem.h>
#include "slab.h"
@@@ -55,7 -57,7 +57,7 @@@ static int kmem_cache_sanity_check(cons
    		continue;
    	}
-#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
 +#if !defined(CONFIG_SLUB)
    	if (!strcmp(s->name, name)) {
    		pr_err("%s (%s): Cache name already exists.\n",
    		       __func__, name);
@@@ -264,7 -266,7 +266,7 @@@ EXPORT_SYMBOL(kmem_cache_create)
   * memcg_create_kmem_cache - Create a cache for a memory cgroup.
   * @memcg: The memory cgroup the new cache is for.
   * @root_cache: The parent of the new cache.
-  * @memcg_name: The name of the memory cgroup (used for naming the new cache).
+  * @cache_name: The string to be used as the new cache name.
   *
   * This function attempts to create a kmem cache that will serve allocation
   * requests going from @memcg to @root_cache. The new cache inherits properties
@@@ -272,31 -274,25 +274,25 @@@
   */
  struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
    				   struct kmem_cache *root_cache,
- 					   const char *memcg_name)
+ 					   char *cache_name)
  {
    struct kmem_cache *s = NULL;
- 	char *cache_name;
get_online_cpus();
    get_online_mems();
mutex_lock(&slab_mutex);
- 	cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
- 			       memcg_cache_id(memcg), memcg_name);
- 	if (!cache_name)
- 		goto out_unlock;
- 
    s = do_kmem_cache_create(cache_name, root_cache->object_size,
    			 root_cache->size, root_cache->align,
    			 root_cache->flags, root_cache->ctor,
    			 memcg, root_cache);
- 	if (IS_ERR(s)) {
- 		kfree(cache_name);
+ 	if (!IS_ERR(s))
+ 		list_add(&s->memcg_params->siblings,
+ 			 &root_cache->memcg_params->children);
+ 	else
    	s = NULL;
- 	}
- out_unlock:
    mutex_unlock(&slab_mutex);
put_online_mems();
@@@ -307,17 -303,15 +303,15 @@@
static int memcg_cleanup_cache_params(struct kmem_cache *s)
  {
- 	int rc;
- 
    if (!s->memcg_params ||
        !s->memcg_params->is_root_cache)
    	return 0;
mutex_unlock(&slab_mutex);
- 	rc = __memcg_cleanup_cache_params(s);
+ 	__memcg_cleanup_cache_params(s);
    mutex_lock(&slab_mutex);
- 	return rc;
+ 	return !list_empty(&s->memcg_params->children);
  }
  #else
  static int memcg_cleanup_cache_params(struct kmem_cache *s)
@@@ -354,6 -348,10 +348,10 @@@ void kmem_cache_destroy(struct kmem_cac
    }
list_del(&s->list);
+ #ifdef CONFIG_MEMCG_KMEM
+ 	if (!is_root_cache(s))
+ 		list_del(&s->memcg_params->siblings);
+ #endif
mutex_unlock(&slab_mutex);
    if (s->flags & SLAB_DESTROY_BY_RCU)
@@@ -692,20 -690,17 +690,17 @@@ void slab_stop(struct seq_file *m, voi
  static void
  memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
  {
- 	struct kmem_cache *c;
+ #ifdef CONFIG_MEMCG_KMEM
+ 	struct memcg_cache_params *params;
    struct slabinfo sinfo;
- 	int i;
- 	if (!is_root_cache(s))
+ 	if (!s->memcg_params ||
+ 	    !s->memcg_params->is_root_cache)
    	return;
- 	for_each_memcg_cache_index(i) {
- 		c = cache_from_memcg_idx(s, i);
- 		if (!c)
- 			continue;
- 
+ 	list_for_each_entry(params, &s->memcg_params->children, siblings) {
    	memset(&sinfo, 0, sizeof(sinfo));
- 		get_slabinfo(c, &sinfo);
+ 		get_slabinfo(params->cachep, &sinfo);
info->active_slabs += sinfo.active_slabs;
    	info->num_slabs += sinfo.num_slabs;
@@@ -713,6 -708,7 +708,7 @@@
    	info->active_objs += sinfo.active_objs;
    	info->num_objs += sinfo.num_objs;
    }
+ #endif
  }
int cache_show(struct kmem_cache *s, struct seq_file *m)
@@@ -787,3 -783,102 +783,102 @@@ static int __init slab_proc_init(void
  }
  module_init(slab_proc_init);
  #endif /* CONFIG_SLABINFO */
+ 
+ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
+ 					   gfp_t flags)
+ {
+ 	void *ret;
+ 	size_t ks = 0;
+ 
+ 	if (p)
+ 		ks = ksize(p);
+ 
+ 	if (ks >= new_size)
+ 		return (void *)p;
+ 
+ 	ret = kmalloc_track_caller(new_size, flags);
+ 	if (ret && p)
+ 		memcpy(ret, p, ks);
+ 
+ 	return ret;
+ }
+ 
+ /**
+  * __krealloc - like krealloc() but don't free @p.
+  * @p: object to reallocate memory for.
+  * @new_size: how many bytes of memory are required.
+  * @flags: the type of memory to allocate.
+  *
+  * This function is like krealloc() except it never frees the originally
+  * allocated buffer. Use this if you don't want to free the buffer immediately
+  * like, for example, with RCU.
+  */
+ void *__krealloc(const void *p, size_t new_size, gfp_t flags)
+ {
+ 	if (unlikely(!new_size))
+ 		return ZERO_SIZE_PTR;
+ 
+ 	return __do_krealloc(p, new_size, flags);
+ 
+ }
+ EXPORT_SYMBOL(__krealloc);
+ 
+ /**
+  * krealloc - reallocate memory. The contents will remain unchanged.
+  * @p: object to reallocate memory for.
+  * @new_size: how many bytes of memory are required.
+  * @flags: the type of memory to allocate.
+  *
+  * The contents of the object pointed to are preserved up to the
+  * lesser of the new and old sizes.  If @p is %NULL, krealloc()
+  * behaves exactly like kmalloc().  If @new_size is 0 and @p is not a
+  * %NULL pointer, the object pointed to is freed.
+  */
+ void *krealloc(const void *p, size_t new_size, gfp_t flags)
+ {
+ 	void *ret;
+ 
+ 	if (unlikely(!new_size)) {
+ 		kfree(p);
+ 		return ZERO_SIZE_PTR;
+ 	}
+ 
+ 	ret = __do_krealloc(p, new_size, flags);
+ 	if (ret && p != ret)
+ 		kfree(p);
+ 
+ 	return ret;
+ }
+ EXPORT_SYMBOL(krealloc);
+ 
+ /**
+  * kzfree - like kfree but zero memory
+  * @p: object to free memory of
+  *
+  * The memory of the object @p points to is zeroed before freed.
+  * If @p is %NULL, kzfree() does nothing.
+  *
+  * Note: this function zeroes the whole allocated buffer which can be a good
+  * deal bigger than the requested buffer size passed to kmalloc(). So be
+  * careful when using this function in performance sensitive code.
+  */
+ void kzfree(const void *p)
+ {
+ 	size_t ks;
+ 	void *mem = (void *)p;
+ 
+ 	if (unlikely(ZERO_OR_NULL_PTR(mem)))
+ 		return;
+ 	ks = ksize(mem);
+ 	memset(mem, 0, ks);
+ 	kfree(mem);
+ }
+ EXPORT_SYMBOL(kzfree);
+ 
+ /* Tracepoints definitions. */
+ EXPORT_TRACEPOINT_SYMBOL(kmalloc);
+ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
+ EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
+ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
+ EXPORT_TRACEPOINT_SYMBOL(kfree);
+ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --combined mm/slub.c
index 8c24a23,8588e82..4114beb
--- a/mm/slub.c
+++ b/mm/slub.c
@@@ -233,11 -233,6 +233,6 @@@ static inline void stat(const struct km
   * 			Core slab cache functions
   *******************************************************************/
- static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
- {
- 	return s->node[node];
- }
- 
  /* Verify that a pointer has an address that is valid within a slab page */
  static inline int check_valid_pointer(struct kmem_cache *s,
    			struct page *page, const void *object)
@@@ -288,6 -283,10 +283,10 @@@ static inline void set_freepointer(stru
    for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
    		__p += (__s)->size)
+ #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
+ 	for (__p = (__addr), __idx = 1; __idx <= __objects;\
+ 			__p += (__s)->size, __idx++)
+ 
  /* Determine object index from a given position */
  static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
  {
@@@ -382,9 -381,9 +381,9 @@@ static inline bool __cmpxchg_double_sla
      defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
    if (s->flags & __CMPXCHG_DOUBLE) {
    	if (cmpxchg_double(&page->freelist, &page->counters,
 -			freelist_old, counters_old,
 -			freelist_new, counters_new))
 -		return 1;
 +				   freelist_old, counters_old,
 +				   freelist_new, counters_new))
 +			return 1;
    } else
  #endif
    {
@@@ -418,9 -417,9 +417,9 @@@ static inline bool cmpxchg_double_slab(
      defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
    if (s->flags & __CMPXCHG_DOUBLE) {
    	if (cmpxchg_double(&page->freelist, &page->counters,
 -			freelist_old, counters_old,
 -			freelist_new, counters_new))
 -		return 1;
 +				   freelist_old, counters_old,
 +				   freelist_new, counters_new))
 +			return 1;
    } else
  #endif
    {
@@@ -945,60 -944,6 +944,6 @@@ static void trace(struct kmem_cache *s
  }
/*
-  * Hooks for other subsystems that check memory allocations. In a typical
-  * production configuration these hooks all should produce no code at all.
-  */
- static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
- {
- 	kmemleak_alloc(ptr, size, 1, flags);
- }
- 
- static inline void kfree_hook(const void *x)
- {
- 	kmemleak_free(x);
- }
- 
- static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
- {
- 	flags &= gfp_allowed_mask;
- 	lockdep_trace_alloc(flags);
- 	might_sleep_if(flags & __GFP_WAIT);
- 
- 	return should_failslab(s->object_size, flags, s->flags);
- }
- 
- static inline void slab_post_alloc_hook(struct kmem_cache *s,
- 					gfp_t flags, void *object)
- {
- 	flags &= gfp_allowed_mask;
- 	kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
- 	kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
- }
- 
- static inline void slab_free_hook(struct kmem_cache *s, void *x)
- {
- 	kmemleak_free_recursive(x, s->flags);
- 
- 	/*
- 	 * Trouble is that we may no longer disable interrupts in the fast path
- 	 * So in order to make the debug calls that expect irqs to be
- 	 * disabled we need to disable interrupts temporarily.
- 	 */
- #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
- 	{
- 		unsigned long flags;
- 
- 		local_irq_save(flags);
- 		kmemcheck_slab_free(s, x, s->object_size);
- 		debug_check_no_locks_freed(x, s->object_size);
- 		local_irq_restore(flags);
- 	}
- #endif
- 	if (!(s->flags & SLAB_DEBUG_OBJECTS))
- 		debug_check_no_obj_freed(x, s->object_size);
- }
- 
- /*
   * Tracking of fully allocated slabs for debugging purposes.
   */
  static void add_full(struct kmem_cache *s,
@@@ -1282,6 -1227,12 +1227,12 @@@ static inline void inc_slabs_node(struc
  static inline void dec_slabs_node(struct kmem_cache *s, int node,
    						int objects) {}
+ #endif /* CONFIG_SLUB_DEBUG */
+ 
+ /*
+  * Hooks for other subsystems that check memory allocations. In a typical
+  * production configuration these hooks all should produce no code at all.
+  */
  static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
  {
    kmemleak_alloc(ptr, size, 1, flags);
@@@ -1293,21 -1244,44 +1244,44 @@@ static inline void kfree_hook(const voi
  }
static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
- 							{ return 0; }
+ {
+ 	flags &= gfp_allowed_mask;
+ 	lockdep_trace_alloc(flags);
+ 	might_sleep_if(flags & __GFP_WAIT);
+ 
+ 	return should_failslab(s->object_size, flags, s->flags);
+ }
- static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
- 		void *object)
+ static inline void slab_post_alloc_hook(struct kmem_cache *s,
+ 					gfp_t flags, void *object)
  {
- 	kmemleak_alloc_recursive(object, s->object_size, 1, s->flags,
- 		flags & gfp_allowed_mask);
+ 	flags &= gfp_allowed_mask;
+ 	kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
+ 	kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
  }
static inline void slab_free_hook(struct kmem_cache *s, void *x)
  {
    kmemleak_free_recursive(x, s->flags);
- }
- #endif /* CONFIG_SLUB_DEBUG */
+ 	/*
+ 	 * Trouble is that we may no longer disable interrupts in the fast path
+ 	 * So in order to make the debug calls that expect irqs to be
+ 	 * disabled we need to disable interrupts temporarily.
+ 	 */
+ #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+ 	{
+ 		unsigned long flags;
+ 
+ 		local_irq_save(flags);
+ 		kmemcheck_slab_free(s, x, s->object_size);
+ 		debug_check_no_locks_freed(x, s->object_size);
+ 		local_irq_restore(flags);
+ 	}
+ #endif
+ 	if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ 		debug_check_no_obj_freed(x, s->object_size);
+ }
/*
   * Slab allocation and freeing
@@@ -1409,9 -1383,9 +1383,9 @@@ static struct page *new_slab(struct kme
  {
    struct page *page;
    void *start;
- 	void *last;
    void *p;
    int order;
+ 	int idx;
BUG_ON(flags & GFP_SLAB_BUG_MASK);
@@@ -1432,14 -1406,13 +1406,13 @@@
    if (unlikely(s->flags & SLAB_POISON))
    	memset(start, POISON_INUSE, PAGE_SIZE << order);
- 	last = start;
- 	for_each_object(p, s, start, page->objects) {
- 		setup_object(s, page, last);
- 		set_freepointer(s, last, p);
- 		last = p;
+ 	for_each_object_idx(p, idx, s, start, page->objects) {
+ 		setup_object(s, page, p);
+ 		if (likely(idx < page->objects))
+ 			set_freepointer(s, p, p + s->size);
+ 		else
+ 			set_freepointer(s, p, NULL);
    }
- 	setup_object(s, page, last);
- 	set_freepointer(s, last, NULL);
page->freelist = start;
    page->inuse = page->objects;
@@@ -2162,6 -2135,7 +2135,7 @@@ slab_out_of_memory(struct kmem_cache *s
    static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
    			      DEFAULT_RATELIMIT_BURST);
    int node;
+ 	struct kmem_cache_node *n;
if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
    	return;
@@@ -2176,15 -2150,11 +2150,11 @@@
    	pr_warn("  %s debugging increased min order, use slub_debug=O to disable.\n",
    		s->name);
- 	for_each_online_node(node) {
- 		struct kmem_cache_node *n = get_node(s, node);
+ 	for_each_kmem_cache_node(s, node, n) {
    	unsigned long nr_slabs;
    	unsigned long nr_objs;
    	unsigned long nr_free;
- 		if (!n)
- 			continue;
- 
    	nr_free  = count_partial(n, count_free);
    	nr_slabs = node_nr_slabs(n);
    	nr_objs  = node_nr_objs(n);
@@@ -2928,13 -2898,10 +2898,10 @@@ static void early_kmem_cache_node_alloc
  static void free_kmem_cache_nodes(struct kmem_cache *s)
  {
    int node;
+ 	struct kmem_cache_node *n;
- 	for_each_node_state(node, N_NORMAL_MEMORY) {
- 		struct kmem_cache_node *n = s->node[node];
- 
- 		if (n)
- 			kmem_cache_free(kmem_cache_node, n);
- 
+ 	for_each_kmem_cache_node(s, node, n) {
+ 		kmem_cache_free(kmem_cache_node, n);
    	s->node[node] = NULL;
    }
  }
@@@ -3199,13 -3166,12 +3166,13 @@@ static void list_slab_objects(struct km
  /*
   * Attempt to free all partial slabs on a node.
   * This is called from kmem_cache_close(). We must be the last thread
 - * using the cache and therefore we do not need to lock anymore.
 + * using the cache, but we still have to lock for lockdep's sake.
   */
  static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
  {
    struct page *page, *h;
+	spin_lock_irq(&n->list_lock);
    list_for_each_entry_safe(page, h, &n->partial, lru) {
    	if (!page->inuse) {
    		__remove_partial(n, page);
@@@ -3215,7 -3181,6 +3182,7 @@@
    		"Objects remaining in %s on kmem_cache_close()");
    	}
    }
 +	spin_unlock_irq(&n->list_lock);
  }
/*
@@@ -3224,12 -3189,11 +3191,11 @@@
  static inline int kmem_cache_close(struct kmem_cache *s)
  {
    int node;
+ 	struct kmem_cache_node *n;
flush_all(s);
    /* Attempt to free all objects */
- 	for_each_node_state(node, N_NORMAL_MEMORY) {
- 		struct kmem_cache_node *n = get_node(s, node);
- 
+ 	for_each_kmem_cache_node(s, node, n) {
    	free_partial(s, n);
    	if (n->nr_partial || slabs_node(s, node))
    		return 1;
@@@ -3414,9 -3378,7 +3380,7 @@@ int __kmem_cache_shrink(struct kmem_cac
    	return -ENOMEM;
flush_all(s);
- 	for_each_node_state(node, N_NORMAL_MEMORY) {
- 		n = get_node(s, node);
- 
+ 	for_each_kmem_cache_node(s, node, n) {
    	if (!n->nr_partial)
    		continue;
@@@ -3588,6 -3550,7 +3552,7 @@@ static struct kmem_cache * __init boots
  {
    int node;
    struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+ 	struct kmem_cache_node *n;
memcpy(s, static_cache, kmem_cache->object_size);
@@@ -3597,19 -3560,16 +3562,16 @@@
     * IPIs around.
     */
    __flush_cpu_slab(s, smp_processor_id());
- 	for_each_node_state(node, N_NORMAL_MEMORY) {
- 		struct kmem_cache_node *n = get_node(s, node);
+ 	for_each_kmem_cache_node(s, node, n) {
    	struct page *p;
- 		if (n) {
- 			list_for_each_entry(p, &n->partial, lru)
- 				p->slab_cache = s;
+ 		list_for_each_entry(p, &n->partial, lru)
+ 			p->slab_cache = s;
#ifdef CONFIG_SLUB_DEBUG
- 			list_for_each_entry(p, &n->full, lru)
- 				p->slab_cache = s;
+ 		list_for_each_entry(p, &n->full, lru)
+ 			p->slab_cache = s;
  #endif
- 		}
    }
    list_add(&s->list, &slab_caches);
    return s;
@@@ -3728,6 -3688,23 +3690,23 @@@ static struct kmem_cache *find_mergeabl
    return NULL;
  }
+ static void memcg_slab_merge(struct kmem_cache *s, size_t size)
+ {
+ #ifdef CONFIG_MEMCG_KMEM
+ 	struct kmem_cache *c;
+ 	struct memcg_cache_params *params;
+ 
+ 	if (!s->memcg_params)
+ 		return;
+ 
+ 	list_for_each_entry(params, &s->memcg_params->children, siblings) {
+ 		c = params->cachep;
+ 		c->object_size = s->object_size;
+ 		c->inuse = max_t(int, c->inuse, ALIGN(size, sizeof(void *)));
+ 	}
+ #endif
+ }
+ 
  struct kmem_cache *
  __kmem_cache_alias(const char *name, size_t size, size_t align,
    	   unsigned long flags, void (*ctor)(void *))
@@@ -3736,9 -3713,6 +3715,6 @@@
s = find_mergeable(size, align, flags, name, ctor);
    if (s) {
- 		int i;
- 		struct kmem_cache *c;
- 
    	s->refcount++;
/*
@@@ -3748,14 -3722,7 +3724,7 @@@
    	s->object_size = max(s->object_size, (int)size);
    	s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
- 		for_each_memcg_cache_index(i) {
- 			c = cache_from_memcg_idx(s, i);
- 			if (!c)
- 				continue;
- 			c->object_size = s->object_size;
- 			c->inuse = max_t(int, c->inuse,
- 					 ALIGN(size, sizeof(void *)));
- 		}
+ 		memcg_slab_merge(s, size);
if (sysfs_slab_alias(s, name)) {
    		s->refcount--;
@@@ -3962,16 -3929,14 +3931,14 @@@ static long validate_slab_cache(struct 
    unsigned long count = 0;
    unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
    			sizeof(unsigned long), GFP_KERNEL);
+ 	struct kmem_cache_node *n;
if (!map)
    	return -ENOMEM;
flush_all(s);
- 	for_each_node_state(node, N_NORMAL_MEMORY) {
- 		struct kmem_cache_node *n = get_node(s, node);
- 
+ 	for_each_kmem_cache_node(s, node, n)
    	count += validate_slab_node(s, n, map);
- 	}
    kfree(map);
    return count;
  }
@@@ -4125,6 -4090,7 +4092,7 @@@ static int list_locations(struct kmem_c
    int node;
    unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
    			     sizeof(unsigned long), GFP_KERNEL);
+ 	struct kmem_cache_node *n;
if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
    			     GFP_TEMPORARY)) {
@@@ -4134,8 -4100,7 +4102,7 @@@
    /* Push back cpu slabs */
    flush_all(s);
- 	for_each_node_state(node, N_NORMAL_MEMORY) {
- 		struct kmem_cache_node *n = get_node(s, node);
+ 	for_each_kmem_cache_node(s, node, n) {
    	unsigned long flags;
    	struct page *page;
@@@ -4207,7 -4172,7 +4174,7 @@@
  #endif
#ifdef SLUB_RESILIENCY_TEST
- static void resiliency_test(void)
+ static void __init resiliency_test(void)
  {
    u8 *p;
@@@ -4334,8 -4299,9 +4301,9 @@@ static ssize_t show_slab_objects(struc
    get_online_mems();
  #ifdef CONFIG_SLUB_DEBUG
    if (flags & SO_ALL) {
- 		for_each_node_state(node, N_NORMAL_MEMORY) {
- 			struct kmem_cache_node *n = get_node(s, node);
+ 		struct kmem_cache_node *n;
+ 
+ 		for_each_kmem_cache_node(s, node, n) {
if (flags & SO_TOTAL)
    			x = atomic_long_read(&n->total_objects);
@@@ -4351,9 -4317,9 +4319,9 @@@
    } else
  #endif
    if (flags & SO_PARTIAL) {
- 		for_each_node_state(node, N_NORMAL_MEMORY) {
- 			struct kmem_cache_node *n = get_node(s, node);
+ 		struct kmem_cache_node *n;
+ 		for_each_kmem_cache_node(s, node, n) {
    		if (flags & SO_TOTAL)
    			x = count_partial(n, count_total);
    		else if (flags & SO_OBJECTS)
@@@ -4366,7 -4332,7 +4334,7 @@@
    }
    x = sprintf(buf, "%lu", total);
  #ifdef CONFIG_NUMA
- 	for_each_node_state(node, N_NORMAL_MEMORY)
+ 	for (node = 0; node < nr_node_ids; node++)
    	if (nodes[node])
    		x += sprintf(buf + x, " N%d=%lu",
    				node, nodes[node]);
@@@ -4380,16 -4346,12 +4348,12 @@@
  static int any_slab_objects(struct kmem_cache *s)
  {
    int node;
+ 	struct kmem_cache_node *n;
- 	for_each_online_node(node) {
- 		struct kmem_cache_node *n = get_node(s, node);
- 
- 		if (!n)
- 			continue;
- 
+ 	for_each_kmem_cache_node(s, node, n)
    	if (atomic_long_read(&n->total_objects))
    		return 1;
- 	}
+ 
    return 0;
  }
  #endif
@@@ -5011,7 -4973,7 +4975,7 @@@ static ssize_t slab_attr_store(struct k
    err = attribute->store(s, buf, len);
  #ifdef CONFIG_MEMCG_KMEM
    if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
- 		int i;
+ 		struct memcg_cache_params *params;
mutex_lock(&slab_mutex);
    	if (s->max_attr_size < len)
@@@ -5034,10 -4996,10 +4998,10 @@@
    	 * directly either failed or succeeded, in which case we loop
    	 * through the descendants with best-effort propagation.
    	 */
- 		for_each_memcg_cache_index(i) {
- 			struct kmem_cache *c = cache_from_memcg_idx(s, i);
- 			if (c)
- 				attribute->store(c, buf, len);
+ 		if (s->memcg_params) {
+ 			list_for_each_entry(params,
+ 					&s->memcg_params->children, siblings)
+ 				attribute->store(params->cachep, buf, len);
    	}
    	mutex_unlock(&slab_mutex);
    }
@@@ -5173,12 -5135,6 +5137,6 @@@ static char *create_unique_id(struct km
    	*p++ = '-';
    p += sprintf(p, "%07d", s->size);
- #ifdef CONFIG_MEMCG_KMEM
- 	if (!is_root_cache(s))
- 		p += sprintf(p, "-%08d",
- 				memcg_cache_id(s->memcg_params->memcg));
- #endif
- 
    BUG_ON(p > name + ID_STR_LENGTH - 1);
    return name;
  }
@@@ -5344,13 -5300,9 +5302,9 @@@ void get_slabinfo(struct kmem_cache *s
    unsigned long nr_objs = 0;
    unsigned long nr_free = 0;
    int node;
+ 	struct kmem_cache_node *n;
- 	for_each_online_node(node) {
- 		struct kmem_cache_node *n = get_node(s, node);
- 
- 		if (!n)
- 			continue;
- 
+ 	for_each_kmem_cache_node(s, node, n) {
    	nr_slabs += node_nr_slabs(n);
    	nr_objs += node_nr_objs(n);
    	nr_free += count_partial(n, count_free);
diff --combined net/bridge/br_multicast.c
index b4845f4,d9c4f57..7751c92
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@@ -1174,7 -1174,7 +1174,7 @@@ static void br_multicast_add_router(str
    }
if (slot)
- 		hlist_add_after_rcu(slot, &port->rlist);
+ 		hlist_add_behind_rcu(&port->rlist, slot);
    else
    	hlist_add_head_rcu(&port->rlist, &br->router_list);
  }
@@@ -2216,43 -2216,6 +2216,43 @@@ unlock
  EXPORT_SYMBOL_GPL(br_multicast_list_adjacent);
/**
 + * br_multicast_has_querier_anywhere - Checks for a querier on a bridge
 + * @dev: The bridge port providing the bridge on which to check for a querier
 + * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
 + *
 + * Checks whether the given interface has a bridge on top and if so returns
 + * true if a valid querier exists anywhere on the bridged link layer.
 + * Otherwise returns false.
 + */
 +bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto)
 +{
 +	struct net_bridge *br;
 +	struct net_bridge_port *port;
 +	struct ethhdr eth;
 +	bool ret = false;
 +
 +	rcu_read_lock();
 +	if (!br_port_exists(dev))
 +		goto unlock;
 +
 +	port = br_port_get_rcu(dev);
 +	if (!port || !port->br)
 +		goto unlock;
 +
 +	br = port->br;
 +
 +	memset(&eth, 0, sizeof(eth));
 +	eth.h_proto = htons(proto);
 +
 +	ret = br_multicast_querier_exists(br, &eth);
 +
 +unlock:
 +	rcu_read_unlock();
 +	return ret;
 +}
 +EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere);
 +
 +/**
   * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port
   * @dev: The bridge port adjacent to which to check for a querier
   * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
diff --combined net/xfrm/xfrm_policy.c
index 0525d78,92cb08d..beeed60
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@@ -389,7 -389,7 +389,7 @@@ redo
    		if (h != h0)
    			continue;
    		hlist_del(&pol->bydst);
- 			hlist_add_after(entry0, &pol->bydst);
+ 			hlist_add_behind(&pol->bydst, entry0);
    	}
    	entry0 = &pol->bydst;
    }
@@@ -654,7 -654,7 +654,7 @@@ int xfrm_policy_insert(int dir, struct 
    		break;
    }
    if (newpos)
- 		hlist_add_after(newpos, &policy->bydst);
+ 		hlist_add_behind(&policy->bydst, newpos);
    else
    	hlist_add_head(&policy->bydst, chain);
    xfrm_pol_hold(policy);
@@@ -2097,8 -2097,6 +2097,8 @@@ struct dst_entry *xfrm_lookup(struct ne
    			goto no_transform;
    		}
+			dst_hold(&xdst->u.dst);
 +			xdst->u.dst.flags |= DST_NOCACHE;
    		route = xdst->route;
    	}
    }
-- 
LinuxNextTracking

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

[linux-next] LinuxNextTracking branch, master, updated. next-20140724