[linux-next] LinuxNextTracking branch, master, updated. next-20140806

6 Aug 2014

The following commit has been merged in the master branch:
commit feac39277b1a4030257b3b8254880ed6dbed01cf
Merge: 500bb5b64921e84fb52202eea43af15dd238bd73 929c76e4e6292770322822a50e08b8feaa4afbbf
Author: Stephen Rothwell sfr@canb.auug.org.au
Date:   Wed Aug 6 16:32:17 2014 +1000
Merge branch 'akpm-current/current'
Conflicts:
    	arch/arm64/Kconfig
    	drivers/rapidio/devices/tsi721_dma.c
    	kernel/kexec.c
diff --combined Documentation/devicetree/bindings/i2c/trivial-devices.txt
index 37803eb,c75046a..6af570e

--- a/Documentation/devicetree/bindings/i2c/trivial-devices.txt
+++ b/Documentation/devicetree/bindings/i2c/trivial-devices.txt
@@@ -50,7 -50,6 +50,7 @@@ epson,rx8581		I2C-BUS INTERFACE REAL TI
  fsl,mag3110		MAG3110: Xtrinsic High Accuracy, 3D Magnetometer
  fsl,mc13892		MC13892: Power Management Integrated Circuit (PMIC) for i.MX35/51
  fsl,mma8450		MMA8450Q: Xtrinsic Low-power, 3-axis Xtrinsic Accelerometer
 +fsl,mma8452		MMA8452Q: 3-axis 12-bit / 8-bit Digital Accelerometer
  fsl,mpr121		MPR121: Proximity Capacitive Touch Sensor Controller
  fsl,sgtl5000		SGTL5000: Ultra Low-Power Audio Codec
  gmt,g751		G751: Digital Temperature Sensor and Thermal Watchdog with Two-Wire Interface
@@@ -70,6 -69,7 +70,7 @@@ nuvoton,npct501		i2c trusted platform m
  nxp,pca9556		Octal SMBus and I2C registered interface
  nxp,pca9557		8-bit I2C-bus and SMBus I/O port with reset
  nxp,pcf8563		Real-time clock/calendar
+ nxp,pcf85063		Tiny Real-Time Clock
  ovti,ov5642		OV5642: Color CMOS QSXGA (5-megapixel) Image Sensor with OmniBSI and Embedded TrueFocus
  pericom,pt7c4338	Real-time Clock Module
  plx,pex8648		48-Lane, 12-Port PCI Express Gen 2 (5.0 GT/s) Switch
@@@ -84,6 -84,5 +85,6 @@@ stm,m41t80		M41T80 - SERIAL ACCESS RTC 
  taos,tsl2550		Ambient Light Sensor with SMBUS/Two Wire Serial Interface
  ti,tsc2003		I2C Touch-Screen Controller
  ti,tmp102		Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface
 +ti,tmp103		Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface
  ti,tmp275		Digital Temperature Sensor
  winbond,wpct301		i2c trusted platform module (TPM)
diff --combined Documentation/kernel-parameters.txt
index 572ad02,6824f37..5ae8608
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -566,17 -566,6 +566,17 @@@ bytes respectively. Such letter suffixe
    		possible to determine what the correct size should be.
    		This option provides an override for these situations.
+	ca_keys=	[KEYS] This parameter identifies a specific key(s) on
 +			the system trusted keyring to be used for certificate
 +			trust validation.
 +			format: { id:<keyid> | builtin }
 +
 +	cca=		[MIPS] Override the kernel pages' cache coherency
 +			algorithm.  Accepted values range from 0 to 7
 +			inclusive. See arch/mips/include/asm/pgtable-bits.h
 +			for platform specific values (SB1, Loongson3 and
 +			others).
 +
    ccw_timeout_log [S390]
    		See Documentation/s390/CommonIO for details.
@@@ -1108,12 -1097,6 +1108,12 @@@
    		that can be changed at run time by the
    		set_graph_function file in the debugfs tracing directory.
+	ftrace_graph_notrace=[function-list]
 +			[FTRACE] Do not trace from the functions specified in
 +			function-list.  This list is a comma separated list of
 +			functions that can be changed at run time by the
 +			set_graph_notrace file in the debugfs tracing directory.
 +
    gamecon.map[2|3]=
    		[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
    		support via parallel port (up to 5 devices per port)
@@@ -1330,23 -1313,6 +1330,23 @@@
    		Formats: { "ima" | "ima-ng" }
    		Default: "ima-ng"
+	ima.ahash_minsize= [IMA] Minimum file size for asynchronous hash usage
 +			Format: <min_file_size>
 +			Set the minimal file size for using asynchronous hash.
 +			If left unspecified, ahash usage is disabled.
 +
 +			ahash performance varies for different data sizes on
 +			different crypto accelerators. This option can be used
 +			to achieve the best performance for a particular HW.
 +
 +	ima.ahash_bufsize= [IMA] Asynchronous hash buffer size
 +			Format: <bufsize>
 +			Set hashing buffer size. Default: 4k.
 +
 +			ahash performance varies for different chunk sizes on
 +			different crypto accelerators. This option can be used
 +			to achieve best performance for particular HW.
 +
    init=		[KNL]
    		Format: <full_path>
    		Run specified binary instead of /sbin/init as init
@@@ -1450,6 -1416,10 +1450,6 @@@
    ip=		[IP_PNP]
    		See Documentation/filesystems/nfs/nfsroot.txt.
-	ip2=		[HW] Set IO/IRQ pairs for up to 4 IntelliPort boards
 -			See comment before ip2_setup() in
 -			drivers/char/ip2/ip2base.c.
 -
    irqfixup	[HW]
    		When an interrupt is not handled search all handlers
    		for it. Intended to get systems with badly broken
@@@ -1722,8 -1692,12 +1722,12 @@@
    		7 (KERN_DEBUG)		debug-level messages
log_buf_len=n[KMG]	Sets the size of the printk ring buffer,
- 			in bytes.  n must be a power of two.  The default
- 			size is set in the kernel config file.
+ 			in bytes.  n must be a power of two and greater
+ 			than the minimal size. The minimal size is defined
+ 			by LOG_BUF_SHIFT kernel config parameter. There is
+ 			also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter
+ 			that allows to increase the default size depending on
+ 			the number of CPUs. See init/Kconfig for more details.
logo.nologo	[FB] Disables display of the built-in Linux logo.
    		This may be used to provide more screen space for
@@@ -2196,21 -2170,6 +2200,21 @@@
    		and restore using xsave. The kernel will fallback to
    		enabling legacy floating-point and sse state.
+	noxsaveopt	[X86] Disables xsaveopt used in saving x86 extended
 +			register states. The kernel will fall back to use
 +			xsave to save the states. By using this parameter,
 +			performance of saving the states is degraded because
 +			xsave doesn't support modified optimization while
 +			xsaveopt supports it on xsaveopt enabled systems.
 +
 +	noxsaves	[X86] Disables xsaves and xrstors used in saving and
 +			restoring x86 extended register state in compacted
 +			form of xsave area. The kernel will fall back to use
 +			xsaveopt and xrstor to save and restore the states
 +			in standard form of xsave area. By using this
 +			parameter, xsave area per process might occupy more
 +			memory on xsaves enabled systems.
 +
    eagerfpu=	[X86]
    		on	enable eager fpu restore
    		off	disable eager fpu restore
@@@ -2852,13 -2811,6 +2856,13 @@@
    		quiescent states.  Units are jiffies, minimum
    		value is one, and maximum value is HZ.
+	rcutree.rcu_nocb_leader_stride= [KNL]
 +			Set the number of NOCB kthread groups, which
 +			defaults to the square root of the number of
 +			CPUs.  Larger numbers reduces the wakeup overhead
 +			on the per-CPU grace-period kthreads, but increases
 +			that same overhead on each group's leader.
 +
    rcutree.qhimark= [KNL]
    		Set threshold of queued RCU callbacks beyond which
    		batch limiting is disabled.
@@@ -3075,13 -3027,6 +3079,13 @@@
S		[KNL] Run init in single mode
+	s390_iommu=	[HW,S390]
 +			Set s390 IOTLB flushing mode
 +		strict
 +			With strict flushing every unmap operation will result in
 +			an IOTLB flush. Default is lazy flushing before reuse,
 +			which is faster.
 +
    sa1100ir	[NET]
    		See drivers/net/irda/sa1100_ir.c.
@@@ -3756,10 -3701,6 +3760,10 @@@
    		Disables the ticketlock slowpath using Xen PV
    		optimizations.
+	xen_nopv	[X86]
 +			Disables the PV optimizations forcing the HVM guest to
 +			run as generic HVM guest with no PV drivers.
 +
    xirc2ps_cs=	[NET,PCMCIA]
    		Format:
    		<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
diff --combined Makefile
index c076c33,f30b312..8a2217b
--- a/Makefile
+++ b/Makefile
@@@ -1,7 -1,7 +1,7 @@@
  VERSION = 3
  PATCHLEVEL = 16
  SUBLEVEL = 0
 -EXTRAVERSION = -rc7
 +EXTRAVERSION =
  NAME = Shuffling Zombie Juror
# *DOCUMENTATION*
@@@ -360,14 -360,9 +360,14 @@@ include $(srctree)/scripts/Kbuild.inclu
  # Make variables (CC, etc...)
  AS		= $(CROSS_COMPILE)as
  LD		= $(CROSS_COMPILE)ld
 +LDFINAL	= $(LD)
  CC		= $(CROSS_COMPILE)gcc
  CPP		= $(CC) -E
 +ifdef CONFIG_LTO
 +AR		= $(CROSS_COMPILE)gcc-ar
 +else
  AR		= $(CROSS_COMPILE)ar
 +endif
  NM		= $(CROSS_COMPILE)nm
  STRIP		= $(CROSS_COMPILE)strip
  OBJCOPY		= $(CROSS_COMPILE)objcopy
@@@ -377,7 -372,6 +377,7 @@@ GENKSYMS	= scripts/genksyms/genksym
  INSTALLKERNEL  := installkernel
  DEPMOD		= /sbin/depmod
  PERL		= perl
 +PYTHON		= python
  CHECK		= sparse
CHECKFLAGS     := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
@@@ -427,8 -421,8 +427,8 @@@ KERNELVERSION = $(VERSION)$(if $(PATCHL
export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
  export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
 -export CPP AR NM STRIP OBJCOPY OBJDUMP
 -export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
 +export CPP AR NM STRIP OBJCOPY OBJDUMP LDFINAL
 +export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
  export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
@@@ -438,17 -432,6 +438,17 @@@ export KBUILD_AFLAGS_MODULE KBUILD_CFLA
  export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
  export KBUILD_ARFLAGS
+ifdef CONFIG_LTO
 +# LTO gcc creates a lot of files in TMPDIR, and with /tmp as tmpfs
 +# it's easy to drive the machine OOM. Use the object directory
 +# instead.
 +ifndef TMPDIR
 +TMPDIR ?= $(objtree)
 +export TMPDIR
 +$(info setting TMPDIR=$(objtree) for LTO build)
 +endif
 +endif
 +
  # When compiling out-of-tree modules, put MODVERDIR in the module
  # tree rather than in the kernel tree. The kernel tree might
  # even be read-only.
@@@ -638,6 -621,9 +638,9 @@@ els
  KBUILD_CFLAGS	+= -O2
  endif
+ # Tell gcc to never replace conditional load with a non-conditional one
+ KBUILD_CFLAGS	+= $(call cc-option,--param=allow-store-data-races=0)
+ 
  ifdef CONFIG_READABLE_ASM
  # Disable optimizations that make assembler listings hard to read.
  # reorder blocks reorders the control in the function
@@@ -653,6 -639,22 +656,22 @@@ KBUILD_CFLAGS += $(call cc-option,-Wfra
  endif
# Handle stack protector mode.
+ #
+ # Since kbuild can potentially perform two passes (first with the old
+ # .config values and then with updated .config values), we cannot error out
+ # if a desired compiler option is unsupported. If we were to error, kbuild
+ # could never get to the second pass and actually notice that we changed
+ # the option to something that was supported.
+ #
+ # Additionally, we don't want to fallback and/or silently change which compiler
+ # flags will be used, since that leads to producing kernels with different
+ # security feature characteristics depending on the compiler used. ("But I
+ # selected CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!")
+ #
+ # The middle ground is to warn here so that the failed option is obvious, but
+ # to let the build fail with bad compiler flags so that we can't produce a
+ # kernel when there is a CONFIG and compiler mismatch.
+ #
  ifdef CONFIG_CC_STACKPROTECTOR_REGULAR
    stackp-flag := -fstack-protector
    ifeq ($(call cc-option, $(stackp-flag)),)
@@@ -685,7 -687,6 +704,7 @@@ KBUILD_CFLAGS += $(call cc-disable-warn
  # source of a reference will be _MergedGlobals and not on of the whitelisted names.
  # See modpost pattern 2
  KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
 +KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
  else
# This warning generated too much noise in a regular build.
@@@ -709,16 -710,9 +728,16 @@@ endi
  KBUILD_CFLAGS   += $(call cc-option, -fno-var-tracking-assignments)
ifdef CONFIG_DEBUG_INFO
 +ifdef CONFIG_DEBUG_INFO_SPLIT
 +KBUILD_CFLAGS   += $(call cc-option, -gsplit-dwarf, -g)
 +else
  KBUILD_CFLAGS	+= -g
 +endif
  KBUILD_AFLAGS	+= -Wa,-gdwarf-2
  endif
 +ifdef CONFIG_DEBUG_INFO_DWARF4
 +KBUILD_CFLAGS	+= $(call cc-option, -gdwarf-4,)
 +endif
ifdef CONFIG_DEBUG_INFO_REDUCED
  KBUILD_CFLAGS 	+= $(call cc-option, -femit-struct-debug-baseonly) \
@@@ -778,7 -772,6 +797,7 @@@ ifeq ($(shell $(CONFIG_SHELL) $(srctree
  endif
include $(srctree)/scripts/Makefile.extrawarn
 +include ${srctree}/scripts/Makefile.lto
# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
  KBUILD_CPPFLAGS += $(KCPPFLAGS)
@@@ -1248,9 -1241,9 +1267,9 @@@ help
    @echo  '  tags/TAGS	  - Generate tags file for editors'
    @echo  '  cscope	  - Generate cscope index'
    @echo  '  gtags           - Generate GNU GLOBAL index'
 -	@echo  '  kernelrelease	  - Output the release version string'
 -	@echo  '  kernelversion	  - Output the version stored in Makefile'
 -	@echo  '  image_name	  - Output the image name'
 +	@echo  '  kernelrelease	  - Output the release version string (use with make -s)'
 +	@echo  '  kernelversion	  - Output the version stored in Makefile (use with make -s)'
 +	@echo  '  image_name	  - Output the image name (use with make -s)'
    @echo  '  headers_install - Install sanitised kernel headers to INSTALL_HDR_PATH'; \
     echo  '                    (default: $(INSTALL_HDR_PATH))'; \
     echo  ''
@@@ -1405,7 -1398,6 +1424,7 @@@ clean: $(clean-dirs
    @find $(if $(KBUILD_EXTMOD), $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \
    	( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \
    	-o -name '*.ko.*' \
 +		-o -name '*.dwo'  \
    	-o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
    	-o -name '*.symtypes' -o -name 'modules.order' \
    	-o -name modules.builtin -o -name '.tmp_*.o.*' \
diff --combined arch/arm/Kconfig
index 916cedbd,551e526..32cbbd5
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@@ -65,6 -65,7 +65,6 @@@ config AR
    select HAVE_UID16
    select HAVE_VIRT_CPU_ACCOUNTING_GEN
    select IRQ_FORCED_THREADING
 -	select KTIME_SCALAR
    select MODULES_USE_ELF_REL
    select NO_BOOTMEM
    select OLD_SIGACTION
@@@ -83,6 -84,7 +83,7 @@@
      http://www.arm.linux.org.uk/.
config ARM_HAS_SG_CHAIN
+ 	select ARCH_HAS_SG_CHAIN
    bool
config NEED_SG_DMA_LENGTH
@@@ -239,6 -241,13 +240,6 @@@ config ARM_PATCH_PHYS_VIR
      this feature (eg, building a kernel for a single machine) and
      you need to shrink the kernel to the minimal size.
-config NEED_MACH_GPIO_H
 -	bool
 -	help
 -	  Select this when mach/gpio.h is required to provide special
 -	  definitions for this platform. The need for mach/gpio.h should
 -	  be avoided when possible.
 -
  config NEED_MACH_IO_H
    bool
    help
@@@ -255,22 -264,8 +256,22 @@@ config NEED_MACH_MEMORY_
config PHYS_OFFSET
    hex "Physical address of main memory" if MMU
 -	depends on !ARM_PATCH_PHYS_VIRT && !NEED_MACH_MEMORY_H
 +	depends on !ARM_PATCH_PHYS_VIRT
    default DRAM_BASE if !MMU
 +	default 0x00000000 if ARCH_EBSA110 || \
 +			EP93XX_SDCE3_SYNC_PHYS_OFFSET || \
 +			ARCH_FOOTBRIDGE || \
 +			ARCH_INTEGRATOR || \
 +			ARCH_IOP13XX || \
 +			ARCH_KS8695 || \
 +			(ARCH_REALVIEW && !REALVIEW_HIGH_PHYS_OFFSET)
 +	default 0x10000000 if ARCH_OMAP1 || ARCH_RPC
 +	default 0x20000000 if ARCH_S5PV210
 +	default 0x70000000 if REALVIEW_HIGH_PHYS_OFFSET
 +	default 0xc0000000 if EP93XX_SDCE0_PHYS_OFFSET || ARCH_SA1100
 +	default 0xd0000000 if EP93XX_SDCE1_PHYS_OFFSET
 +	default 0xe0000000 if EP93XX_SDCE2_PHYS_OFFSET
 +	default 0xf0000000 if EP93XX_SDCE3_ASYNC_PHYS_OFFSET
    help
      Please provide the physical address corresponding to the
      location of main memory in your system.
@@@ -319,7 -314,7 +320,7 @@@ config ARCH_MULTIPLATFOR
  config ARCH_INTEGRATOR
    bool "ARM Ltd. Integrator family"
    select ARM_AMBA
 -	select ARM_PATCH_PHYS_VIRT
 +	select ARM_PATCH_PHYS_VIRT if MMU
    select AUTO_ZRELADDR
    select COMMON_CLK
    select COMMON_CLK_VERSATILE
@@@ -327,6 -322,7 +328,6 @@@
    select HAVE_TCM
    select ICST
    select MULTI_IRQ_HANDLER
 -	select NEED_MACH_MEMORY_H
    select PLAT_VERSATILE
    select SPARSE_IRQ
    select USE_OF
@@@ -346,6 -342,7 +347,6 @@@ config ARCH_REALVIE
    select ICST
    select NEED_MACH_MEMORY_H
    select PLAT_VERSATILE
 -	select PLAT_VERSATILE_CLCD
    help
      This enables support for ARM Ltd RealView boards.
@@@ -360,6 -357,7 +361,6 @@@ config ARCH_VERSATIL
    select HAVE_MACH_CLKDEV
    select ICST
    select PLAT_VERSATILE
 -	select PLAT_VERSATILE_CLCD
    select PLAT_VERSATILE_CLOCK
    select VERSATILE_FPGA_IRQ
    help
@@@ -439,6 -437,7 +440,6 @@@ config ARCH_EP93X
    select ARM_VIC
    select CLKDEV_LOOKUP
    select CPU_ARM920T
 -	select NEED_MACH_MEMORY_H
    help
      This enables support for the Cirrus EP93xx series of CPUs.
@@@ -531,6 -530,21 +532,6 @@@ config ARCH_DOV
    help
      Support for the Marvell Dove SoC 88AP510
-config ARCH_KIRKWOOD
 -	bool "Marvell Kirkwood"
 -	select ARCH_REQUIRE_GPIOLIB
 -	select CPU_FEROCEON
 -	select GENERIC_CLOCKEVENTS
 -	select MVEBU_MBUS
 -	select PCI
 -	select PCI_QUIRKS
 -	select PINCTRL
 -	select PINCTRL_KIRKWOOD
 -	select PLAT_ORION_LEGACY
 -	help
 -	  Support for the following Marvell Kirkwood series SoCs:
 -	  88F6180, 88F6192 and 88F6281.
 -
  config ARCH_MV78XX0
    bool "Marvell MV78xx0"
    select ARCH_REQUIRE_GPIOLIB
@@@ -622,7 -636,6 +623,7 @@@ config ARCH_PX
    select AUTO_ZRELADDR
    select CLKDEV_LOOKUP
    select CLKSRC_MMIO
 +	select CLKSRC_OF
    select GENERIC_CLOCKEVENTS
    select GPIO_PXA
    select HAVE_IDE
@@@ -647,7 -660,7 +648,7 @@@ config ARCH_MS
  config ARCH_SHMOBILE_LEGACY
    bool "Renesas ARM SoCs (non-multiplatform)"
    select ARCH_SHMOBILE
 -	select ARM_PATCH_PHYS_VIRT
 +	select ARM_PATCH_PHYS_VIRT if MMU
    select CLKDEV_LOOKUP
    select GENERIC_CLOCKEVENTS
    select HAVE_ARM_SCU if SMP
@@@ -747,6 -760,61 +748,6 @@@ config ARCH_S3C64X
    help
      Samsung S3C64XX series based systems
-config ARCH_S5P64X0
 -	bool "Samsung S5P6440 S5P6450"
 -	select ATAGS
 -	select CLKDEV_LOOKUP
 -	select CLKSRC_SAMSUNG_PWM
 -	select CPU_V6
 -	select GENERIC_CLOCKEVENTS
 -	select GPIO_SAMSUNG
 -	select HAVE_S3C2410_I2C if I2C
 -	select HAVE_S3C2410_WATCHDOG if WATCHDOG
 -	select HAVE_S3C_RTC if RTC_CLASS
 -	select NEED_MACH_GPIO_H
 -	select SAMSUNG_ATAGS
 -	select SAMSUNG_WDT_RESET
 -	help
 -	  Samsung S5P64X0 CPU based systems, such as the Samsung SMDK6440,
 -	  SMDK6450.
 -
 -config ARCH_S5PC100
 -	bool "Samsung S5PC100"
 -	select ARCH_REQUIRE_GPIOLIB
 -	select ATAGS
 -	select CLKDEV_LOOKUP
 -	select CLKSRC_SAMSUNG_PWM
 -	select CPU_V7
 -	select GENERIC_CLOCKEVENTS
 -	select GPIO_SAMSUNG
 -	select HAVE_S3C2410_I2C if I2C
 -	select HAVE_S3C2410_WATCHDOG if WATCHDOG
 -	select HAVE_S3C_RTC if RTC_CLASS
 -	select NEED_MACH_GPIO_H
 -	select SAMSUNG_ATAGS
 -	select SAMSUNG_WDT_RESET
 -	help
 -	  Samsung S5PC100 series based systems
 -
 -config ARCH_S5PV210
 -	bool "Samsung S5PV210/S5PC110"
 -	select ARCH_HAS_HOLES_MEMORYMODEL
 -	select ARCH_SPARSEMEM_ENABLE
 -	select ATAGS
 -	select CLKDEV_LOOKUP
 -	select CLKSRC_SAMSUNG_PWM
 -	select CPU_V7
 -	select GENERIC_CLOCKEVENTS
 -	select GPIO_SAMSUNG
 -	select HAVE_S3C2410_I2C if I2C
 -	select HAVE_S3C2410_WATCHDOG if WATCHDOG
 -	select HAVE_S3C_RTC if RTC_CLASS
 -	select NEED_MACH_GPIO_H
 -	select NEED_MACH_MEMORY_H
 -	select SAMSUNG_ATAGS
 -	help
 -	  Samsung S5PV210/S5PC110 series based systems
 -
  config ARCH_DAVINCI
    bool "TI DaVinci"
    select ARCH_HAS_HOLES_MEMORYMODEL
@@@ -885,6 -953,8 +886,6 @@@ source "arch/arm/mach-ixp4xx/Kconfig
source "arch/arm/mach-keystone/Kconfig"
-source "arch/arm/mach-kirkwood/Kconfig"
 -
  source "arch/arm/mach-ks8695/Kconfig"
source "arch/arm/mach-msm/Kconfig"
@@@ -895,8 -965,6 +896,8 @@@ source "arch/arm/mach-mv78xx0/Kconfig
source "arch/arm/mach-imx/Kconfig"
+source "arch/arm/mach-mediatek/Kconfig"
 +
  source "arch/arm/mach-mxs/Kconfig"
source "arch/arm/mach-netx/Kconfig"
@@@ -938,6 -1006,10 +939,6 @@@ source "arch/arm/mach-s3c24xx/Kconfig
source "arch/arm/mach-s3c64xx/Kconfig"
-source "arch/arm/mach-s5p64x0/Kconfig"
 -
 -source "arch/arm/mach-s5pc100/Kconfig"
 -
  source "arch/arm/mach-s5pv210/Kconfig"
source "arch/arm/mach-exynos/Kconfig"
@@@ -1484,12 -1556,10 +1485,12 @@@ config ARM_PSC
  config ARCH_NR_GPIO
    int
    default 1024 if ARCH_SHMOBILE || ARCH_TEGRA
 -	default 512 if ARCH_EXYNOS || ARCH_KEYSTONE || SOC_OMAP5 || SOC_DRA7XX || ARCH_S3C24XX || ARCH_S3C64XX
 +	default 512 if ARCH_EXYNOS || ARCH_KEYSTONE || SOC_OMAP5 || \
 +		SOC_DRA7XX || ARCH_S3C24XX || ARCH_S3C64XX || ARCH_S5PV210
    default 416 if ARCH_SUNXI
    default 392 if ARCH_U8500
    default 352 if ARCH_VT8500
 +	default 288 if ARCH_ROCKCHIP
    default 264 if MACH_H4700
    default 0
    help
@@@ -1501,7 -1571,7 +1502,7 @@@ source kernel/Kconfig.preemp
config HZ_FIXED
    int
 -	default 200 if ARCH_EBSA110 || ARCH_S3C24XX || ARCH_S5P64X0 || \
 +	default 200 if ARCH_EBSA110 || ARCH_S3C24XX || \
    	ARCH_S5PV210 || ARCH_EXYNOS4
    default AT91_TIMER_HZ if ARCH_AT91
    default SHMOBILE_TIMER_HZ if ARCH_SHMOBILE_LEGACY
@@@ -2126,6 -2196,7 +2127,6 @@@ menu "Power management options
  source "kernel/power/Kconfig"
config ARCH_SUSPEND_POSSIBLE
 -	depends on !ARCH_S5PC100
    depends on CPU_ARM920T || CPU_ARM926T || CPU_FEROCEON || CPU_SA1100 || \
    	CPU_V6 || CPU_V6K || CPU_V7 || CPU_V7M || CPU_XSC3 || CPU_XSCALE || CPU_MOHAWK
    def_bool y
diff --combined arch/arm/mm/dma-mapping.c
index 1f88db0,3116880..7a996aa
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@@ -26,6 -26,7 +26,7 @@@
  #include <linux/io.h>
  #include <linux/vmalloc.h>
  #include <linux/sizes.h>
+ #include <linux/cma.h>
#include <asm/memory.h>
  #include <asm/highmem.h>
@@@ -461,21 -462,12 +462,21 @@@ void __init dma_contiguous_remap(void
    	map.type = MT_MEMORY_DMA_READY;
/*
 -		 * Clear previous low-memory mapping
 +		 * Clear previous low-memory mapping to ensure that the
 +		 * TLB does not see any conflicting entries, then flush
 +		 * the TLB of the old entries before creating new mappings.
 +		 *
 +		 * This ensures that any speculatively loaded TLB entries
 +		 * (even though they may be rare) can not cause any problems,
 +		 * and ensures that this code is architecturally compliant.
    	 */
    	for (addr = __phys_to_virt(start); addr < __phys_to_virt(end);
    	     addr += PMD_SIZE)
    		pmd_clear(pmd_off_k(addr));
+		flush_tlb_kernel_range(__phys_to_virt(start),
 +				       __phys_to_virt(end));
 +
    	iotable_init(&map, 1);
    }
  }
diff --combined arch/arm64/Kconfig
index a5dc5ff,7bc7b74..62b4ae1
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@@ -1,6 -1,8 +1,7 @@@
  config ARM64
    def_bool y
    select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 -	select ARCH_HAS_OPP
+ 	select ARCH_HAS_SG_CHAIN
    select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
    select ARCH_USE_CMPXCHG_LOCKREF
    select ARCH_SUPPORTS_ATOMIC_RMW
@@@ -10,9 -12,6 +11,9 @@@
    select ARM_AMBA
    select ARM_ARCH_TIMER
    select ARM_GIC
 +	select ARM_GIC_V2M if (PCI && PCI_MSI)
 +	select ARM_GIC_V3
 +	select AUDIT_ARCH_COMPAT_GENERIC
    select BUILDTIME_EXTABLE_SORT
    select CLONE_BACKWARDS
    select COMMON_CLK
@@@ -31,12 -30,10 +32,12 @@@
    select GENERIC_STRNLEN_USER
    select GENERIC_TIME_VSYSCALL
    select HARDIRQS_SW_RESEND
 +	select HAVE_ARCH_AUDITSYSCALL
    select HAVE_ARCH_JUMP_LABEL
    select HAVE_ARCH_KGDB
    select HAVE_ARCH_TRACEHOOK
    select HAVE_C_RECORDMCOUNT
 +	select HAVE_CC_STACKPROTECTOR
    select HAVE_DEBUG_BUGVERBOSE
    select HAVE_DEBUG_KMEMLEAK
    select HAVE_DMA_API_DEBUG
@@@ -67,7 -64,6 +68,7 @@@
    select RTC_LIB
    select SPARSE_IRQ
    select SYSCTL_EXCEPTION_TRACE
 +	select HAVE_CONTEXT_TRACKING
    help
      ARM 64-bit (AArch64) Linux support.
@@@ -160,63 -156,14 +161,63 @@@ endmen
menu "Kernel Features"
+choice
 +	prompt "Page size"
 +	default ARM64_4K_PAGES
 +	help
 +	  Page size (translation granule) configuration.
 +
 +config ARM64_4K_PAGES
 +	bool "4KB"
 +	help
 +	  This feature enables 4KB pages support.
 +
  config ARM64_64K_PAGES
 -	bool "Enable 64KB pages support"
 +	bool "64KB"
    help
      This feature enables 64KB pages support (4KB by default)
      allowing only two levels of page tables and faster TLB
      look-up. AArch32 emulation is not available when this feature
      is enabled.
+endchoice
 +
 +choice
 +	prompt "Virtual address space size"
 +	default ARM64_VA_BITS_39 if ARM64_4K_PAGES
 +	default ARM64_VA_BITS_42 if ARM64_64K_PAGES
 +	help
 +	  Allows choosing one of multiple possible virtual address
 +	  space sizes. The level of translation table is determined by
 +	  a combination of page size and virtual address space size.
 +
 +config ARM64_VA_BITS_39
 +	bool "39-bit"
 +	depends on ARM64_4K_PAGES
 +
 +config ARM64_VA_BITS_42
 +	bool "42-bit"
 +	depends on ARM64_64K_PAGES
 +
 +config ARM64_VA_BITS_48
 +	bool "48-bit"
 +	depends on BROKEN
 +
 +endchoice
 +
 +config ARM64_VA_BITS
 +	int
 +	default 39 if ARM64_VA_BITS_39
 +	default 42 if ARM64_VA_BITS_42
 +	default 48 if ARM64_VA_BITS_48
 +
 +config ARM64_PGTABLE_LEVELS
 +	int
 +	default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42
 +	default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48
 +	default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39
 +	default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48
 +
  config CPU_BIG_ENDIAN
         bool "Build big-endian kernel"
         help
@@@ -348,18 -295,12 +349,18 @@@ config CMDLINE_FORC
      This is useful if you cannot or don't want to change the
      command-line options your boot loader passes to the kernel.
+config EFI_STUB
 +	bool
 +
  config EFI
    bool "UEFI runtime support"
    depends on OF && !CPU_BIG_ENDIAN
    select LIBFDT
    select UCS2_STRING
    select EFI_PARAMS_FROM_FDT
 +	select EFI_RUNTIME_WRAPPERS
 +	select EFI_STUB
 +	select EFI_ARMSTUB
    default y
    help
      This option provides support for runtime services provided
diff --combined arch/ia64/Kconfig
index 44a6915,56986a0..c84c88b
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@@ -10,7 -10,6 +10,7 @@@ config IA6
    select ARCH_MIGHT_HAVE_PC_SERIO
    select PCI if (!IA64_HP_SIM)
    select ACPI if (!IA64_HP_SIM)
 +	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
    select PM if (!IA64_HP_SIM)
    select HAVE_UNSTABLE_SCHED_CLOCK
    select HAVE_IDE
@@@ -28,6 -27,7 +28,7 @@@
    select HAVE_MEMBLOCK
    select HAVE_MEMBLOCK_NODE_MAP
    select HAVE_VIRT_CPU_ACCOUNTING
+ 	select ARCH_HAS_SG_CHAIN
    select VIRT_TO_BUS
    select ARCH_DISCARD_MEMBLOCK
    select GENERIC_IRQ_PROBE
diff --combined arch/ia64/kernel/time.c
index 3e71ef8,a149c67..9a0104a
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@@ -384,21 -384,6 +384,6 @@@ static struct irqaction timer_irqactio
    .name =		"timer"
  };
- static struct platform_device rtc_efi_dev = {
- 	.name = "rtc-efi",
- 	.id = -1,
- };
- 
- static int __init rtc_init(void)
- {
- 	if (platform_device_register(&rtc_efi_dev) < 0)
- 		printk(KERN_ERR "unable to register rtc device...\n");
- 
- 	/* not necessarily an error */
- 	return 0;
- }
- module_init(rtc_init);
- 
  void read_persistent_clock(struct timespec *ts)
  {
    efi_gettimeofday(ts);
@@@ -441,7 -426,7 +426,7 @@@ void update_vsyscall_tz(void
  }
void update_vsyscall_old(struct timespec *wall, struct timespec *wtm,
 -			struct clocksource *c, u32 mult)
 +			 struct clocksource *c, u32 mult, cycle_t cycle_last)
  {
    write_seqcount_begin(&fsyscall_gtod_data.seq);
@@@ -450,7 -435,7 +435,7 @@@
          fsyscall_gtod_data.clk_mult = mult;
          fsyscall_gtod_data.clk_shift = c->shift;
          fsyscall_gtod_data.clk_fsys_mmio = c->archdata.fsys_mmio;
 -        fsyscall_gtod_data.clk_cycle_last = c->cycle_last;
 +        fsyscall_gtod_data.clk_cycle_last = cycle_last;
/* copy kernel time structures */
          fsyscall_gtod_data.wall_time.tv_sec = wall->tv_sec;
diff --combined arch/powerpc/kvm/Makefile
index 1ccd7a1,72905c3..91cfb0d
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@@ -10,17 -10,27 +10,17 @@@ KVM := ../../../virt/kv
  common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
    	$(KVM)/eventfd.o
CFLAGS_e500_mmu.o := -I.
  CFLAGS_e500_mmu_host.o := -I.
  CFLAGS_emulate.o  := -I.
 +CFLAGS_emulate_loadstore.o  := -I.
-common-objs-y += powerpc.o emulate.o
 +common-objs-y += powerpc.o emulate.o emulate_loadstore.o
  obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
  obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
AFLAGS_booke_interrupts.o := -I$(obj)
-kvm-440-objs := \
 -	$(common-objs-y) \
 -	booke.o \
 -	booke_emulate.o \
 -	booke_interrupts.o \
 -	44x.o \
 -	44x_tlb.o \
 -	44x_emulate.o
 -kvm-objs-$(CONFIG_KVM_440) := $(kvm-440-objs)
 -
  kvm-e500-objs := \
    $(common-objs-y) \
    booke.o \
@@@ -80,7 -90,6 +80,6 @@@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM
    book3s_hv_rm_mmu.o \
    book3s_hv_ras.o \
    book3s_hv_builtin.o \
- 	book3s_hv_cma.o \
    $(kvm-book3s_64-builtin-xics-objs-y)
  endif
@@@ -92,7 -101,6 +91,7 @@@ kvm-book3s_64-module-objs += 
    $(KVM)/eventfd.o \
    powerpc.o \
    emulate.o \
 +	emulate_loadstore.o \
    book3s.o \
    book3s_64_vio.o \
    book3s_rtas.o \
@@@ -118,6 -126,7 +117,6 @@@ kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING
kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
-obj-$(CONFIG_KVM_440) += kvm.o
  obj-$(CONFIG_KVM_E500V2) += kvm.o
  obj-$(CONFIG_KVM_E500MC) += kvm.o
  obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
diff --combined arch/powerpc/kvm/book3s_64_mmu_hv.c
index e3d17f5,a01744f..72c20bb
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@@ -37,8 -37,6 +37,6 @@@
  #include <asm/ppc-opcode.h>
  #include <asm/cputable.h>
- #include "book3s_hv_cma.h"
- 
  /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
  #define MAX_LPID_970	63
@@@ -64,10 -62,10 +62,10 @@@ long kvmppc_alloc_hpt(struct kvm *kvm, 
    }
kvm->arch.hpt_cma_alloc = 0;
    page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
    if (page) {
    	hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+ 		memset((void *)hpt, 0, (1 << order));
    	kvm->arch.hpt_cma_alloc = 1;
    }
@@@ -450,7 -448,7 +448,7 @@@ static int kvmppc_mmu_book3s_64_hv_xlat
    unsigned long slb_v;
    unsigned long pp, key;
    unsigned long v, gr;
 -	unsigned long *hptep;
 +	__be64 *hptep;
    int index;
    int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
@@@ -473,13 -471,13 +471,13 @@@
    	preempt_enable();
    	return -ENOENT;
    }
 -	hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
 -	v = hptep[0] & ~HPTE_V_HVLOCK;
 +	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
 +	v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
    gr = kvm->arch.revmap[index].guest_rpte;
/* Unlock the HPTE */
    asm volatile("lwsync" : : : "memory");
 -	hptep[0] = v;
 +	hptep[0] = cpu_to_be64(v);
    preempt_enable();
gpte->eaddr = eaddr;
@@@ -530,14 -528,21 +528,14 @@@ static int instruction_is_store(unsigne
  static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
    			  unsigned long gpa, gva_t ea, int is_store)
  {
 -	int ret;
    u32 last_inst;
 -	unsigned long srr0 = kvmppc_get_pc(vcpu);
-	/* We try to load the last instruction.  We don't let
 -	 * emulate_instruction do it as it doesn't check what
 -	 * kvmppc_ld returns.
 +	/*
     * If we fail, we just return to the guest and try executing it again.
     */
 -	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) {
 -		ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
 -		if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED)
 -			return RESUME_GUEST;
 -		vcpu->arch.last_inst = last_inst;
 -	}
 +	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
 +		EMULATE_DONE)
 +		return RESUME_GUEST;
/*
     * WARNING: We do not know for sure whether the instruction we just
@@@ -551,7 -556,7 +549,7 @@@
     * we just return and retry the instruction.
     */
-	if (instruction_is_store(kvmppc_get_last_inst(vcpu)) != !!is_store)
 +	if (instruction_is_store(last_inst) != !!is_store)
    	return RESUME_GUEST;
/*
@@@ -576,8 -581,7 +574,8 @@@ int kvmppc_book3s_hv_page_fault(struct 
    			unsigned long ea, unsigned long dsisr)
  {
    struct kvm *kvm = vcpu->kvm;
 -	unsigned long *hptep, hpte[3], r;
 +	unsigned long hpte[3], r;
 +	__be64 *hptep;
    unsigned long mmu_seq, psize, pte_size;
    unsigned long gpa_base, gfn_base;
    unsigned long gpa, gfn, hva, pfn;
@@@ -600,16 -604,16 +598,16 @@@
    if (ea != vcpu->arch.pgfault_addr)
    	return RESUME_GUEST;
    index = vcpu->arch.pgfault_index;
 -	hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
 +	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
    rev = &kvm->arch.revmap[index];
    preempt_disable();
    while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
    	cpu_relax();
 -	hpte[0] = hptep[0] & ~HPTE_V_HVLOCK;
 -	hpte[1] = hptep[1];
 +	hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
 +	hpte[1] = be64_to_cpu(hptep[1]);
    hpte[2] = r = rev->guest_rpte;
    asm volatile("lwsync" : : : "memory");
 -	hptep[0] = hpte[0];
 +	hptep[0] = cpu_to_be64(hpte[0]);
    preempt_enable();
if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
@@@ -725,9 -729,8 +723,9 @@@
    preempt_disable();
    while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
    	cpu_relax();
 -	if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] ||
 -	    rev->guest_rpte != hpte[2])
 +	if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] ||
 +		be64_to_cpu(hptep[1]) != hpte[1] ||
 +		rev->guest_rpte != hpte[2])
    	/* HPTE has been changed under us; let the guest retry */
    	goto out_unlock;
    hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
@@@ -747,20 -750,20 +745,20 @@@
    rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
    r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
-	if (hptep[0] & HPTE_V_VALID) {
 +	if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
    	/* HPTE was previously valid, so we need to invalidate it */
    	unlock_rmap(rmap);
 -		hptep[0] |= HPTE_V_ABSENT;
 +		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
    	kvmppc_invalidate_hpte(kvm, hptep, index);
    	/* don't lose previous R and C bits */
 -		r |= hptep[1] & (HPTE_R_R | HPTE_R_C);
 +		r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
    } else {
    	kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
    }
-	hptep[1] = r;
 +	hptep[1] = cpu_to_be64(r);
    eieio();
 -	hptep[0] = hpte[0];
 +	hptep[0] = cpu_to_be64(hpte[0]);
    asm volatile("ptesync" : : : "memory");
    preempt_enable();
    if (page && hpte_is_writable(r))
@@@ -779,7 -782,7 +777,7 @@@
    return ret;
out_unlock:
 -	hptep[0] &= ~HPTE_V_HVLOCK;
 +	hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
    preempt_enable();
    goto out_put;
  }
@@@ -855,7 -858,7 +853,7 @@@ static int kvm_unmap_rmapp(struct kvm *
  {
    struct revmap_entry *rev = kvm->arch.revmap;
    unsigned long h, i, j;
 -	unsigned long *hptep;
 +	__be64 *hptep;
    unsigned long ptel, psize, rcbits;
for (;;) {
@@@ -871,11 -874,11 +869,11 @@@
    	 * rmap chain lock.
    	 */
    	i = *rmapp & KVMPPC_RMAP_INDEX;
 -		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
 +		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
    	if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
    		/* unlock rmap before spinning on the HPTE lock */
    		unlock_rmap(rmapp);
 -			while (hptep[0] & HPTE_V_HVLOCK)
 +			while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
    			cpu_relax();
    		continue;
    	}
@@@ -894,14 -897,14 +892,14 @@@
/* Now check and modify the HPTE */
    	ptel = rev[i].guest_rpte;
 -		psize = hpte_page_size(hptep[0], ptel);
 -		if ((hptep[0] & HPTE_V_VALID) &&
 +		psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
 +		if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
    	    hpte_rpn(ptel, psize) == gfn) {
    		if (kvm->arch.using_mmu_notifiers)
 -				hptep[0] |= HPTE_V_ABSENT;
 +				hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
    		kvmppc_invalidate_hpte(kvm, hptep, i);
    		/* Harvest R and C */
 -			rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
 +			rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
    		*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
    		if (rcbits & ~rev[i].guest_rpte) {
    			rev[i].guest_rpte = ptel | rcbits;
@@@ -909,7 -912,7 +907,7 @@@
    		}
    	}
    	unlock_rmap(rmapp);
 -		hptep[0] &= ~HPTE_V_HVLOCK;
 +		hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
    }
    return 0;
  }
@@@ -956,7 -959,7 +954,7 @@@ static int kvm_age_rmapp(struct kvm *kv
  {
    struct revmap_entry *rev = kvm->arch.revmap;
    unsigned long head, i, j;
 -	unsigned long *hptep;
 +	__be64 *hptep;
    int ret = 0;
retry:
@@@ -972,24 -975,23 +970,24 @@@
i = head = *rmapp & KVMPPC_RMAP_INDEX;
    do {
 -		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
 +		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
    	j = rev[i].forw;
/* If this HPTE isn't referenced, ignore it */
 -		if (!(hptep[1] & HPTE_R_R))
 +		if (!(be64_to_cpu(hptep[1]) & HPTE_R_R))
    		continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
    		/* unlock rmap before spinning on the HPTE lock */
    		unlock_rmap(rmapp);
 -			while (hptep[0] & HPTE_V_HVLOCK)
 +			while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
    			cpu_relax();
    		goto retry;
    	}
/* Now check and modify the HPTE */
 -		if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
 +		if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
 +		    (be64_to_cpu(hptep[1]) & HPTE_R_R)) {
    		kvmppc_clear_ref_hpte(kvm, hptep, i);
    		if (!(rev[i].guest_rpte & HPTE_R_R)) {
    			rev[i].guest_rpte |= HPTE_R_R;
@@@ -997,7 -999,7 +995,7 @@@
    		}
    		ret = 1;
    	}
 -		hptep[0] &= ~HPTE_V_HVLOCK;
 +		hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
    } while ((i = j) != head);
unlock_rmap(rmapp);
@@@ -1031,7 -1033,7 +1029,7 @@@ static int kvm_test_age_rmapp(struct kv
    	do {
    		hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
    		j = rev[i].forw;
 -			if (hp[1] & HPTE_R_R)
 +			if (be64_to_cpu(hp[1]) & HPTE_R_R)
    			goto out;
    	} while ((i = j) != head);
    }
@@@ -1071,7 -1073,7 +1069,7 @@@ static int kvm_test_clear_dirty_npages(
    unsigned long head, i, j;
    unsigned long n;
    unsigned long v, r;
 -	unsigned long *hptep;
 +	__be64 *hptep;
    int npages_dirty = 0;
retry:
@@@ -1087,8 -1089,7 +1085,8 @@@
i = head = *rmapp & KVMPPC_RMAP_INDEX;
    do {
 -		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
 +		unsigned long hptep1;
 +		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
    	j = rev[i].forw;
/*
@@@ -1105,30 -1106,29 +1103,30 @@@
    	 * Otherwise we need to do the tlbie even if C==0 in
    	 * order to pick up any delayed writeback of C.
    	 */
 -		if (!(hptep[1] & HPTE_R_C) &&
 -		    (!hpte_is_writable(hptep[1]) || vcpus_running(kvm)))
 +		hptep1 = be64_to_cpu(hptep[1]);
 +		if (!(hptep1 & HPTE_R_C) &&
 +		    (!hpte_is_writable(hptep1) || vcpus_running(kvm)))
    		continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
    		/* unlock rmap before spinning on the HPTE lock */
    		unlock_rmap(rmapp);
 -			while (hptep[0] & HPTE_V_HVLOCK)
 +			while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK))
    			cpu_relax();
    		goto retry;
    	}
/* Now check and modify the HPTE */
 -		if (!(hptep[0] & HPTE_V_VALID))
 +		if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID)))
    		continue;
/* need to make it temporarily absent so C is stable */
 -		hptep[0] |= HPTE_V_ABSENT;
 +		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
    	kvmppc_invalidate_hpte(kvm, hptep, i);
 -		v = hptep[0];
 -		r = hptep[1];
 +		v = be64_to_cpu(hptep[0]);
 +		r = be64_to_cpu(hptep[1]);
    	if (r & HPTE_R_C) {
 -			hptep[1] = r & ~HPTE_R_C;
 +			hptep[1] = cpu_to_be64(r & ~HPTE_R_C);
    		if (!(rev[i].guest_rpte & HPTE_R_C)) {
    			rev[i].guest_rpte |= HPTE_R_C;
    			note_hpte_modification(kvm, &rev[i]);
@@@ -1141,7 -1141,7 +1139,7 @@@
    	}
    	v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
    	v |= HPTE_V_VALID;
 -		hptep[0] = v;
 +		hptep[0] = cpu_to_be64(v);
    } while ((i = j) != head);
unlock_rmap(rmapp);
@@@ -1305,7 -1305,7 +1303,7 @@@ struct kvm_htab_ctx 
   * Returns 1 if this HPT entry has been modified or has pending
   * R/C bit changes.
   */
 -static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp)
 +static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp)
  {
    unsigned long rcbits_unset;
@@@ -1314,14 -1314,13 +1312,14 @@@
/* Also need to consider changes in reference and changed bits */
    rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
 -	if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset))
 +	if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) &&
 +	    (be64_to_cpu(hptp[1]) & rcbits_unset))
    	return 1;
return 0;
  }
-static long record_hpte(unsigned long flags, unsigned long *hptp,
 +static long record_hpte(unsigned long flags, __be64 *hptp,
    		unsigned long *hpte, struct revmap_entry *revp,
    		int want_valid, int first_pass)
  {
@@@ -1336,10 -1335,10 +1334,10 @@@
    	return 0;
valid = 0;
 -	if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) {
 +	if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) {
    	valid = 1;
    	if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
 -		    !(hptp[0] & HPTE_V_BOLTED))
 +		    !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED))
    		valid = 0;
    }
    if (valid != want_valid)
@@@ -1351,7 -1350,7 +1349,7 @@@
    	preempt_disable();
    	while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
    		cpu_relax();
 -		v = hptp[0];
 +		v = be64_to_cpu(hptp[0]);
/* re-evaluate valid and dirty from synchronized HPTE value */
    	valid = !!(v & HPTE_V_VALID);
@@@ -1359,9 -1358,9 +1357,9 @@@
/* Harvest R and C into guest view if necessary */
    	rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
 -		if (valid && (rcbits_unset & hptp[1])) {
 -			revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) |
 -				HPTE_GR_MODIFIED;
 +		if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) {
 +			revp->guest_rpte |= (be64_to_cpu(hptp[1]) &
 +				(HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
    		dirty = 1;
    	}
@@@ -1380,13 -1379,13 +1378,13 @@@
    		revp->guest_rpte = r;
    	}
    	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
 -		hptp[0] &= ~HPTE_V_HVLOCK;
 +		hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
    	preempt_enable();
    	if (!(valid == want_valid && (first_pass || dirty)))
    		ok = 0;
    }
 -	hpte[0] = v;
 -	hpte[1] = r;
 +	hpte[0] = cpu_to_be64(v);
 +	hpte[1] = cpu_to_be64(r);
    return ok;
  }
@@@ -1396,7 -1395,7 +1394,7 @@@ static ssize_t kvm_htab_read(struct fil
    struct kvm_htab_ctx *ctx = file->private_data;
    struct kvm *kvm = ctx->kvm;
    struct kvm_get_htab_header hdr;
 -	unsigned long *hptp;
 +	__be64 *hptp;
    struct revmap_entry *revp;
    unsigned long i, nb, nw;
    unsigned long __user *lbuf;
@@@ -1412,7 -1411,7 +1410,7 @@@
    flags = ctx->flags;
i = ctx->index;
 -	hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
 +	hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
    revp = kvm->arch.revmap + i;
    lbuf = (unsigned long __user *)buf;
@@@ -1496,7 -1495,7 +1494,7 @@@ static ssize_t kvm_htab_write(struct fi
    unsigned long i, j;
    unsigned long v, r;
    unsigned long __user *lbuf;
 -	unsigned long *hptp;
 +	__be64 *hptp;
    unsigned long tmp[2];
    ssize_t nb;
    long int err, ret;
@@@ -1538,7 -1537,7 +1536,7 @@@
    	    i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
    		break;
-		hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
 +		hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
    	lbuf = (unsigned long __user *)buf;
    	for (j = 0; j < hdr.n_valid; ++j) {
    		err = -EFAULT;
@@@ -1550,7 -1549,7 +1548,7 @@@
    		lbuf += 2;
    		nb += HPTE_SIZE;
-			if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
 +			if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
    			kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
    		err = -EIO;
    		ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
@@@ -1576,7 -1575,7 +1574,7 @@@
    	}
for (j = 0; j < hdr.n_invalid; ++j) {
 -			if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
 +			if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
    			kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
    		++i;
    		hptp += 2;
diff --combined arch/powerpc/kvm/book3s_hv_builtin.c
index 3b41447,6cf498a..329d7fd
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@@ -16,12 -16,14 +16,14 @@@
  #include <linux/init.h>
  #include <linux/memblock.h>
  #include <linux/sizes.h>
+ #include <linux/cma.h>
#include <asm/cputable.h>
  #include <asm/kvm_ppc.h>
  #include <asm/kvm_book3s.h>
- #include "book3s_hv_cma.h"
+ #define KVM_CMA_CHUNK_ORDER	18
+ 
  /*
   * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
   * should be power of 2.
@@@ -43,6 -45,8 +45,8 @@@ static unsigned long kvm_cma_resv_rati
  unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT;	/* 128MB */
  EXPORT_SYMBOL_GPL(kvm_rma_pages);
+ static struct cma *kvm_cma;
+ 
  /* Work out RMLS (real mode limit selector) field value for a given RMA size.
     Assumes POWER7 or PPC970. */
  static inline int lpcr_rmls(unsigned long rma_size)
@@@ -97,7 -101,7 +101,7 @@@ struct kvm_rma_info *kvm_alloc_rma(
    ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL);
    if (!ri)
    	return NULL;
- 	page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages);
+ 	page = cma_alloc(kvm_cma, kvm_rma_pages, get_order(kvm_rma_pages));
    if (!page)
    	goto err_out;
    atomic_set(&ri->use_count, 1);
@@@ -112,7 -116,7 +116,7 @@@ EXPORT_SYMBOL_GPL(kvm_alloc_rma)
  void kvm_release_rma(struct kvm_rma_info *ri)
  {
    if (atomic_dec_and_test(&ri->use_count)) {
- 		kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages);
+ 		cma_release(kvm_cma, pfn_to_page(ri->base_pfn), kvm_rma_pages);
    	kfree(ri);
    }
  }
@@@ -131,16 -135,18 +135,18 @@@ struct page *kvm_alloc_hpt(unsigned lon
  {
    unsigned long align_pages = HPT_ALIGN_PAGES;
+ 	VM_BUG_ON(get_order(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+ 
    /* Old CPUs require HPT aligned on a multiple of its size */
    if (!cpu_has_feature(CPU_FTR_ARCH_206))
    	align_pages = nr_pages;
- 	return kvm_alloc_cma(nr_pages, align_pages);
+ 	return cma_alloc(kvm_cma, nr_pages, get_order(align_pages));
  }
  EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
void kvm_release_hpt(struct page *page, unsigned long nr_pages)
  {
- 	kvm_release_cma(page, nr_pages);
+ 	cma_release(kvm_cma, page, nr_pages);
  }
  EXPORT_SYMBOL_GPL(kvm_release_hpt);
@@@ -179,7 -185,8 +185,8 @@@ void __init kvm_cma_reserve(void
    		align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size);
- 		kvm_cma_declare_contiguous(selected_size, align_size);
+ 		cma_declare_contiguous(0, selected_size, 0, align_size,
+ 			KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma);
    }
  }
@@@ -212,16 -219,3 +219,16 @@@ bool kvm_hv_mode_active(void
  {
    return atomic_read(&hv_vm_count) != 0;
  }
 +
 +extern int hcall_real_table[], hcall_real_table_end[];
 +
 +int kvmppc_hcall_impl_hv_realmode(unsigned long cmd)
 +{
 +	cmd /= 4;
 +	if (cmd < hcall_real_table_end - hcall_real_table &&
 +	    hcall_real_table[cmd])
 +		return 1;
 +
 +	return 0;
 +}
 +EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
diff --combined arch/s390/Kconfig
index 720a11d,d12d40e..0c96f38
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@@ -116,6 -116,7 +116,6 @@@ config S39
    select HAVE_FTRACE_MCOUNT_RECORD
    select HAVE_FUNCTION_GRAPH_TRACER
    select HAVE_FUNCTION_TRACER
 -	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
    select HAVE_FUTEX_CMPXCHG if FUTEX
    select HAVE_KERNEL_BZIP2
    select HAVE_KERNEL_GZIP
@@@ -136,6 -137,7 +136,6 @@@
    select HAVE_SYSCALL_TRACEPOINTS
    select HAVE_UID16 if 32BIT
    select HAVE_VIRT_CPU_ACCOUNTING
 -	select KTIME_SCALAR if 32BIT
    select MODULES_USE_ELF_RELA
    select NO_BOOTMEM
    select OLD_SIGACTION
@@@ -144,6 -146,7 +144,7 @@@
    select TTY
    select VIRT_CPU_ACCOUNTING
    select VIRT_TO_BUS
+ 	select ARCH_HAS_SG_CHAIN
config SCHED_OMIT_FRAME_POINTER
    def_bool y
diff --combined arch/sparc/Kconfig
index 4692c90,bff3192..a537816
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@@ -42,6 -42,7 +42,7 @@@ config SPAR
    select MODULES_USE_ELF_RELA
    select ODD_RT_SIGACTION
    select OLD_SIGSUSPEND
+ 	select ARCH_HAS_SG_CHAIN
config SPARC32
    def_bool !64BIT
@@@ -55,6 -56,7 +56,6 @@@ config SPARC6
    select HAVE_FUNCTION_TRACER
    select HAVE_FUNCTION_GRAPH_TRACER
    select HAVE_FUNCTION_GRAPH_FP_TEST
 -	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
    select HAVE_KRETPROBES
    select HAVE_KPROBES
    select HAVE_RCU_TABLE_FREE if SMP
diff --combined arch/x86/Kconfig
index cb101ff,2ae952c..8f0d295
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -21,7 -21,6 +21,7 @@@ config X86_6
  ### Arch settings
  config X86
    def_bool y
 +	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
    select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
    select ARCH_MIGHT_HAVE_PC_PARPORT
    select ARCH_MIGHT_HAVE_PC_SERIO
@@@ -55,6 -54,7 +55,6 @@@
    select HAVE_FUNCTION_TRACER
    select HAVE_FUNCTION_GRAPH_TRACER
    select HAVE_FUNCTION_GRAPH_FP_TEST
 -	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
    select HAVE_SYSCALL_TRACEPOINTS
    select SYSCTL_EXCEPTION_TRACE
    select HAVE_KVM
@@@ -96,6 -96,7 +96,7 @@@
    select IRQ_FORCED_THREADING
    select HAVE_BPF_JIT if X86_64
    select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ 	select ARCH_HAS_SG_CHAIN
    select CLKEVT_I8253
    select ARCH_HAVE_NMI_SAFE_CMPXCHG
    select GENERIC_IOMAP
@@@ -109,9 -110,9 +110,9 @@@
    select CLOCKSOURCE_WATCHDOG
    select GENERIC_CLOCKEVENTS
    select ARCH_CLOCKSOURCE_DATA
 +	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
    select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
    select GENERIC_TIME_VSYSCALL
 -	select KTIME_SCALAR if X86_32
    select GENERIC_STRNCPY_FROM_USER
    select GENERIC_STRNLEN_USER
    select HAVE_CONTEXT_TRACKING if X86_64
@@@ -132,9 -133,6 +133,9 @@@
    select GENERIC_CPU_AUTOPROBE
    select HAVE_ARCH_AUDITSYSCALL
    select ARCH_SUPPORTS_ATOMIC_RMW
 +	select ACPI_LEGACY_TABLES_LOOKUP if ACPI
 +	select HAVE_ACPI_APEI if ACPI
 +	select HAVE_ACPI_APEI_NMI if ACPI
config INSTRUCTION_DECODER
    def_bool y
@@@ -433,7 -431,6 +434,7 @@@ config X86_INTEL_C
    bool "CE4100 TV platform"
    depends on PCI
    depends on PCI_GODIRECT
 +	depends on X86_IO_APIC
    depends on X86_32
    depends on X86_EXTENDED_PLATFORM
    select X86_REBOOTFIXUPS
@@@ -541,7 -538,7 +542,7 @@@ config X86_32_IRI
config SCHED_OMIT_FRAME_POINTER
    def_bool y
 -	prompt "Single-depth WCHAN output"
 +	prompt "Single-depth WCHAN output" if !LTO && !FRAME_POINTER
    depends on X86
    ---help---
      Calculate simpler /proc/<PID>/wchan values. If this option
@@@ -840,7 -837,6 +841,7 @@@ config X86_IO_API
    def_bool y
    depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI
    select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
 +	select IRQ_DOMAIN
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
    bool "Reroute for broken boot IRQs"
@@@ -1528,7 -1524,6 +1529,7 @@@ config EF
    bool "EFI runtime service support"
    depends on ACPI
    select UCS2_STRING
 +	select EFI_RUNTIME_WRAPPERS
    ---help---
      This enables the kernel to use EFI runtime services that are
      available (such as the EFI variable services).
@@@ -1542,7 -1537,7 +1543,7 @@@
config EFI_STUB
         bool "EFI stub support"
 -       depends on EFI
 +       depends on EFI && !X86_USE_3DNOW
         ---help---
            This kernel feature allows a bzImage to be loaded directly
      by EFI firmware without the use of a bootloader.
@@@ -2410,10 -2405,6 +2411,10 @@@ config IOSF_MB
    default m
    depends on PCI
+config PMC_ATOM
 +	def_bool y
 +        depends on PCI
 +
  source "net/Kconfig"
source "drivers/Kconfig"
diff --combined arch/x86/mm/fault.c
index 1dbade8,d30b78b..d393ac6
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@@ -350,7 -350,7 +350,7 @@@ out
void vmalloc_sync_all(void)
  {
- 	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
+ 	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
  }
/*
@@@ -577,8 -577,6 +577,8 @@@ static int is_f00f_bug(struct pt_regs *
static const char nx_warning[] = KERN_CRIT
  "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
 +static const char smep_warning[] = KERN_CRIT
 +"unable to execute userspace code (SMEP?) (uid: %d)\n";
static void
  show_fault_oops(struct pt_regs *regs, unsigned long error_code,
@@@ -599,10 -597,6 +599,10 @@@
if (pte && pte_present(*pte) && !pte_exec(*pte))
    		printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
 +		if (pte && pte_present(*pte) && pte_exec(*pte) &&
 +				(pgd_flags(*pgd) & _PAGE_USER) &&
 +				(read_cr4() & X86_CR4_SMEP))
 +			printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
    }
printk(KERN_ALERT "BUG: unable to handle kernel ");
@@@ -1218,7 -1212,8 +1218,8 @@@ good_area
    /*
     * If for any reason at all we couldn't handle the fault,
     * make sure we exit gracefully rather than endlessly redo
- 	 * the fault:
+ 	 * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
+ 	 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
     */
    fault = handle_mm_fault(mm, vma, address, flags);
diff --combined block/bio-integrity.c
index bc423f7b,56754c4..38c8ac2
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@@ -70,10 -70,8 +70,10 @@@ struct bio_integrity_payload *bio_integ
    				  bs->bvec_integrity_pool);
    	if (!bip->bip_vec)
    		goto err;
 +		bip->bip_max_vcnt = bvec_nr_vecs(idx);
    } else {
    	bip->bip_vec = bip->bip_inline_vecs;
 +		bip->bip_max_vcnt = inline_vecs;
    }
bip->bip_slab = idx;
@@@ -116,6 -114,14 +116,6 @@@ void bio_integrity_free(struct bio *bio
  }
  EXPORT_SYMBOL(bio_integrity_free);
-static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
 -{
 -	if (bip->bip_slab == BIO_POOL_NONE)
 -		return BIP_INLINE_VECS;
 -
 -	return bvec_nr_vecs(bip->bip_slab);
 -}
 -
  /**
   * bio_integrity_add_page - Attach integrity metadata
   * @bio:	bio to update
@@@ -131,7 -137,7 +131,7 @@@ int bio_integrity_add_page(struct bio *
    struct bio_integrity_payload *bip = bio->bi_integrity;
    struct bio_vec *iv;
-	if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
 +	if (bip->bip_vcnt >= bip->bip_max_vcnt) {
    	printk(KERN_ERR "%s: bip_vec full\n", __func__);
    	return 0;
    }
@@@ -646,6 -652,4 +646,4 @@@ void __init bio_integrity_init(void
    			     sizeof(struct bio_integrity_payload) +
    			     sizeof(struct bio_vec) * BIP_INLINE_VECS,
    			     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
- 	if (!bip_slab)
- 		panic("Failed to create slab\n");
  }
diff --combined drivers/ata/Kconfig
index e65d400,b0d5b5a..e1b9278
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@@ -16,6 -16,7 +16,7 @@@ menuconfig AT
    depends on BLOCK
    depends on !(M32R || M68K || S390) || BROKEN
    select SCSI
+ 	select GLOB
    ---help---
      If you want to use an ATA hard disk, ATA tape drive, ATA CD-ROM or
      any other ATA device under Linux, say Y and make sure that you know
@@@ -141,15 -142,6 +142,15 @@@ config AHCI_SUNX
If unsure, say N.
+config AHCI_TEGRA
 +	tristate "NVIDIA Tegra124 AHCI SATA support"
 +	depends on ARCH_TEGRA
 +	help
 +	  This option enables support for the NVIDIA Tegra124 SoC's
 +	  onboard AHCI SATA.
 +
 +	  If unsure, say N.
 +
  config AHCI_XGENE
    tristate "APM X-Gene 6.0Gbps AHCI SATA host controller support"
    depends on PHY_XGENE
diff --combined drivers/base/Kconfig
index 88500fe,9d5fed1..4e7f0ff
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@@ -149,21 -149,15 +149,21 @@@ config EXTRA_FIRMWARE_DI
      some other directory containing the firmware files.
config FW_LOADER_USER_HELPER
 +	bool
 +
 +config FW_LOADER_USER_HELPER_FALLBACK
    bool "Fallback user-helper invocation for firmware loading"
    depends on FW_LOADER
 -	default y
 +	select FW_LOADER_USER_HELPER
    help
      This option enables / disables the invocation of user-helper
      (e.g. udev) for loading firmware files as a fallback after the
      direct file loading in kernel fails.  The user-mode helper is
      no longer required unless you have a special firmware file that
 -	  resides in a non-standard path.
 +	  resides in a non-standard path. Moreover, the udev support has
 +	  been deprecated upstream.
 +
 +	  If you are unsure about this, say N here.
config DEBUG_DRIVER
    bool "Driver Core verbose debug messages"
@@@ -214,15 -208,6 +214,15 @@@ config DMA_SHARED_BUFFE
      APIs extension; the file's descriptor can then be passed on to other
      driver.
+config FENCE_TRACE
 +	bool "Enable verbose FENCE_TRACE messages"
 +	depends on DMA_SHARED_BUFFER
 +	help
 +	  Enable the FENCE_TRACE printks. This will add extra
 +	  spam to the console log, but will make it easier to diagnose
 +	  lockup related problems for dma-buffers shared across multiple
 +	  devices.
 +
  config DMA_CMA
    bool "DMA Contiguous Memory Allocator"
    depends on HAVE_DMA_CONTIGUOUS && CMA
@@@ -289,16 -274,6 +289,6 @@@ config CMA_ALIGNMEN
If unsure, leave the default value "8".
- config CMA_AREAS
- 	int "Maximum count of the CMA device-private areas"
- 	default 7
- 	help
- 	  CMA allows to create CMA areas for particular devices. This parameter
- 	  sets the maximum number of such device private CMA areas in the
- 	  system.
- 
- 	  If unsure, leave the default value "7".
- 
  endif
endmenu
diff --combined drivers/leds/Kconfig
index 8c96e2d,6784c17..f6e32ba
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@@ -11,9 -11,6 +11,6 @@@ menuconfig NEW_LED
      Say Y to enable Linux LED support.  This allows control of supported
      LEDs from both userspace and optionally, by kernel events (triggers).
- 	  This is not related to standard keyboard LEDs which are controlled
- 	  via the input system.
- 
  if NEW_LEDS
config LEDS_CLASS
@@@ -32,6 -29,14 +29,6 @@@ config LEDS_88PM860
      This option enables support for on-chip LED drivers found on Marvell
      Semiconductor 88PM8606 PMIC.
-config LEDS_ATMEL_PWM
 -	tristate "LED Support using Atmel PWM outputs"
 -	depends on LEDS_CLASS
 -	depends on ATMEL_PWM
 -	help
 -	  This option enables support for LEDs driven using outputs
 -	  of the dedicated PWM controller found on newer Atmel SOCs.
 -
  config LEDS_LM3530
    tristate "LCD Backlight driver for LM3530"
    depends on LEDS_CLASS
@@@ -135,13 -140,6 +132,13 @@@ config LEDS_SUNFIR
      This option enables support for the Left, Middle, and Right
      LEDs on the I/O and CPU boards of SunFire UltraSPARC servers.
+config LEDS_IPAQ_MICRO
 +	tristate "LED Support for the Compaq iPAQ h3xxx"
 +	depends on MFD_IPAQ_MICRO
 +	help
 +	  Choose this option if you want to use the notification LED on
 +	  Compaq/HP iPAQ h3100 and h3600.
 +
  config LEDS_HP6XX
    tristate "LED Support for the HP Jornada 6xx"
    depends on LEDS_CLASS
diff --combined drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 681a9e8,c57b085..e8ba747
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@@ -155,19 -155,6 +155,19 @@@ static struct i40e_stats i40e_gstrings_
    I40E_PF_STAT("rx_lpi_count", stats.rx_lpi_count),
  };
+#ifdef I40E_FCOE
 +static const struct i40e_stats i40e_gstrings_fcoe_stats[] = {
 +	I40E_VSI_STAT("fcoe_bad_fccrc", fcoe_stats.fcoe_bad_fccrc),
 +	I40E_VSI_STAT("rx_fcoe_dropped", fcoe_stats.rx_fcoe_dropped),
 +	I40E_VSI_STAT("rx_fcoe_packets", fcoe_stats.rx_fcoe_packets),
 +	I40E_VSI_STAT("rx_fcoe_dwords", fcoe_stats.rx_fcoe_dwords),
 +	I40E_VSI_STAT("fcoe_ddp_count", fcoe_stats.fcoe_ddp_count),
 +	I40E_VSI_STAT("fcoe_last_error", fcoe_stats.fcoe_last_error),
 +	I40E_VSI_STAT("tx_fcoe_packets", fcoe_stats.tx_fcoe_packets),
 +	I40E_VSI_STAT("tx_fcoe_dwords", fcoe_stats.tx_fcoe_dwords),
 +};
 +
 +#endif /* I40E_FCOE */
  #define I40E_QUEUE_STATS_LEN(n) \
    (((struct i40e_netdev_priv *)netdev_priv((n)))->vsi->num_queue_pairs \
        * 2 /* Tx and Rx together */                                     \
@@@ -175,17 -162,9 +175,17 @@@
  #define I40E_GLOBAL_STATS_LEN	ARRAY_SIZE(i40e_gstrings_stats)
  #define I40E_NETDEV_STATS_LEN   ARRAY_SIZE(i40e_gstrings_net_stats)
  #define I40E_MISC_STATS_LEN	ARRAY_SIZE(i40e_gstrings_misc_stats)
 +#ifdef I40E_FCOE
 +#define I40E_FCOE_STATS_LEN	ARRAY_SIZE(i40e_gstrings_fcoe_stats)
 +#define I40E_VSI_STATS_LEN(n)	(I40E_NETDEV_STATS_LEN + \
 +				 I40E_FCOE_STATS_LEN + \
 +				 I40E_MISC_STATS_LEN + \
 +				 I40E_QUEUE_STATS_LEN((n)))
 +#else
  #define I40E_VSI_STATS_LEN(n)   (I40E_NETDEV_STATS_LEN + \
    			 I40E_MISC_STATS_LEN + \
    			 I40E_QUEUE_STATS_LEN((n)))
 +#endif /* I40E_FCOE */
  #define I40E_PFC_STATS_LEN ( \
    	(FIELD_SIZEOF(struct i40e_pf, stats.priority_xoff_rx) + \
    	 FIELD_SIZEOF(struct i40e_pf, stats.priority_xon_rx) + \
@@@ -236,135 -215,52 +236,135 @@@ static int i40e_get_settings(struct net
    /* hardware is either in 40G mode or 10G mode
     * NOTE: this section initializes supported and advertising
     */
 +	if (!link_up) {
 +		/* link is down and the driver needs to fall back on
 +		 * device ID to determine what kinds of info to display,
 +		 * it's mostly a guess that may change when link is up
 +		 */
 +		switch (hw->device_id) {
 +		case I40E_DEV_ID_QSFP_A:
 +		case I40E_DEV_ID_QSFP_B:
 +		case I40E_DEV_ID_QSFP_C:
 +			/* pluggable QSFP */
 +			ecmd->supported = SUPPORTED_40000baseSR4_Full |
 +					  SUPPORTED_40000baseCR4_Full |
 +					  SUPPORTED_40000baseLR4_Full;
 +			ecmd->advertising = ADVERTISED_40000baseSR4_Full |
 +					    ADVERTISED_40000baseCR4_Full |
 +					    ADVERTISED_40000baseLR4_Full;
 +			break;
 +		case I40E_DEV_ID_KX_B:
 +			/* backplane 40G */
 +			ecmd->supported = SUPPORTED_40000baseKR4_Full;
 +			ecmd->advertising = ADVERTISED_40000baseKR4_Full;
 +			break;
 +		case I40E_DEV_ID_KX_C:
 +			/* backplane 10G */
 +			ecmd->supported = SUPPORTED_10000baseKR_Full;
 +			ecmd->advertising = ADVERTISED_10000baseKR_Full;
 +			break;
 +		default:
 +			/* all the rest are 10G/1G */
 +			ecmd->supported = SUPPORTED_10000baseT_Full |
 +					  SUPPORTED_1000baseT_Full;
 +			ecmd->advertising = ADVERTISED_10000baseT_Full |
 +					    ADVERTISED_1000baseT_Full;
 +			break;
 +		}
 +
 +		/* skip phy_type use as it is zero when link is down */
 +		goto no_valid_phy_type;
 +	}
 +
    switch (hw_link_info->phy_type) {
    case I40E_PHY_TYPE_40GBASE_CR4:
    case I40E_PHY_TYPE_40GBASE_CR4_CU:
 -		ecmd->supported = SUPPORTED_40000baseCR4_Full;
 -		ecmd->advertising = ADVERTISED_40000baseCR4_Full;
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_40000baseCR4_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_40000baseCR4_Full;
    	break;
    case I40E_PHY_TYPE_40GBASE_KR4:
 -		ecmd->supported = SUPPORTED_40000baseKR4_Full;
 -		ecmd->advertising = ADVERTISED_40000baseKR4_Full;
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_40000baseKR4_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_40000baseKR4_Full;
    	break;
    case I40E_PHY_TYPE_40GBASE_SR4:
 +	case I40E_PHY_TYPE_XLPPI:
 +	case I40E_PHY_TYPE_XLAUI:
    	ecmd->supported = SUPPORTED_40000baseSR4_Full;
    	break;
    case I40E_PHY_TYPE_40GBASE_LR4:
    	ecmd->supported = SUPPORTED_40000baseLR4_Full;
 -		ecmd->advertising = ADVERTISED_40000baseLR4_Full;
    	break;
    case I40E_PHY_TYPE_10GBASE_KX4:
 -		ecmd->supported = SUPPORTED_10000baseKX4_Full;
 -		ecmd->advertising = ADVERTISED_10000baseKX4_Full;
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_10000baseKX4_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_10000baseKX4_Full;
    	break;
    case I40E_PHY_TYPE_10GBASE_KR:
 -		ecmd->supported = SUPPORTED_10000baseKR_Full;
 -		ecmd->advertising = ADVERTISED_10000baseKR_Full;
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_10000baseKR_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_10000baseKR_Full;
    	break;
 -	default:
 -		if (i40e_is_40G_device(hw->device_id)) {
 -			ecmd->supported = SUPPORTED_40000baseSR4_Full;
 -			ecmd->advertising = ADVERTISED_40000baseSR4_Full;
 -		} else {
 -			ecmd->supported = SUPPORTED_10000baseT_Full;
 -			ecmd->advertising = ADVERTISED_10000baseT_Full;
 -		}
 +	case I40E_PHY_TYPE_10GBASE_SR:
 +	case I40E_PHY_TYPE_10GBASE_LR:
 +		ecmd->supported = SUPPORTED_10000baseT_Full;
 +		break;
 +	case I40E_PHY_TYPE_10GBASE_CR1_CU:
 +	case I40E_PHY_TYPE_10GBASE_CR1:
 +	case I40E_PHY_TYPE_10GBASE_T:
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_10000baseT_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_10000baseT_Full;
 +		break;
 +	case I40E_PHY_TYPE_XAUI:
 +	case I40E_PHY_TYPE_XFI:
 +	case I40E_PHY_TYPE_SFI:
 +	case I40E_PHY_TYPE_10GBASE_SFPP_CU:
 +		ecmd->supported = SUPPORTED_10000baseT_Full;
    	break;
 +	case I40E_PHY_TYPE_1000BASE_KX:
 +	case I40E_PHY_TYPE_1000BASE_T:
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_1000baseT_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_1000baseT_Full;
 +		break;
 +	case I40E_PHY_TYPE_100BASE_TX:
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_100baseT_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_100baseT_Full;
 +		break;
 +	case I40E_PHY_TYPE_SGMII:
 +		ecmd->supported = SUPPORTED_Autoneg |
 +				  SUPPORTED_1000baseT_Full |
 +				  SUPPORTED_100baseT_Full;
 +		ecmd->advertising = ADVERTISED_Autoneg |
 +				    ADVERTISED_1000baseT_Full |
 +				    ADVERTISED_100baseT_Full;
 +		break;
 +	default:
 +		/* if we got here and link is up something bad is afoot */
 +		WARN_ON(link_up);
    }
-	ecmd->supported |= SUPPORTED_Autoneg;
 -	ecmd->advertising |= ADVERTISED_Autoneg;
 +no_valid_phy_type:
 +	/* this is if autoneg is enabled or disabled */
    ecmd->autoneg = ((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ?
    		  AUTONEG_ENABLE : AUTONEG_DISABLE);
switch (hw->phy.media_type) {
    case I40E_MEDIA_TYPE_BACKPLANE:
 -		ecmd->supported |= SUPPORTED_Backplane;
 -		ecmd->advertising |= ADVERTISED_Backplane;
 +		ecmd->supported |= SUPPORTED_Autoneg |
 +				   SUPPORTED_Backplane;
 +		ecmd->advertising |= ADVERTISED_Autoneg |
 +				     ADVERTISED_Backplane;
    	ecmd->port = PORT_NONE;
    	break;
    case I40E_MEDIA_TYPE_BASET:
@@@ -380,6 -276,7 +380,6 @@@
    	break;
    case I40E_MEDIA_TYPE_FIBER:
    	ecmd->supported |= SUPPORTED_FIBRE;
 -		ecmd->advertising |= ADVERTISED_FIBRE;
    	ecmd->port = PORT_FIBRE;
    	break;
    case I40E_MEDIA_TYPE_UNKNOWN:
@@@ -390,25 -287,6 +390,25 @@@
ecmd->transceiver = XCVR_EXTERNAL;
+	ecmd->supported |= SUPPORTED_Pause;
 +
 +	switch (hw->fc.current_mode) {
 +	case I40E_FC_FULL:
 +		ecmd->advertising |= ADVERTISED_Pause;
 +		break;
 +	case I40E_FC_TX_PAUSE:
 +		ecmd->advertising |= ADVERTISED_Asym_Pause;
 +		break;
 +	case I40E_FC_RX_PAUSE:
 +		ecmd->advertising |= (ADVERTISED_Pause |
 +				      ADVERTISED_Asym_Pause);
 +		break;
 +	default:
 +		ecmd->advertising &= ~(ADVERTISED_Pause |
 +				       ADVERTISED_Asym_Pause);
 +		break;
 +	}
 +
    if (link_up) {
    	switch (link_speed) {
    	case I40E_LINK_SPEED_40GB:
@@@ -418,9 -296,6 +418,9 @@@
    	case I40E_LINK_SPEED_10GB:
    		ethtool_cmd_speed_set(ecmd, SPEED_10000);
    		break;
 +		case I40E_LINK_SPEED_1GB:
 +			ethtool_cmd_speed_set(ecmd, SPEED_1000);
 +			break;
    	default:
    		break;
    	}
@@@ -434,182 -309,6 +434,182 @@@
  }
/**
 + * i40e_set_settings - Set Speed and Duplex
 + * @netdev: network interface device structure
 + * @ecmd: ethtool command
 + *
 + * Set speed/duplex per media_types advertised/forced
 + **/
 +static int i40e_set_settings(struct net_device *netdev,
 +			     struct ethtool_cmd *ecmd)
 +{
 +	struct i40e_netdev_priv *np = netdev_priv(netdev);
 +	struct i40e_aq_get_phy_abilities_resp abilities;
 +	struct i40e_aq_set_phy_config config;
 +	struct i40e_pf *pf = np->vsi->back;
 +	struct i40e_vsi *vsi = np->vsi;
 +	struct i40e_hw *hw = &pf->hw;
 +	struct ethtool_cmd safe_ecmd;
 +	i40e_status status = 0;
 +	bool change = false;
 +	int err = 0;
 +	u8 autoneg;
 +	u32 advertise;
 +
 +	if (vsi != pf->vsi[pf->lan_vsi])
 +		return -EOPNOTSUPP;
 +
 +	if (hw->phy.media_type != I40E_MEDIA_TYPE_BASET &&
 +	    hw->phy.media_type != I40E_MEDIA_TYPE_FIBER &&
 +	    hw->phy.media_type != I40E_MEDIA_TYPE_BACKPLANE)
 +		return -EOPNOTSUPP;
 +
 +	/* get our own copy of the bits to check against */
 +	memset(&safe_ecmd, 0, sizeof(struct ethtool_cmd));
 +	i40e_get_settings(netdev, &safe_ecmd);
 +
 +	/* save autoneg and speed out of ecmd */
 +	autoneg = ecmd->autoneg;
 +	advertise = ecmd->advertising;
 +
 +	/* set autoneg and speed back to what they currently are */
 +	ecmd->autoneg = safe_ecmd.autoneg;
 +	ecmd->advertising = safe_ecmd.advertising;
 +
 +	ecmd->cmd = safe_ecmd.cmd;
 +	/* If ecmd and safe_ecmd are not the same now, then they are
 +	 * trying to set something that we do not support
 +	 */
 +	if (memcmp(ecmd, &safe_ecmd, sizeof(struct ethtool_cmd)))
 +		return -EOPNOTSUPP;
 +
 +	while (test_bit(__I40E_CONFIG_BUSY, &vsi->state))
 +		usleep_range(1000, 2000);
 +
 +	/* Get the current phy config */
 +	status = i40e_aq_get_phy_capabilities(hw, false, false, &abilities,
 +					      NULL);
 +	if (status)
 +		return -EAGAIN;
 +
 +	/* Copy link_speed and abilities to config in case they are not
 +	 * set below
 +	 */
 +	memset(&config, 0, sizeof(struct i40e_aq_set_phy_config));
 +	config.link_speed = abilities.link_speed;
 +	config.abilities = abilities.abilities;
 +
 +	/* Check autoneg */
 +	if (autoneg == AUTONEG_ENABLE) {
 +		/* If autoneg is not supported, return error */
 +		if (!(safe_ecmd.supported & SUPPORTED_Autoneg)) {
 +			netdev_info(netdev, "Autoneg not supported on this phy\n");
 +			return -EINVAL;
 +		}
 +		/* If autoneg was not already enabled */
 +		if (!(hw->phy.link_info.an_info & I40E_AQ_AN_COMPLETED)) {
 +			config.abilities = abilities.abilities |
 +					   I40E_AQ_PHY_ENABLE_AN;
 +			change = true;
 +		}
 +	} else {
 +		/* If autoneg is supported 10GBASE_T is the only phy that
 +		 * can disable it, so otherwise return error
 +		 */
 +		if (safe_ecmd.supported & SUPPORTED_Autoneg &&
 +		    hw->phy.link_info.phy_type != I40E_PHY_TYPE_10GBASE_T) {
 +			netdev_info(netdev, "Autoneg cannot be disabled on this phy\n");
 +			return -EINVAL;
 +		}
 +		/* If autoneg is currently enabled */
 +		if (hw->phy.link_info.an_info & I40E_AQ_AN_COMPLETED) {
 +			config.abilities = abilities.abilities |
 +					   ~I40E_AQ_PHY_ENABLE_AN;
 +			change = true;
 +		}
 +	}
 +
 +	if (advertise & ~safe_ecmd.supported)
 +		return -EINVAL;
 +
 +	if (advertise & ADVERTISED_100baseT_Full)
 +		if (!(abilities.link_speed & I40E_LINK_SPEED_100MB)) {
 +			config.link_speed |= I40E_LINK_SPEED_100MB;
 +			change = true;
 +		}
 +	if (advertise & ADVERTISED_1000baseT_Full ||
 +	    advertise & ADVERTISED_1000baseKX_Full)
 +		if (!(abilities.link_speed & I40E_LINK_SPEED_1GB)) {
 +			config.link_speed |= I40E_LINK_SPEED_1GB;
 +			change = true;
 +		}
 +	if (advertise & ADVERTISED_10000baseT_Full ||
 +	    advertise & ADVERTISED_10000baseKX4_Full ||
 +	    advertise & ADVERTISED_10000baseKR_Full)
 +		if (!(abilities.link_speed & I40E_LINK_SPEED_10GB)) {
 +			config.link_speed |= I40E_LINK_SPEED_10GB;
 +			change = true;
 +		}
 +	if (advertise & ADVERTISED_40000baseKR4_Full ||
 +	    advertise & ADVERTISED_40000baseCR4_Full ||
 +	    advertise & ADVERTISED_40000baseSR4_Full ||
 +	    advertise & ADVERTISED_40000baseLR4_Full)
 +		if (!(abilities.link_speed & I40E_LINK_SPEED_40GB)) {
 +			config.link_speed |= I40E_LINK_SPEED_40GB;
 +			change = true;
 +		}
 +
 +	if (change) {
 +		/* copy over the rest of the abilities */
 +		config.phy_type = abilities.phy_type;
 +		config.eee_capability = abilities.eee_capability;
 +		config.eeer = abilities.eeer_val;
 +		config.low_power_ctrl = abilities.d3_lpan;
 +
 +		/* If link is up set link and an so changes take effect */
 +		if (hw->phy.link_info.link_info & I40E_AQ_LINK_UP)
 +			config.abilities |= I40E_AQ_PHY_ENABLE_ATOMIC_LINK;
 +
 +		/* make the aq call */
 +		status = i40e_aq_set_phy_config(hw, &config, NULL);
 +		if (status) {
 +			netdev_info(netdev, "Set phy config failed with error %d.\n",
 +				    status);
 +			return -EAGAIN;
 +		}
 +
 +		status = i40e_update_link_info(hw, true);
 +		if (status)
 +			netdev_info(netdev, "Updating link info failed with error %d\n",
 +				    status);
 +
 +	} else {
 +		netdev_info(netdev, "Nothing changed, exiting without setting anything.\n");
 +	}
 +
 +	return err;
 +}
 +
 +static int i40e_nway_reset(struct net_device *netdev)
 +{
 +	/* restart autonegotiation */
 +	struct i40e_netdev_priv *np = netdev_priv(netdev);
 +	struct i40e_pf *pf = np->vsi->back;
 +	struct i40e_hw *hw = &pf->hw;
 +	bool link_up = hw->phy.link_info.link_info & I40E_AQ_LINK_UP;
 +	i40e_status ret = 0;
 +
 +	ret = i40e_aq_set_link_restart_an(hw, link_up, NULL);
 +	if (ret) {
 +		netdev_info(netdev, "link restart failed, aq_err=%d\n",
 +			    pf->hw.aq.asq_last_status);
 +		return -EIO;
 +	}
 +
 +	return 0;
 +}
 +
 +/**
   * i40e_get_pauseparam -  Get Flow Control status
   * Return tx/rx-pause status
   **/
@@@ -635,85 -334,6 +635,85 @@@ static void i40e_get_pauseparam(struct 
    }
  }
+/**
 + * i40e_set_pauseparam - Set Flow Control parameter
 + * @netdev: network interface device structure
 + * @pause: return tx/rx flow control status
 + **/
 +static int i40e_set_pauseparam(struct net_device *netdev,
 +			       struct ethtool_pauseparam *pause)
 +{
 +	struct i40e_netdev_priv *np = netdev_priv(netdev);
 +	struct i40e_pf *pf = np->vsi->back;
 +	struct i40e_vsi *vsi = np->vsi;
 +	struct i40e_hw *hw = &pf->hw;
 +	struct i40e_link_status *hw_link_info = &hw->phy.link_info;
 +	bool link_up = hw_link_info->link_info & I40E_AQ_LINK_UP;
 +	i40e_status status;
 +	u8 aq_failures;
 +	int err = 0;
 +
 +	if (vsi != pf->vsi[pf->lan_vsi])
 +		return -EOPNOTSUPP;
 +
 +	if (pause->autoneg != ((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ?
 +	    AUTONEG_ENABLE : AUTONEG_DISABLE)) {
 +		netdev_info(netdev, "To change autoneg please use: ethtool -s <dev> autoneg <on|off>\n");
 +		return -EOPNOTSUPP;
 +	}
 +
 +	/* If we have link and don't have autoneg */
 +	if (!test_bit(__I40E_DOWN, &pf->state) &&
 +	    !(hw_link_info->an_info & I40E_AQ_AN_COMPLETED)) {
 +		/* Send message that it might not necessarily work*/
 +		netdev_info(netdev, "Autoneg did not complete so changing settings may not result in an actual change.\n");
 +	}
 +
 +	if (hw->fc.current_mode == I40E_FC_PFC) {
 +		netdev_info(netdev, "Priority flow control enabled. Cannot set link flow control.\n");
 +		return -EOPNOTSUPP;
 +	}
 +
 +	if (pause->rx_pause && pause->tx_pause)
 +		hw->fc.requested_mode = I40E_FC_FULL;
 +	else if (pause->rx_pause && !pause->tx_pause)
 +		hw->fc.requested_mode = I40E_FC_RX_PAUSE;
 +	else if (!pause->rx_pause && pause->tx_pause)
 +		hw->fc.requested_mode = I40E_FC_TX_PAUSE;
 +	else if (!pause->rx_pause && !pause->tx_pause)
 +		hw->fc.requested_mode = I40E_FC_NONE;
 +	else
 +		 return -EINVAL;
 +
 +	/* Set the fc mode and only restart an if link is up*/
 +	status = i40e_set_fc(hw, &aq_failures, link_up);
 +
 +	if (aq_failures & I40E_SET_FC_AQ_FAIL_GET) {
 +		netdev_info(netdev, "Set fc failed on the get_phy_capabilities call with error %d and status %d\n",
 +			    status, hw->aq.asq_last_status);
 +		err = -EAGAIN;
 +	}
 +	if (aq_failures & I40E_SET_FC_AQ_FAIL_SET) {
 +		netdev_info(netdev, "Set fc failed on the set_phy_config call with error %d and status %d\n",
 +			    status, hw->aq.asq_last_status);
 +		err = -EAGAIN;
 +	}
 +	if (aq_failures & I40E_SET_FC_AQ_FAIL_UPDATE) {
 +		netdev_info(netdev, "Set fc failed on the update_link_info call with error %d and status %d\n",
 +			    status, hw->aq.asq_last_status);
 +		err = -EAGAIN;
 +	}
 +
 +	if (!test_bit(__I40E_DOWN, &pf->state)) {
 +		/* Give it a little more time to try to come back */
 +		msleep(75);
 +		if (!test_bit(__I40E_DOWN, &pf->state))
 +			return i40e_nway_reset(netdev);
 +	}
 +
 +	return err;
 +}
 +
  static u32 i40e_get_msglevel(struct net_device *netdev)
  {
    struct i40e_netdev_priv *np = netdev_priv(netdev);
@@@ -784,33 -404,10 +784,33 @@@ static int i40e_get_eeprom(struct net_d
    u8 *eeprom_buff;
    u16 i, sectors;
    bool last;
 +	u32 magic;
 +
  #define I40E_NVM_SECTOR_SIZE  4096
    if (eeprom->len == 0)
    	return -EINVAL;
+	/* check for NVMUpdate access method */
 +	magic = hw->vendor_id | (hw->device_id << 16);
 +	if (eeprom->magic && eeprom->magic != magic) {
 +		int errno;
 +
 +		/* make sure it is the right magic for NVMUpdate */
 +		if ((eeprom->magic >> 16) != hw->device_id)
 +			return -EINVAL;
 +
 +		ret_val = i40e_nvmupd_command(hw,
 +					      (struct i40e_nvm_access *)eeprom,
 +					      bytes, &errno);
 +		if (ret_val)
 +			dev_info(&pf->pdev->dev,
 +				 "NVMUpdate read failed err=%d status=0x%x\n",
 +				 ret_val, hw->aq.asq_last_status);
 +
 +		return errno;
 +	}
 +
 +	/* normal ethtool get_eeprom support */
    eeprom->magic = hw->vendor_id | (hw->device_id << 16);
eeprom_buff = kzalloc(eeprom->len, GFP_KERNEL);
@@@ -837,7 -434,7 +837,7 @@@
    	ret_val = i40e_aq_read_nvm(hw, 0x0,
    			eeprom->offset + (I40E_NVM_SECTOR_SIZE * i),
    			len,
 -				eeprom_buff + (I40E_NVM_SECTOR_SIZE * i),
 +				(u8 *)eeprom_buff + (I40E_NVM_SECTOR_SIZE * i),
    			last, NULL);
    	if (ret_val) {
    		dev_info(&pf->pdev->dev,
@@@ -849,7 -446,7 +849,7 @@@
release_nvm:
    i40e_release_nvm(hw);
 -	memcpy(bytes, eeprom_buff, eeprom->len);
 +	memcpy(bytes, (u8 *)eeprom_buff, eeprom->len);
  free_buff:
    kfree(eeprom_buff);
    return ret_val;
@@@ -869,39 -466,6 +869,39 @@@ static int i40e_get_eeprom_len(struct n
    return val;
  }
+static int i40e_set_eeprom(struct net_device *netdev,
 +			   struct ethtool_eeprom *eeprom, u8 *bytes)
 +{
 +	struct i40e_netdev_priv *np = netdev_priv(netdev);
 +	struct i40e_hw *hw = &np->vsi->back->hw;
 +	struct i40e_pf *pf = np->vsi->back;
 +	int ret_val = 0;
 +	int errno;
 +	u32 magic;
 +
 +	/* normal ethtool set_eeprom is not supported */
 +	magic = hw->vendor_id | (hw->device_id << 16);
 +	if (eeprom->magic == magic)
 +		return -EOPNOTSUPP;
 +
 +	/* check for NVMUpdate access method */
 +	if (!eeprom->magic || (eeprom->magic >> 16) != hw->device_id)
 +		return -EINVAL;
 +
 +	if (test_bit(__I40E_RESET_RECOVERY_PENDING, &pf->state) ||
 +	    test_bit(__I40E_RESET_INTR_RECEIVED, &pf->state))
 +		return -EBUSY;
 +
 +	ret_val = i40e_nvmupd_command(hw, (struct i40e_nvm_access *)eeprom,
 +				      bytes, &errno);
 +	if (ret_val)
 +		dev_info(&pf->pdev->dev,
 +			 "NVMUpdate write failed err=%d status=0x%x\n",
 +			 ret_val, hw->aq.asq_last_status);
 +
 +	return errno;
 +}
 +
  static void i40e_get_drvinfo(struct net_device *netdev,
    		     struct ethtool_drvinfo *drvinfo)
  {
@@@ -1133,13 -697,6 +1133,13 @@@ static void i40e_get_ethtool_stats(stru
    	data[i++] = (i40e_gstrings_misc_stats[j].sizeof_stat ==
    		    sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
    }
 +#ifdef I40E_FCOE
 +	for (j = 0; j < I40E_FCOE_STATS_LEN; j++) {
 +		p = (char *)vsi + i40e_gstrings_fcoe_stats[j].stat_offset;
 +		data[i++] = (i40e_gstrings_fcoe_stats[j].sizeof_stat ==
 +			sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
 +	}
 +#endif
    rcu_read_lock();
    for (j = 0; j < vsi->num_queue_pairs; j++) {
    	tx_ring = ACCESS_ONCE(vsi->tx_rings[j]);
@@@ -1221,13 -778,6 +1221,13 @@@ static void i40e_get_strings(struct net
    			 i40e_gstrings_misc_stats[i].stat_string);
    		p += ETH_GSTRING_LEN;
    	}
 +#ifdef I40E_FCOE
 +		for (i = 0; i < I40E_FCOE_STATS_LEN; i++) {
 +			snprintf(p, ETH_GSTRING_LEN, "%s",
 +				 i40e_gstrings_fcoe_stats[i].stat_string);
 +			p += ETH_GSTRING_LEN;
 +		}
 +#endif
    	for (i = 0; i < vsi->num_queue_pairs; i++) {
    		snprintf(p, ETH_GSTRING_LEN, "tx-%u.tx_packets", i);
    		p += ETH_GSTRING_LEN;
@@@ -1471,6 -1021,24 +1471,6 @@@ static int i40e_set_wol(struct net_devi
    return 0;
  }
-static int i40e_nway_reset(struct net_device *netdev)
 -{
 -	/* restart autonegotiation */
 -	struct i40e_netdev_priv *np = netdev_priv(netdev);
 -	struct i40e_pf *pf = np->vsi->back;
 -	struct i40e_hw *hw = &pf->hw;
 -	i40e_status ret = 0;
 -
 -	ret = i40e_aq_set_link_restart_an(hw, NULL);
 -	if (ret) {
 -		netdev_info(netdev, "link restart failed, aq_err=%d\n",
 -			    pf->hw.aq.asq_last_status);
 -		return -EIO;
 -	}
 -
 -	return 0;
 -}
 -
  static int i40e_set_phys_id(struct net_device *netdev,
    		    enum ethtool_phys_id_state state)
  {
@@@ -1537,36 -1105,17 +1537,36 @@@ static int i40e_set_coalesce(struct net
    if (ec->tx_max_coalesced_frames_irq || ec->rx_max_coalesced_frames_irq)
    	vsi->work_limit = ec->tx_max_coalesced_frames_irq;
+	vector = vsi->base_vector;
    if ((ec->rx_coalesce_usecs >= (I40E_MIN_ITR << 1)) &&
 -	    (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1)))
 +	    (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1))) {
    	vsi->rx_itr_setting = ec->rx_coalesce_usecs;
 -	else
 +	} else if (ec->rx_coalesce_usecs == 0) {
 +		vsi->rx_itr_setting = ec->rx_coalesce_usecs;
 +		i40e_irq_dynamic_disable(vsi, vector);
 +		if (ec->use_adaptive_rx_coalesce)
 +			netif_info(pf, drv, netdev,
 +				   "Rx-secs=0, need to disable adaptive-Rx for a complete disable\n");
 +	} else {
 +		netif_info(pf, drv, netdev,
 +			   "Invalid value, Rx-usecs range is 0, 8-8160\n");
    	return -EINVAL;
 +	}
if ((ec->tx_coalesce_usecs >= (I40E_MIN_ITR << 1)) &&
 -	    (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1)))
 +	    (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1))) {
    	vsi->tx_itr_setting = ec->tx_coalesce_usecs;
 -	else
 +	} else if (ec->tx_coalesce_usecs == 0) {
 +		vsi->tx_itr_setting = ec->tx_coalesce_usecs;
 +		i40e_irq_dynamic_disable(vsi, vector);
 +		if (ec->use_adaptive_tx_coalesce)
 +			netif_info(pf, drv, netdev,
 +				   "Tx-secs=0, need to disable adaptive-Tx for a complete disable\n");
 +	} else {
 +		netif_info(pf, drv, netdev,
 +			   "Invalid value, Tx-usecs range is 0, 8-8160\n");
    	return -EINVAL;
 +	}
if (ec->use_adaptive_rx_coalesce)
    	vsi->rx_itr_setting |= I40E_ITR_DYNAMIC;
@@@ -1578,6 -1127,7 +1578,6 @@@
    else
    	vsi->tx_itr_setting &= ~I40E_ITR_DYNAMIC;
-	vector = vsi->base_vector;
    for (i = 0; i < vsi->num_q_vectors; i++, vector++) {
    	q_vector = vsi->q_vectors[i];
    	q_vector->rx.itr = ITR_TO_REG(vsi->rx_itr_setting);
@@@ -1948,7 -1498,7 +1948,7 @@@ static int i40e_update_ethtool_fdir_ent
/* add filter to the list */
    if (parent)
- 		hlist_add_after(&parent->fdir_node, &input->fdir_node);
+ 		hlist_add_behind(&input->fdir_node, &parent->fdir_node);
    else
    	hlist_add_head(&input->fdir_node,
    		       &pf->fdir_filter_list);
@@@ -2181,7 -1731,6 +2181,7 @@@ static int i40e_set_channels(struct net
static const struct ethtool_ops i40e_ethtool_ops = {
    .get_settings		= i40e_get_settings,
 +	.set_settings		= i40e_set_settings,
    .get_drvinfo		= i40e_get_drvinfo,
    .get_regs_len		= i40e_get_regs_len,
    .get_regs		= i40e_get_regs,
@@@ -2189,13 -1738,11 +2189,13 @@@
    .get_link		= ethtool_op_get_link,
    .get_wol		= i40e_get_wol,
    .set_wol		= i40e_set_wol,
 +	.set_eeprom		= i40e_set_eeprom,
    .get_eeprom_len		= i40e_get_eeprom_len,
    .get_eeprom		= i40e_get_eeprom,
    .get_ringparam		= i40e_get_ringparam,
    .set_ringparam		= i40e_set_ringparam,
    .get_pauseparam		= i40e_get_pauseparam,
 +	.set_pauseparam		= i40e_set_pauseparam,
    .get_msglevel		= i40e_get_msglevel,
    .set_msglevel		= i40e_set_msglevel,
    .get_rxnfc		= i40e_get_rxnfc,
diff --combined drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 94a1c07,a6e5bcc..e4100b5
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@@ -1408,6 -1408,7 +1408,6 @@@ static int ixgbe_reg_test(struct ixgbe_
    default:
    	*data = 1;
    	return 1;
 -		break;
    }
/*
@@@ -2517,7 -2518,7 +2517,7 @@@ static int ixgbe_update_ethtool_fdir_en
/* add filter to the list */
    if (parent)
- 		hlist_add_after(&parent->fdir_node, &input->fdir_node);
+ 		hlist_add_behind(&input->fdir_node, &parent->fdir_node);
    else
    	hlist_add_head(&input->fdir_node,
    		       &adapter->fdir_filter_list);
@@@ -2865,6 -2866,7 +2865,6 @@@ static int ixgbe_get_ts_info(struct net
    	break;
    default:
    	return ethtool_op_get_ts_info(dev, info);
 -		break;
    }
    return 0;
  }
diff --combined drivers/staging/android/binder.c
index 02b0379,0ca9785..4f34dc0
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
@@@ -454,8 -454,9 +454,8 @@@ static size_t binder_buffer_size(struc
  {
    if (list_is_last(&buffer->entry, &proc->buffers))
    	return proc->buffer + proc->buffer_size - (void *)buffer->data;
 -	else
 -		return (size_t)list_entry(buffer->entry.next,
 -			struct binder_buffer, entry) - (size_t)buffer->data;
 +	return (size_t)list_entry(buffer->entry.next,
 +			  struct binder_buffer, entry) - (size_t)buffer->data;
  }
static void binder_insert_free_buffer(struct binder_proc *proc,
@@@ -585,7 -586,6 +585,6 @@@ static int binder_update_page_range(str
for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
    	int ret;
- 		struct page **page_array_ptr;
page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
@@@ -598,8 -598,7 +597,7 @@@
    	}
    	tmp_area.addr = page_addr;
    	tmp_area.size = PAGE_SIZE + PAGE_SIZE /* guard page? */;
- 		page_array_ptr = page;
- 		ret = map_vm_area(&tmp_area, PAGE_KERNEL, &page_array_ptr);
+ 		ret = map_vm_area(&tmp_area, PAGE_KERNEL, page);
    	if (ret) {
    		pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n",
    		       proc->pid, page_addr);
@@@ -1185,7 -1184,6 +1183,7 @@@ static void binder_send_failed_reply(st
    			     uint32_t error_code)
  {
    struct binder_thread *target_thread;
 +	struct binder_transaction *next;
BUG_ON(t->flags & TF_ONE_WAY);
    while (1) {
@@@ -1213,23 -1211,24 +1211,23 @@@
    				target_thread->return_error);
    		}
    		return;
 -		} else {
 -			struct binder_transaction *next = t->from_parent;
 +		}
 +		next = t->from_parent;
-			binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 -				     "send failed reply for transaction %d, target dead\n",
 -				     t->debug_id);
 +		binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 +			     "send failed reply for transaction %d, target dead\n",
 +			     t->debug_id);
-			binder_pop_transaction(target_thread, t);
 -			if (next == NULL) {
 -				binder_debug(BINDER_DEBUG_DEAD_BINDER,
 -					     "reply failed, no target thread at root\n");
 -				return;
 -			}
 -			t = next;
 +		binder_pop_transaction(target_thread, t);
 +		if (next == NULL) {
    		binder_debug(BINDER_DEBUG_DEAD_BINDER,
 -				     "reply failed, no target thread -- retry %d\n",
 -				      t->debug_id);
 +				     "reply failed, no target thread at root\n");
 +			return;
    	}
 +		t = next;
 +		binder_debug(BINDER_DEBUG_DEAD_BINDER,
 +			     "reply failed, no target thread -- retry %d\n",
 +			      t->debug_id);
    }
  }
@@@ -2593,106 -2592,6 +2591,106 @@@ static unsigned int binder_poll(struct 
    return 0;
  }
+static int binder_ioctl_write_read(struct file *filp,
 +				unsigned int cmd, unsigned long arg,
 +				struct binder_thread *thread)
 +{
 +	int ret = 0;
 +	struct binder_proc *proc = filp->private_data;
 +	unsigned int size = _IOC_SIZE(cmd);
 +	void __user *ubuf = (void __user *)arg;
 +	struct binder_write_read bwr;
 +
 +	if (size != sizeof(struct binder_write_read)) {
 +		ret = -EINVAL;
 +		goto out;
 +	}
 +	if (copy_from_user(&bwr, ubuf, sizeof(bwr))) {
 +		ret = -EFAULT;
 +		goto out;
 +	}
 +	binder_debug(BINDER_DEBUG_READ_WRITE,
 +		     "%d:%d write %lld at %016llx, read %lld at %016llx\n",
 +		     proc->pid, thread->pid,
 +		     (u64)bwr.write_size, (u64)bwr.write_buffer,
 +		     (u64)bwr.read_size, (u64)bwr.read_buffer);
 +
 +	if (bwr.write_size > 0) {
 +		ret = binder_thread_write(proc, thread,
 +					  bwr.write_buffer,
 +					  bwr.write_size,
 +					  &bwr.write_consumed);
 +		trace_binder_write_done(ret);
 +		if (ret < 0) {
 +			bwr.read_consumed = 0;
 +			if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
 +				ret = -EFAULT;
 +			goto out;
 +		}
 +	}
 +	if (bwr.read_size > 0) {
 +		ret = binder_thread_read(proc, thread, bwr.read_buffer,
 +					 bwr.read_size,
 +					 &bwr.read_consumed,
 +					 filp->f_flags & O_NONBLOCK);
 +		trace_binder_read_done(ret);
 +		if (!list_empty(&proc->todo))
 +			wake_up_interruptible(&proc->wait);
 +		if (ret < 0) {
 +			if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
 +				ret = -EFAULT;
 +			goto out;
 +		}
 +	}
 +	binder_debug(BINDER_DEBUG_READ_WRITE,
 +		     "%d:%d wrote %lld of %lld, read return %lld of %lld\n",
 +		     proc->pid, thread->pid,
 +		     (u64)bwr.write_consumed, (u64)bwr.write_size,
 +		     (u64)bwr.read_consumed, (u64)bwr.read_size);
 +	if (copy_to_user(ubuf, &bwr, sizeof(bwr))) {
 +		ret = -EFAULT;
 +		goto out;
 +	}
 +out:
 +	return ret;
 +}
 +
 +static int binder_ioctl_set_ctx_mgr(struct file *filp)
 +{
 +	int ret = 0;
 +	struct binder_proc *proc = filp->private_data;
 +	kuid_t curr_euid = current_euid();
 +
 +	if (binder_context_mgr_node != NULL) {
 +		pr_err("BINDER_SET_CONTEXT_MGR already set\n");
 +		ret = -EBUSY;
 +		goto out;
 +	}
 +	if (uid_valid(binder_context_mgr_uid)) {
 +		if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
 +			pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
 +			       from_kuid(&init_user_ns, curr_euid),
 +			       from_kuid(&init_user_ns,
 +					binder_context_mgr_uid));
 +			ret = -EPERM;
 +			goto out;
 +		}
 +	} else {
 +		binder_context_mgr_uid = curr_euid;
 +	}
 +	binder_context_mgr_node = binder_new_node(proc, 0, 0);
 +	if (binder_context_mgr_node == NULL) {
 +		ret = -ENOMEM;
 +		goto out;
 +	}
 +	binder_context_mgr_node->local_weak_refs++;
 +	binder_context_mgr_node->local_strong_refs++;
 +	binder_context_mgr_node->has_strong_ref = 1;
 +	binder_context_mgr_node->has_weak_ref = 1;
 +out:
 +	return ret;
 +}
 +
  static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
  {
    int ret;
@@@ -2700,9 -2599,9 +2698,9 @@@
    struct binder_thread *thread;
    unsigned int size = _IOC_SIZE(cmd);
    void __user *ubuf = (void __user *)arg;
 -	kuid_t curr_euid = current_euid();
-	/*pr_info("binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd, arg);*/
 +	/*pr_info("binder_ioctl: %d:%d %x %lx\n",
 +			proc->pid, current->pid, cmd, arg);*/
trace_binder_ioctl(cmd, arg);
@@@ -2718,11 -2617,61 +2716,11 @@@
    }
switch (cmd) {
 -	case BINDER_WRITE_READ: {
 -		struct binder_write_read bwr;
 -
 -		if (size != sizeof(struct binder_write_read)) {
 -			ret = -EINVAL;
 +	case BINDER_WRITE_READ:
 +		ret = binder_ioctl_write_read(filp, cmd, arg, thread);
 +		if (ret)
    		goto err;
 -		}
 -		if (copy_from_user(&bwr, ubuf, sizeof(bwr))) {
 -			ret = -EFAULT;
 -			goto err;
 -		}
 -		binder_debug(BINDER_DEBUG_READ_WRITE,
 -			     "%d:%d write %lld at %016llx, read %lld at %016llx\n",
 -			     proc->pid, thread->pid,
 -			     (u64)bwr.write_size, (u64)bwr.write_buffer,
 -			     (u64)bwr.read_size, (u64)bwr.read_buffer);
 -
 -		if (bwr.write_size > 0) {
 -			ret = binder_thread_write(proc, thread,
 -						  bwr.write_buffer,
 -						  bwr.write_size,
 -						  &bwr.write_consumed);
 -			trace_binder_write_done(ret);
 -			if (ret < 0) {
 -				bwr.read_consumed = 0;
 -				if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
 -					ret = -EFAULT;
 -				goto err;
 -			}
 -		}
 -		if (bwr.read_size > 0) {
 -			ret = binder_thread_read(proc, thread, bwr.read_buffer,
 -						 bwr.read_size,
 -						 &bwr.read_consumed,
 -						 filp->f_flags & O_NONBLOCK);
 -			trace_binder_read_done(ret);
 -			if (!list_empty(&proc->todo))
 -				wake_up_interruptible(&proc->wait);
 -			if (ret < 0) {
 -				if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
 -					ret = -EFAULT;
 -				goto err;
 -			}
 -		}
 -		binder_debug(BINDER_DEBUG_READ_WRITE,
 -			     "%d:%d wrote %lld of %lld, read return %lld of %lld\n",
 -			     proc->pid, thread->pid,
 -			     (u64)bwr.write_consumed, (u64)bwr.write_size,
 -			     (u64)bwr.read_consumed, (u64)bwr.read_size);
 -		if (copy_to_user(ubuf, &bwr, sizeof(bwr))) {
 -			ret = -EFAULT;
 -			goto err;
 -		}
    	break;
 -	}
    case BINDER_SET_MAX_THREADS:
    	if (copy_from_user(&proc->max_threads, ubuf, sizeof(proc->max_threads))) {
    		ret = -EINVAL;
@@@ -2730,9 -2679,31 +2728,9 @@@
    	}
    	break;
    case BINDER_SET_CONTEXT_MGR:
 -		if (binder_context_mgr_node != NULL) {
 -			pr_err("BINDER_SET_CONTEXT_MGR already set\n");
 -			ret = -EBUSY;
 -			goto err;
 -		}
 -		if (uid_valid(binder_context_mgr_uid)) {
 -			if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
 -				pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
 -				       from_kuid(&init_user_ns, curr_euid),
 -				       from_kuid(&init_user_ns, binder_context_mgr_uid));
 -				ret = -EPERM;
 -				goto err;
 -			}
 -		} else {
 -			binder_context_mgr_uid = curr_euid;
 -		}
 -		binder_context_mgr_node = binder_new_node(proc, 0, 0);
 -		if (binder_context_mgr_node == NULL) {
 -			ret = -ENOMEM;
 +		ret = binder_ioctl_set_ctx_mgr(filp);
 +		if (ret)
    		goto err;
 -		}
 -		binder_context_mgr_node->local_weak_refs++;
 -		binder_context_mgr_node->local_strong_refs++;
 -		binder_context_mgr_node->has_strong_ref = 1;
 -		binder_context_mgr_node->has_weak_ref = 1;
    	break;
    case BINDER_THREAD_EXIT:
    	binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n",
@@@ -2796,15 -2767,9 +2794,15 @@@ static void binder_vma_close(struct vm_
    binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
  }
+static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 +{
 +	return VM_FAULT_SIGBUS;
 +}
 +
  static struct vm_operations_struct binder_vm_ops = {
    .open = binder_vma_open,
    .close = binder_vma_close,
 +	.fault = binder_vm_fault,
  };
static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
diff --combined drivers/staging/lustre/lustre/libcfs/hash.c
index 5dde794,6db7391..8ef1deb
--- a/drivers/staging/lustre/lustre/libcfs/hash.c
+++ b/drivers/staging/lustre/lustre/libcfs/hash.c
@@@ -107,7 -107,7 +107,7 @@@
   *   table. Also, user can break the iteration by return 1 in callback.
   */
-#include <linux/libcfs/libcfs.h>
 +#include "../../include/linux/libcfs/libcfs.h"
  #include <linux/seq_file.h>
#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
@@@ -351,7 -351,7 +351,7 @@@ cfs_hash_dh_hnode_add(struct cfs_hash *
    				    cfs_hash_dhead_t, dh_head);
if (dh->dh_tail != NULL) /* not empty */
- 		hlist_add_after(dh->dh_tail, hnode);
+ 		hlist_add_behind(hnode, dh->dh_tail);
    else /* empty list */
    	hlist_add_head(hnode, &dh->dh_head);
    dh->dh_tail = hnode;
@@@ -406,7 -406,7 +406,7 @@@ cfs_hash_dd_hnode_add(struct cfs_hash *
    					cfs_hash_dhead_dep_t, dd_head);
if (dh->dd_tail != NULL) /* not empty */
- 		hlist_add_after(dh->dd_tail, hnode);
+ 		hlist_add_behind(hnode, dh->dd_tail);
    else /* empty list */
    	hlist_add_head(hnode, &dh->dd_head);
    dh->dd_tail = hnode;
diff --combined drivers/video/backlight/backlight.c
index bddc8b1,19b170d..0ce8823
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@@ -190,8 -190,6 +190,6 @@@ static ssize_t brightness_store(struct 
    }
    mutex_unlock(&bd->ops_lock);
- 	backlight_generate_event(bd, BACKLIGHT_UPDATE_SYSFS);
- 
    return rc;
  }
  static DEVICE_ATTR_RW(brightness);
@@@ -223,8 -221,6 +221,8 @@@ static ssize_t actual_brightness_show(s
    mutex_lock(&bd->ops_lock);
    if (bd->ops && bd->ops->get_brightness)
    	rc = sprintf(buf, "%d\n", bd->ops->get_brightness(bd));
 +	else
 +		rc = sprintf(buf, "%d\n", bd->props.brightness);
    mutex_unlock(&bd->ops_lock);
return rc;
diff --combined fs/cifs/cifssmb.c
index 66f6500,c3dc52e..86a2aa5
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@@ -196,6 -196,10 +196,6 @@@ cifs_reconnect_tcon(struct cifs_tcon *t
    if (rc)
    	goto out;
-	/*
 -	 * FIXME: check if wsize needs updated due to negotiated smb buffer
 -	 * 	  size shrinking
 -	 */
    atomic_inc(&tconInfoReconnectCount);
/* tell server Unix caps we support */
@@@ -1513,6 -1517,7 +1513,6 @@@ cifs_readv_receive(struct TCP_Server_In
    	return length;
server->total_read += length;
 -	rdata->bytes = length;
cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
    	 server->total_read, buflen, data_len);
@@@ -1555,18 -1560,12 +1555,18 @@@ cifs_readv_callback(struct mid_q_entry 
    				 rc);
    	}
    	/* FIXME: should this be counted toward the initiating task? */
 -		task_io_account_read(rdata->bytes);
 -		cifs_stats_bytes_read(tcon, rdata->bytes);
 +		task_io_account_read(rdata->got_bytes);
 +		cifs_stats_bytes_read(tcon, rdata->got_bytes);
    	break;
    case MID_REQUEST_SUBMITTED:
    case MID_RETRY_NEEDED:
    	rdata->result = -EAGAIN;
 +		if (server->sign && rdata->got_bytes)
 +			/* reset bytes number since we can not check a sign */
 +			rdata->got_bytes = 0;
 +		/* FIXME: should this be counted toward the initiating task? */
 +		task_io_account_read(rdata->got_bytes);
 +		cifs_stats_bytes_read(tcon, rdata->got_bytes);
    	break;
    default:
    	rdata->result = -EIO;
@@@ -1735,7 -1734,10 +1735,7 @@@ CIFSSMBRead(const unsigned int xid, str
/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
    if (*buf) {
 -		if (resp_buf_type == CIFS_SMALL_BUFFER)
 -			cifs_small_buf_release(iov[0].iov_base);
 -		else if (resp_buf_type == CIFS_LARGE_BUFFER)
 -			cifs_buf_release(iov[0].iov_base);
 +		free_rsp_buf(resp_buf_type, iov[0].iov_base);
    } else if (resp_buf_type != CIFS_NO_BUFFER) {
    	/* return buffer to caller to free */
    	*buf = iov[0].iov_base;
@@@ -1897,80 -1899,28 +1897,80 @@@ cifs_writedata_release(struct kref *ref
  static void
  cifs_writev_requeue(struct cifs_writedata *wdata)
  {
 -	int i, rc;
 +	int i, rc = 0;
    struct inode *inode = wdata->cfile->dentry->d_inode;
    struct TCP_Server_Info *server;
 +	unsigned int rest_len;
-	for (i = 0; i < wdata->nr_pages; i++) {
 -		lock_page(wdata->pages[i]);
 -		clear_page_dirty_for_io(wdata->pages[i]);
 -	}
 -
 +	server = tlink_tcon(wdata->cfile->tlink)->ses->server;
 +	i = 0;
 +	rest_len = wdata->bytes;
    do {
 -		server = tlink_tcon(wdata->cfile->tlink)->ses->server;
 -		rc = server->ops->async_writev(wdata, cifs_writedata_release);
 -	} while (rc == -EAGAIN);
 +		struct cifs_writedata *wdata2;
 +		unsigned int j, nr_pages, wsize, tailsz, cur_len;
 +
 +		wsize = server->ops->wp_retry_size(inode);
 +		if (wsize < rest_len) {
 +			nr_pages = wsize / PAGE_CACHE_SIZE;
 +			if (!nr_pages) {
 +				rc = -ENOTSUPP;
 +				break;
 +			}
 +			cur_len = nr_pages * PAGE_CACHE_SIZE;
 +			tailsz = PAGE_CACHE_SIZE;
 +		} else {
 +			nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE);
 +			cur_len = rest_len;
 +			tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE;
 +		}
-	for (i = 0; i < wdata->nr_pages; i++) {
 -		unlock_page(wdata->pages[i]);
 -		if (rc != 0) {
 -			SetPageError(wdata->pages[i]);
 -			end_page_writeback(wdata->pages[i]);
 -			page_cache_release(wdata->pages[i]);
 +		wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
 +		if (!wdata2) {
 +			rc = -ENOMEM;
 +			break;
    	}
 -	}
 +
 +		for (j = 0; j < nr_pages; j++) {
 +			wdata2->pages[j] = wdata->pages[i + j];
 +			lock_page(wdata2->pages[j]);
 +			clear_page_dirty_for_io(wdata2->pages[j]);
 +		}
 +
 +		wdata2->sync_mode = wdata->sync_mode;
 +		wdata2->nr_pages = nr_pages;
 +		wdata2->offset = page_offset(wdata2->pages[0]);
 +		wdata2->pagesz = PAGE_CACHE_SIZE;
 +		wdata2->tailsz = tailsz;
 +		wdata2->bytes = cur_len;
 +
 +		wdata2->cfile = find_writable_file(CIFS_I(inode), false);
 +		if (!wdata2->cfile) {
 +			cifs_dbg(VFS, "No writable handles for inode\n");
 +			rc = -EBADF;
 +			break;
 +		}
 +		wdata2->pid = wdata2->cfile->pid;
 +		rc = server->ops->async_writev(wdata2, cifs_writedata_release);
 +
 +		for (j = 0; j < nr_pages; j++) {
 +			unlock_page(wdata2->pages[j]);
 +			if (rc != 0 && rc != -EAGAIN) {
 +				SetPageError(wdata2->pages[j]);
 +				end_page_writeback(wdata2->pages[j]);
 +				page_cache_release(wdata2->pages[j]);
 +			}
 +		}
 +
 +		if (rc) {
 +			kref_put(&wdata2->refcount, cifs_writedata_release);
 +			if (rc == -EAGAIN)
 +				continue;
 +			break;
 +		}
 +
 +		rest_len -= cur_len;
 +		i += nr_pages;
 +	} while (i < wdata->nr_pages);
mapping_set_error(inode->i_mapping, rc);
    kref_put(&wdata->refcount, cifs_writedata_release);
@@@ -2253,7 -2203,10 +2253,7 @@@ CIFSSMBWrite2(const unsigned int xid, s
    }
/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
 -	if (resp_buf_type == CIFS_SMALL_BUFFER)
 -		cifs_small_buf_release(iov[0].iov_base);
 -	else if (resp_buf_type == CIFS_LARGE_BUFFER)
 -		cifs_buf_release(iov[0].iov_base);
 +	free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls
    	since file handle passed in no longer valid */
@@@ -2477,14 -2430,14 +2477,14 @@@ CIFSSMBPosixLock(const unsigned int xid
    	}
    	parm_data = (struct cifs_posix_lock *)
    		((char *)&pSMBr->hdr.Protocol + data_offset);
- 		if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
+ 		if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
    		pLockData->fl_type = F_UNLCK;
    	else {
    		if (parm_data->lock_type ==
- 					__constant_cpu_to_le16(CIFS_RDLCK))
+ 					cpu_to_le16(CIFS_RDLCK))
    			pLockData->fl_type = F_RDLCK;
    		else if (parm_data->lock_type ==
- 					__constant_cpu_to_le16(CIFS_WRLCK))
+ 					cpu_to_le16(CIFS_WRLCK))
    			pLockData->fl_type = F_WRLCK;
pLockData->fl_start = le64_to_cpu(parm_data->start);
@@@ -2498,7 -2451,10 +2498,7 @@@ plk_err_exit
    if (pSMB)
    	cifs_small_buf_release(pSMB);
-	if (resp_buf_type == CIFS_SMALL_BUFFER)
 -		cifs_small_buf_release(iov[0].iov_base);
 -	else if (resp_buf_type == CIFS_LARGE_BUFFER)
 -		cifs_buf_release(iov[0].iov_base);
 +	free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls
       since file handle passed in no longer valid */
@@@ -3276,25 -3232,25 +3276,25 @@@ CIFSSMB_set_compression(const unsigned 
    pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
pSMB->TotalParameterCount = 0;
- 	pSMB->TotalDataCount = __constant_cpu_to_le32(2);
+ 	pSMB->TotalDataCount = cpu_to_le32(2);
    pSMB->MaxParameterCount = 0;
    pSMB->MaxDataCount = 0;
    pSMB->MaxSetupCount = 4;
    pSMB->Reserved = 0;
    pSMB->ParameterOffset = 0;
- 	pSMB->DataCount = __constant_cpu_to_le32(2);
+ 	pSMB->DataCount = cpu_to_le32(2);
    pSMB->DataOffset =
    	cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req,
    			compression_state) - 4);  /* 84 */
    pSMB->SetupCount = 4;
- 	pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL);
+ 	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
    pSMB->ParameterCount = 0;
- 	pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION);
+ 	pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION);
    pSMB->IsFsctl = 1; /* FSCTL */
    pSMB->IsRootFlag = 0;
    pSMB->Fid = fid; /* file handle always le */
    /* 3 byte pad, followed by 2 byte compress state */
- 	pSMB->ByteCount = __constant_cpu_to_le16(5);
+ 	pSMB->ByteCount = cpu_to_le16(5);
    inc_rfc1001_len(pSMB, 5);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@@ -3430,10 -3386,10 +3430,10 @@@ static __u16 ACL_to_cifs_posix(char *pa
    cifs_acl->version = cpu_to_le16(1);
    if (acl_type == ACL_TYPE_ACCESS) {
    	cifs_acl->access_entry_count = cpu_to_le16(count);
- 		cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF);
+ 		cifs_acl->default_entry_count = cpu_to_le16(0xFFFF);
    } else if (acl_type == ACL_TYPE_DEFAULT) {
    	cifs_acl->default_entry_count = cpu_to_le16(count);
- 		cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF);
+ 		cifs_acl->access_entry_count = cpu_to_le16(0xFFFF);
    } else {
    	cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
    	return 0;
@@@ -3882,7 -3838,10 +3882,7 @@@ CIFSSMBGetCIFSACL(const unsigned int xi
    	}
    }
  qsec_out:
 -	if (buf_type == CIFS_SMALL_BUFFER)
 -		cifs_small_buf_release(iov[0].iov_base);
 -	else if (buf_type == CIFS_LARGE_BUFFER)
 -		cifs_buf_release(iov[0].iov_base);
 +	free_rsp_buf(buf_type, iov[0].iov_base);
  /*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
    return rc;
  }
diff --combined fs/cifs/file.c
index 4ab2f79,3c1967c..553747f
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@@ -1058,7 -1058,7 +1058,7 @@@ cifs_push_mandatory_locks(struct cifsFi
max_num = (max_buf - sizeof(struct smb_hdr)) /
    					sizeof(LOCKING_ANDX_RANGE);
- 	buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ 	buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
    if (!buf) {
    	free_xid(xid);
    	return -ENOMEM;
@@@ -1393,7 -1393,7 +1393,7 @@@ cifs_unlock_range(struct cifsFileInfo *
max_num = (max_buf - sizeof(struct smb_hdr)) /
    					sizeof(LOCKING_ANDX_RANGE);
- 	buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ 	buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
    if (!buf)
    	return -ENOMEM;
@@@ -1670,8 -1670,8 +1670,8 @@@ cifs_write(struct cifsFileInfo *open_fi
    				break;
    		}
-			len = min((size_t)cifs_sb->wsize,
 -				  write_size - total_written);
 +			len = min(server->ops->wp_retry_size(dentry->d_inode),
 +				  (unsigned int)write_size - total_written);
    		/* iov[0] is reserved for smb header */
    		iov[1].iov_base = (char *)write_data + total_written;
    		iov[1].iov_len = len;
@@@ -1878,163 -1878,15 +1878,163 @@@ static int cifs_partialpagewrite(struc
    return rc;
  }
+static struct cifs_writedata *
 +wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
 +			  pgoff_t end, pgoff_t *index,
 +			  unsigned int *found_pages)
 +{
 +	unsigned int nr_pages;
 +	struct page **pages;
 +	struct cifs_writedata *wdata;
 +
 +	wdata = cifs_writedata_alloc((unsigned int)tofind,
 +				     cifs_writev_complete);
 +	if (!wdata)
 +		return NULL;
 +
 +	/*
 +	 * find_get_pages_tag seems to return a max of 256 on each
 +	 * iteration, so we must call it several times in order to
 +	 * fill the array or the wsize is effectively limited to
 +	 * 256 * PAGE_CACHE_SIZE.
 +	 */
 +	*found_pages = 0;
 +	pages = wdata->pages;
 +	do {
 +		nr_pages = find_get_pages_tag(mapping, index,
 +					      PAGECACHE_TAG_DIRTY, tofind,
 +					      pages);
 +		*found_pages += nr_pages;
 +		tofind -= nr_pages;
 +		pages += nr_pages;
 +	} while (nr_pages && tofind && *index <= end);
 +
 +	return wdata;
 +}
 +
 +static unsigned int
 +wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
 +		    struct address_space *mapping,
 +		    struct writeback_control *wbc,
 +		    pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done)
 +{
 +	unsigned int nr_pages = 0, i;
 +	struct page *page;
 +
 +	for (i = 0; i < found_pages; i++) {
 +		page = wdata->pages[i];
 +		/*
 +		 * At this point we hold neither mapping->tree_lock nor
 +		 * lock on the page itself: the page may be truncated or
 +		 * invalidated (changing page->mapping to NULL), or even
 +		 * swizzled back from swapper_space to tmpfs file
 +		 * mapping
 +		 */
 +
 +		if (nr_pages == 0)
 +			lock_page(page);
 +		else if (!trylock_page(page))
 +			break;
 +
 +		if (unlikely(page->mapping != mapping)) {
 +			unlock_page(page);
 +			break;
 +		}
 +
 +		if (!wbc->range_cyclic && page->index > end) {
 +			*done = true;
 +			unlock_page(page);
 +			break;
 +		}
 +
 +		if (*next && (page->index != *next)) {
 +			/* Not next consecutive page */
 +			unlock_page(page);
 +			break;
 +		}
 +
 +		if (wbc->sync_mode != WB_SYNC_NONE)
 +			wait_on_page_writeback(page);
 +
 +		if (PageWriteback(page) ||
 +				!clear_page_dirty_for_io(page)) {
 +			unlock_page(page);
 +			break;
 +		}
 +
 +		/*
 +		 * This actually clears the dirty bit in the radix tree.
 +		 * See cifs_writepage() for more commentary.
 +		 */
 +		set_page_writeback(page);
 +		if (page_offset(page) >= i_size_read(mapping->host)) {
 +			*done = true;
 +			unlock_page(page);
 +			end_page_writeback(page);
 +			break;
 +		}
 +
 +		wdata->pages[i] = page;
 +		*next = page->index + 1;
 +		++nr_pages;
 +	}
 +
 +	/* reset index to refind any pages skipped */
 +	if (nr_pages == 0)
 +		*index = wdata->pages[0]->index + 1;
 +
 +	/* put any pages we aren't going to use */
 +	for (i = nr_pages; i < found_pages; i++) {
 +		page_cache_release(wdata->pages[i]);
 +		wdata->pages[i] = NULL;
 +	}
 +
 +	return nr_pages;
 +}
 +
 +static int
 +wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
 +		 struct address_space *mapping, struct writeback_control *wbc)
 +{
 +	int rc = 0;
 +	struct TCP_Server_Info *server;
 +	unsigned int i;
 +
 +	wdata->sync_mode = wbc->sync_mode;
 +	wdata->nr_pages = nr_pages;
 +	wdata->offset = page_offset(wdata->pages[0]);
 +	wdata->pagesz = PAGE_CACHE_SIZE;
 +	wdata->tailsz = min(i_size_read(mapping->host) -
 +			page_offset(wdata->pages[nr_pages - 1]),
 +			(loff_t)PAGE_CACHE_SIZE);
 +	wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz;
 +
 +	if (wdata->cfile != NULL)
 +		cifsFileInfo_put(wdata->cfile);
 +	wdata->cfile = find_writable_file(CIFS_I(mapping->host), false);
 +	if (!wdata->cfile) {
 +		cifs_dbg(VFS, "No writable handles for inode\n");
 +		rc = -EBADF;
 +	} else {
 +		wdata->pid = wdata->cfile->pid;
 +		server = tlink_tcon(wdata->cfile->tlink)->ses->server;
 +		rc = server->ops->async_writev(wdata, cifs_writedata_release);
 +	}
 +
 +	for (i = 0; i < nr_pages; ++i)
 +		unlock_page(wdata->pages[i]);
 +
 +	return rc;
 +}
 +
  static int cifs_writepages(struct address_space *mapping,
    		   struct writeback_control *wbc)
  {
    struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
 +	struct TCP_Server_Info *server;
    bool done = false, scanned = false, range_whole = false;
    pgoff_t end, index;
    struct cifs_writedata *wdata;
 -	struct TCP_Server_Info *server;
 -	struct page *page;
    int rc = 0;
/*
@@@ -2054,50 -1906,152 +2054,50 @@@
    		range_whole = true;
    	scanned = true;
    }
 +	server = cifs_sb_master_tcon(cifs_sb)->ses->server;
  retry:
    while (!done && index <= end) {
 -		unsigned int i, nr_pages, found_pages;
 -		pgoff_t next = 0, tofind;
 -		struct page **pages;
 +		unsigned int i, nr_pages, found_pages, wsize, credits;
 +		pgoff_t next = 0, tofind, saved_index = index;
 +
 +		rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
 +						   &wsize, &credits);
 +		if (rc)
 +			break;
-		tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1,
 -				end - index) + 1;
 +		tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1;
-		wdata = cifs_writedata_alloc((unsigned int)tofind,
 -					     cifs_writev_complete);
 +		wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index,
 +						  &found_pages);
    	if (!wdata) {
    		rc = -ENOMEM;
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
-		/*
 -		 * find_get_pages_tag seems to return a max of 256 on each
 -		 * iteration, so we must call it several times in order to
 -		 * fill the array or the wsize is effectively limited to
 -		 * 256 * PAGE_CACHE_SIZE.
 -		 */
 -		found_pages = 0;
 -		pages = wdata->pages;
 -		do {
 -			nr_pages = find_get_pages_tag(mapping, &index,
 -							PAGECACHE_TAG_DIRTY,
 -							tofind, pages);
 -			found_pages += nr_pages;
 -			tofind -= nr_pages;
 -			pages += nr_pages;
 -		} while (nr_pages && tofind && index <= end);
 -
    	if (found_pages == 0) {
    		kref_put(&wdata->refcount, cifs_writedata_release);
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
-		nr_pages = 0;
 -		for (i = 0; i < found_pages; i++) {
 -			page = wdata->pages[i];
 -			/*
 -			 * At this point we hold neither mapping->tree_lock nor
 -			 * lock on the page itself: the page may be truncated or
 -			 * invalidated (changing page->mapping to NULL), or even
 -			 * swizzled back from swapper_space to tmpfs file
 -			 * mapping
 -			 */
 -
 -			if (nr_pages == 0)
 -				lock_page(page);
 -			else if (!trylock_page(page))
 -				break;
 -
 -			if (unlikely(page->mapping != mapping)) {
 -				unlock_page(page);
 -				break;
 -			}
 -
 -			if (!wbc->range_cyclic && page->index > end) {
 -				done = true;
 -				unlock_page(page);
 -				break;
 -			}
 -
 -			if (next && (page->index != next)) {
 -				/* Not next consecutive page */
 -				unlock_page(page);
 -				break;
 -			}
 -
 -			if (wbc->sync_mode != WB_SYNC_NONE)
 -				wait_on_page_writeback(page);
 -
 -			if (PageWriteback(page) ||
 -					!clear_page_dirty_for_io(page)) {
 -				unlock_page(page);
 -				break;
 -			}
 -
 -			/*
 -			 * This actually clears the dirty bit in the radix tree.
 -			 * See cifs_writepage() for more commentary.
 -			 */
 -			set_page_writeback(page);
 -
 -			if (page_offset(page) >= i_size_read(mapping->host)) {
 -				done = true;
 -				unlock_page(page);
 -				end_page_writeback(page);
 -				break;
 -			}
 -
 -			wdata->pages[i] = page;
 -			next = page->index + 1;
 -			++nr_pages;
 -		}
 -
 -		/* reset index to refind any pages skipped */
 -		if (nr_pages == 0)
 -			index = wdata->pages[0]->index + 1;
 -
 -		/* put any pages we aren't going to use */
 -		for (i = nr_pages; i < found_pages; i++) {
 -			page_cache_release(wdata->pages[i]);
 -			wdata->pages[i] = NULL;
 -		}
 +		nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc,
 +					       end, &index, &next, &done);
/* nothing to write? */
    	if (nr_pages == 0) {
    		kref_put(&wdata->refcount, cifs_writedata_release);
 +			add_credits_and_wake_if(server, credits, 0);
    		continue;
    	}
-		wdata->sync_mode = wbc->sync_mode;
 -		wdata->nr_pages = nr_pages;
 -		wdata->offset = page_offset(wdata->pages[0]);
 -		wdata->pagesz = PAGE_CACHE_SIZE;
 -		wdata->tailsz =
 -			min(i_size_read(mapping->host) -
 -			    page_offset(wdata->pages[nr_pages - 1]),
 -			    (loff_t)PAGE_CACHE_SIZE);
 -		wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
 -					wdata->tailsz;
 -
 -		do {
 -			if (wdata->cfile != NULL)
 -				cifsFileInfo_put(wdata->cfile);
 -			wdata->cfile = find_writable_file(CIFS_I(mapping->host),
 -							  false);
 -			if (!wdata->cfile) {
 -				cifs_dbg(VFS, "No writable handles for inode\n");
 -				rc = -EBADF;
 -				break;
 -			}
 -			wdata->pid = wdata->cfile->pid;
 -			server = tlink_tcon(wdata->cfile->tlink)->ses->server;
 -			rc = server->ops->async_writev(wdata,
 -							cifs_writedata_release);
 -		} while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
 +		wdata->credits = credits;
-		for (i = 0; i < nr_pages; ++i)
 -			unlock_page(wdata->pages[i]);
 +		rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
/* send failure -- clean up the mess */
    	if (rc != 0) {
 +			add_credits_and_wake_if(server, wdata->credits, 0);
    		for (i = 0; i < nr_pages; ++i) {
    			if (rc == -EAGAIN)
    				redirty_page_for_writepage(wbc,
@@@ -2112,11 -2066,6 +2112,11 @@@
    	}
    	kref_put(&wdata->refcount, cifs_writedata_release);
+		if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) {
 +			index = saved_index;
 +			continue;
 +		}
 +
    	wbc->nr_to_write -= nr_pages;
    	if (wbc->nr_to_write <= 0)
    		done = true;
@@@ -2413,109 -2362,123 +2413,109 @@@ cifs_uncached_writev_complete(struct wo
    kref_put(&wdata->refcount, cifs_uncached_writedata_release);
  }
-/* attempt to send write to server, retry on any -EAGAIN errors */
  static int
 -cifs_uncached_retry_writev(struct cifs_writedata *wdata)
 +wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
 +		      size_t *len, unsigned long *num_pages)
  {
 -	int rc;
 -	struct TCP_Server_Info *server;
 +	size_t save_len, copied, bytes, cur_len = *len;
 +	unsigned long i, nr_pages = *num_pages;
-	server = tlink_tcon(wdata->cfile->tlink)->ses->server;
 +	save_len = cur_len;
 +	for (i = 0; i < nr_pages; i++) {
 +		bytes = min_t(const size_t, cur_len, PAGE_SIZE);
 +		copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from);
 +		cur_len -= copied;
 +		/*
 +		 * If we didn't copy as much as we expected, then that
 +		 * may mean we trod into an unmapped area. Stop copying
 +		 * at that point. On the next pass through the big
 +		 * loop, we'll likely end up getting a zero-length
 +		 * write and bailing out of it.
 +		 */
 +		if (copied < bytes)
 +			break;
 +	}
 +	cur_len = save_len - cur_len;
 +	*len = cur_len;
-	do {
 -		if (wdata->cfile->invalidHandle) {
 -			rc = cifs_reopen_file(wdata->cfile, false);
 -			if (rc != 0)
 -				continue;
 -		}
 -		rc = server->ops->async_writev(wdata,
 -					       cifs_uncached_writedata_release);
 -	} while (rc == -EAGAIN);
 +	/*
 +	 * If we have no data to send, then that probably means that
 +	 * the copy above failed altogether. That's most likely because
 +	 * the address in the iovec was bogus. Return -EFAULT and let
 +	 * the caller free anything we allocated and bail out.
 +	 */
 +	if (!cur_len)
 +		return -EFAULT;
-	return rc;
 +	/*
 +	 * i + 1 now represents the number of pages we actually used in
 +	 * the copy phase above.
 +	 */
 +	*num_pages = i + 1;
 +	return 0;
  }
-static ssize_t
 -cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
 +static int
 +cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 +		     struct cifsFileInfo *open_file,
 +		     struct cifs_sb_info *cifs_sb, struct list_head *wdata_list)
  {
 -	unsigned long nr_pages, i;
 -	size_t bytes, copied, len, cur_len;
 -	ssize_t total_written = 0;
 -	loff_t offset;
 -	struct cifsFileInfo *open_file;
 -	struct cifs_tcon *tcon;
 -	struct cifs_sb_info *cifs_sb;
 -	struct cifs_writedata *wdata, *tmp;
 -	struct list_head wdata_list;
 -	int rc;
 +	int rc = 0;
 +	size_t cur_len;
 +	unsigned long nr_pages, num_pages, i;
 +	struct cifs_writedata *wdata;
 +	struct iov_iter saved_from;
 +	loff_t saved_offset = offset;
    pid_t pid;
 -
 -	len = iov_iter_count(from);
 -	rc = generic_write_checks(file, poffset, &len, 0);
 -	if (rc)
 -		return rc;
 -
 -	if (!len)
 -		return 0;
 -
 -	iov_iter_truncate(from, len);
 -
 -	INIT_LIST_HEAD(&wdata_list);
 -	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 -	open_file = file->private_data;
 -	tcon = tlink_tcon(open_file->tlink);
 -
 -	if (!tcon->ses->server->ops->async_writev)
 -		return -ENOSYS;
 -
 -	offset = *poffset;
 +	struct TCP_Server_Info *server;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
    	pid = open_file->pid;
    else
    	pid = current->tgid;
+	server = tlink_tcon(open_file->tlink)->ses->server;
 +	memcpy(&saved_from, from, sizeof(struct iov_iter));
 +
    do {
 -		size_t save_len;
 +		unsigned int wsize, credits;
-		nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
 +		rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
 +						   &wsize, &credits);
 +		if (rc)
 +			break;
 +
 +		nr_pages = get_numpages(wsize, len, &cur_len);
    	wdata = cifs_writedata_alloc(nr_pages,
    				     cifs_uncached_writev_complete);
    	if (!wdata) {
    		rc = -ENOMEM;
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
    	if (rc) {
    		kfree(wdata);
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
-		save_len = cur_len;
 -		for (i = 0; i < nr_pages; i++) {
 -			bytes = min_t(size_t, cur_len, PAGE_SIZE);
 -			copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
 -						     from);
 -			cur_len -= copied;
 -			/*
 -			 * If we didn't copy as much as we expected, then that
 -			 * may mean we trod into an unmapped area. Stop copying
 -			 * at that point. On the next pass through the big
 -			 * loop, we'll likely end up getting a zero-length
 -			 * write and bailing out of it.
 -			 */
 -			if (copied < bytes)
 -				break;
 -		}
 -		cur_len = save_len - cur_len;
 -
 -		/*
 -		 * If we have no data to send, then that probably means that
 -		 * the copy above failed altogether. That's most likely because
 -		 * the address in the iovec was bogus. Set the rc to -EFAULT,
 -		 * free anything we allocated and bail out.
 -		 */
 -		if (!cur_len) {
 +		num_pages = nr_pages;
 +		rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
 +		if (rc) {
    		for (i = 0; i < nr_pages; i++)
    			put_page(wdata->pages[i]);
    		kfree(wdata);
 -			rc = -EFAULT;
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
/*
 -		 * i + 1 now represents the number of pages we actually used in
 -		 * the copy phase above. Bring nr_pages down to that, and free
 -		 * any pages that we didn't use.
 +		 * Bring nr_pages down to the number of pages we actually used,
 +		 * and free any pages that we didn't use.
    	 */
 -		for ( ; nr_pages > i + 1; nr_pages--)
 +		for ( ; nr_pages > num_pages; nr_pages--)
    		put_page(wdata->pages[nr_pages - 1]);
wdata->sync_mode = WB_SYNC_ALL;
@@@ -2526,69 -2489,18 +2526,69 @@@
    	wdata->bytes = cur_len;
    	wdata->pagesz = PAGE_SIZE;
    	wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
 -		rc = cifs_uncached_retry_writev(wdata);
 +		wdata->credits = credits;
 +
 +		if (!wdata->cfile->invalidHandle ||
 +		    !cifs_reopen_file(wdata->cfile, false))
 +			rc = server->ops->async_writev(wdata,
 +					cifs_uncached_writedata_release);
    	if (rc) {
 +			add_credits_and_wake_if(server, wdata->credits, 0);
    		kref_put(&wdata->refcount,
    			 cifs_uncached_writedata_release);
 +			if (rc == -EAGAIN) {
 +				memcpy(from, &saved_from,
 +				       sizeof(struct iov_iter));
 +				iov_iter_advance(from, offset - saved_offset);
 +				continue;
 +			}
    		break;
    	}
-		list_add_tail(&wdata->list, &wdata_list);
 +		list_add_tail(&wdata->list, wdata_list);
    	offset += cur_len;
    	len -= cur_len;
    } while (len > 0);
+	return rc;
 +}
 +
 +static ssize_t
 +cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
 +{
 +	size_t len;
 +	ssize_t total_written = 0;
 +	struct cifsFileInfo *open_file;
 +	struct cifs_tcon *tcon;
 +	struct cifs_sb_info *cifs_sb;
 +	struct cifs_writedata *wdata, *tmp;
 +	struct list_head wdata_list;
 +	struct iov_iter saved_from;
 +	int rc;
 +
 +	len = iov_iter_count(from);
 +	rc = generic_write_checks(file, poffset, &len, 0);
 +	if (rc)
 +		return rc;
 +
 +	if (!len)
 +		return 0;
 +
 +	iov_iter_truncate(from, len);
 +
 +	INIT_LIST_HEAD(&wdata_list);
 +	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 +	open_file = file->private_data;
 +	tcon = tlink_tcon(open_file->tlink);
 +
 +	if (!tcon->ses->server->ops->async_writev)
 +		return -ENOSYS;
 +
 +	memcpy(&saved_from, from, sizeof(struct iov_iter));
 +
 +	rc = cifs_write_from_iter(*poffset, len, from, open_file, cifs_sb,
 +				  &wdata_list);
 +
    /*
     * If at least one write was successfully sent, then discard any rc
     * value from the later writes. If the other write succeeds, then
@@@ -2617,25 -2529,7 +2617,25 @@@ restart_loop
/* resend call if it's a retryable error */
    		if (rc == -EAGAIN) {
 -				rc = cifs_uncached_retry_writev(wdata);
 +				struct list_head tmp_list;
 +				struct iov_iter tmp_from;
 +
 +				INIT_LIST_HEAD(&tmp_list);
 +				list_del_init(&wdata->list);
 +
 +				memcpy(&tmp_from, &saved_from,
 +				       sizeof(struct iov_iter));
 +				iov_iter_advance(&tmp_from,
 +						 wdata->offset - *poffset);
 +
 +				rc = cifs_write_from_iter(wdata->offset,
 +						wdata->bytes, &tmp_from,
 +						open_file, cifs_sb, &tmp_list);
 +
 +				list_splice(&tmp_list, &wdata_list);
 +
 +				kref_put(&wdata->refcount,
 +					 cifs_uncached_writedata_release);
    			goto restart_loop;
    		}
    	}
@@@ -2828,6 -2722,26 +2828,6 @@@ cifs_uncached_readdata_release(struct k
    cifs_readdata_release(refcount);
  }
-static int
 -cifs_retry_async_readv(struct cifs_readdata *rdata)
 -{
 -	int rc;
 -	struct TCP_Server_Info *server;
 -
 -	server = tlink_tcon(rdata->cfile->tlink)->ses->server;
 -
 -	do {
 -		if (rdata->cfile->invalidHandle) {
 -			rc = cifs_reopen_file(rdata->cfile, true);
 -			if (rc != 0)
 -				continue;
 -		}
 -		rc = server->ops->async_readv(rdata);
 -	} while (rc == -EAGAIN);
 -
 -	return rc;
 -}
 -
  /**
   * cifs_readdata_to_iov - copy data from pages in response to an iovec
   * @rdata:	the readdata response with list of pages holding data
@@@ -2840,7 -2754,7 +2840,7 @@@
  static int
  cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
  {
 -	size_t remaining = rdata->bytes;
 +	size_t remaining = rdata->got_bytes;
    unsigned int i;
for (i = 0; i < rdata->nr_pages; i++) {
@@@ -2868,12 -2782,11 +2868,12 @@@ static in
  cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
    		struct cifs_readdata *rdata, unsigned int len)
  {
 -	int total_read = 0, result = 0;
 +	int result = 0;
    unsigned int i;
    unsigned int nr_pages = rdata->nr_pages;
    struct kvec iov;
+	rdata->got_bytes = 0;
    rdata->tailsz = PAGE_SIZE;
    for (i = 0; i < nr_pages; i++) {
    	struct page *page = rdata->pages[i];
@@@ -2907,45 -2820,55 +2907,45 @@@
    	if (result < 0)
    		break;
-		total_read += result;
 +		rdata->got_bytes += result;
    }
-	return total_read > 0 ? total_read : result;
 +	return rdata->got_bytes > 0 && result != -ECONNABORTED ?
 +						rdata->got_bytes : result;
  }
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
 +static int
 +cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 +		     struct cifs_sb_info *cifs_sb, struct list_head *rdata_list)
  {
 -	struct file *file = iocb->ki_filp;
 -	ssize_t rc;
 -	size_t len, cur_len;
 -	ssize_t total_read = 0;
 -	loff_t offset = iocb->ki_pos;
 -	unsigned int npages;
 -	struct cifs_sb_info *cifs_sb;
 -	struct cifs_tcon *tcon;
 -	struct cifsFileInfo *open_file;
 -	struct cifs_readdata *rdata, *tmp;
 -	struct list_head rdata_list;
 +	struct cifs_readdata *rdata;
 +	unsigned int npages, rsize, credits;
 +	size_t cur_len;
 +	int rc;
    pid_t pid;
 +	struct TCP_Server_Info *server;
-	len = iov_iter_count(to);
 -	if (!len)
 -		return 0;
 -
 -	INIT_LIST_HEAD(&rdata_list);
 -	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 -	open_file = file->private_data;
 -	tcon = tlink_tcon(open_file->tlink);
 -
 -	if (!tcon->ses->server->ops->async_readv)
 -		return -ENOSYS;
 +	server = tlink_tcon(open_file->tlink)->ses->server;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
    	pid = open_file->pid;
    else
    	pid = current->tgid;
-	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 -		cifs_dbg(FYI, "attempting read on write only file instance\n");
 -
    do {
 -		cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
 +		rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
 +						   &rsize, &credits);
 +		if (rc)
 +			break;
 +
 +		cur_len = min_t(const size_t, len, rsize);
    	npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
/* allocate a readdata struct */
    	rdata = cifs_readdata_alloc(npages,
    				    cifs_uncached_readv_complete);
    	if (!rdata) {
 +			add_credits_and_wake_if(server, credits, 0);
    		rc = -ENOMEM;
    		break;
    	}
@@@ -2961,113 -2884,44 +2961,113 @@@
    	rdata->pid = pid;
    	rdata->pagesz = PAGE_SIZE;
    	rdata->read_into_pages = cifs_uncached_read_into_pages;
 +		rdata->credits = credits;
-		rc = cifs_retry_async_readv(rdata);
 +		if (!rdata->cfile->invalidHandle ||
 +		    !cifs_reopen_file(rdata->cfile, true))
 +			rc = server->ops->async_readv(rdata);
  error:
    	if (rc) {
 +			add_credits_and_wake_if(server, rdata->credits, 0);
    		kref_put(&rdata->refcount,
    			 cifs_uncached_readdata_release);
 +			if (rc == -EAGAIN)
 +				continue;
    		break;
    	}
-		list_add_tail(&rdata->list, &rdata_list);
 +		list_add_tail(&rdata->list, rdata_list);
    	offset += cur_len;
    	len -= cur_len;
    } while (len > 0);
+	return rc;
 +}
 +
 +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
 +{
 +	struct file *file = iocb->ki_filp;
 +	ssize_t rc;
 +	size_t len;
 +	ssize_t total_read = 0;
 +	loff_t offset = iocb->ki_pos;
 +	struct cifs_sb_info *cifs_sb;
 +	struct cifs_tcon *tcon;
 +	struct cifsFileInfo *open_file;
 +	struct cifs_readdata *rdata, *tmp;
 +	struct list_head rdata_list;
 +
 +	len = iov_iter_count(to);
 +	if (!len)
 +		return 0;
 +
 +	INIT_LIST_HEAD(&rdata_list);
 +	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 +	open_file = file->private_data;
 +	tcon = tlink_tcon(open_file->tlink);
 +
 +	if (!tcon->ses->server->ops->async_readv)
 +		return -ENOSYS;
 +
 +	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 +		cifs_dbg(FYI, "attempting read on write only file instance\n");
 +
 +	rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list);
 +
    /* if at least one read request send succeeded, then reset rc */
    if (!list_empty(&rdata_list))
    	rc = 0;
len = iov_iter_count(to);
    /* the loop below should proceed in the order of increasing offsets */
 +again:
    list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
 -	again:
    	if (!rc) {
    		/* FIXME: freezable sleep too? */
    		rc = wait_for_completion_killable(&rdata->done);
    		if (rc)
    			rc = -EINTR;
 -			else if (rdata->result) {
 -				rc = rdata->result;
 +			else if (rdata->result == -EAGAIN) {
    			/* resend call if it's a retryable error */
 -				if (rc == -EAGAIN) {
 -					rc = cifs_retry_async_readv(rdata);
 -					goto again;
 +				struct list_head tmp_list;
 +				unsigned int got_bytes = rdata->got_bytes;
 +
 +				list_del_init(&rdata->list);
 +				INIT_LIST_HEAD(&tmp_list);
 +
 +				/*
 +				 * Got a part of data and then reconnect has
 +				 * happened -- fill the buffer and continue
 +				 * reading.
 +				 */
 +				if (got_bytes && got_bytes < rdata->bytes) {
 +					rc = cifs_readdata_to_iov(rdata, to);
 +					if (rc) {
 +						kref_put(&rdata->refcount,
 +						cifs_uncached_readdata_release);
 +						continue;
 +					}
    			}
 -			} else {
 +
 +				rc = cifs_send_async_read(
 +						rdata->offset + got_bytes,
 +						rdata->bytes - got_bytes,
 +						rdata->cfile, cifs_sb,
 +						&tmp_list);
 +
 +				list_splice(&tmp_list, &rdata_list);
 +
 +				kref_put(&rdata->refcount,
 +					 cifs_uncached_readdata_release);
 +				goto again;
 +			} else if (rdata->result)
 +				rc = rdata->result;
 +			else
    			rc = cifs_readdata_to_iov(rdata, to);
 -			}
+			/* if there was a short read -- discard anything left */
 +			if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
 +				rc = -ENODATA;
    	}
    	list_del_init(&rdata->list);
    	kref_put(&rdata->refcount, cifs_uncached_readdata_release);
@@@ -3176,19 -3030,18 +3176,19 @@@ cifs_read(struct file *file, char *read
for (total_read = 0, cur_offset = read_data; read_size > total_read;
         total_read += bytes_read, cur_offset += bytes_read) {
 -		current_read_size = min_t(uint, read_size - total_read, rsize);
 -		/*
 -		 * For windows me and 9x we do not want to request more than it
 -		 * negotiated since it will refuse the read then.
 -		 */
 -		if ((tcon->ses) && !(tcon->ses->capabilities &
 +		do {
 +			current_read_size = min_t(uint, read_size - total_read,
 +						  rsize);
 +			/*
 +			 * For windows me and 9x we do not want to request more
 +			 * than it negotiated since it will refuse the read
 +			 * then.
 +			 */
 +			if ((tcon->ses) && !(tcon->ses->capabilities &
    			tcon->ses->server->vals->cap_large_files)) {
 -			current_read_size = min_t(uint, current_read_size,
 -					CIFSMaxBufSize);
 -		}
 -		rc = -EAGAIN;
 -		while (rc == -EAGAIN) {
 +				current_read_size = min_t(uint,
 +					current_read_size, CIFSMaxBufSize);
 +			}
    		if (open_file->invalidHandle) {
    			rc = cifs_reopen_file(open_file, true);
    			if (rc != 0)
@@@ -3201,8 -3054,7 +3201,8 @@@
    		rc = server->ops->sync_read(xid, open_file, &io_parms,
    					    &bytes_read, &cur_offset,
    					    &buf_type);
 -		}
 +		} while (rc == -EAGAIN);
 +
    	if (rc || (bytes_read == 0)) {
    		if (total_read) {
    			break;
@@@ -3281,30 -3133,25 +3281,30 @@@ int cifs_file_mmap(struct file *file, s
  static void
  cifs_readv_complete(struct work_struct *work)
  {
 -	unsigned int i;
 +	unsigned int i, got_bytes;
    struct cifs_readdata *rdata = container_of(work,
    					struct cifs_readdata, work);
+	got_bytes = rdata->got_bytes;
    for (i = 0; i < rdata->nr_pages; i++) {
    	struct page *page = rdata->pages[i];
lru_cache_add_file(page);
-		if (rdata->result == 0) {
 +		if (rdata->result == 0 ||
 +		    (rdata->result == -EAGAIN && got_bytes)) {
    		flush_dcache_page(page);
    		SetPageUptodate(page);
    	}
unlock_page(page);
-		if (rdata->result == 0)
 +		if (rdata->result == 0 ||
 +		    (rdata->result == -EAGAIN && got_bytes))
    		cifs_readpage_to_fscache(rdata->mapping->host, page);
+		got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes);
 +
    	page_cache_release(page);
    	rdata->pages[i] = NULL;
    }
@@@ -3315,7 -3162,7 +3315,7 @@@ static in
  cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
    		struct cifs_readdata *rdata, unsigned int len)
  {
 -	int total_read = 0, result = 0;
 +	int result = 0;
    unsigned int i;
    u64 eof;
    pgoff_t eof_index;
@@@ -3327,7 -3174,6 +3327,7 @@@
    eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
    cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
+	rdata->got_bytes = 0;
    rdata->tailsz = PAGE_CACHE_SIZE;
    for (i = 0; i < nr_pages; i++) {
    	struct page *page = rdata->pages[i];
@@@ -3382,70 -3228,10 +3382,70 @@@
    	if (result < 0)
    		break;
-		total_read += result;
 +		rdata->got_bytes += result;
    }
-	return total_read > 0 ? total_read : result;
 +	return rdata->got_bytes > 0 && result != -ECONNABORTED ?
 +						rdata->got_bytes : result;
 +}
 +
 +static int
 +readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
 +		    unsigned int rsize, struct list_head *tmplist,
 +		    unsigned int *nr_pages, loff_t *offset, unsigned int *bytes)
 +{
 +	struct page *page, *tpage;
 +	unsigned int expected_index;
 +	int rc;
 +
 +	INIT_LIST_HEAD(tmplist);
 +
 +	page = list_entry(page_list->prev, struct page, lru);
 +
 +	/*
 +	 * Lock the page and put it in the cache. Since no one else
 +	 * should have access to this page, we're safe to simply set
 +	 * PG_locked without checking it first.
 +	 */
 +	__set_page_locked(page);
 +	rc = add_to_page_cache_locked(page, mapping,
 +				      page->index, GFP_KERNEL);
 +
 +	/* give up if we can't stick it in the cache */
 +	if (rc) {
 +		__clear_page_locked(page);
 +		return rc;
 +	}
 +
 +	/* move first page to the tmplist */
 +	*offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
 +	*bytes = PAGE_CACHE_SIZE;
 +	*nr_pages = 1;
 +	list_move_tail(&page->lru, tmplist);
 +
 +	/* now try and add more pages onto the request */
 +	expected_index = page->index + 1;
 +	list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
 +		/* discontinuity ? */
 +		if (page->index != expected_index)
 +			break;
 +
 +		/* would this page push the read over the rsize? */
 +		if (*bytes + PAGE_CACHE_SIZE > rsize)
 +			break;
 +
 +		__set_page_locked(page);
 +		if (add_to_page_cache_locked(page, mapping, page->index,
 +								GFP_KERNEL)) {
 +			__clear_page_locked(page);
 +			break;
 +		}
 +		list_move_tail(&page->lru, tmplist);
 +		(*bytes) += PAGE_CACHE_SIZE;
 +		expected_index++;
 +		(*nr_pages)++;
 +	}
 +	return rc;
  }
static int cifs_readpages(struct file *file, struct address_space *mapping,
@@@ -3455,10 -3241,19 +3455,10 @@@
    struct list_head tmplist;
    struct cifsFileInfo *open_file = file->private_data;
    struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 -	unsigned int rsize = cifs_sb->rsize;
 +	struct TCP_Server_Info *server;
    pid_t pid;
/*
 -	 * Give up immediately if rsize is too small to read an entire page.
 -	 * The VFS will fall back to readpage. We should never reach this
 -	 * point however since we set ra_pages to 0 when the rsize is smaller
 -	 * than a cache page.
 -	 */
 -	if (unlikely(rsize < PAGE_CACHE_SIZE))
 -		return 0;
 -
 -	/*
     * Reads as many pages as possible from fscache. Returns -ENOBUFS
     * immediately if the cookie is negative
     *
@@@ -3476,7 -3271,7 +3476,7 @@@
    	pid = current->tgid;
rc = 0;
 -	INIT_LIST_HEAD(&tmplist);
 +	server = tlink_tcon(open_file->tlink)->ses->server;
cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
    	 __func__, file, mapping, num_pages);
@@@ -3493,35 -3288,58 +3493,35 @@@
     * the rdata->pages, then we want them in increasing order.
     */
    while (!list_empty(page_list)) {
 -		unsigned int i;
 -		unsigned int bytes = PAGE_CACHE_SIZE;
 -		unsigned int expected_index;
 -		unsigned int nr_pages = 1;
 +		unsigned int i, nr_pages, bytes, rsize;
    	loff_t offset;
    	struct page *page, *tpage;
    	struct cifs_readdata *rdata;
 +		unsigned credits;
-		page = list_entry(page_list->prev, struct page, lru);
 +		rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
 +						   &rsize, &credits);
 +		if (rc)
 +			break;
/*
 -		 * Lock the page and put it in the cache. Since no one else
 -		 * should have access to this page, we're safe to simply set
 -		 * PG_locked without checking it first.
 +		 * Give up immediately if rsize is too small to read an entire
 +		 * page. The VFS will fall back to readpage. We should never
 +		 * reach this point however since we set ra_pages to 0 when the
 +		 * rsize is smaller than a cache page.
    	 */
 -		__set_page_locked(page);
 -		rc = add_to_page_cache_locked(page, mapping,
 -					      page->index, GFP_KERNEL);
 +		if (unlikely(rsize < PAGE_CACHE_SIZE)) {
 +			add_credits_and_wake_if(server, credits, 0);
 +			return 0;
 +		}
-		/* give up if we can't stick it in the cache */
 +		rc = readpages_get_pages(mapping, page_list, rsize, &tmplist,
 +					 &nr_pages, &offset, &bytes);
    	if (rc) {
 -			__clear_page_locked(page);
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
-		/* move first page to the tmplist */
 -		offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
 -		list_move_tail(&page->lru, &tmplist);
 -
 -		/* now try and add more pages onto the request */
 -		expected_index = page->index + 1;
 -		list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
 -			/* discontinuity ? */
 -			if (page->index != expected_index)
 -				break;
 -
 -			/* would this page push the read over the rsize? */
 -			if (bytes + PAGE_CACHE_SIZE > rsize)
 -				break;
 -
 -			__set_page_locked(page);
 -			if (add_to_page_cache_locked(page, mapping,
 -						page->index, GFP_KERNEL)) {
 -				__clear_page_locked(page);
 -				break;
 -			}
 -			list_move_tail(&page->lru, &tmplist);
 -			bytes += PAGE_CACHE_SIZE;
 -			expected_index++;
 -			nr_pages++;
 -		}
 -
    	rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
    	if (!rdata) {
    		/* best to give up if we're out of mem */
@@@ -3532,7 -3350,6 +3532,7 @@@
    			page_cache_release(page);
    		}
    		rc = -ENOMEM;
 +			add_credits_and_wake_if(server, credits, 0);
    		break;
    	}
@@@ -3543,32 -3360,21 +3543,32 @@@
    	rdata->pid = pid;
    	rdata->pagesz = PAGE_CACHE_SIZE;
    	rdata->read_into_pages = cifs_readpages_read_into_pages;
 +		rdata->credits = credits;
list_for_each_entry_safe(page, tpage, &tmplist, lru) {
    		list_del(&page->lru);
    		rdata->pages[rdata->nr_pages++] = page;
    	}
-		rc = cifs_retry_async_readv(rdata);
 -		if (rc != 0) {
 +		if (!rdata->cfile->invalidHandle ||
 +		    !cifs_reopen_file(rdata->cfile, true))
 +			rc = server->ops->async_readv(rdata);
 +		if (rc) {
 +			add_credits_and_wake_if(server, rdata->credits, 0);
    		for (i = 0; i < rdata->nr_pages; i++) {
    			page = rdata->pages[i];
    			lru_cache_add_file(page);
    			unlock_page(page);
    			page_cache_release(page);
 +				if (rc == -EAGAIN)
 +					list_add_tail(&page->lru, &tmplist);
    		}
    		kref_put(&rdata->refcount, cifs_readdata_release);
 +			if (rc == -EAGAIN) {
 +				/* Re-add pages to the page_list and retry */
 +				list_splice(&tmplist, page_list);
 +				continue;
 +			}
    		break;
    	}
@@@ -3812,6 -3618,13 +3812,6 @@@ static int cifs_launder_page(struct pag
    return rc;
  }
-static int
 -cifs_pending_writers_wait(void *unused)
 -{
 -	schedule();
 -	return 0;
 -}
 -
  void cifs_oplock_break(struct work_struct *work)
  {
    struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@@ -3823,7 -3636,7 +3823,7 @@@
    int rc = 0;
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
 -			cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE);
 +			TASK_UNINTERRUPTIBLE);
server->ops->downgrade_oplock(server, cinode,
    	test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
diff --combined fs/cifs/sess.c
index 39ee326,27e6175..39b8507
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@@ -46,7 -46,7 +46,7 @@@ static __u32 cifs_ssetup_hdr(struct cif
    				CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
    				USHRT_MAX));
    pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
- 	pSMB->req.VcNumber = __constant_cpu_to_le16(1);
+ 	pSMB->req.VcNumber = cpu_to_le16(1);
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@@ -520,559 -520,382 +520,559 @@@ select_sectype(struct TCP_Server_Info *
    }
  }
-int
 -CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
 -	       const struct nls_table *nls_cp)
 +struct sess_data {
 +	unsigned int xid;
 +	struct cifs_ses *ses;
 +	struct nls_table *nls_cp;
 +	void (*func)(struct sess_data *);
 +	int result;
 +
 +	/* we will send the SMB in three pieces:
 +	 * a fixed length beginning part, an optional
 +	 * SPNEGO blob (which can be zero length), and a
 +	 * last part which will include the strings
 +	 * and rest of bcc area. This allows us to avoid
 +	 * a large buffer 17K allocation
 +	 */
 +	int buf0_type;
 +	struct kvec iov[3];
 +};
 +
 +static int
 +sess_alloc_buffer(struct sess_data *sess_data, int wct)
  {
 -	int rc = 0;
 -	int wct;
 +	int rc;
 +	struct cifs_ses *ses = sess_data->ses;
    struct smb_hdr *smb_buf;
 -	char *bcc_ptr;
 -	char *str_area;
 -	SESSION_SETUP_ANDX *pSMB;
 -	__u32 capabilities;
 -	__u16 count;
 -	int resp_buf_type;
 -	struct kvec iov[3];
 -	enum securityEnum type;
 -	__u16 action, bytes_remaining;
 -	struct key *spnego_key = NULL;
 -	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
 -	u16 blob_len;
 -	char *ntlmsspblob = NULL;
-	if (ses == NULL) {
 -		WARN(1, "%s: ses == NULL!", __func__);
 -		return -EINVAL;
 -	}
 +	rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
 +				  (void **)&smb_buf);
-	type = select_sectype(ses->server, ses->sectype);
 -	cifs_dbg(FYI, "sess setup type %d\n", type);
 -	if (type == Unspecified) {
 -		cifs_dbg(VFS,
 -			"Unable to select appropriate authentication method!");
 -		return -EINVAL;
 +	if (rc)
 +		return rc;
 +
 +	sess_data->iov[0].iov_base = (char *)smb_buf;
 +	sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
 +	/*
 +	 * This variable will be used to clear the buffer
 +	 * allocated above in case of any error in the calling function.
 +	 */
 +	sess_data->buf0_type = CIFS_SMALL_BUFFER;
 +
 +	/* 2000 big enough to fit max user, domain, NOS name etc. */
 +	sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL);
 +	if (!sess_data->iov[2].iov_base) {
 +		rc = -ENOMEM;
 +		goto out_free_smb_buf;
    }
-	if (type == RawNTLMSSP) {
 -		/* if memory allocation is successful, caller of this function
 -		 * frees it.
 -		 */
 -		ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
 -		if (!ses->ntlmssp)
 -			return -ENOMEM;
 -		ses->ntlmssp->sesskey_per_smbsess = false;
 +	return 0;
 +
 +out_free_smb_buf:
 +	kfree(smb_buf);
 +	sess_data->iov[0].iov_base = NULL;
 +	sess_data->iov[0].iov_len = 0;
 +	sess_data->buf0_type = CIFS_NO_BUFFER;
 +	return rc;
 +}
 +
 +static void
 +sess_free_buffer(struct sess_data *sess_data)
 +{
+	free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base);
 +	sess_data->buf0_type = CIFS_NO_BUFFER;
 +	kfree(sess_data->iov[2].iov_base);
 +}
 +
 +static int
 +sess_establish_session(struct sess_data *sess_data)
 +{
 +	struct cifs_ses *ses = sess_data->ses;
 +
 +	mutex_lock(&ses->server->srv_mutex);
 +	if (!ses->server->session_estab) {
 +		if (ses->server->sign) {
 +			ses->server->session_key.response =
 +				kmemdup(ses->auth_key.response,
 +				ses->auth_key.len, GFP_KERNEL);
 +			if (!ses->server->session_key.response) {
 +				mutex_unlock(&ses->server->srv_mutex);
 +				return -ENOMEM;
 +			}
 +			ses->server->session_key.len =
 +						ses->auth_key.len;
 +		}
 +		ses->server->sequence_number = 0x2;
 +		ses->server->session_estab = true;
    }
 +	mutex_unlock(&ses->server->srv_mutex);
-ssetup_ntlmssp_authenticate:
 -	if (phase == NtLmChallenge)
 -		phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
 +	cifs_dbg(FYI, "CIFS session established successfully\n");
 +	spin_lock(&GlobalMid_Lock);
 +	ses->status = CifsGood;
 +	ses->need_reconnect = false;
 +	spin_unlock(&GlobalMid_Lock);
-	if (type == LANMAN) {
 -#ifndef CONFIG_CIFS_WEAK_PW_HASH
 -		/* LANMAN and plaintext are less secure and off by default.
 -		So we make this explicitly be turned on in kconfig (in the
 -		build) and turned on at runtime (changed from the default)
 -		in proc/fs/cifs or via mount parm.  Unfortunately this is
 -		needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
 -		return -EOPNOTSUPP;
 -#endif
 -		wct = 10; /* lanman 2 style sessionsetup */
 -	} else if ((type == NTLM) || (type == NTLMv2)) {
 -		/* For NTLMv2 failures eventually may need to retry NTLM */
 -		wct = 13; /* old style NTLM sessionsetup */
 -	} else /* same size: negotiate or auth, NTLMSSP or extended security */
 -		wct = 12;
 +	return 0;
 +}
-	rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
 -			    (void **)&smb_buf);
 -	if (rc)
 -		return rc;
 +static int
 +sess_sendreceive(struct sess_data *sess_data)
 +{
 +	int rc;
 +	struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base;
 +	__u16 count;
-	pSMB = (SESSION_SETUP_ANDX *)smb_buf;
 +	count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len;
 +	smb_buf->smb_buf_length =
 +		cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
 +	put_bcc(count, smb_buf);
 +
 +	rc = SendReceive2(sess_data->xid, sess_data->ses,
 +			  sess_data->iov, 3 /* num_iovecs */,
 +			  &sess_data->buf0_type,
 +			  CIFS_LOG_ERROR);
 +
 +	return rc;
 +}
+/*
 + * LANMAN and plaintext are less secure and off by default.
 + * So we make this explicitly be turned on in kconfig (in the
 + * build) and turned on at runtime (changed from the default)
 + * in proc/fs/cifs or via mount parm.  Unfortunately this is
 + * needed for old Win (e.g. Win95), some obscure NAS and OS/2
 + */
 +#ifdef CONFIG_CIFS_WEAK_PW_HASH
 +static void
 +sess_auth_lanman(struct sess_data *sess_data)
 +{
 +	int rc = 0;
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	char *bcc_ptr;
 +	struct cifs_ses *ses = sess_data->ses;
 +	char lnm_session_key[CIFS_AUTH_RESP_SIZE];
 +	__u32 capabilities;
 +	__u16 bytes_remaining;
 +
 +	/* lanman 2 style sessionsetup */
 +	/* wct = 10 */
 +	rc = sess_alloc_buffer(sess_data, 10);
 +	if (rc)
 +		goto out;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	bcc_ptr = sess_data->iov[2].iov_base;
    capabilities = cifs_ssetup_hdr(ses, pSMB);
-	/* we will send the SMB in three pieces:
 -	a fixed length beginning part, an optional
 -	SPNEGO blob (which can be zero length), and a
 -	last part which will include the strings
 -	and rest of bcc area. This allows us to avoid
 -	a large buffer 17K allocation */
 -	iov[0].iov_base = (char *)pSMB;
 -	iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
 -
 -	/* setting this here allows the code at the end of the function
 -	   to free the request buffer if there's an error */
 -	resp_buf_type = CIFS_SMALL_BUFFER;
 +	pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
-	/* 2000 big enough to fit max user, domain, NOS name etc. */
 -	str_area = kmalloc(2000, GFP_KERNEL);
 -	if (str_area == NULL) {
 -		rc = -ENOMEM;
 -		goto ssetup_exit;
 -	}
 -	bcc_ptr = str_area;
 +	/* no capabilities flags in old lanman negotiation */
 +	pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-	iov[1].iov_base = NULL;
 -	iov[1].iov_len = 0;
 +	/* Calculate hash with password and copy into bcc_ptr.
 +	 * Encryption Key (stored as in cryptkey) gets used if the
 +	 * security mode bit in Negottiate Protocol response states
 +	 * to use challenge/response method (i.e. Password bit is 1).
 +	 */
 +	rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
 +			      ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
 +			      true : false, lnm_session_key);
-	if (type == LANMAN) {
 -#ifdef CONFIG_CIFS_WEAK_PW_HASH
 -		char lnm_session_key[CIFS_AUTH_RESP_SIZE];
 +	memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
 +	bcc_ptr += CIFS_AUTH_RESP_SIZE;
 +
 +	/*
 +	 * can not sign if LANMAN negotiated so no need
 +	 * to calculate signing key? but what if server
 +	 * changed to do higher than lanman dialect and
 +	 * we reconnected would we ever calc signing_key?
 +	 */
-		pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
 +	cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
 +	/* Unicode not allowed for LANMAN dialects */
 +	ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
-		/* no capabilities flags in old lanman negotiation */
 +	sess_data->iov[2].iov_len = (long) bcc_ptr -
 +			(long) sess_data->iov[2].iov_base;
-		pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 +	rc = sess_sendreceive(sess_data);
 +	if (rc)
 +		goto out;
-		/* Calculate hash with password and copy into bcc_ptr.
 -		 * Encryption Key (stored as in cryptkey) gets used if the
 -		 * security mode bit in Negottiate Protocol response states
 -		 * to use challenge/response method (i.e. Password bit is 1).
 -		 */
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
-		rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
 -				 ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
 -					true : false, lnm_session_key);
 +	/* lanman response has a word count of 3 */
 +	if (smb_buf->WordCount != 3) {
 +		rc = -EIO;
 +		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 +		goto out;
 +	}
-		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
 -		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 +	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
 +		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
 +
 +	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
 +	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
-		/* can not sign if LANMAN negotiated so no need
 -		to calculate signing key? but what if server
 -		changed to do higher than lanman dialect and
 -		we reconnected would we ever calc signing_key? */
 +	bytes_remaining = get_bcc(smb_buf);
 +	bcc_ptr = pByteArea(smb_buf);
-		cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
 -		/* Unicode not allowed for LANMAN dialects */
 -		ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 +	/* BB check if Unicode and decode strings */
 +	if (bytes_remaining == 0) {
 +		/* no string area to decode, do nothing */
 +	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
 +		/* unicode string area must be word-aligned */
 +		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
 +			++bcc_ptr;
 +			--bytes_remaining;
 +		}
 +		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				      sess_data->nls_cp);
 +	} else {
 +		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				    sess_data->nls_cp);
 +	}
 +
 +	rc = sess_establish_session(sess_data);
 +out:
 +	sess_data->result = rc;
 +	sess_data->func = NULL;
 +	sess_free_buffer(sess_data);
 +}
 +
 +#else
 +
 +static void
 +sess_auth_lanman(struct sess_data *sess_data)
 +{
 +	sess_data->result = -EOPNOTSUPP;
 +	sess_data->func = NULL;
 +}
  #endif
 -	} else if (type == NTLM) {
 -		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 -		pSMB->req_no_secext.CaseInsensitivePasswordLength =
 +
 +static void
 +sess_auth_ntlm(struct sess_data *sess_data)
 +{
 +	int rc = 0;
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	char *bcc_ptr;
 +	struct cifs_ses *ses = sess_data->ses;
 +	__u32 capabilities;
 +	__u16 bytes_remaining;
 +
 +	/* old style NTLM sessionsetup */
 +	/* wct = 13 */
 +	rc = sess_alloc_buffer(sess_data, 13);
 +	if (rc)
 +		goto out;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	bcc_ptr = sess_data->iov[2].iov_base;
 +	capabilities = cifs_ssetup_hdr(ses, pSMB);
 +
 +	pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 +	pSMB->req_no_secext.CaseInsensitivePasswordLength =
    		cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 -		pSMB->req_no_secext.CaseSensitivePasswordLength =
 +	pSMB->req_no_secext.CaseSensitivePasswordLength =
    		cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-		/* calculate ntlm response and session key */
 -		rc = setup_ntlm_response(ses, nls_cp);
 -		if (rc) {
 -			cifs_dbg(VFS, "Error %d during NTLM authentication\n",
 +	/* calculate ntlm response and session key */
 +	rc = setup_ntlm_response(ses, sess_data->nls_cp);
 +	if (rc) {
 +		cifs_dbg(VFS, "Error %d during NTLM authentication\n",
    			 rc);
 -			goto ssetup_exit;
 -		}
 +		goto out;
 +	}
-		/* copy ntlm response */
 -		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 -				CIFS_AUTH_RESP_SIZE);
 -		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 -		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 -				CIFS_AUTH_RESP_SIZE);
 -		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 -
 -		if (ses->capabilities & CAP_UNICODE) {
 -			/* unicode strings must be word aligned */
 -			if (iov[0].iov_len % 2) {
 -				*bcc_ptr = 0;
 -				bcc_ptr++;
 -			}
 -			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
 -		} else
 -			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 -	} else if (type == NTLMv2) {
 -		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 -
 -		/* LM2 password would be here if we supported it */
 -		pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
 -
 -		/* calculate nlmv2 response and session key */
 -		rc = setup_ntlmv2_rsp(ses, nls_cp);
 -		if (rc) {
 -			cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n",
 -				 rc);
 -			goto ssetup_exit;
 +	/* copy ntlm response */
 +	memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 +			CIFS_AUTH_RESP_SIZE);
 +	bcc_ptr += CIFS_AUTH_RESP_SIZE;
 +	memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 +			CIFS_AUTH_RESP_SIZE);
 +	bcc_ptr += CIFS_AUTH_RESP_SIZE;
 +
 +	if (ses->capabilities & CAP_UNICODE) {
 +		/* unicode strings must be word aligned */
 +		if (sess_data->iov[0].iov_len % 2) {
 +			*bcc_ptr = 0;
 +			bcc_ptr++;
    	}
 -		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 -				ses->auth_key.len - CIFS_SESS_KEY_SIZE);
 -		bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
 -
 -		/* set case sensitive password length after tilen may get
 -		 * assigned, tilen is 0 otherwise.
 -		 */
 -		pSMB->req_no_secext.CaseSensitivePasswordLength =
 -			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
 +		unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
 +	} else {
 +		ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
 +	}
-		if (ses->capabilities & CAP_UNICODE) {
 -			if (iov[0].iov_len % 2) {
 -				*bcc_ptr = 0;
 -				bcc_ptr++;
 -			}
 -			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
 -		} else
 -			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 -	} else if (type == Kerberos) {
 -#ifdef CONFIG_CIFS_UPCALL
 -		struct cifs_spnego_msg *msg;
-		spnego_key = cifs_get_spnego_key(ses);
 -		if (IS_ERR(spnego_key)) {
 -			rc = PTR_ERR(spnego_key);
 -			spnego_key = NULL;
 -			goto ssetup_exit;
 -		}
 +	sess_data->iov[2].iov_len = (long) bcc_ptr -
 +			(long) sess_data->iov[2].iov_base;
-		msg = spnego_key->payload.data;
 -		/* check version field to make sure that cifs.upcall is
 -		   sending us a response in an expected form */
 -		if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
 -			cifs_dbg(VFS, "incorrect version of cifs.upcall "
 -				   "expected %d but got %d)",
 -				   CIFS_SPNEGO_UPCALL_VERSION, msg->version);
 -			rc = -EKEYREJECTED;
 -			goto ssetup_exit;
 -		}
 +	rc = sess_sendreceive(sess_data);
 +	if (rc)
 +		goto out;
-		ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
 -						 GFP_KERNEL);
 -		if (!ses->auth_key.response) {
 -			cifs_dbg(VFS,
 -				"Kerberos can't allocate (%u bytes) memory",
 -				msg->sesskey_len);
 -			rc = -ENOMEM;
 -			goto ssetup_exit;
 -		}
 -		ses->auth_key.len = msg->sesskey_len;
 -
 -		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 -		capabilities |= CAP_EXTENDED_SECURITY;
 -		pSMB->req.Capabilities = cpu_to_le32(capabilities);
 -		iov[1].iov_base = msg->data + msg->sesskey_len;
 -		iov[1].iov_len = msg->secblob_len;
 -		pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len);
 -
 -		if (ses->capabilities & CAP_UNICODE) {
 -			/* unicode strings must be word aligned */
 -			if ((iov[0].iov_len + iov[1].iov_len) % 2) {
 -				*bcc_ptr = 0;
 -				bcc_ptr++;
 -			}
 -			unicode_oslm_strings(&bcc_ptr, nls_cp);
 -			unicode_domain_string(&bcc_ptr, ses, nls_cp);
 -		} else
 -		/* BB: is this right? */
 -			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 -#else /* ! CONFIG_CIFS_UPCALL */
 -		cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
 -		rc = -ENOSYS;
 -		goto ssetup_exit;
 -#endif /* CONFIG_CIFS_UPCALL */
 -	} else if (type == RawNTLMSSP) {
 -		if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
 -			cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
 -			rc = -ENOSYS;
 -			goto ssetup_exit;
 -		}
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
-		cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase);
 -		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 -		capabilities |= CAP_EXTENDED_SECURITY;
 -		pSMB->req.Capabilities |= cpu_to_le32(capabilities);
 -		switch(phase) {
 -		case NtLmNegotiate:
 -			build_ntlmssp_negotiate_blob(
 -				pSMB->req.SecurityBlob, ses);
 -			iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
 -			iov[1].iov_base = pSMB->req.SecurityBlob;
 -			pSMB->req.SecurityBlobLength =
 -				cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
 -			break;
 -		case NtLmAuthenticate:
 -			/*
 -			 * 5 is an empirical value, large enough to hold
 -			 * authenticate message plus max 10 of av paris,
 -			 * domain, user, workstation names, flags, etc.
 -			 */
 -			ntlmsspblob = kzalloc(
 -				5*sizeof(struct _AUTHENTICATE_MESSAGE),
 -				GFP_KERNEL);
 -			if (!ntlmsspblob) {
 -				rc = -ENOMEM;
 -				goto ssetup_exit;
 -			}
 +	if (smb_buf->WordCount != 3) {
 +		rc = -EIO;
 +		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 +		goto out;
 +	}
-			rc = build_ntlmssp_auth_blob(ntlmsspblob,
 -						&blob_len, ses, nls_cp);
 -			if (rc)
 -				goto ssetup_exit;
 -			iov[1].iov_len = blob_len;
 -			iov[1].iov_base = ntlmsspblob;
 -			pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
 -			/*
 -			 * Make sure that we tell the server that we are using
 -			 * the uid that it just gave us back on the response
 -			 * (challenge)
 -			 */
 -			smb_buf->Uid = ses->Suid;
 -			break;
 -		default:
 -			cifs_dbg(VFS, "invalid phase %d\n", phase);
 -			rc = -ENOSYS;
 -			goto ssetup_exit;
 +	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
 +		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
 +
 +	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
 +	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
 +
 +	bytes_remaining = get_bcc(smb_buf);
 +	bcc_ptr = pByteArea(smb_buf);
 +
 +	/* BB check if Unicode and decode strings */
 +	if (bytes_remaining == 0) {
 +		/* no string area to decode, do nothing */
 +	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
 +		/* unicode string area must be word-aligned */
 +		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
 +			++bcc_ptr;
 +			--bytes_remaining;
    	}
 -		/* unicode strings must be word aligned */
 -		if ((iov[0].iov_len + iov[1].iov_len) % 2) {
 +		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				      sess_data->nls_cp);
 +	} else {
 +		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				    sess_data->nls_cp);
 +	}
 +
 +	rc = sess_establish_session(sess_data);
 +out:
 +	sess_data->result = rc;
 +	sess_data->func = NULL;
 +	sess_free_buffer(sess_data);
 +	kfree(ses->auth_key.response);
 +	ses->auth_key.response = NULL;
 +}
 +
 +static void
 +sess_auth_ntlmv2(struct sess_data *sess_data)
 +{
 +	int rc = 0;
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	char *bcc_ptr;
 +	struct cifs_ses *ses = sess_data->ses;
 +	__u32 capabilities;
 +	__u16 bytes_remaining;
 +
 +	/* old style NTLM sessionsetup */
 +	/* wct = 13 */
 +	rc = sess_alloc_buffer(sess_data, 13);
 +	if (rc)
 +		goto out;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	bcc_ptr = sess_data->iov[2].iov_base;
 +	capabilities = cifs_ssetup_hdr(ses, pSMB);
 +
 +	pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 +
 +	/* LM2 password would be here if we supported it */
 +	pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
 +
 +	/* calculate nlmv2 response and session key */
 +	rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp);
 +	if (rc) {
 +		cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc);
 +		goto out;
 +	}
 +
 +	memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 +			ses->auth_key.len - CIFS_SESS_KEY_SIZE);
 +	bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
 +
 +	/* set case sensitive password length after tilen may get
 +	 * assigned, tilen is 0 otherwise.
 +	 */
 +	pSMB->req_no_secext.CaseSensitivePasswordLength =
 +		cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
 +
 +	if (ses->capabilities & CAP_UNICODE) {
 +		if (sess_data->iov[0].iov_len % 2) {
    		*bcc_ptr = 0;
    		bcc_ptr++;
    	}
 -		unicode_oslm_strings(&bcc_ptr, nls_cp);
 +		unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
    } else {
 -		cifs_dbg(VFS, "secType %d not supported!\n", type);
 -		rc = -ENOSYS;
 -		goto ssetup_exit;
 +		ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
    }
-	iov[2].iov_base = str_area;
 -	iov[2].iov_len = (long) bcc_ptr - (long) str_area;
-	count = iov[1].iov_len + iov[2].iov_len;
 -	smb_buf->smb_buf_length =
 -		cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
 +	sess_data->iov[2].iov_len = (long) bcc_ptr -
 +			(long) sess_data->iov[2].iov_base;
-	put_bcc(count, smb_buf);
 +	rc = sess_sendreceive(sess_data);
 +	if (rc)
 +		goto out;
-	rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
 -			  CIFS_LOG_ERROR);
 -	/* SMB request buf freed in SendReceive2 */
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
 +
 +	if (smb_buf->WordCount != 3) {
 +		rc = -EIO;
 +		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 +		goto out;
 +	}
 +
 +	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
 +		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
 +
 +	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
 +	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
-	pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
 -	smb_buf = (struct smb_hdr *)iov[0].iov_base;
 +	bytes_remaining = get_bcc(smb_buf);
 +	bcc_ptr = pByteArea(smb_buf);
-	if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) &&
 -	    (smb_buf->Status.CifsError ==
 -			cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
 -		if (phase != NtLmNegotiate) {
 -			cifs_dbg(VFS, "Unexpected more processing error\n");
 -			goto ssetup_exit;
 +	/* BB check if Unicode and decode strings */
 +	if (bytes_remaining == 0) {
 +		/* no string area to decode, do nothing */
 +	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
 +		/* unicode string area must be word-aligned */
 +		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
 +			++bcc_ptr;
 +			--bytes_remaining;
    	}
 -		/* NTLMSSP Negotiate sent now processing challenge (response) */
 -		phase = NtLmChallenge; /* process ntlmssp challenge */
 -		rc = 0; /* MORE_PROC rc is not an error here, but expected */
 +		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				      sess_data->nls_cp);
 +	} else {
 +		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				    sess_data->nls_cp);
    }
 +
 +	rc = sess_establish_session(sess_data);
 +out:
 +	sess_data->result = rc;
 +	sess_data->func = NULL;
 +	sess_free_buffer(sess_data);
 +	kfree(ses->auth_key.response);
 +	ses->auth_key.response = NULL;
 +}
 +
 +#ifdef CONFIG_CIFS_UPCALL
 +static void
 +sess_auth_kerberos(struct sess_data *sess_data)
 +{
 +	int rc = 0;
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	char *bcc_ptr;
 +	struct cifs_ses *ses = sess_data->ses;
 +	__u32 capabilities;
 +	__u16 bytes_remaining;
 +	struct key *spnego_key = NULL;
 +	struct cifs_spnego_msg *msg;
 +	u16 blob_len;
 +
 +	/* extended security */
 +	/* wct = 12 */
 +	rc = sess_alloc_buffer(sess_data, 12);
    if (rc)
 -		goto ssetup_exit;
 +		goto out;
-	if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	bcc_ptr = sess_data->iov[2].iov_base;
 +	capabilities = cifs_ssetup_hdr(ses, pSMB);
 +
 +	spnego_key = cifs_get_spnego_key(ses);
 +	if (IS_ERR(spnego_key)) {
 +		rc = PTR_ERR(spnego_key);
 +		spnego_key = NULL;
 +		goto out;
 +	}
 +
 +	msg = spnego_key->payload.data;
 +	/*
 +	 * check version field to make sure that cifs.upcall is
 +	 * sending us a response in an expected form
 +	 */
 +	if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
 +		cifs_dbg(VFS,
 +		  "incorrect version of cifs.upcall (expected %d but got %d)",
 +			      CIFS_SPNEGO_UPCALL_VERSION, msg->version);
 +		rc = -EKEYREJECTED;
 +		goto out_put_spnego_key;
 +	}
 +
 +	ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
 +					 GFP_KERNEL);
 +	if (!ses->auth_key.response) {
 +		cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory",
 +				msg->sesskey_len);
 +		rc = -ENOMEM;
 +		goto out_put_spnego_key;
 +	}
 +	ses->auth_key.len = msg->sesskey_len;
 +
 +	pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 +	capabilities |= CAP_EXTENDED_SECURITY;
 +	pSMB->req.Capabilities = cpu_to_le32(capabilities);
 +	sess_data->iov[1].iov_base = msg->data + msg->sesskey_len;
 +	sess_data->iov[1].iov_len = msg->secblob_len;
 +	pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len);
 +
 +	if (ses->capabilities & CAP_UNICODE) {
 +		/* unicode strings must be word aligned */
 +		if ((sess_data->iov[0].iov_len
 +			+ sess_data->iov[1].iov_len) % 2) {
 +			*bcc_ptr = 0;
 +			bcc_ptr++;
 +		}
 +		unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
 +		unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
 +	} else {
 +		/* BB: is this right? */
 +		ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
 +	}
 +
 +	sess_data->iov[2].iov_len = (long) bcc_ptr -
 +			(long) sess_data->iov[2].iov_base;
 +
 +	rc = sess_sendreceive(sess_data);
 +	if (rc)
 +		goto out_put_spnego_key;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
 +
 +	if (smb_buf->WordCount != 4) {
    	rc = -EIO;
    	cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 -		goto ssetup_exit;
 +		goto out_put_spnego_key;
    }
 -	action = le16_to_cpu(pSMB->resp.Action);
 -	if (action & GUEST_LOGIN)
 +
 +	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
    	cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
 +
    ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
    cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
 -	/* response can have either 3 or 4 word count - Samba sends 3 */
 -	/* and lanman response is 3 */
 +
    bytes_remaining = get_bcc(smb_buf);
    bcc_ptr = pByteArea(smb_buf);
-	if (smb_buf->WordCount == 4) {
 -		blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 -		if (blob_len > bytes_remaining) {
 -			cifs_dbg(VFS, "bad security blob length %d\n",
 -				 blob_len);
 -			rc = -EINVAL;
 -			goto ssetup_exit;
 -		}
 -		if (phase == NtLmChallenge) {
 -			rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
 -			/* now goto beginning for ntlmssp authenticate phase */
 -			if (rc)
 -				goto ssetup_exit;
 -		}
 -		bcc_ptr += blob_len;
 -		bytes_remaining -= blob_len;
 +	blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 +	if (blob_len > bytes_remaining) {
 +		cifs_dbg(VFS, "bad security blob length %d\n",
 +				blob_len);
 +		rc = -EINVAL;
 +		goto out_put_spnego_key;
    }
 +	bcc_ptr += blob_len;
 +	bytes_remaining -= blob_len;
/* BB check if Unicode and decode strings */
    if (bytes_remaining == 0) {
@@@ -1083,371 -906,60 +1083,371 @@@
    		++bcc_ptr;
    		--bytes_remaining;
    	}
 -		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
 +		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				      sess_data->nls_cp);
    } else {
 -		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
 +		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				    sess_data->nls_cp);
    }
-ssetup_exit:
 -	if (spnego_key) {
 -		key_invalidate(spnego_key);
 -		key_put(spnego_key);
 +	rc = sess_establish_session(sess_data);
 +out_put_spnego_key:
 +	key_invalidate(spnego_key);
 +	key_put(spnego_key);
 +out:
 +	sess_data->result = rc;
 +	sess_data->func = NULL;
 +	sess_free_buffer(sess_data);
 +	kfree(ses->auth_key.response);
 +	ses->auth_key.response = NULL;
 +}
 +
 +#else
 +
 +static void
 +sess_auth_kerberos(struct sess_data *sess_data)
 +{
 +	cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
 +	sess_data->result = -ENOSYS;
 +	sess_data->func = NULL;
 +}
 +#endif /* ! CONFIG_CIFS_UPCALL */
 +
 +/*
 + * The required kvec buffers have to be allocated before calling this
 + * function.
 + */
 +static int
 +_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
 +{
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	struct cifs_ses *ses = sess_data->ses;
 +	__u32 capabilities;
 +	char *bcc_ptr;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)pSMB;
 +
 +	capabilities = cifs_ssetup_hdr(ses, pSMB);
 +	if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
 +		cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
 +		return -ENOSYS;
    }
 -	kfree(str_area);
 -	kfree(ntlmsspblob);
 -	ntlmsspblob = NULL;
 -	if (resp_buf_type == CIFS_SMALL_BUFFER) {
 -		cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base);
 -		cifs_small_buf_release(iov[0].iov_base);
 -	} else if (resp_buf_type == CIFS_LARGE_BUFFER)
 -		cifs_buf_release(iov[0].iov_base);
-	/* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
 -	if ((phase == NtLmChallenge) && (rc == 0))
 -		goto ssetup_ntlmssp_authenticate;
 +	pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 +	capabilities |= CAP_EXTENDED_SECURITY;
 +	pSMB->req.Capabilities |= cpu_to_le32(capabilities);
 +
 +	bcc_ptr = sess_data->iov[2].iov_base;
 +	/* unicode strings must be word aligned */
 +	if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) {
 +		*bcc_ptr = 0;
 +		bcc_ptr++;
 +	}
 +	unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
 +
 +	sess_data->iov[2].iov_len = (long) bcc_ptr -
 +					(long) sess_data->iov[2].iov_base;
 +
 +	return 0;
 +}
 +
 +static void
 +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data);
 +
 +static void
 +sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
 +{
 +	int rc;
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	struct cifs_ses *ses = sess_data->ses;
 +	__u16 bytes_remaining;
 +	char *bcc_ptr;
 +	u16 blob_len;
 +
 +	cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n");
 +
 +	/*
 +	 * if memory allocation is successful, caller of this function
 +	 * frees it.
 +	 */
 +	ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
 +	if (!ses->ntlmssp) {
 +		rc = -ENOMEM;
 +		goto out;
 +	}
 +	ses->ntlmssp->sesskey_per_smbsess = false;
 +
 +	/* wct = 12 */
 +	rc = sess_alloc_buffer(sess_data, 12);
 +	if (rc)
 +		goto out;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +
 +	/* Build security blob before we assemble the request */
 +	build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses);
 +	sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
 +	sess_data->iov[1].iov_base = pSMB->req.SecurityBlob;
 +	pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
 +
 +	rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
 +	if (rc)
 +		goto out;
 +
 +	rc = sess_sendreceive(sess_data);
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
 +
 +	/* If true, rc here is expected and not an error */
 +	if (sess_data->buf0_type != CIFS_NO_BUFFER &&
 +	    smb_buf->Status.CifsError ==
 +			cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
 +		rc = 0;
 +
 +	if (rc)
 +		goto out;
 +
 +	cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
 +
 +	if (smb_buf->WordCount != 4) {
 +		rc = -EIO;
 +		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 +		goto out;
 +	}
 +
 +	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
 +	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
 +
 +	bytes_remaining = get_bcc(smb_buf);
 +	bcc_ptr = pByteArea(smb_buf);
 +
 +	blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 +	if (blob_len > bytes_remaining) {
 +		cifs_dbg(VFS, "bad security blob length %d\n",
 +				blob_len);
 +		rc = -EINVAL;
 +		goto out;
 +	}
 +
 +	rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
 +out:
 +	sess_free_buffer(sess_data);
if (!rc) {
 -		mutex_lock(&ses->server->srv_mutex);
 -		if (!ses->server->session_estab) {
 -			if (ses->server->sign) {
 -				ses->server->session_key.response =
 -					kmemdup(ses->auth_key.response,
 -					ses->auth_key.len, GFP_KERNEL);
 -				if (!ses->server->session_key.response) {
 -					rc = -ENOMEM;
 -					mutex_unlock(&ses->server->srv_mutex);
 -					goto keycp_exit;
 -				}
 -				ses->server->session_key.len =
 -							ses->auth_key.len;
 -			}
 -			ses->server->sequence_number = 0x2;
 -			ses->server->session_estab = true;
 -		}
 -		mutex_unlock(&ses->server->srv_mutex);
 +		sess_data->func = sess_auth_rawntlmssp_authenticate;
 +		return;
 +	}
 +
 +	/* Else error. Cleanup */
 +	kfree(ses->auth_key.response);
 +	ses->auth_key.response = NULL;
 +	kfree(ses->ntlmssp);
 +	ses->ntlmssp = NULL;
 +
 +	sess_data->func = NULL;
 +	sess_data->result = rc;
 +}
-		cifs_dbg(FYI, "CIFS session established successfully\n");
 -		spin_lock(&GlobalMid_Lock);
 -		ses->status = CifsGood;
 -		ses->need_reconnect = false;
 -		spin_unlock(&GlobalMid_Lock);
 +static void
 +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
 +{
 +	int rc;
 +	struct smb_hdr *smb_buf;
 +	SESSION_SETUP_ANDX *pSMB;
 +	struct cifs_ses *ses = sess_data->ses;
 +	__u16 bytes_remaining;
 +	char *bcc_ptr;
 +	char *ntlmsspblob = NULL;
 +	u16 blob_len;
 +
 +	cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n");
 +
 +	/* wct = 12 */
 +	rc = sess_alloc_buffer(sess_data, 12);
 +	if (rc)
 +		goto out;
 +
 +	/* Build security blob before we assemble the request */
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)pSMB;
 +	/*
 +	 * 5 is an empirical value, large enough to hold
 +	 * authenticate message plus max 10 of av paris,
 +	 * domain, user, workstation names, flags, etc.
 +	 */
 +	ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE),
 +				GFP_KERNEL);
 +	if (!ntlmsspblob) {
 +		rc = -ENOMEM;
 +		goto out;
    }
-keycp_exit:
 +	rc = build_ntlmssp_auth_blob(ntlmsspblob,
 +					&blob_len, ses, sess_data->nls_cp);
 +	if (rc)
 +		goto out_free_ntlmsspblob;
 +	sess_data->iov[1].iov_len = blob_len;
 +	sess_data->iov[1].iov_base = ntlmsspblob;
 +	pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
 +	/*
 +	 * Make sure that we tell the server that we are using
 +	 * the uid that it just gave us back on the response
 +	 * (challenge)
 +	 */
 +	smb_buf->Uid = ses->Suid;
 +
 +	rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
 +	if (rc)
 +		goto out_free_ntlmsspblob;
 +
 +	rc = sess_sendreceive(sess_data);
 +	if (rc)
 +		goto out_free_ntlmsspblob;
 +
 +	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 +	smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
 +	if (smb_buf->WordCount != 4) {
 +		rc = -EIO;
 +		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 +		goto out_free_ntlmsspblob;
 +	}
 +
 +	if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
 +		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
 +
 +	bytes_remaining = get_bcc(smb_buf);
 +	bcc_ptr = pByteArea(smb_buf);
 +	blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 +	if (blob_len > bytes_remaining) {
 +		cifs_dbg(VFS, "bad security blob length %d\n",
 +				blob_len);
 +		rc = -EINVAL;
 +		goto out_free_ntlmsspblob;
 +	}
 +	bcc_ptr += blob_len;
 +	bytes_remaining -= blob_len;
 +
 +
 +	/* BB check if Unicode and decode strings */
 +	if (bytes_remaining == 0) {
 +		/* no string area to decode, do nothing */
 +	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
 +		/* unicode string area must be word-aligned */
 +		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
 +			++bcc_ptr;
 +			--bytes_remaining;
 +		}
 +		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				      sess_data->nls_cp);
 +	} else {
 +		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
 +				    sess_data->nls_cp);
 +	}
 +
 +out_free_ntlmsspblob:
 +	kfree(ntlmsspblob);
 +out:
 +	sess_free_buffer(sess_data);
 +
 +	 if (!rc)
 +		rc = sess_establish_session(sess_data);
 +
 +	/* Cleanup */
    kfree(ses->auth_key.response);
    ses->auth_key.response = NULL;
    kfree(ses->ntlmssp);
 +	ses->ntlmssp = NULL;
 +
 +	sess_data->func = NULL;
 +	sess_data->result = rc;
 +}
 +
 +static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data)
 +{
 +	int type;
 +
 +	type = select_sectype(ses->server, ses->sectype);
 +	cifs_dbg(FYI, "sess setup type %d\n", type);
 +	if (type == Unspecified) {
 +		cifs_dbg(VFS,
 +			"Unable to select appropriate authentication method!");
 +		return -EINVAL;
 +	}
 +
 +	switch (type) {
 +	case LANMAN:
 +		/* LANMAN and plaintext are less secure and off by default.
 +		 * So we make this explicitly be turned on in kconfig (in the
 +		 * build) and turned on at runtime (changed from the default)
 +		 * in proc/fs/cifs or via mount parm.  Unfortunately this is
 +		 * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
 +#ifdef CONFIG_CIFS_WEAK_PW_HASH
 +		sess_data->func = sess_auth_lanman;
 +		break;
 +#else
 +		return -EOPNOTSUPP;
 +#endif
 +	case NTLM:
 +		sess_data->func = sess_auth_ntlm;
 +		break;
 +	case NTLMv2:
 +		sess_data->func = sess_auth_ntlmv2;
 +		break;
 +	case Kerberos:
 +#ifdef CONFIG_CIFS_UPCALL
 +		sess_data->func = sess_auth_kerberos;
 +		break;
 +#else
 +		cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
 +		return -ENOSYS;
 +		break;
 +#endif /* CONFIG_CIFS_UPCALL */
 +	case RawNTLMSSP:
 +		sess_data->func = sess_auth_rawntlmssp_negotiate;
 +		break;
 +	default:
 +		cifs_dbg(VFS, "secType %d not supported!\n", type);
 +		return -ENOSYS;
 +	}
 +
 +	return 0;
 +}
 +
 +int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
 +		    const struct nls_table *nls_cp)
 +{
 +	int rc = 0;
 +	struct sess_data *sess_data;
 +
 +	if (ses == NULL) {
 +		WARN(1, "%s: ses == NULL!", __func__);
 +		return -EINVAL;
 +	}
 +
 +	sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL);
 +	if (!sess_data)
 +		return -ENOMEM;
 +
 +	rc = select_sec(ses, sess_data);
 +	if (rc)
 +		goto out;
 +
 +	sess_data->xid = xid;
 +	sess_data->ses = ses;
 +	sess_data->buf0_type = CIFS_NO_BUFFER;
 +	sess_data->nls_cp = (struct nls_table *) nls_cp;
 +
 +	while (sess_data->func)
 +		sess_data->func(sess_data);
 +
 +	/* Store result before we free sess_data */
 +	rc = sess_data->result;
+out:
 +	kfree(sess_data);
    return rc;
  }
diff --combined fs/cifs/smb2misc.c
index f2e6ac2,36867bd..da0faf6
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@@ -67,27 -67,27 +67,27 @@@ check_smb2_hdr(struct smb2_hdr *hdr, __
   *  indexed by command in host byte order
   */
  static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
- 	/* SMB2_NEGOTIATE */ __constant_cpu_to_le16(65),
- 	/* SMB2_SESSION_SETUP */ __constant_cpu_to_le16(9),
- 	/* SMB2_LOGOFF */ __constant_cpu_to_le16(4),
- 	/* SMB2_TREE_CONNECT */ __constant_cpu_to_le16(16),
- 	/* SMB2_TREE_DISCONNECT */ __constant_cpu_to_le16(4),
- 	/* SMB2_CREATE */ __constant_cpu_to_le16(89),
- 	/* SMB2_CLOSE */ __constant_cpu_to_le16(60),
- 	/* SMB2_FLUSH */ __constant_cpu_to_le16(4),
- 	/* SMB2_READ */ __constant_cpu_to_le16(17),
- 	/* SMB2_WRITE */ __constant_cpu_to_le16(17),
- 	/* SMB2_LOCK */ __constant_cpu_to_le16(4),
- 	/* SMB2_IOCTL */ __constant_cpu_to_le16(49),
+ 	/* SMB2_NEGOTIATE */ cpu_to_le16(65),
+ 	/* SMB2_SESSION_SETUP */ cpu_to_le16(9),
+ 	/* SMB2_LOGOFF */ cpu_to_le16(4),
+ 	/* SMB2_TREE_CONNECT */ cpu_to_le16(16),
+ 	/* SMB2_TREE_DISCONNECT */ cpu_to_le16(4),
+ 	/* SMB2_CREATE */ cpu_to_le16(89),
+ 	/* SMB2_CLOSE */ cpu_to_le16(60),
+ 	/* SMB2_FLUSH */ cpu_to_le16(4),
+ 	/* SMB2_READ */ cpu_to_le16(17),
+ 	/* SMB2_WRITE */ cpu_to_le16(17),
+ 	/* SMB2_LOCK */ cpu_to_le16(4),
+ 	/* SMB2_IOCTL */ cpu_to_le16(49),
    /* BB CHECK this ... not listed in documentation */
- 	/* SMB2_CANCEL */ __constant_cpu_to_le16(0),
- 	/* SMB2_ECHO */ __constant_cpu_to_le16(4),
- 	/* SMB2_QUERY_DIRECTORY */ __constant_cpu_to_le16(9),
- 	/* SMB2_CHANGE_NOTIFY */ __constant_cpu_to_le16(9),
- 	/* SMB2_QUERY_INFO */ __constant_cpu_to_le16(9),
- 	/* SMB2_SET_INFO */ __constant_cpu_to_le16(2),
+ 	/* SMB2_CANCEL */ cpu_to_le16(0),
+ 	/* SMB2_ECHO */ cpu_to_le16(4),
+ 	/* SMB2_QUERY_DIRECTORY */ cpu_to_le16(9),
+ 	/* SMB2_CHANGE_NOTIFY */ cpu_to_le16(9),
+ 	/* SMB2_QUERY_INFO */ cpu_to_le16(9),
+ 	/* SMB2_SET_INFO */ cpu_to_le16(2),
    /* BB FIXME can also be 44 for lease break */
- 	/* SMB2_OPLOCK_BREAK */ __constant_cpu_to_le16(24)
+ 	/* SMB2_OPLOCK_BREAK */ cpu_to_le16(24)
  };
int
@@@ -437,7 -437,7 +437,7 @@@ smb2_tcon_has_lease(struct cifs_tcon *t
    		continue;
cifs_dbg(FYI, "found in the open list\n");
 -		cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
 +		cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
    		 le32_to_cpu(rsp->NewLeaseState));
server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
@@@ -467,7 -467,7 +467,7 @@@
    	}
cifs_dbg(FYI, "found in the pending open list\n");
 -		cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
 +		cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
    		 le32_to_cpu(rsp->NewLeaseState));
open->oplock = lease_state;
@@@ -546,7 -546,7 +546,7 @@@ smb2_is_valid_oplock_break(char *buffer
    		return false;
    }
-	cifs_dbg(FYI, "oplock level 0x%d\n", rsp->OplockLevel);
 +	cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel);
/* look up tcon based on tid & uid */
    spin_lock(&cifs_tcp_ses_lock);
diff --combined fs/cifs/smb2ops.c
index 77f8aeb,7f99a0f..5278331
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@@ -19,7 -19,6 +19,7 @@@
#include <linux/pagemap.h>
  #include <linux/vfs.h>
 +#include <linux/falloc.h>
  #include "cifsglob.h"
  #include "smb2pdu.h"
  #include "smb2proto.h"
@@@ -113,53 -112,6 +113,53 @@@ smb2_get_credits(struct mid_q_entry *mi
    return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest);
  }
+static int
 +smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
 +		      unsigned int *num, unsigned int *credits)
 +{
 +	int rc = 0;
 +	unsigned int scredits;
 +
 +	spin_lock(&server->req_lock);
 +	while (1) {
 +		if (server->credits <= 0) {
 +			spin_unlock(&server->req_lock);
 +			cifs_num_waiters_inc(server);
 +			rc = wait_event_killable(server->request_q,
 +					has_credits(server, &server->credits));
 +			cifs_num_waiters_dec(server);
 +			if (rc)
 +				return rc;
 +			spin_lock(&server->req_lock);
 +		} else {
 +			if (server->tcpStatus == CifsExiting) {
 +				spin_unlock(&server->req_lock);
 +				return -ENOENT;
 +			}
 +
 +			scredits = server->credits;
 +			/* can deadlock with reopen */
 +			if (scredits == 1) {
 +				*num = SMB2_MAX_BUFFER_SIZE;
 +				*credits = 0;
 +				break;
 +			}
 +
 +			/* leave one credit for a possible reopen */
 +			scredits--;
 +			*num = min_t(unsigned int, size,
 +				     scredits * SMB2_MAX_BUFFER_SIZE);
 +
 +			*credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
 +			server->credits -= *credits;
 +			server->in_flight++;
 +			break;
 +		}
 +	}
 +	spin_unlock(&server->req_lock);
 +	return rc;
 +}
 +
  static __u64
  smb2_get_next_mid(struct TCP_Server_Info *server)
  {
@@@ -230,9 -182,8 +230,9 @@@ smb2_negotiate_wsize(struct cifs_tcon *
    /* start with specified wsize, or default */
    wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
    wsize = min_t(unsigned int, wsize, server->max_write);
 -	/* set it to the maximum buffer size value we can send with 1 credit */
 -	wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
 +
 +	if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
 +		wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
return wsize;
  }
@@@ -246,9 -197,8 +246,9 @@@ smb2_negotiate_rsize(struct cifs_tcon *
    /* start with specified rsize, or default */
    rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
    rsize = min_t(unsigned int, rsize, server->max_read);
 -	/* set it to the maximum buffer size value we can send with 1 credit */
 -	rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
 +
 +	if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
 +		rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
return rsize;
  }
@@@ -597,7 -547,7 +597,7 @@@ smb2_clone_range(const unsigned int xid
    	goto cchunk_out;
/* For now array only one chunk long, will make more flexible later */
- 	pcchunk->ChunkCount = __constant_cpu_to_le32(1);
+ 	pcchunk->ChunkCount = cpu_to_le32(1);
    pcchunk->Reserved = 0;
    pcchunk->Reserved2 = 0;
@@@ -737,7 -687,7 +737,7 @@@ smb2_set_file_size(const unsigned int x
  {
    __le64 eof = cpu_to_le64(size);
    return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
 -			    cfile->fid.volatile_fid, cfile->pid, &eof);
 +			    cfile->fid.volatile_fid, cfile->pid, &eof, false);
  }
static int
@@@ -1154,13 -1104,6 +1154,13 @@@ smb3_parse_lease_buf(void *buf, unsigne
    return le32_to_cpu(lc->lcontext.LeaseState);
  }
+static unsigned int
 +smb2_wp_retry_size(struct inode *inode)
 +{
 +	return min_t(unsigned int, CIFS_SB(inode->i_sb)->wsize,
 +		     SMB2_MAX_BUFFER_SIZE);
 +}
 +
  struct smb_version_operations smb20_operations = {
    .compare_fids = smb2_compare_fids,
    .setup_request = smb2_setup_request,
@@@ -1170,7 -1113,6 +1170,7 @@@
    .set_credits = smb2_set_credits,
    .get_credits_field = smb2_get_credits_field,
    .get_credits = smb2_get_credits,
 +	.wait_mtu_credits = cifs_wait_mtu_credits,
    .get_next_mid = smb2_get_next_mid,
    .read_data_offset = smb2_read_data_offset,
    .read_data_length = smb2_read_data_length,
@@@ -1235,7 -1177,6 +1235,7 @@@
    .create_lease_buf = smb2_create_lease_buf,
    .parse_lease_buf = smb2_parse_lease_buf,
    .clone_range = smb2_clone_range,
 +	.wp_retry_size = smb2_wp_retry_size,
  };
struct smb_version_operations smb21_operations = {
@@@ -1247,7 -1188,6 +1247,7 @@@
    .set_credits = smb2_set_credits,
    .get_credits_field = smb2_get_credits_field,
    .get_credits = smb2_get_credits,
 +	.wait_mtu_credits = smb2_wait_mtu_credits,
    .get_next_mid = smb2_get_next_mid,
    .read_data_offset = smb2_read_data_offset,
    .read_data_length = smb2_read_data_length,
@@@ -1312,7 -1252,6 +1312,7 @@@
    .create_lease_buf = smb2_create_lease_buf,
    .parse_lease_buf = smb2_parse_lease_buf,
    .clone_range = smb2_clone_range,
 +	.wp_retry_size = smb2_wp_retry_size,
  };
struct smb_version_operations smb30_operations = {
@@@ -1324,7 -1263,6 +1324,7 @@@
    .set_credits = smb2_set_credits,
    .get_credits_field = smb2_get_credits_field,
    .get_credits = smb2_get_credits,
 +	.wait_mtu_credits = smb2_wait_mtu_credits,
    .get_next_mid = smb2_get_next_mid,
    .read_data_offset = smb2_read_data_offset,
    .read_data_length = smb2_read_data_length,
@@@ -1392,7 -1330,6 +1392,7 @@@
    .parse_lease_buf = smb3_parse_lease_buf,
    .clone_range = smb2_clone_range,
    .validate_negotiate = smb3_validate_negotiate,
 +	.wp_retry_size = smb2_wp_retry_size,
  };
struct smb_version_values smb20_values = {
diff --combined fs/cifs/smb2pdu.c
index 42ebc1a,a9b03c2..a5f2a5c
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@@ -108,6 -108,7 +108,6 @@@ smb2_hdr_assemble(struct smb2_hdr *hdr
    if (!tcon)
    	goto out;
-	/* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */
    /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
    /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
    if ((tcon->ses) &&
@@@ -244,6 -245,10 +244,6 @@@ smb2_reconnect(__le16 smb2_command, str
    if (rc)
    	goto out;
    atomic_inc(&tconInfoReconnectCount);
 -	/*
 -	 * BB FIXME add code to check if wsize needs update due to negotiated
 -	 * smb buffer size shrinking.
 -	 */
  out:
    /*
     * Check if handle based operation so we know whether we can continue
@@@ -304,6 -309,16 +304,6 @@@ small_smb2_init(__le16 smb2_command, st
    return rc;
  }
-static void
 -free_rsp_buf(int resp_buftype, void *rsp)
 -{
 -	if (resp_buftype == CIFS_SMALL_BUFFER)
 -		cifs_small_buf_release(rsp);
 -	else if (resp_buftype == CIFS_LARGE_BUFFER)
 -		cifs_buf_release(rsp);
 -}
 -
 -
  /*
   *
   *	SMB2 Worker functions follow:
@@@ -1354,7 -1369,7 +1354,7 @@@ SMB2_set_compression(const unsigned in
    char *ret_data = NULL;
fsctl_input.CompressionState =
- 			__constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+ 			cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
    		FSCTL_SET_COMPRESSION, true /* is_fsctl */,
@@@ -1723,18 -1738,12 +1723,18 @@@ smb2_readv_callback(struct mid_q_entry 
    				 rc);
    	}
    	/* FIXME: should this be counted toward the initiating task? */
 -		task_io_account_read(rdata->bytes);
 -		cifs_stats_bytes_read(tcon, rdata->bytes);
 +		task_io_account_read(rdata->got_bytes);
 +		cifs_stats_bytes_read(tcon, rdata->got_bytes);
    	break;
    case MID_REQUEST_SUBMITTED:
    case MID_RETRY_NEEDED:
    	rdata->result = -EAGAIN;
 +		if (server->sign && rdata->got_bytes)
 +			/* reset bytes number since we can not check a sign */
 +			rdata->got_bytes = 0;
 +		/* FIXME: should this be counted toward the initiating task? */
 +		task_io_account_read(rdata->got_bytes);
 +		cifs_stats_bytes_read(tcon, rdata->got_bytes);
    	break;
    default:
    	if (rdata->result != -ENODATA)
@@@ -1753,12 -1762,11 +1753,12 @@@
  int
  smb2_async_readv(struct cifs_readdata *rdata)
  {
 -	int rc;
 +	int rc, flags = 0;
    struct smb2_hdr *buf;
    struct cifs_io_parms io_parms;
    struct smb_rqst rqst = { .rq_iov = &rdata->iov,
    			 .rq_nvec = 1 };
 +	struct TCP_Server_Info *server;
cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
    	 __func__, rdata->offset, rdata->bytes);
@@@ -1769,41 -1777,18 +1769,41 @@@
    io_parms.persistent_fid = rdata->cfile->fid.persistent_fid;
    io_parms.volatile_fid = rdata->cfile->fid.volatile_fid;
    io_parms.pid = rdata->pid;
 +
 +	server = io_parms.tcon->ses->server;
 +
    rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0);
 -	if (rc)
 +	if (rc) {
 +		if (rc == -EAGAIN && rdata->credits) {
 +			/* credits was reset by reconnect */
 +			rdata->credits = 0;
 +			/* reduce in_flight value since we won't send the req */
 +			spin_lock(&server->req_lock);
 +			server->in_flight--;
 +			spin_unlock(&server->req_lock);
 +		}
    	return rc;
 +	}
buf = (struct smb2_hdr *)rdata->iov.iov_base;
    /* 4 for rfc1002 length field */
    rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4;
+	if (rdata->credits) {
 +		buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
 +						SMB2_MAX_BUFFER_SIZE));
 +		spin_lock(&server->req_lock);
 +		server->credits += rdata->credits -
 +						le16_to_cpu(buf->CreditCharge);
 +		spin_unlock(&server->req_lock);
 +		wake_up(&server->request_q);
 +		flags = CIFS_HAS_CREDITS;
 +	}
 +
    kref_get(&rdata->refcount);
    rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
    		     cifs_readv_receive, smb2_readv_callback,
 -			     rdata, 0);
 +			     rdata, flags);
    if (rc) {
    	kref_put(&rdata->refcount, cifs_readdata_release);
    	cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
@@@ -1921,25 -1906,15 +1921,25 @@@ in
  smb2_async_writev(struct cifs_writedata *wdata,
    	  void (*release)(struct kref *kref))
  {
 -	int rc = -EACCES;
 +	int rc = -EACCES, flags = 0;
    struct smb2_write_req *req = NULL;
    struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
 +	struct TCP_Server_Info *server = tcon->ses->server;
    struct kvec iov;
    struct smb_rqst rqst;
rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req);
 -	if (rc)
 +	if (rc) {
 +		if (rc == -EAGAIN && wdata->credits) {
 +			/* credits was reset by reconnect */
 +			wdata->credits = 0;
 +			/* reduce in_flight value since we won't send the req */
 +			spin_lock(&server->req_lock);
 +			server->in_flight--;
 +			spin_unlock(&server->req_lock);
 +		}
    	goto async_writev_out;
 +	}
req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid);
@@@ -1972,20 -1947,9 +1972,20 @@@
inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
+	if (wdata->credits) {
 +		req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
 +						    SMB2_MAX_BUFFER_SIZE));
 +		spin_lock(&server->req_lock);
 +		server->credits += wdata->credits -
 +					le16_to_cpu(req->hdr.CreditCharge);
 +		spin_unlock(&server->req_lock);
 +		wake_up(&server->request_q);
 +		flags = CIFS_HAS_CREDITS;
 +	}
 +
    kref_get(&wdata->refcount);
 -	rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
 -				smb2_writev_callback, wdata, 0);
 +	rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata,
 +			     flags);
if (rc) {
    	kref_put(&wdata->refcount, release);
@@@ -2361,7 -2325,7 +2361,7 @@@ SMB2_set_hardlink(const unsigned int xi
int
  SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 -	     u64 volatile_fid, u32 pid, __le64 *eof)
 +	     u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc)
  {
    struct smb2_file_eof_info info;
    void *data;
@@@ -2372,12 -2336,8 +2372,12 @@@
    data = &info;
    size = sizeof(struct smb2_file_eof_info);
-	return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid,
 -			     FILE_END_OF_FILE_INFORMATION, 1, &data, &size);
 +	if (is_falloc)
 +		return send_set_info(xid, tcon, persistent_fid, volatile_fid,
 +			pid, FILE_ALLOCATION_INFORMATION, 1, &data, &size);
 +	else
 +		return send_set_info(xid, tcon, persistent_fid, volatile_fid,
 +			pid, FILE_END_OF_FILE_INFORMATION, 1, &data, &size);
  }
int
diff --combined fs/exec.c
index ab1f120,2ef2751..a2b42a9
--- a/fs/exec.c
+++ b/fs/exec.c
@@@ -368,10 -368,6 +368,6 @@@ static int bprm_mm_init(struct linux_bi
    if (!mm)
    	goto err;
- 	err = init_new_context(current, mm);
- 	if (err)
- 		goto err;
- 
    err = __bprm_mm_init(bprm);
    if (err)
    	goto err;
@@@ -1216,7 -1212,7 +1212,7 @@@ EXPORT_SYMBOL(install_exec_creds)
  /*
   * determine how safe it is to execute the proposed program
   * - the caller must hold ->cred_guard_mutex to protect against
 - *   PTRACE_ATTACH
 + *   PTRACE_ATTACH or seccomp thread-sync
   */
  static void check_unsafe_exec(struct linux_binprm *bprm)
  {
@@@ -1234,7 -1230,7 +1230,7 @@@
     * This isn't strictly necessary, but it makes it harder for LSMs to
     * mess up.
     */
 -	if (current->no_new_privs)
 +	if (task_no_new_privs(current))
    	bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
t = p;
@@@ -1272,7 -1268,7 +1268,7 @@@ int prepare_binprm(struct linux_binprm 
    bprm->cred->egid = current_egid();
if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
 -	    !current->no_new_privs &&
 +	    !task_no_new_privs(current) &&
        kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
        kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
    	/* Set-uid? */
diff --combined fs/fscache/main.c
index a31b83c,3248c15..b39d487
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@@ -67,7 -67,7 +67,7 @@@ static int fscache_max_active_sysctl(st
    return ret;
  }
- struct ctl_table fscache_sysctls[] = {
+ static struct ctl_table fscache_sysctls[] = {
    {
    	.procname	= "object_max_active",
    	.data		= &fscache_object_max_active,
@@@ -87,7 -87,7 +87,7 @@@
    {}
  };
- struct ctl_table fscache_sysctls_root[] = {
+ static struct ctl_table fscache_sysctls_root[] = {
    {
    	.procname	= "fscache",
    	.mode		= 0555,
@@@ -197,6 -197,24 +197,6 @@@ static void __exit fscache_exit(void
  module_exit(fscache_exit);
/*
 - * wait_on_bit() sleep function for uninterruptible waiting
 - */
 -int fscache_wait_bit(void *flags)
 -{
 -	schedule();
 -	return 0;
 -}
 -
 -/*
 - * wait_on_bit() sleep function for interruptible waiting
 - */
 -int fscache_wait_bit_interruptible(void *flags)
 -{
 -	schedule();
 -	return signal_pending(current);
 -}
 -
 -/*
   * wait_on_atomic_t() sleep function for uninterruptible waiting
   */
  int fscache_wait_atomic_t(atomic_t *p)
diff --combined fs/namespace.c
index 7886176,2a1447c..0acabea
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@@ -798,7 -798,7 +798,7 @@@ static void commit_tree(struct mount *m
    list_splice(&head, n->list.prev);
if (shadows)
- 		hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
+ 		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
    else
    	hlist_add_head_rcu(&mnt->mnt_hash,
    			m_hash(&parent->mnt, mnt->mnt_mountpoint));
@@@ -890,21 -890,8 +890,21 @@@ static struct mount *clone_mnt(struct m
mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
    /* Don't allow unprivileged users to change mount flags */
 -	if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
 -		mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
 +	if (flag & CL_UNPRIVILEGED) {
 +		mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
 +
 +		if (mnt->mnt.mnt_flags & MNT_READONLY)
 +			mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
 +
 +		if (mnt->mnt.mnt_flags & MNT_NODEV)
 +			mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
 +
 +		if (mnt->mnt.mnt_flags & MNT_NOSUID)
 +			mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
 +
 +		if (mnt->mnt.mnt_flags & MNT_NOEXEC)
 +			mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
 +	}
/* Don't allow unprivileged users to reveal what is under a mount */
    if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
@@@ -1909,6 -1896,9 +1909,6 @@@ static int change_mount_flags(struct vf
    if (readonly_request == __mnt_is_readonly(mnt))
    	return 0;
-	if (mnt->mnt_flags & MNT_LOCK_READONLY)
 -		return -EPERM;
 -
    if (readonly_request)
    	error = mnt_make_readonly(real_mount(mnt));
    else
@@@ -1934,33 -1924,6 +1934,33 @@@ static int do_remount(struct path *path
    if (path->dentry != path->mnt->mnt_root)
    	return -EINVAL;
+	/* Don't allow changing of locked mnt flags.
 +	 *
 +	 * No locks need to be held here while testing the various
 +	 * MNT_LOCK flags because those flags can never be cleared
 +	 * once they are set.
 +	 */
 +	if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
 +	    !(mnt_flags & MNT_READONLY)) {
 +		return -EPERM;
 +	}
 +	if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
 +	    !(mnt_flags & MNT_NODEV)) {
 +		return -EPERM;
 +	}
 +	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
 +	    !(mnt_flags & MNT_NOSUID)) {
 +		return -EPERM;
 +	}
 +	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
 +	    !(mnt_flags & MNT_NOEXEC)) {
 +		return -EPERM;
 +	}
 +	if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
 +	    ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
 +		return -EPERM;
 +	}
 +
    err = security_sb_remount(sb, data);
    if (err)
    	return err;
@@@ -1974,7 -1937,7 +1974,7 @@@
    	err = do_remount_sb(sb, flags, data, 0);
    if (!err) {
    	lock_mount_hash();
 -		mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
 +		mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
    	mnt->mnt.mnt_flags = mnt_flags;
    	touch_mnt_namespace(mnt->mnt_ns);
    	unlock_mount_hash();
@@@ -2159,7 -2122,7 +2159,7 @@@ static int do_new_mount(struct path *pa
    	 */
    	if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
    		flags |= MS_NODEV;
 -			mnt_flags |= MNT_NODEV;
 +			mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
    	}
    }
@@@ -2473,14 -2436,6 +2473,14 @@@ long do_mount(const char *dev_name, con
    if (flags & MS_RDONLY)
    	mnt_flags |= MNT_READONLY;
+	/* The default atime for remount is preservation */
 +	if ((flags & MS_REMOUNT) &&
 +	    ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
 +		       MS_STRICTATIME)) == 0)) {
 +		mnt_flags &= ~MNT_ATIME_MASK;
 +		mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
 +	}
 +
    flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
    	   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
    	   MS_STRICTATIME);
@@@ -3017,13 -2972,13 +3017,13 @@@ static void *mntns_get(struct task_stru
    struct mnt_namespace *ns = NULL;
    struct nsproxy *nsproxy;
-	rcu_read_lock();
 -	nsproxy = task_nsproxy(task);
 +	task_lock(task);
 +	nsproxy = task->nsproxy;
    if (nsproxy) {
    	ns = nsproxy->mnt_ns;
    	get_mnt_ns(ns);
    }
 -	rcu_read_unlock();
 +	task_unlock(task);
return ns;
  }
diff --combined fs/nilfs2/super.c
index ac91499,48c9ce8..0584706
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@@ -942,7 -942,7 +942,7 @@@ static int nilfs_get_root_dentry(struc
    		iput(inode);
    	}
    } else {
 -		dentry = d_obtain_alias(inode);
 +		dentry = d_obtain_root(inode);
    	if (IS_ERR(dentry)) {
    		ret = PTR_ERR(dentry);
    		goto failed_dentry;
@@@ -1014,7 -1014,7 +1014,7 @@@ int nilfs_checkpoint_is_mounted(struct 
    struct dentry *dentry;
    int ret;
- 	if (cno < 0 || cno > nilfs->ns_cno)
+ 	if (cno > nilfs->ns_cno)
    	return false;
if (cno >= nilfs_last_cno(nilfs))
@@@ -1452,13 -1452,19 +1452,19 @@@ static int __init init_nilfs_fs(void
    if (err)
    	goto fail;
- 	err = register_filesystem(&nilfs_fs_type);
+ 	err = nilfs_sysfs_init();
    if (err)
    	goto free_cachep;
+ 	err = register_filesystem(&nilfs_fs_type);
+ 	if (err)
+ 		goto deinit_sysfs_entry;
+ 
    printk(KERN_INFO "NILFS version 2 loaded\n");
    return 0;
+ deinit_sysfs_entry:
+ 	nilfs_sysfs_exit();
  free_cachep:
    nilfs_destroy_cachep();
  fail:
@@@ -1468,6 -1474,7 +1474,7 @@@
  static void __exit exit_nilfs_fs(void)
  {
    nilfs_destroy_cachep();
+ 	nilfs_sysfs_exit();
    unregister_filesystem(&nilfs_fs_type);
  }
diff --combined fs/proc/base.c
index 0131156,043c83c..baf852b
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@@ -105,7 -105,7 +105,7 @@@
   */
struct pid_entry {
- 	char *name;
+ 	const char *name;
    int len;
    umode_t mode;
    const struct inode_operations *iop;
@@@ -130,10 -130,6 +130,6 @@@
    	{ .proc_get_link = get_link } )
  #define REG(NAME, MODE, fops)				\
    NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
- #define INF(NAME, MODE, read)				\
- 	NOD(NAME, (S_IFREG|(MODE)), 			\
- 		NULL, &proc_info_file_operations,	\
- 		{ .proc_read = read } )
  #define ONE(NAME, MODE, show)				\
    NOD(NAME, (S_IFREG|(MODE)), 			\
    	NULL, &proc_single_file_operations,	\
@@@ -200,27 -196,32 +196,32 @@@ static int proc_root_link(struct dentr
    return result;
  }
- static int proc_pid_cmdline(struct task_struct *task, char *buffer)
+ static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
+ 			    struct pid *pid, struct task_struct *task)
  {
- 	return get_cmdline(task, buffer, PAGE_SIZE);
+ 	/*
+ 	 * Rely on struct seq_operations::show() being called once
+ 	 * per internal buffer allocation. See single_open(), traverse().
+ 	 */
+ 	BUG_ON(m->size < PAGE_SIZE);
+ 	m->count += get_cmdline(task, m->buf, PAGE_SIZE);
+ 	return 0;
  }
- static int proc_pid_auxv(struct task_struct *task, char *buffer)
+ static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
+ 			 struct pid *pid, struct task_struct *task)
  {
    struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
- 	int res = PTR_ERR(mm);
    if (mm && !IS_ERR(mm)) {
    	unsigned int nwords = 0;
    	do {
    		nwords += 2;
    	} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
- 		res = nwords * sizeof(mm->saved_auxv[0]);
- 		if (res > PAGE_SIZE)
- 			res = PAGE_SIZE;
- 		memcpy(buffer, mm->saved_auxv, res);
+ 		seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0]));
    	mmput(mm);
- 	}
- 	return res;
+ 		return 0;
+ 	} else
+ 		return PTR_ERR(mm);
  }
@@@ -229,7 -230,8 +230,8 @@@
   * Provides a wchan file via kallsyms in a proper one-value-per-file format.
   * Returns the resolved symbol.  If that fails, simply return the address.
   */
- static int proc_pid_wchan(struct task_struct *task, char *buffer)
+ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
+ 			  struct pid *pid, struct task_struct *task)
  {
    unsigned long wchan;
    char symname[KSYM_NAME_LEN];
@@@ -240,9 -242,9 +242,9 @@@
    	if (!ptrace_may_access(task, PTRACE_MODE_READ))
    		return 0;
    	else
- 			return sprintf(buffer, "%lu", wchan);
+ 			return seq_printf(m, "%lu", wchan);
    else
- 		return sprintf(buffer, "%s", symname);
+ 		return seq_printf(m, "%s", symname);
  }
  #endif /* CONFIG_KALLSYMS */
@@@ -304,9 -306,10 +306,10 @@@ static int proc_pid_stack(struct seq_fi
  /*
   * Provides /proc/PID/schedstat
   */
- static int proc_pid_schedstat(struct task_struct *task, char *buffer)
+ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
+ 			      struct pid *pid, struct task_struct *task)
  {
- 	return sprintf(buffer, "%llu %llu %lu\n",
+ 	return seq_printf(m, "%llu %llu %lu\n",
    		(unsigned long long)task->se.sum_exec_runtime,
    		(unsigned long long)task->sched_info.run_delay,
    		task->sched_info.pcount);
@@@ -404,7 -407,8 +407,8 @@@ static const struct file_operations pro
  };
  #endif
- static int proc_oom_score(struct task_struct *task, char *buffer)
+ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
+ 			  struct pid *pid, struct task_struct *task)
  {
    unsigned long totalpages = totalram_pages + total_swap_pages;
    unsigned long points = 0;
@@@ -414,12 -418,12 +418,12 @@@
    	points = oom_badness(task, NULL, NULL, totalpages) *
    					1000 / totalpages;
    read_unlock(&tasklist_lock);
- 	return sprintf(buffer, "%lu\n", points);
+ 	return seq_printf(m, "%lu\n", points);
  }
struct limit_names {
- 	char *name;
- 	char *unit;
+ 	const char *name;
+ 	const char *unit;
  };
static const struct limit_names lnames[RLIM_NLIMITS] = {
@@@ -442,12 -446,11 +446,11 @@@
  };
/* Display limits for a process */
- static int proc_pid_limits(struct task_struct *task, char *buffer)
+ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
+ 			   struct pid *pid, struct task_struct *task)
  {
    unsigned int i;
- 	int count = 0;
    unsigned long flags;
- 	char *bufptr = buffer;
struct rlimit rlim[RLIM_NLIMITS];
@@@ -459,35 -462,34 +462,34 @@@
    /*
     * print the file header
     */
- 	count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n",
+        seq_printf(m, "%-25s %-20s %-20s %-10s\n",
    		"Limit", "Soft Limit", "Hard Limit", "Units");
for (i = 0; i < RLIM_NLIMITS; i++) {
    	if (rlim[i].rlim_cur == RLIM_INFINITY)
- 			count += sprintf(&bufptr[count], "%-25s %-20s ",
+ 			seq_printf(m, "%-25s %-20s ",
    				 lnames[i].name, "unlimited");
    	else
- 			count += sprintf(&bufptr[count], "%-25s %-20lu ",
+ 			seq_printf(m, "%-25s %-20lu ",
    				 lnames[i].name, rlim[i].rlim_cur);
if (rlim[i].rlim_max == RLIM_INFINITY)
- 			count += sprintf(&bufptr[count], "%-20s ", "unlimited");
+ 			seq_printf(m, "%-20s ", "unlimited");
    	else
- 			count += sprintf(&bufptr[count], "%-20lu ",
- 					 rlim[i].rlim_max);
+ 			seq_printf(m, "%-20lu ", rlim[i].rlim_max);
if (lnames[i].unit)
- 			count += sprintf(&bufptr[count], "%-10s\n",
- 					 lnames[i].unit);
+ 			seq_printf(m, "%-10s\n", lnames[i].unit);
    	else
- 			count += sprintf(&bufptr[count], "\n");
+ 			seq_putc(m, '\n');
    }
- 	return count;
+ 	return 0;
  }
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- static int proc_pid_syscall(struct task_struct *task, char *buffer)
+ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
+ 			    struct pid *pid, struct task_struct *task)
  {
    long nr;
    unsigned long args[6], sp, pc;
@@@ -496,11 -498,11 +498,11 @@@
    	return res;
if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
- 		res = sprintf(buffer, "running\n");
+ 		seq_puts(m, "running\n");
    else if (nr < 0)
- 		res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+ 		seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
    else
- 		res = sprintf(buffer,
+ 		seq_printf(m,
    	       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
    	       nr,
    	       args[0], args[1], args[2], args[3], args[4], args[5],
@@@ -598,43 -600,6 +600,6 @@@ static const struct inode_operations pr
    .setattr	= proc_setattr,
  };
- #define PROC_BLOCK_SIZE	(3*1024)		/* 4K page size but our output routines use some slack for overruns */
- 
- static ssize_t proc_info_read(struct file * file, char __user * buf,
- 			  size_t count, loff_t *ppos)
- {
- 	struct inode * inode = file_inode(file);
- 	unsigned long page;
- 	ssize_t length;
- 	struct task_struct *task = get_proc_task(inode);
- 
- 	length = -ESRCH;
- 	if (!task)
- 		goto out_no_task;
- 
- 	if (count > PROC_BLOCK_SIZE)
- 		count = PROC_BLOCK_SIZE;
- 
- 	length = -ENOMEM;
- 	if (!(page = __get_free_page(GFP_TEMPORARY)))
- 		goto out;
- 
- 	length = PROC_I(inode)->op.proc_read(task, (char*)page);
- 
- 	if (length >= 0)
- 		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
- 	free_page(page);
- out:
- 	put_task_struct(task);
- out_no_task:
- 	return length;
- }
- 
- static const struct file_operations proc_info_file_operations = {
- 	.read		= proc_info_read,
- 	.llseek		= generic_file_llseek,
- };
- 
  static int proc_single_show(struct seq_file *m, void *v)
  {
    struct inode *inode = m->private;
@@@ -2056,7 -2021,7 +2021,7 @@@ static int show_timer(struct seq_file *
    struct k_itimer *timer;
    struct timers_private *tp = m->private;
    int notify;
- 	static char *nstr[] = {
+ 	static const char * const nstr[] = {
    	[SIGEV_SIGNAL] = "signal",
    	[SIGEV_NONE] = "none",
    	[SIGEV_THREAD] = "thread",
@@@ -2392,7 -2357,7 +2357,7 @@@ static const struct file_operations pro
  #endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
+ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
  {
    struct task_io_accounting acct = task->ioac;
    unsigned long flags;
@@@ -2416,7 -2381,7 +2381,7 @@@
unlock_task_sighand(task, &flags);
    }
- 	result = sprintf(buffer,
+ 	result = seq_printf(m,
    		"rchar: %llu\n"
    		"wchar: %llu\n"
    		"syscr: %llu\n"
@@@ -2436,20 -2401,22 +2401,22 @@@ out_unlock
    return result;
  }
- static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+ static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+ 				  struct pid *pid, struct task_struct *task)
  {
- 	return do_io_accounting(task, buffer, 0);
+ 	return do_io_accounting(task, m, 0);
  }
- static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+ static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+ 				   struct pid *pid, struct task_struct *task)
  {
- 	return do_io_accounting(task, buffer, 1);
+ 	return do_io_accounting(task, m, 1);
  }
  #endif /* CONFIG_TASK_IO_ACCOUNTING */
#ifdef CONFIG_USER_NS
  static int proc_id_map_open(struct inode *inode, struct file *file,
- 	struct seq_operations *seq_ops)
+ 	const struct seq_operations *seq_ops)
  {
    struct user_namespace *ns = NULL;
    struct task_struct *task;
@@@ -2557,10 -2524,10 +2524,10 @@@ static const struct pid_entry tgid_base
    DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
  #endif
    REG("environ",    S_IRUSR, proc_environ_operations),
- 	INF("auxv",       S_IRUSR, proc_pid_auxv),
+ 	ONE("auxv",       S_IRUSR, proc_pid_auxv),
    ONE("status",     S_IRUGO, proc_pid_status),
    ONE("personality", S_IRUSR, proc_pid_personality),
- 	INF("limits",	  S_IRUGO, proc_pid_limits),
+ 	ONE("limits",	  S_IRUGO, proc_pid_limits),
  #ifdef CONFIG_SCHED_DEBUG
    REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
  #endif
@@@ -2569,9 -2536,9 +2536,9 @@@
  #endif
    REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
  #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- 	INF("syscall",    S_IRUSR, proc_pid_syscall),
+ 	ONE("syscall",    S_IRUSR, proc_pid_syscall),
  #endif
- 	INF("cmdline",    S_IRUGO, proc_pid_cmdline),
+ 	ONE("cmdline",    S_IRUGO, proc_pid_cmdline),
    ONE("stat",       S_IRUGO, proc_tgid_stat),
    ONE("statm",      S_IRUGO, proc_pid_statm),
    REG("maps",       S_IRUGO, proc_pid_maps_operations),
@@@ -2594,13 -2561,13 +2561,13 @@@
    DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
  #endif
  #ifdef CONFIG_KALLSYMS
- 	INF("wchan",      S_IRUGO, proc_pid_wchan),
+ 	ONE("wchan",      S_IRUGO, proc_pid_wchan),
  #endif
  #ifdef CONFIG_STACKTRACE
    ONE("stack",      S_IRUSR, proc_pid_stack),
  #endif
  #ifdef CONFIG_SCHEDSTATS
- 	INF("schedstat",  S_IRUGO, proc_pid_schedstat),
+ 	ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
  #endif
  #ifdef CONFIG_LATENCYTOP
    REG("latency",  S_IRUGO, proc_lstats_operations),
@@@ -2611,7 -2578,7 +2578,7 @@@
  #ifdef CONFIG_CGROUPS
    REG("cgroup",  S_IRUGO, proc_cgroup_operations),
  #endif
- 	INF("oom_score",  S_IRUGO, proc_oom_score),
+ 	ONE("oom_score",  S_IRUGO, proc_oom_score),
    REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
    REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
  #ifdef CONFIG_AUDITSYSCALL
@@@ -2625,10 -2592,10 +2592,10 @@@
    REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
  #endif
  #ifdef CONFIG_TASK_IO_ACCOUNTING
- 	INF("io",	S_IRUSR, proc_tgid_io_accounting),
+ 	ONE("io",	S_IRUSR, proc_tgid_io_accounting),
  #endif
  #ifdef CONFIG_HARDWALL
- 	INF("hardwall",   S_IRUGO, proc_pid_hardwall),
+ 	ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
  #endif
  #ifdef CONFIG_USER_NS
    REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
@@@ -2780,12 -2747,12 +2747,12 @@@ out
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
  {
- 	int result = 0;
+ 	int result = -ENOENT;
    struct task_struct *task;
    unsigned tgid;
    struct pid_namespace *ns;
- 	tgid = name_to_int(dentry);
+ 	tgid = name_to_int(&dentry->d_name);
    if (tgid == ~0U)
    	goto out;
@@@ -2847,7 -2814,7 +2814,7 @@@ retry
    return iter;
  }
-#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
 +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
/* for the /proc/ directory itself, after non-process stuff has been done */
  int proc_pid_readdir(struct file *file, struct dir_context *ctx)
@@@ -2859,19 -2826,14 +2826,19 @@@
    if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
    	return 0;
-	if (pos == TGID_OFFSET - 1) {
 +	if (pos == TGID_OFFSET - 2) {
    	struct inode *inode = ns->proc_self->d_inode;
    	if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
    		return 0;
 -		iter.tgid = 0;
 -	} else {
 -		iter.tgid = pos - TGID_OFFSET;
 +		ctx->pos = pos = pos + 1;
 +	}
 +	if (pos == TGID_OFFSET - 1) {
 +		struct inode *inode = ns->proc_thread_self->d_inode;
 +		if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
 +			return 0;
 +		ctx->pos = pos = pos + 1;
    }
 +	iter.tgid = pos - TGID_OFFSET;
    iter.task = NULL;
    for (iter = next_tgid(ns, iter);
         iter.task;
@@@ -2900,22 -2862,19 +2867,22 @@@ static const struct pid_entry tid_base_
    DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
    DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
    DIR("ns",	 S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 +#ifdef CONFIG_NET
 +	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
 +#endif
    REG("environ",   S_IRUSR, proc_environ_operations),
- 	INF("auxv",      S_IRUSR, proc_pid_auxv),
+ 	ONE("auxv",      S_IRUSR, proc_pid_auxv),
    ONE("status",    S_IRUGO, proc_pid_status),
    ONE("personality", S_IRUSR, proc_pid_personality),
- 	INF("limits",	 S_IRUGO, proc_pid_limits),
+ 	ONE("limits",	 S_IRUGO, proc_pid_limits),
  #ifdef CONFIG_SCHED_DEBUG
    REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
  #endif
    REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
  #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- 	INF("syscall",   S_IRUSR, proc_pid_syscall),
+ 	ONE("syscall",   S_IRUSR, proc_pid_syscall),
  #endif
- 	INF("cmdline",   S_IRUGO, proc_pid_cmdline),
+ 	ONE("cmdline",   S_IRUGO, proc_pid_cmdline),
    ONE("stat",      S_IRUGO, proc_tid_stat),
    ONE("statm",     S_IRUGO, proc_pid_statm),
    REG("maps",      S_IRUGO, proc_tid_maps_operations),
@@@ -2940,13 -2899,13 +2907,13 @@@
    DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
  #endif
  #ifdef CONFIG_KALLSYMS
- 	INF("wchan",     S_IRUGO, proc_pid_wchan),
+ 	ONE("wchan",     S_IRUGO, proc_pid_wchan),
  #endif
  #ifdef CONFIG_STACKTRACE
    ONE("stack",      S_IRUSR, proc_pid_stack),
  #endif
  #ifdef CONFIG_SCHEDSTATS
- 	INF("schedstat", S_IRUGO, proc_pid_schedstat),
+ 	ONE("schedstat", S_IRUGO, proc_pid_schedstat),
  #endif
  #ifdef CONFIG_LATENCYTOP
    REG("latency",  S_IRUGO, proc_lstats_operations),
@@@ -2957,7 -2916,7 +2924,7 @@@
  #ifdef CONFIG_CGROUPS
    REG("cgroup",  S_IRUGO, proc_cgroup_operations),
  #endif
- 	INF("oom_score", S_IRUGO, proc_oom_score),
+ 	ONE("oom_score", S_IRUGO, proc_oom_score),
    REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
    REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
  #ifdef CONFIG_AUDITSYSCALL
@@@ -2968,10 -2927,10 +2935,10 @@@
    REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
  #endif
  #ifdef CONFIG_TASK_IO_ACCOUNTING
- 	INF("io",	S_IRUSR, proc_tid_io_accounting),
+ 	ONE("io",	S_IRUSR, proc_tid_io_accounting),
  #endif
  #ifdef CONFIG_HARDWALL
- 	INF("hardwall",   S_IRUGO, proc_pid_hardwall),
+ 	ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
  #endif
  #ifdef CONFIG_USER_NS
    REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
@@@ -3041,7 -3000,7 +3008,7 @@@ static struct dentry *proc_task_lookup(
    if (!leader)
    	goto out_no_task;
- 	tid = name_to_int(dentry);
+ 	tid = name_to_int(&dentry->d_name);
    if (tid == ~0U)
    	goto out;
diff --combined fs/proc/internal.h
index ee04619,a024cf7..7da13e4
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@@ -52,7 -52,6 +52,6 @@@ struct proc_dir_entry
union proc_op {
    int (*proc_get_link)(struct dentry *, struct path *);
- 	int (*proc_read)(struct task_struct *task, char *page);
    int (*proc_show)(struct seq_file *m,
    	struct pid_namespace *ns, struct pid *pid,
    	struct task_struct *task);
@@@ -112,10 -111,10 +111,10 @@@ static inline int task_dumpable(struct 
    return 0;
  }
- static inline unsigned name_to_int(struct dentry *dentry)
+ static inline unsigned name_to_int(const struct qstr *qstr)
  {
- 	const char *name = dentry->d_name.name;
- 	int len = dentry->d_name.len;
+ 	const char *name = qstr->name;
+ 	int len = qstr->len;
    unsigned n = 0;
if (len > 1 && *name == '0')
@@@ -178,8 -177,6 +177,6 @@@ extern bool proc_fill_cache(struct fil
  /*
   * generic.c
   */
- extern spinlock_t proc_subdir_lock;
- 
  extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
  extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
    			     struct dentry *);
@@@ -234,12 -231,6 +231,12 @@@ static inline int proc_net_init(void) 
  extern int proc_setup_self(struct super_block *);
/*
 + * proc_thread_self.c
 + */
 +extern int proc_setup_thread_self(struct super_block *);
 +extern void proc_thread_self_init(void);
 +
 +/*
   * proc_sysctl.c
   */
  #ifdef CONFIG_PROC_SYSCTL
diff --combined fs/proc/root.c
index 92c12c2,574bafc..6296c76
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@@ -149,8 -149,6 +149,8 @@@ static void proc_kill_sb(struct super_b
    ns = (struct pid_namespace *)sb->s_fs_info;
    if (ns->proc_self)
    	dput(ns->proc_self);
 +	if (ns->proc_thread_self)
 +		dput(ns->proc_thread_self);
    kill_anon_super(sb);
    put_pid_ns(ns);
  }
@@@ -172,8 -170,7 +172,8 @@@ void __init proc_root_init(void
    	return;
proc_self_init();
 -	proc_symlink("mounts", NULL, "self/mounts");
 +	proc_thread_self_init();
 +	proc_symlink("mounts", NULL, "thread-self/mounts");
proc_net_init();
@@@ -202,10 -199,10 +202,10 @@@ static int proc_root_getattr(struct vfs
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
  {
- 	if (!proc_lookup(dir, dentry, flags))
+ 	if (!proc_pid_lookup(dir, dentry, flags))
    	return NULL;
    
- 	return proc_pid_lookup(dir, dentry, flags);
+ 	return proc_lookup(dir, dentry, flags);
  }
static int proc_root_readdir(struct file *file, struct dir_context *ctx)
diff --combined fs/reiserfs/do_balan.c
index 4d5e529,5739cb9..9c02d96
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@@ -10,7 -10,7 +10,7 @@@
   * and using buffers obtained after all above.
   */
- #include <asm/uaccess.h>
+ #include <linux/uaccess.h>
  #include <linux/time.h>
  #include "reiserfs.h"
  #include <linux/buffer_head.h>
@@@ -286,14 -286,12 +286,14 @@@ static int balance_leaf_when_delete(str
    return 0;
  }
-static void balance_leaf_insert_left(struct tree_balance *tb,
 -				     struct item_head *ih, const char *body)
 +static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
 +					     struct item_head *const ih,
 +					     const char * const body)
  {
    int ret;
    struct buffer_info bi;
    int n = B_NR_ITEMS(tb->L[0]);
 +	unsigned body_shift_bytes = 0;
if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
    	/* part of new item falls into L[0] */
@@@ -331,7 -329,7 +331,7 @@@
put_ih_item_len(ih, new_item_len);
    	if (tb->lbytes > tb->zeroes_num) {
 -			body += (tb->lbytes - tb->zeroes_num);
 +			body_shift_bytes = tb->lbytes - tb->zeroes_num;
    		tb->zeroes_num = 0;
    	} else
    		tb->zeroes_num -= tb->lbytes;
@@@ -351,12 -349,11 +351,12 @@@
    	tb->insert_size[0] = 0;
    	tb->zeroes_num = 0;
    }
 +	return body_shift_bytes;
  }
static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
 -						 struct item_head *ih,
 -						 const char *body)
 +						 struct item_head * const ih,
 +						 const char * const body)
  {
    int n = B_NR_ITEMS(tb->L[0]);
    struct buffer_info bi;
@@@ -416,18 -413,17 +416,18 @@@
    tb->pos_in_item -= tb->lbytes;
  }
-static void balance_leaf_paste_left_shift(struct tree_balance *tb,
 -					  struct item_head *ih,
 -					  const char *body)
 +static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb,
 +						  struct item_head * const ih,
 +						  const char * const body)
  {
    struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
    int n = B_NR_ITEMS(tb->L[0]);
    struct buffer_info bi;
 +	int body_shift_bytes = 0;
if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
    	balance_leaf_paste_left_shift_dirent(tb, ih, body);
 -		return;
 +		return 0;
    }
RFALSE(tb->lbytes <= 0,
@@@ -501,7 -497,7 +501,7 @@@
    	 * insert_size[0]
    	 */
    	if (l_n > tb->zeroes_num) {
 -			body += (l_n - tb->zeroes_num);
 +			body_shift_bytes = l_n - tb->zeroes_num;
    		tb->zeroes_num = 0;
    	} else
    		tb->zeroes_num -= l_n;
@@@ -530,14 -526,13 +530,14 @@@
    	 */
    	leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
    }
 +	return body_shift_bytes;
  }
/* appended item will be in L[0] in whole */
  static void balance_leaf_paste_left_whole(struct tree_balance *tb,
 -					  struct item_head *ih,
 -					  const char *body)
 +					  struct item_head * const ih,
 +					  const char * const body)
  {
    struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
    int n = B_NR_ITEMS(tb->L[0]);
@@@ -589,44 -584,39 +589,44 @@@
    tb->zeroes_num = 0;
  }
-static void balance_leaf_paste_left(struct tree_balance *tb,
 -				    struct item_head *ih, const char *body)
 +static unsigned int balance_leaf_paste_left(struct tree_balance *tb,
 +					    struct item_head * const ih,
 +					    const char * const body)
  {
    /* we must shift the part of the appended item */
    if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
 -		balance_leaf_paste_left_shift(tb, ih, body);
 +		return balance_leaf_paste_left_shift(tb, ih, body);
    else
    	balance_leaf_paste_left_whole(tb, ih, body);
 +	return 0;
  }
/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
 -static void balance_leaf_left(struct tree_balance *tb, struct item_head *ih,
 -			      const char *body, int flag)
 +static unsigned int balance_leaf_left(struct tree_balance *tb,
 +				      struct item_head * const ih,
 +				      const char * const body, int flag)
  {
    if (tb->lnum[0] <= 0)
 -		return;
 +		return 0;
/* new item or it part falls to L[0], shift it too */
    if (tb->item_pos < tb->lnum[0]) {
    	BUG_ON(flag != M_INSERT && flag != M_PASTE);
if (flag == M_INSERT)
 -			balance_leaf_insert_left(tb, ih, body);
 +			return balance_leaf_insert_left(tb, ih, body);
    	else /* M_PASTE */
 -			balance_leaf_paste_left(tb, ih, body);
 +			return balance_leaf_paste_left(tb, ih, body);
    } else
    	/* new item doesn't fall into L[0] */
    	leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
 +	return 0;
  }
static void balance_leaf_insert_right(struct tree_balance *tb,
 -				      struct item_head *ih, const char *body)
 +				      struct item_head * const ih,
 +				      const char * const body)
  {
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
@@@ -714,8 -704,7 +714,8 @@@
static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
 -				     struct item_head *ih, const char *body)
 +				     struct item_head * const ih,
 +				     const char * const body)
  {
    struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
    struct buffer_info bi;
@@@ -765,8 -754,7 +765,8 @@@
  }
static void balance_leaf_paste_right_shift(struct tree_balance *tb,
 -				     struct item_head *ih, const char *body)
 +				     struct item_head * const ih,
 +				     const char * const body)
  {
    struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
    int n_shift, n_rem, r_zeroes_number, version;
@@@ -843,8 -831,7 +843,8 @@@
  }
static void balance_leaf_paste_right_whole(struct tree_balance *tb,
 -				     struct item_head *ih, const char *body)
 +				     struct item_head * const ih,
 +				     const char * const body)
  {
    struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
    int n = B_NR_ITEMS(tbS0);
@@@ -887,8 -874,7 +887,8 @@@
  }
static void balance_leaf_paste_right(struct tree_balance *tb,
 -				     struct item_head *ih, const char *body)
 +				     struct item_head * const ih,
 +				     const char * const body)
  {
    struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
    int n = B_NR_ITEMS(tbS0);
@@@ -910,9 -896,8 +910,9 @@@
  }
/* shift rnum[0] items from S[0] to the right neighbor R[0] */
 -static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih,
 -			       const char *body, int flag)
 +static void balance_leaf_right(struct tree_balance *tb,
 +			       struct item_head * const ih,
 +			       const char * const body, int flag)
  {
    if (tb->rnum[0] <= 0)
    	return;
@@@ -926,8 -911,8 +926,8 @@@
  }
static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
 -					  struct item_head *ih,
 -					  const char *body,
 +					  struct item_head * const ih,
 +					  const char * const body,
    				  struct item_head *insert_key,
    				  struct buffer_head **insert_ptr,
    				  int i)
@@@ -1018,8 -1003,8 +1018,8 @@@
/* we append to directory item */
  static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
 -					 struct item_head *ih,
 -					 const char *body,
 +					 struct item_head * const ih,
 +					 const char * const body,
    				 struct item_head *insert_key,
    				 struct buffer_head **insert_ptr,
    				 int i)
@@@ -1073,8 -1058,8 +1073,8 @@@
  }
static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
 -					 struct item_head *ih,
 -					 const char *body,
 +					 struct item_head * const ih,
 +					 const char * const body,
    				 struct item_head *insert_key,
    				 struct buffer_head **insert_ptr,
    				 int i)
@@@ -1146,8 -1131,8 +1146,8 @@@
  }
static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
 -					       struct item_head *ih,
 -					       const char *body,
 +					       struct item_head * const ih,
 +					       const char * const body,
    				       struct item_head *insert_key,
    				       struct buffer_head **insert_ptr,
    				       int i)
@@@ -1199,8 -1184,8 +1199,8 @@@
}
  static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
 -					 struct item_head *ih,
 -					 const char *body,
 +					 struct item_head * const ih,
 +					 const char * const body,
    				 struct item_head *insert_key,
    				 struct buffer_head **insert_ptr,
    				 int i)
@@@ -1229,8 -1214,8 +1229,8 @@@
/* Fill new nodes that appear in place of S[0] */
  static void balance_leaf_new_nodes(struct tree_balance *tb,
 -				   struct item_head *ih,
 -				   const char *body,
 +				   struct item_head * const ih,
 +				   const char * const body,
    			   struct item_head *insert_key,
    			   struct buffer_head **insert_ptr,
    			   int flag)
@@@ -1269,8 -1254,8 +1269,8 @@@
  }
static void balance_leaf_finish_node_insert(struct tree_balance *tb,
 -					    struct item_head *ih,
 -					    const char *body)
 +					    struct item_head * const ih,
 +					    const char * const body)
  {
    struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
    struct buffer_info bi;
@@@ -1286,8 -1271,8 +1286,8 @@@
  }
static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
 -						  struct item_head *ih,
 -						  const char *body)
 +						  struct item_head * const ih,
 +						  const char * const body)
  {
    struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
    struct item_head *pasted = item_head(tbS0, tb->item_pos);
@@@ -1320,8 -1305,8 +1320,8 @@@
  }
static void balance_leaf_finish_node_paste(struct tree_balance *tb,
 -					   struct item_head *ih,
 -					   const char *body)
 +					   struct item_head * const ih,
 +					   const char * const body)
  {
    struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
    struct buffer_info bi;
@@@ -1364,8 -1349,8 +1364,8 @@@
   * of the affected item which remains in S
   */
  static void balance_leaf_finish_node(struct tree_balance *tb,
 -				      struct item_head *ih,
 -				      const char *body, int flag)
 +				      struct item_head * const ih,
 +				      const char * const body, int flag)
  {
    /* if we must insert or append into buffer S[0] */
    if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
@@@ -1417,7 -1402,7 +1417,7 @@@ static int balance_leaf(struct tree_bal
        && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
    	tb->pos_in_item *= UNFM_P_SIZE;
-	balance_leaf_left(tb, ih, body, flag);
 +	body += balance_leaf_left(tb, ih, body, flag);
/* tb->lnum[0] > 0 */
    /* Calculate new item position */
diff --combined fs/reiserfs/lbalance.c
index 3a74d15,814dda3..249594a
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@@ -2,7 -2,7 +2,7 @@@
   * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
   */
- #include <asm/uaccess.h>
+ #include <linux/uaccess.h>
  #include <linux/string.h>
  #include <linux/time.h>
  #include "reiserfs.h"
@@@ -899,9 -899,8 +899,9 @@@ void leaf_delete_items(struct buffer_in
/* insert item into the leaf node in position before */
  void leaf_insert_into_buf(struct buffer_info *bi, int before,
 -			  struct item_head *inserted_item_ih,
 -			  const char *inserted_item_body, int zeros_number)
 +			  struct item_head * const inserted_item_ih,
 +			  const char * const inserted_item_body,
 +			  int zeros_number)
  {
    struct buffer_head *bh = bi->bi_bh;
    int nr, free_space;
diff --combined include/linux/fs.h
index 2daccaf,8b4a021..1ab6c69
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -833,7 -833,7 +833,7 @@@ static inline struct file *get_file(str
   *
   * Lockd stuffs a "host" pointer into this.
   */
 -typedef struct files_struct *fl_owner_t;
 +typedef void *fl_owner_t;
struct file_lock_operations {
    void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
@@@ -2688,7 -2688,7 +2688,7 @@@ static const struct file_operations __f
    .read	 = simple_attr_read,					\
    .write	 = simple_attr_write,					\
    .llseek	 = generic_file_llseek,					\
- };
+ }
static inline __printf(1, 2)
  void __simple_attr_check_format(const char *fmt, ...)
diff --combined include/linux/kernel.h
index a9e2268,44a498d..e989204
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@@ -470,6 -470,7 +470,7 @@@ extern enum system_states 
  #define TAINT_FIRMWARE_WORKAROUND	11
  #define TAINT_OOT_MODULE		12
  #define TAINT_UNSIGNED_MODULE		13
+ #define TAINT_SOFTLOCKUP		14
extern const char hex_asc[];
  #define hex_asc_lo(x)	hex_asc[((x) & 0x0f)]
@@@ -493,15 -494,10 +494,10 @@@ static inline char *hex_byte_pack_upper
    return buf;
  }
extern int hex_to_bin(char ch);
  extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
-int mac_pton(const char *s, u8 *mac);
 +bool mac_pton(const char *s, u8 *mac);
/*
   * General tracing related utility functions - trace_printk(),
@@@ -719,23 -715,8 +715,8 @@@ static inline void ftrace_dump(enum ftr
    (void) (&_max1 == &_max2);		\
    _max1 > _max2 ? _max1 : _max2; })
- #define min3(x, y, z) ({			\
- 	typeof(x) _min1 = (x);			\
- 	typeof(y) _min2 = (y);			\
- 	typeof(z) _min3 = (z);			\
- 	(void) (&_min1 == &_min2);		\
- 	(void) (&_min1 == &_min3);		\
- 	_min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
- 		(_min2 < _min3 ? _min2 : _min3); })
- 
- #define max3(x, y, z) ({			\
- 	typeof(x) _max1 = (x);			\
- 	typeof(y) _max2 = (y);			\
- 	typeof(z) _max3 = (z);			\
- 	(void) (&_max1 == &_max2);		\
- 	(void) (&_max1 == &_max3);		\
- 	_max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \
- 		(_max2 > _max3 ? _max2 : _max3); })
+ #define min3(x, y, z) min((typeof(x))min(x, y), z)
+ #define max3(x, y, z) max((typeof(x))max(x, y), z)
/**
   * min_not_zero - return the minimum that is _not_ zero, unless both are zero
@@@ -750,20 -731,13 +731,13 @@@
  /**
   * clamp - return a value clamped to a given range with strict typechecking
   * @val: current value
-  * @min: minimum allowable value
-  * @max: maximum allowable value
+  * @lo: lowest allowable value
+  * @hi: highest allowable value
   *
   * This macro does strict typechecking of min/max to make sure they are of the
   * same type as val.  See the unnecessary pointer comparisons.
   */
- #define clamp(val, min, max) ({			\
- 	typeof(val) __val = (val);		\
- 	typeof(min) __min = (min);		\
- 	typeof(max) __max = (max);		\
- 	(void) (&__val == &__min);		\
- 	(void) (&__val == &__max);		\
- 	__val = __val < __min ? __min: __val;	\
- 	__val > __max ? __max: __val; })
+ #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
/*
   * ..and if you can't take the strict
diff --combined include/linux/mm_types.h
index 796deac,21bff4b..6e0b286
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@@ -461,6 -461,7 +461,7 @@@ static inline void mm_init_cpumask(stru
  #ifdef CONFIG_CPUMASK_OFFSTACK
    mm->cpu_vm_mask_var = &mm->cpumask_allocation;
  #endif
+ 	cpumask_clear(mm->cpu_vm_mask_var);
  }
/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
@@@ -516,12 -517,4 +517,12 @@@ struct vm_special_mappin
    struct page **pages;
  };
+enum tlb_flush_reason {
 +	TLB_FLUSH_ON_TASK_SWITCH,
 +	TLB_REMOTE_SHOOTDOWN,
 +	TLB_LOCAL_SHOOTDOWN,
 +	TLB_LOCAL_MM_SHOOTDOWN,
 +	NR_TLB_FLUSH_REASONS,
 +};
 +
  #endif /* _LINUX_MM_TYPES_H */
diff --combined include/linux/scatterlist.h
index f4ec8bb,4b152c8..ed8f9e7
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@@ -136,7 -136,7 +136,7 @@@ static inline void sg_set_buf(struct sc
  static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
    		    struct scatterlist *sgl)
  {
- #ifndef ARCH_HAS_SG_CHAIN
+ #ifndef CONFIG_ARCH_HAS_SG_CHAIN
    BUG();
  #endif
@@@ -229,10 -229,10 +229,10 @@@ void sg_init_one(struct scatterlist *, 
  typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t);
  typedef void (sg_free_fn)(struct scatterlist *, unsigned int);
-void __sg_free_table(struct sg_table *, unsigned int, sg_free_fn *);
 +void __sg_free_table(struct sg_table *, unsigned int, bool, sg_free_fn *);
  void sg_free_table(struct sg_table *);
 -int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, gfp_t,
 -		     sg_alloc_fn *);
 +int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int,
 +		     struct scatterlist *, gfp_t, sg_alloc_fn *);
  int sg_alloc_table(struct sg_table *, unsigned int, gfp_t);
  int sg_alloc_table_from_pages(struct sg_table *sgt,
    struct page **pages, unsigned int n_pages,
diff --combined include/linux/sched.h
index a555f37,b9d5364..857ba40
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -33,6 -33,7 +33,7 @@@ struct sched_param
#include <linux/smp.h>
  #include <linux/sem.h>
+ #include <linux/shm.h>
  #include <linux/signal.h>
  #include <linux/compiler.h>
  #include <linux/completion.h>
@@@ -813,7 -814,7 +814,7 @@@ struct task_delay_info 
     * associated with the operation is added to XXX_delay.
     * XXX_delay contains the accumulated delay time in nanoseconds.
     */
 -	struct timespec blkio_start, blkio_end;	/* Shared by blkio, swapin */
 +	u64 blkio_start;	/* Shared by blkio, swapin */
    u64 blkio_delay;	/* wait for sync block io completion */
    u64 swapin_delay;	/* wait for swapin block io completion */
    u32 blkio_count;	/* total count of the number of sync block */
@@@ -821,7 -822,7 +822,7 @@@
    u32 swapin_count;	/* total count of the number of swapin block */
    			/* io operations performed */
-	struct timespec freepages_start, freepages_end;
 +	u64 freepages_start;
    u64 freepages_delay;	/* wait for memory reclaim */
    u32 freepages_count;	/* total count of memory reclaim */
  };
@@@ -1270,6 -1271,9 +1271,6 @@@ struct task_struct 
  #ifdef CONFIG_TREE_PREEMPT_RCU
    struct rcu_node *rcu_blocked_node;
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 -#ifdef CONFIG_RCU_BOOST
 -	struct rt_mutex *rcu_boost_mutex;
 -#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
    struct sched_info sched_info;
@@@ -1304,12 -1308,13 +1305,12 @@@
    			 * execve */
    unsigned in_iowait:1;
-	/* task may not gain privileges */
 -	unsigned no_new_privs:1;
 -
    /* Revert to default priority/policy when forking */
    unsigned sched_reset_on_fork:1;
    unsigned sched_contributes_to_load:1;
+	unsigned long atomic_flags; /* Flags needing atomic access. */
 +
    pid_t pid;
    pid_t tgid;
@@@ -1363,8 -1368,8 +1364,8 @@@
    } vtime_snap_whence;
  #endif
    unsigned long nvcsw, nivcsw; /* context switch counts */
 -	struct timespec start_time; 		/* monotonic time */
 -	struct timespec real_start_time;	/* boot based time */
 +	u64 start_time;		/* monotonic time in nsec */
 +	u64 real_start_time;	/* boot based time in nsec */
  /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
    unsigned long min_flt, maj_flt;
@@@ -1385,6 -1390,7 +1386,7 @@@
  #ifdef CONFIG_SYSVIPC
  /* ipc stuff */
    struct sysv_sem sysvsem;
+ 	struct sysv_shm sysvshm;
  #endif
  #ifdef CONFIG_DETECT_HUNG_TASK
  /* hung task detection */
@@@ -1436,6 -1442,8 +1438,6 @@@
    struct rb_node *pi_waiters_leftmost;
    /* Deadlock detection and priority inheritance handling */
    struct rt_mutex_waiter *pi_blocked_on;
 -	/* Top pi_waiters task */
 -	struct task_struct *pi_top_task;
  #endif
#ifdef CONFIG_DEBUG_MUTEXES
@@@ -1628,12 -1636,6 +1630,6 @@@
    unsigned long trace_recursion;
  #endif /* CONFIG_TRACING */
  #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
- 	struct memcg_batch_info {
- 		int do_batch;	/* incremented when batch uncharge started */
- 		struct mem_cgroup *memcg; /* target memcg of uncharge */
- 		unsigned long nr_pages;	/* uncharged usage */
- 		unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
- 	} memcg_batch;
    unsigned int memcg_kmem_skip_account;
    struct memcg_oom_info {
    	struct mem_cgroup *memcg;
@@@ -1961,19 -1963,6 +1957,19 @@@ static inline void memalloc_noio_restor
    current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
  }
+/* Per-process atomic flags. */
 +#define PFA_NO_NEW_PRIVS 0x00000001	/* May not gain new privileges. */
 +
 +static inline bool task_no_new_privs(struct task_struct *p)
 +{
 +	return test_bit(PFA_NO_NEW_PRIVS, &p->atomic_flags);
 +}
 +
 +static inline void task_set_no_new_privs(struct task_struct *p)
 +{
 +	set_bit(PFA_NO_NEW_PRIVS, &p->atomic_flags);
 +}
 +
  /*
   * task->jobctl flags
   */
@@@ -2016,6 -2005,9 +2012,6 @@@ static inline void rcu_copy_process(str
  #ifdef CONFIG_TREE_PREEMPT_RCU
    p->rcu_blocked_node = NULL;
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 -#ifdef CONFIG_RCU_BOOST
 -	p->rcu_boost_mutex = NULL;
 -#endif /* #ifdef CONFIG_RCU_BOOST */
    INIT_LIST_HEAD(&p->rcu_node_entry);
  }
@@@ -2364,10 -2356,8 +2360,10 @@@ static inline int on_sig_stack(unsigne
static inline int sas_ss_flags(unsigned long sp)
  {
 -	return (current->sas_ss_size == 0 ? SS_DISABLE
 -		: on_sig_stack(sp) ? SS_ONSTACK : 0);
 +	if (!current->sas_ss_size)
 +		return SS_DISABLE;
 +
 +	return on_sig_stack(sp) ? SS_ONSTACK : 0;
  }
static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
@@@ -2794,7 -2784,7 +2790,7 @@@ static inline bool __must_check current
/*
     * Polling state must be visible before we test NEED_RESCHED,
 -	 * paired by resched_task()
 +	 * paired by resched_curr()
     */
    smp_mb__after_atomic();
@@@ -2812,7 -2802,7 +2808,7 @@@ static inline bool __must_check current
/*
     * Polling state must be visible before we test NEED_RESCHED,
 -	 * paired by resched_task()
 +	 * paired by resched_curr()
     */
    smp_mb__after_atomic();
@@@ -2844,7 -2834,7 +2840,7 @@@ static inline void current_clr_polling(
     * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
     * fold.
     */
 -	smp_mb(); /* paired with resched_task() */
 +	smp_mb(); /* paired with resched_curr() */
preempt_fold_need_resched();
  }
@@@ -2969,15 -2959,10 +2965,10 @@@ static inline void inc_syscw(struct tas
#ifdef CONFIG_MEMCG
  extern void mm_update_next_owner(struct mm_struct *mm);
- extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
  #else
  static inline void mm_update_next_owner(struct mm_struct *mm)
  {
  }
- 
- static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
- {
- }
  #endif /* CONFIG_MEMCG */
static inline unsigned long task_rlimit(const struct task_struct *tsk,
diff --combined include/scsi/scsi.h
index e6df23c,d34cf2d..261e708
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@@ -31,7 -31,7 +31,7 @@@ enum scsi_timeouts 
   * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
   * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
   */
- #ifdef ARCH_HAS_SG_CHAIN
+ #ifdef CONFIG_ARCH_HAS_SG_CHAIN
  #define SCSI_MAX_SG_CHAIN_SEGMENTS	2048
  #else
  #define SCSI_MAX_SG_CHAIN_SEGMENTS	SCSI_MAX_SG_SEGMENTS
@@@ -332,7 -332,6 +332,7 @@@ static inline int scsi_status_is_good(i
  #define TYPE_ENCLOSURE      0x0d    /* Enclosure Services Device */
  #define TYPE_RBC	    0x0e
  #define TYPE_OSD            0x11
 +#define TYPE_ZBC            0x14
  #define TYPE_NO_LUN         0x7f
/* SCSI protocols; these are taken from SPC-3 section 7.5 */
@@@ -386,7 -385,7 +386,7 @@@ struct scsi_lun 
  #define SCSI_W_LUN_ACCESS_CONTROL (SCSI_W_LUN_BASE + 2)
  #define SCSI_W_LUN_TARGET_LOG_PAGE (SCSI_W_LUN_BASE + 3)
-static inline int scsi_is_wlun(unsigned int lun)
 +static inline int scsi_is_wlun(u64 lun)
  {
    return (lun & 0xff00) == SCSI_W_LUN_BASE;
  }
diff --combined init/Kconfig
index 85fb985,77dc4cb..d3ef635
--- a/init/Kconfig
+++ b/init/Kconfig
@@@ -505,7 -505,7 +505,7 @@@ config PREEMPT_RC
    def_bool TREE_PREEMPT_RCU
    help
      This option enables preemptible-RCU code that is common between
 -	  the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
 +	  TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU.
config RCU_STALL_COMMON
    def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE )
@@@ -737,7 -737,7 +737,7 @@@ choic
config RCU_NOCB_CPU_NONE
    bool "No build_forced no-CBs CPUs"
 -	depends on RCU_NOCB_CPU && !NO_HZ_FULL
 +	depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL
    help
      This option does not force any of the CPUs to be no-CBs CPUs.
      Only CPUs designated by the rcu_nocbs= boot parameter will be
@@@ -751,7 -751,7 +751,7 @@@
config RCU_NOCB_CPU_ZERO
    bool "CPU 0 is a build_forced no-CBs CPU"
 -	depends on RCU_NOCB_CPU && !NO_HZ_FULL
 +	depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL
    help
      This option forces CPU 0 to be a no-CBs CPU, so that its RCU
      callbacks are invoked by a per-CPU kthread whose name begins
@@@ -807,15 -807,53 +807,53 @@@ config LOG_BUF_SHIF
    range 12 21
    default 17
    help
- 	  Select kernel log buffer size as a power of 2.
+ 	  Select the minimal kernel log buffer size as a power of 2.
+ 	  The final size is affected by LOG_CPU_MAX_BUF_SHIFT config
+ 	  parameter, see below. Any higher size also might be forced
+ 	  by "log_buf_len" boot parameter.
+ 
      Examples:
- 	  	     17 => 128 KB
+ 		     17 => 128 KB
    	     16 => 64 KB
- 	             15 => 32 KB
- 	             14 => 16 KB
+ 		     15 => 32 KB
+ 		     14 => 16 KB
    	     13 =>  8 KB
    	     12 =>  4 KB
+ config LOG_CPU_MAX_BUF_SHIFT
+ 	int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)"
+ 	range 0 21
+ 	default 12 if !BASE_SMALL
+ 	default 0 if BASE_SMALL
+ 	help
+ 	  This option allows to increase the default ring buffer size
+ 	  according to the number of CPUs. The value defines the contribution
+ 	  of each CPU as a power of 2. The used space is typically only few
+ 	  lines however it might be much more when problems are reported,
+ 	  e.g. backtraces.
+ 
+ 	  The increased size means that a new buffer has to be allocated and
+ 	  the original static one is unused. It makes sense only on systems
+ 	  with more CPUs. Therefore this value is used only when the sum of
+ 	  contributions is greater than the half of the default kernel ring
+ 	  buffer as defined by LOG_BUF_SHIFT. The default values are set
+ 	  so that more than 64 CPUs are needed to trigger the allocation.
+ 
+ 	  Also this option is ignored when "log_buf_len" kernel parameter is
+ 	  used as it forces an exact (power of two) size of the ring buffer.
+ 
+ 	  The number of possible CPUs is used for this computation ignoring
+ 	  hotplugging making the compuation optimal for the the worst case
+ 	  scenerio while allowing a simple algorithm to be used from bootup.
+ 
+ 	  Examples shift values and their meaning:
+ 		     17 => 128 KB for each CPU
+ 		     16 =>  64 KB for each CPU
+ 		     15 =>  32 KB for each CPU
+ 		     14 =>  16 KB for each CPU
+ 		     13 =>   8 KB for each CPU
+ 		     12 =>   4 KB for each CPU
+ 
  #
  # Architectures with an unreliable sched_clock() should select this:
  #
@@@ -1264,77 -1302,6 +1302,77 @@@ config CC_OPTIMIZE_FOR_SIZ
If unsure, say N.
+config LTO_MENU
 +	bool "Enable gcc link time optimization (LTO)"
 +	# Only tested on X86 for now. For other architectures you likely
 +	# have to fix some things first, like adding asmlinkages etc.
 +	depends on X86
 +	# lto does not support excluding flags for specific files
 +	# right now. Can be removed if that is fixed.
 +	depends on !FUNCTION_TRACER
 +	help
 +	  With this option gcc will do whole program optimizations for
 +	  the whole kernel and module. This increases compile time, but can
 +	  lead to better code. It allows gcc to inline functions between
 +	  different files and do other optimization.  It might also trigger
 +	  bugs due to more aggressive optimization. It allows gcc to drop unused
 +	  code. On smaller monolithic kernel configurations
 +	  it usually leads to smaller kernels, especially when modules
 +	  are disabled.
 +
 +	  With this option gcc will also do some global checking over
 +	  different source files. It also disables a number of kernel
 +	  features.
 +
 +	  This option is recommended for release builds. With LTO
 +	  the kernel always has to be re-optimized (but not re-parsed)
 +	  on each build.
 +
 +	  This requires a gcc 4.8 or later compiler and
 +	  Linux binutils 2.21.51.0.3 or later.  gcc 4.9 builds significantly
 +	  faster than 4.8 It does not currently work with a FSF release of
 +	  binutils or with the gold linker.
 +
 +	  On larger configurations this may need more than 4GB of RAM.
 +	  It will likely not work on those with a 32bit compiler.
 +
 +	  When the toolchain support is not available this will (hopefully)
 +	  be automatically disabled.
 +
 +	  For more information see Documentation/lto-build
 +
 +config LTO_DISABLE
 +         bool "Disable LTO again"
 +         depends on LTO_MENU
 +         default n
 +         help
 +           This option is merely here so that allyesconfig or allmodconfig do
 +           not enable LTO. If you want to actually use LTO do not enable.
 +
 +config LTO
 +	bool
 +	default y
 +	depends on LTO_MENU && !LTO_DISABLE
 +
 +config LTO_DEBUG
 +	bool "Enable LTO compile time debugging"
 +	depends on LTO
 +	help
 +	  Enable LTO debugging in the compiler. The compiler dumps
 +	  some log files that make it easier to figure out LTO
 +	  behavior. The log files also allow to reconstruct
 +	  the global inlining and a global callgraph.
 +	  They however add some (single threaded) cost to the
 +	  compilation.  When in doubt do not enable.
 +
 +config LTO_CP_CLONE
 +	bool "Allow aggressive cloning for function specialization"
 +	depends on LTO
 +	help
 +	  Allow the compiler to clone and specialize functions for specific
 +	  arguments when it determines these arguments are very commonly
 +	  called.  Experimential. Will increase text size.
 +
  config SYSCTL
    bool
@@@ -1834,8 -1801,6 +1872,8 @@@ config MODULE_FORCE_UNLOA
config MODVERSIONS
    bool "Module versioning support"
 +	# LTO should work with gcc 4.9
 +	depends on !LTO
    help
      Usually, you have to use modules compiled with your kernel.
      Saying Y here makes it sometimes possible to use modules
diff --combined kernel/acct.c
index a1844f1,1bfdda0..5179352
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@@ -141,12 -141,12 +141,12 @@@ static int check_free_space(struct bsd_
    if (acct->active) {
    	if (act < 0) {
    		acct->active = 0;
- 			printk(KERN_INFO "Process accounting paused\n");
+ 			pr_info("Process accounting paused\n");
    	}
    } else {
    	if (act > 0) {
    		acct->active = 1;
- 			printk(KERN_INFO "Process accounting resumed\n");
+ 			pr_info("Process accounting resumed\n");
    	}
    }
@@@ -261,6 -261,7 +261,7 @@@ SYSCALL_DEFINE1(acct, const char __use
if (name) {
    	struct filename *tmp = getname(name);
+ 
    	if (IS_ERR(tmp))
    		return PTR_ERR(tmp);
    	error = acct_on(tmp);
@@@ -376,7 -377,7 +377,7 @@@ static comp_t encode_comp_t(unsigned lo
    return exp;
  }
- #if ACCT_VERSION==1 || ACCT_VERSION==2
+ #if ACCT_VERSION == 1 || ACCT_VERSION == 2
  /*
   * encode an u64 into a comp2_t (24 bits)
   *
@@@ -389,7 -390,7 +390,7 @@@
  #define MANTSIZE2       20                      /* 20 bit mantissa. */
  #define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
  #define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
- #define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */
+ #define MAXEXP2         ((1 << EXPSIZE2) - 1)    /* Maximum exponent. */
static comp2_t encode_comp2_t(u64 value)
  {
@@@ -420,7 -421,7 +421,7 @@@
  }
  #endif
- #if ACCT_VERSION==3
+ #if ACCT_VERSION == 3
  /*
   * encode an u64 into a 32 bit IEEE float
   */
@@@ -429,8 -430,9 +430,9 @@@ static u32 encode_float(u64 value
    unsigned exp = 190;
    unsigned u;
- 	if (value==0) return 0;
- 	while ((s64)value > 0){
+ 	if (value == 0)
+ 		return 0;
+ 	while ((s64)value > 0) {
    	value <<= 1;
    	exp--;
    }
@@@ -458,7 -460,9 +460,7 @@@ static void do_acct_process(struct bsd_
    acct_t ac;
    mm_segment_t fs;
    unsigned long flim;
 -	u64 elapsed;
 -	u64 run_time;
 -	struct timespec uptime;
 +	u64 elapsed, run_time;
    struct tty_struct *tty;
    const struct cred *orig_cred;
@@@ -482,20 -486,23 +484,21 @@@
    strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
/* calculate run_time in nsec*/
 -	do_posix_clock_monotonic_gettime(&uptime);
 -	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
 -	run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
 -		       + current->group_leader->start_time.tv_nsec;
 +	run_time = ktime_get_ns();
 +	run_time -= current->group_leader->start_time;
    /* convert nsec -> AHZ */
    elapsed = nsec_to_AHZ(run_time);
- #if ACCT_VERSION==3
+ #if ACCT_VERSION == 3
    ac.ac_etime = encode_float(elapsed);
  #else
    ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
- 	                       (unsigned long) elapsed : (unsigned long) -1l);
+ 				(unsigned long) elapsed : (unsigned long) -1l);
  #endif
- #if ACCT_VERSION==1 || ACCT_VERSION==2
+ #if ACCT_VERSION == 1 || ACCT_VERSION == 2
    {
    	/* new enlarged etime field */
    	comp2_t etime = encode_comp2_t(elapsed);
+ 
    	ac.ac_etime_hi = etime >> 16;
    	ac.ac_etime_lo = (u16) etime;
    }
@@@ -505,15 -512,15 +508,15 @@@
    /* we really need to bite the bullet and change layout */
    ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
    ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
- #if ACCT_VERSION==2
+ #if ACCT_VERSION == 2
    ac.ac_ahz = AHZ;
  #endif
- #if ACCT_VERSION==1 || ACCT_VERSION==2
+ #if ACCT_VERSION == 1 || ACCT_VERSION == 2
    /* backward-compatible 16 bit fields */
    ac.ac_uid16 = ac.ac_uid;
    ac.ac_gid16 = ac.ac_gid;
  #endif
- #if ACCT_VERSION==3
+ #if ACCT_VERSION == 3
    ac.ac_pid = task_tgid_nr_ns(current, ns);
    rcu_read_lock();
    ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
@@@ -574,6 -581,7 +577,7 @@@ void acct_collect(long exitcode, int gr
if (group_dead && current->mm) {
    	struct vm_area_struct *vma;
+ 
    	down_read(&current->mm->mmap_sem);
    	vma = current->mm->mmap;
    	while (vma) {
diff --combined kernel/fork.c
index fbd3497,735ea98..fa91243
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@@ -315,15 -315,6 +315,15 @@@ static struct task_struct *dup_task_str
    	goto free_ti;
tsk->stack = ti;
 +#ifdef CONFIG_SECCOMP
 +	/*
 +	 * We must handle setting up seccomp filters once we're under
 +	 * the sighand lock in case orig has changed between now and
 +	 * then. Until then, filter must be NULL to avoid messing up
 +	 * the usage counts on the error path calling free_task.
 +	 */
 +	tsk->seccomp.filter = NULL;
 +#endif
setup_thread_stack(tsk, orig);
    clear_user_return_notifier(tsk);
@@@ -374,12 -365,11 +374,11 @@@ static int dup_mmap(struct mm_struct *m
     */
    down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
- 	mm->locked_vm = 0;
- 	mm->mmap = NULL;
- 	mm->vmacache_seqnum = 0;
- 	mm->map_count = 0;
- 	cpumask_clear(mm_cpumask(mm));
- 	mm->mm_rb = RB_ROOT;
+ 	mm->total_vm = oldmm->total_vm;
+ 	mm->shared_vm = oldmm->shared_vm;
+ 	mm->exec_vm = oldmm->exec_vm;
+ 	mm->stack_vm = oldmm->stack_vm;
+ 
    rb_link = &mm->mm_rb.rb_node;
    rb_parent = NULL;
    pprev = &mm->mmap;
@@@ -536,19 -526,37 +535,37 @@@ static void mm_init_aio(struct mm_struc
  #endif
  }
+ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+ {
+ #ifdef CONFIG_MEMCG
+ 	mm->owner = p;
+ #endif
+ }
+ 
  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
  {
+ 	mm->mmap = NULL;
+ 	mm->mm_rb = RB_ROOT;
+ 	mm->vmacache_seqnum = 0;
    atomic_set(&mm->mm_users, 1);
    atomic_set(&mm->mm_count, 1);
    init_rwsem(&mm->mmap_sem);
    INIT_LIST_HEAD(&mm->mmlist);
    mm->core_state = NULL;
    atomic_long_set(&mm->nr_ptes, 0);
+ 	mm->map_count = 0;
+ 	mm->locked_vm = 0;
+ 	mm->pinned_vm = 0;
    memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
    spin_lock_init(&mm->page_table_lock);
+ 	mm_init_cpumask(mm);
    mm_init_aio(mm);
    mm_init_owner(mm, p);
+ 	mmu_notifier_mm_init(mm);
    clear_tlb_flush_pending(mm);
+ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ 	mm->pmd_huge_pte = NULL;
+ #endif
if (current->mm) {
    	mm->flags = current->mm->flags & MMF_INIT_MASK;
@@@ -558,11 -566,17 +575,17 @@@
    	mm->def_flags = 0;
    }
- 	if (likely(!mm_alloc_pgd(mm))) {
- 		mmu_notifier_mm_init(mm);
- 		return mm;
- 	}
+ 	if (mm_alloc_pgd(mm))
+ 		goto fail_nopgd;
+ 
+ 	if (init_new_context(p, mm))
+ 		goto fail_nocontext;
+ 
+ 	return mm;
+ fail_nocontext:
+ 	mm_free_pgd(mm);
+ fail_nopgd:
    free_mm(mm);
    return NULL;
  }
@@@ -596,7 -610,6 +619,6 @@@ struct mm_struct *mm_alloc(void
    	return NULL;
memset(mm, 0, sizeof(*mm));
- 	mm_init_cpumask(mm);
    return mm_init(mm, current);
  }
@@@ -828,17 -841,10 +850,10 @@@ static struct mm_struct *dup_mm(struct 
    	goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
- 	mm_init_cpumask(mm);
if (!mm_init(mm, tsk))
    	goto fail_nomem;
- 	if (init_new_context(tsk, mm))
- 		goto fail_nocontext;
- 
    dup_mm_exe_file(oldmm, mm);
err = dup_mmap(mm, oldmm);
@@@ -860,15 -866,6 +875,6 @@@ free_pt
fail_nomem:
    return NULL;
- 
- fail_nocontext:
- 	/*
- 	 * If init_new_context() failed, we cannot use mmput() to free the mm
- 	 * because it calls destroy_context()
- 	 */
- 	mm_free_pgd(mm);
- 	free_mm(mm);
- 	return NULL;
  }
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
@@@ -1090,39 -1087,6 +1096,39 @@@ static int copy_signal(unsigned long cl
    return 0;
  }
+static void copy_seccomp(struct task_struct *p)
 +{
 +#ifdef CONFIG_SECCOMP
 +	/*
 +	 * Must be called with sighand->lock held, which is common to
 +	 * all threads in the group. Holding cred_guard_mutex is not
 +	 * needed because this new task is not yet running and cannot
 +	 * be racing exec.
 +	 */
 +	BUG_ON(!spin_is_locked(&current->sighand->siglock));
 +
 +	/* Ref-count the new filter user, and assign it. */
 +	get_seccomp_filter(current);
 +	p->seccomp = current->seccomp;
 +
 +	/*
 +	 * Explicitly enable no_new_privs here in case it got set
 +	 * between the task_struct being duplicated and holding the
 +	 * sighand lock. The seccomp state and nnp must be in sync.
 +	 */
 +	if (task_no_new_privs(current))
 +		task_set_no_new_privs(p);
 +
 +	/*
 +	 * If the parent gained a seccomp mode after copying thread
 +	 * flags and between before we held the sighand lock, we have
 +	 * to manually enable the seccomp thread flag here.
 +	 */
 +	if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
 +		set_tsk_thread_flag(p, TIF_SECCOMP);
 +#endif
 +}
 +
  SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
  {
    current->clear_child_tid = tidptr;
@@@ -1137,16 -1101,10 +1143,9 @@@ static void rt_mutex_init_task(struct t
    p->pi_waiters = RB_ROOT;
    p->pi_waiters_leftmost = NULL;
    p->pi_blocked_on = NULL;
  #endif
  }
- #ifdef CONFIG_MEMCG
- void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
- {
- 	mm->owner = p;
- }
- #endif /* CONFIG_MEMCG */
- 
  /*
   * Initialize POSIX timer handling for a single task.
   */
@@@ -1237,6 -1195,7 +1236,6 @@@ static struct task_struct *copy_process
    	goto fork_out;
ftrace_graph_init_task(p);
 -	get_seccomp_filter(p);
rt_mutex_init_task(p);
@@@ -1302,8 -1261,9 +1301,8 @@@
posix_cpu_timers_init(p);
-	do_posix_clock_monotonic_gettime(&p->start_time);
 -	p->real_start_time = p->start_time;
 -	monotonic_to_bootbased(&p->real_start_time);
 +	p->start_time = ktime_get_ns();
 +	p->real_start_time = ktime_get_boot_ns();
    p->io_context = NULL;
    p->audit_context = NULL;
    if (clone_flags & CLONE_THREAD)
@@@ -1346,10 -1306,6 +1345,6 @@@
  #ifdef CONFIG_DEBUG_MUTEXES
    p->blocked_on = NULL; /* not blocked yet */
  #endif
- #ifdef CONFIG_MEMCG
- 	p->memcg_batch.do_batch = 0;
- 	p->memcg_batch.memcg = NULL;
- #endif
  #ifdef CONFIG_BCACHE
    p->sequential_io	= 0;
    p->sequential_io_avg	= 0;
@@@ -1367,6 -1323,7 +1362,7 @@@
    if (retval)
    	goto bad_fork_cleanup_policy;
    /* copy all the process information */
+ 	shm_init_task(p);
    retval = copy_semundo(clone_flags, p);
    if (retval)
    	goto bad_fork_cleanup_audit;
@@@ -1476,12 -1433,6 +1472,12 @@@
    spin_lock(&current->sighand->siglock);
/*
 +	 * Copy seccomp details explicitly here, in case they were changed
 +	 * before holding sighand lock.
 +	 */
 +	copy_seccomp(p);
 +
 +	/*
     * Process group and session signals need to be delivered to just the
     * parent before the fork or both the parent and the child after the
     * fork. Restart if a signal comes in before we add the new process to
@@@ -1918,6 -1869,11 +1914,11 @@@ SYSCALL_DEFINE1(unshare, unsigned long
    		 */
    		exit_sem(current);
    	}
+ 		if (unshare_flags & CLONE_NEWIPC) {
+ 			/* Orphan segments in old ns (see sem above). */
+ 			exit_shm(current);
+ 			shm_init_task(current);
+ 		}
if (new_nsproxy)
    		switch_task_namespaces(current, new_nsproxy);
diff --combined lib/Kconfig
index a8a775730,fdf90f3..2accc79
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@@ -177,6 -177,13 +177,13 @@@ config CRC
      when they need to do cyclic redundancy check according CRC8
      algorithm. Module will be called crc8.
+ config CRC64_ECMA
+ 	tristate "CRC64 ECMA function"
+ 	help
+ 	  This option provides CRC64 ECMA function. Drivers may select this
+ 	  when they need to do cyclic redundancy check according to the CRC64
+ 	  ECMA algorithm.
+ 
  config AUDIT_GENERIC
    bool
    depends on AUDIT && !AUDIT_ARCH
@@@ -396,6 -403,39 +403,39 @@@ config CPU_RMA
  config DQL
    bool
+ config GLOB
+ 	bool
+ #	This actually supports modular compilation, but the module overhead
+ #	is ridiculous for the amount of code involved.	Until an out-of-tree
+ #	driver asks for it, we'll just link it directly it into the kernel
+ #	when required.  Since we're ignoring out-of-tree users,	there's also
+ #	no need bother prompting for a manual decision:
+ #	prompt "glob_match() function"
+ 	help
+ 	  This option provides a glob_match function for performing
+ 	  simple text pattern matching.  It originated in the ATA code
+ 	  to blacklist particular drive models, but other device drivers
+ 	  may need similar functionality.
+ 
+ 	  All drivers in the Linux kernel tree that require this function
+ 	  should automatically select this option.  Say N unless you
+ 	  are compiling an out-of tree driver which tells you that it
+ 	  depends on this.
+ 
+ config GLOB_SELFTEST
+ 	bool "glob self-test on init"
+ 	default n
+ 	depends on GLOB
+ 	help
+ 	  This option enables a simple self-test of the glob_match
+ 	  function on startup.	It is primarily useful for people
+ 	  working on the code to ensure they haven't introduced any
+ 	  regressions.
+ 
+ 	  It only adds a little bit of code and slows kernel boot (or
+ 	  module load) by a small amount, so you're welcome to play with
+ 	  it, but you probably don't need it.
+ 
  #
  # Netlink attribute parsing support is select'ed if needed
  #
@@@ -451,8 -491,7 +491,8 @@@ config MPILI
config SIGNATURE
    tristate
 -	depends on KEYS && CRYPTO
 +	depends on KEYS
 +	select CRYPTO
    select CRYPTO_SHA1
    select MPILIB
    help
@@@ -475,4 -514,11 +515,11 @@@ config UCS2_STRIN
source "lib/fonts/Kconfig"
+ #
+ # sg chaining option
+ #
+ 
+ config ARCH_HAS_SG_CHAIN
+ 	def_bool n
+ 
  endmenu
diff --combined lib/Kconfig.debug
index df5661c,fd939e1..0d1bc8d
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@@ -15,7 -15,7 +15,7 @@@ config PRINTK_TIM
      The behavior is also controlled by the kernel command line
      parameter printk.time=1. See Documentation/kernel-parameters.txt
- config DEFAULT_MESSAGE_LOGLEVEL
+ config MESSAGE_LOGLEVEL_DEFAULT
    int "Default message log level (1-7)"
    range 1 7
    default "4"
@@@ -143,30 -143,6 +143,30 @@@ config DEBUG_INFO_REDUCE
      DEBUG_INFO build and compile times are reduced too.
      Only works with newer gcc versions.
+config DEBUG_INFO_SPLIT
 +	bool "Produce split debuginfo in .dwo files"
 +	depends on DEBUG_INFO
 +	help
 +	  Generate debug info into separate .dwo files. This significantly
 +	  reduces the build directory size for builds with DEBUG_INFO,
 +	  because it stores the information only once on disk in .dwo
 +	  files instead of multiple times in object files and executables.
 +	  In addition the debug information is also compressed.
 +
 +	  Requires recent gcc (4.7+) and recent gdb/binutils.
 +	  Any tool that packages or reads debug information would need
 +	  to know about the .dwo files and include them.
 +	  Incompatible with older versions of ccache.
 +
 +config DEBUG_INFO_DWARF4
 +	bool "Generate dwarf4 debuginfo"
 +	depends on DEBUG_INFO
 +	help
 +	  Generate dwarf4 debug info. This requires recent versions
 +	  of gcc and gdb. It makes the debug information larger.
 +	  But it significantly improves the success of resolving
 +	  variables in gdb on optimized code.
 +
  config ENABLE_WARN_DEPRECATED
    bool "Enable __deprecated logic"
    default y
@@@ -204,7 -180,7 +204,7 @@@ config STRIP_ASM_SYM
config READABLE_ASM
          bool "Generate readable assembler code"
 -        depends on DEBUG_KERNEL
 +        depends on DEBUG_KERNEL && !LTO
          help
            Disable some compiler optimizations that tend to generate human unreadable
            assembler output. This may make the kernel slightly slower, but it helps
@@@ -859,7 -835,7 +859,7 @@@ config DEBUG_RT_MUTEXE
config RT_MUTEX_TESTER
    bool "Built-in scriptable tester for rt-mutexes"
 -	depends on DEBUG_KERNEL && RT_MUTEXES
 +	depends on DEBUG_KERNEL && RT_MUTEXES && BROKEN
    help
      This option enables a rt-mutex tester.
@@@ -1155,6 -1131,20 +1155,6 @@@ config PROVE_RCU_REPEATEDL
Say N if you are unsure.
-config PROVE_RCU_DELAY
 -	bool "RCU debugging: preemptible RCU race provocation"
 -	depends on DEBUG_KERNEL && PREEMPT_RCU
 -	default n
 -	help
 -	 There is a class of races that involve an unlikely preemption
 -	 of __rcu_read_unlock() just after ->rcu_read_lock_nesting has
 -	 been set to INT_MIN.  This feature inserts a delay at that
 -	 point to increase the probability of these races.
 -
 -	 Say Y to increase probability of preemption of __rcu_read_unlock().
 -
 -	 Say N if you are unsure.
 -
  config SPARSE_RCU_POINTER
    bool "RCU debugging: sparse-based checks for pointer usage"
    default n
@@@ -1560,14 -1550,6 +1560,14 @@@ config TEST_STRING_HELPER
  config TEST_KSTRTOX
    tristate "Test kstrto*() family of functions at runtime"
+config TEST_RHASHTABLE
 +	bool "Perform selftest on resizable hash table"
 +	default n
 +	help
 +	  Enable this option to test the rhashtable functions at boot.
 +
 +	  If unsure, say N.
 +
  endmenu # runtime tests
config PROVIDE_OHCI1394_DMA_INIT
@@@ -1667,28 -1649,6 +1667,28 @@@ config TEST_BP
If unsure, say N.
+config TEST_FIRMWARE
 +	tristate "Test firmware loading via userspace interface"
 +	default n
 +	depends on FW_LOADER
 +	help
 +	  This builds the "test_firmware" module that creates a userspace
 +	  interface for testing firmware loading. This can be used to
 +	  control the triggering of firmware loading without needing an
 +	  actual firmware-using device. The contents can be rechecked by
 +	  userspace.
 +
 +	  If unsure, say N.
 +
 +config TEST_UDELAY
 +	tristate "udelay test driver"
 +	default n
 +	help
 +	  This builds the "udelay_test" module that helps to make sure
 +	  that udelay() is working properly.
 +
 +	  If unsure, say N.
 +
  source "samples/Kconfig"
source "lib/Kconfig.kgdb"
diff --combined lib/Makefile
index 8427df9,e48067c..b73c3c3
--- a/lib/Makefile
+++ b/lib/Makefile
@@@ -26,7 -26,7 +26,7 @@@ obj-y += bcd.o div64.o sort.o parser.o 
     bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
     gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
     bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
 -	 percpu-refcount.o percpu_ida.o hash.o
 +	 percpu-refcount.o percpu_ida.o hash.o rhashtable.o
  obj-y += string_helpers.o
  obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
  obj-y += kstrtox.o
@@@ -34,7 -34,6 +34,7 @@@ obj-$(CONFIG_TEST_KSTRTOX) += test-kstr
  obj-$(CONFIG_TEST_MODULE) += test_module.o
  obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o
  obj-$(CONFIG_TEST_BPF) += test_bpf.o
 +obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
ifeq ($(CONFIG_DEBUG_KOBJECT),y)
  CFLAGS_kobject.o += -DDEBUG
@@@ -72,6 -71,7 +72,7 @@@ obj-$(CONFIG_CRC32)	+= crc32.
  obj-$(CONFIG_CRC7)	+= crc7.o
  obj-$(CONFIG_LIBCRC32C)	+= libcrc32c.o
  obj-$(CONFIG_CRC8)	+= crc8.o
+ obj-$(CONFIG_CRC64_ECMA)	+= crc64_ecma.o
  obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
@@@ -137,6 -137,8 +138,8 @@@ obj-$(CONFIG_CORDIC) += cordic.
obj-$(CONFIG_DQL) += dynamic_queue_limits.o
+ obj-$(CONFIG_GLOB) += glob.o
+ 
  obj-$(CONFIG_MPILIB) += mpi/
  obj-$(CONFIG_SIGNATURE) += digsig.o
diff --combined lib/scatterlist.c
index b4415fc,4251cbd..9cdf62f
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@@ -73,7 -73,7 +73,7 @@@ EXPORT_SYMBOL(sg_nents)
   **/
  struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents)
  {
- #ifndef ARCH_HAS_SG_CHAIN
+ #ifndef CONFIG_ARCH_HAS_SG_CHAIN
    struct scatterlist *ret = &sgl[nents - 1];
  #else
    struct scatterlist *sg, *ret = NULL;
@@@ -165,7 -165,6 +165,7 @@@ static void sg_kfree(struct scatterlis
   * __sg_free_table - Free a previously mapped sg table
   * @table:	The sg table header to use
   * @max_ents:	The maximum number of entries per single scatterlist
 + * @skip_first_chunk: don't free the (preallocated) first scatterlist chunk
   * @free_fn:	Free function
   *
   *  Description:
@@@ -175,7 -174,7 +175,7 @@@
   *
   **/
  void __sg_free_table(struct sg_table *table, unsigned int max_ents,
 -		     sg_free_fn *free_fn)
 +		     bool skip_first_chunk, sg_free_fn *free_fn)
  {
    struct scatterlist *sgl, *next;
@@@ -203,10 -202,7 +203,10 @@@
    	}
table->orig_nents -= sg_size;
 -		free_fn(sgl, alloc_size);
 +		if (!skip_first_chunk) {
 +			free_fn(sgl, alloc_size);
 +			skip_first_chunk = false;
 +		}
    	sgl = next;
    }
@@@ -221,7 -217,7 +221,7 @@@ EXPORT_SYMBOL(__sg_free_table)
   **/
  void sg_free_table(struct sg_table *table)
  {
 -	__sg_free_table(table, SG_MAX_SINGLE_ALLOC, sg_kfree);
 +	__sg_free_table(table, SG_MAX_SINGLE_ALLOC, false, sg_kfree);
  }
  EXPORT_SYMBOL(sg_free_table);
@@@ -245,8 -241,8 +245,8 @@@
   *
   **/
  int __sg_alloc_table(struct sg_table *table, unsigned int nents,
 -		     unsigned int max_ents, gfp_t gfp_mask,
 -		     sg_alloc_fn *alloc_fn)
 +		     unsigned int max_ents, struct scatterlist *first_chunk,
 +		     gfp_t gfp_mask, sg_alloc_fn *alloc_fn)
  {
    struct scatterlist *sg, *prv;
    unsigned int left;
@@@ -255,7 -251,7 +255,7 @@@
if (nents == 0)
    	return -EINVAL;
- #ifndef ARCH_HAS_SG_CHAIN
+ #ifndef CONFIG_ARCH_HAS_SG_CHAIN
    if (WARN_ON_ONCE(nents > max_ents))
    	return -EINVAL;
  #endif
@@@ -273,12 -269,7 +273,12 @@@
left -= sg_size;
-		sg = alloc_fn(alloc_size, gfp_mask);
 +		if (first_chunk) {
 +			sg = first_chunk;
 +			first_chunk = NULL;
 +		} else {
 +			sg = alloc_fn(alloc_size, gfp_mask);
 +		}
    	if (unlikely(!sg)) {
    		/*
    		 * Adjust entry count to reflect that the last
@@@ -333,9 -324,9 +333,9 @@@ int sg_alloc_table(struct sg_table *tab
    int ret;
ret = __sg_alloc_table(table, nents, SG_MAX_SINGLE_ALLOC,
 -			       gfp_mask, sg_kmalloc);
 +			       NULL, gfp_mask, sg_kmalloc);
    if (unlikely(ret))
 -		__sg_free_table(table, SG_MAX_SINGLE_ALLOC, sg_kfree);
 +		__sg_free_table(table, SG_MAX_SINGLE_ALLOC, false, sg_kfree);
return ret;
  }
diff --combined mm/filemap.c
index 65d44fd,02b3b10d..f501b56
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@@ -31,6 -31,7 +31,7 @@@
  #include <linux/security.h>
  #include <linux/cpuset.h>
  #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+ #include <linux/hugetlb.h>
  #include <linux/memcontrol.h>
  #include <linux/cleancache.h>
  #include <linux/rmap.h>
@@@ -233,7 -234,6 +234,6 @@@ void delete_from_page_cache(struct pag
    spin_lock_irq(&mapping->tree_lock);
    __delete_from_page_cache(page, NULL);
    spin_unlock_irq(&mapping->tree_lock);
- 	mem_cgroup_uncharge_cache_page(page);
if (freepage)
    	freepage(page);
@@@ -241,6 -241,18 +241,6 @@@
  }
  EXPORT_SYMBOL(delete_from_page_cache);
-static int sleep_on_page(void *word)
 -{
 -	io_schedule();
 -	return 0;
 -}
 -
 -static int sleep_on_page_killable(void *word)
 -{
 -	sleep_on_page(word);
 -	return fatal_signal_pending(current) ? -EINTR : 0;
 -}
 -
  static int filemap_check_errors(struct address_space *mapping)
  {
    int ret = 0;
@@@ -489,8 -501,7 +489,7 @@@ int replace_page_cache_page(struct pag
    	if (PageSwapBacked(new))
    		__inc_zone_page_state(new, NR_SHMEM);
    	spin_unlock_irq(&mapping->tree_lock);
- 		/* mem_cgroup codes must not be called under tree_lock */
- 		mem_cgroup_replace_page_cache(old, new);
+ 		mem_cgroup_migrate(old, new, true);
    	radix_tree_preload_end();
    	if (freepage)
    		freepage(old);
@@@ -548,19 -559,24 +547,24 @@@ static int __add_to_page_cache_locked(s
    			      pgoff_t offset, gfp_t gfp_mask,
    			      void **shadowp)
  {
+ 	int huge = PageHuge(page);
+ 	struct mem_cgroup *memcg;
    int error;
VM_BUG_ON_PAGE(!PageLocked(page), page);
    VM_BUG_ON_PAGE(PageSwapBacked(page), page);
- 	error = mem_cgroup_charge_file(page, current->mm,
- 					gfp_mask & GFP_RECLAIM_MASK);
- 	if (error)
- 		return error;
+ 	if (!huge) {
+ 		error = mem_cgroup_try_charge(page, current->mm,
+ 					      gfp_mask, &memcg);
+ 		if (error)
+ 			return error;
+ 	}
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
    if (error) {
- 		mem_cgroup_uncharge_cache_page(page);
+ 		if (!huge)
+ 			mem_cgroup_cancel_charge(page, memcg);
    	return error;
    }
@@@ -575,13 -591,16 +579,16 @@@
    	goto err_insert;
    __inc_zone_page_state(page, NR_FILE_PAGES);
    spin_unlock_irq(&mapping->tree_lock);
+ 	if (!huge)
+ 		mem_cgroup_commit_charge(page, memcg, false);
    trace_mm_filemap_add_to_page_cache(page);
    return 0;
  err_insert:
    page->mapping = NULL;
    /* Leave page->index set: truncation relies upon it */
    spin_unlock_irq(&mapping->tree_lock);
- 	mem_cgroup_uncharge_cache_page(page);
+ 	if (!huge)
+ 		mem_cgroup_cancel_charge(page, memcg);
    page_cache_release(page);
    return error;
  }
@@@ -680,7 -699,7 +687,7 @@@ void wait_on_page_bit(struct page *page
    DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
if (test_bit(bit_nr, &page->flags))
 -		__wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
 +		__wait_on_bit(page_waitqueue(page), &wait, bit_wait_io,
    						TASK_UNINTERRUPTIBLE);
  }
  EXPORT_SYMBOL(wait_on_page_bit);
@@@ -693,7 -712,7 +700,7 @@@ int wait_on_page_bit_killable(struct pa
    	return 0;
return __wait_on_bit(page_waitqueue(page), &wait,
 -			     sleep_on_page_killable, TASK_KILLABLE);
 +			     bit_wait_io, TASK_KILLABLE);
  }
/**
@@@ -794,7 -813,7 +801,7 @@@ void __lock_page(struct page *page
  {
    DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-	__wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
 +	__wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
    						TASK_UNINTERRUPTIBLE);
  }
  EXPORT_SYMBOL(__lock_page);
@@@ -804,10 -823,21 +811,21 @@@ int __lock_page_killable(struct page *p
    DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
return __wait_on_bit_lock(page_waitqueue(page), &wait,
 -					sleep_on_page_killable, TASK_KILLABLE);
 +					bit_wait_io, TASK_KILLABLE);
  }
  EXPORT_SYMBOL_GPL(__lock_page_killable);
+ /*
+  * Return values:
+  * 1 - page is locked; mmap_sem is still held.
+  * 0 - page is not locked.
+  *     mmap_sem has been released (up_read()), unless flags had both
+  *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
+  *     which case mmap_sem is still held.
+  *
+  * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
+  * with the page locked and the mmap_sem unperturbed.
+  */
  int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
    		 unsigned int flags)
  {
@@@ -1091,9 -1121,9 +1109,9 @@@ no_page
    	if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
    		fgp_flags |= FGP_LOCK;
- 		/* Init accessed so avoit atomic mark_page_accessed later */
+ 		/* Init accessed so avoid atomic mark_page_accessed later */
    	if (fgp_flags & FGP_ACCESSED)
- 			init_page_accessed(page);
+ 			__SetPageReferenced(page);
err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
    	if (unlikely(err)) {
@@@ -1827,6 -1857,18 +1845,18 @@@ static void do_async_mmap_readahead(str
   * The goto's are kind of ugly, but this streamlines the normal case of having
   * it in the page cache, and handles the special cases reasonably without
   * having a lot of duplicated code.
+  *
+  * vma->vm_mm->mmap_sem must be held on entry.
+  *
+  * If our return value has VM_FAULT_RETRY set, it's because
+  * lock_page_or_retry() returned 0.
+  * The mmap_sem has usually been released in this case.
+  * See __lock_page_or_retry() for the exception.
+  *
+  * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
+  * has not been released.
+  *
+  * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
   */
  int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
diff --combined mm/memcontrol.c
index f009a14,cd1e90b..475ecadd
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -754,9 -754,11 +754,11 @@@ static void __mem_cgroup_remove_exceede
  static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
    			       struct mem_cgroup_tree_per_zone *mctz)
  {
- 	spin_lock(&mctz->lock);
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&mctz->lock, flags);
    __mem_cgroup_remove_exceeded(mz, mctz);
- 	spin_unlock(&mctz->lock);
+ 	spin_unlock_irqrestore(&mctz->lock, flags);
  }
@@@ -779,7 -781,9 +781,9 @@@ static void mem_cgroup_update_tree(stru
    	 * mem is over its softlimit.
    	 */
    	if (excess || mz->on_tree) {
- 			spin_lock(&mctz->lock);
+ 			unsigned long flags;
+ 
+ 			spin_lock_irqsave(&mctz->lock, flags);
    		/* if on-tree, remove it */
    		if (mz->on_tree)
    			__mem_cgroup_remove_exceeded(mz, mctz);
@@@ -788,7 -792,7 +792,7 @@@
    		 * If excess is 0, no tree ops.
    		 */
    		__mem_cgroup_insert_exceeded(mz, mctz, excess);
- 			spin_unlock(&mctz->lock);
+ 			spin_unlock_irqrestore(&mctz->lock, flags);
    	}
    }
  }
@@@ -839,9 -843,9 +843,9 @@@ mem_cgroup_largest_soft_limit_node(stru
  {
    struct mem_cgroup_per_zone *mz;
- 	spin_lock(&mctz->lock);
+ 	spin_lock_irq(&mctz->lock);
    mz = __mem_cgroup_largest_soft_limit_node(mctz);
- 	spin_unlock(&mctz->lock);
+ 	spin_unlock_irq(&mctz->lock);
    return mz;
  }
@@@ -882,13 -886,6 +886,6 @@@ static long mem_cgroup_read_stat(struc
    return val;
  }
- static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
- 					 bool charge)
- {
- 	int val = (charge) ? 1 : -1;
- 	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
- }
- 
  static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
    				    enum mem_cgroup_events_index idx)
  {
@@@ -909,13 -906,13 +906,13 @@@
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
    				 struct page *page,
- 					 bool anon, int nr_pages)
+ 					 int nr_pages)
  {
    /*
     * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
     * counted as CACHE even if it's on ANON LRU.
     */
- 	if (anon)
+ 	if (PageAnon(page))
    	__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
    			nr_pages);
    else
@@@ -1013,7 -1010,6 +1010,6 @@@ static bool mem_cgroup_event_ratelimit(
   */
  static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
  {
- 	preempt_disable();
    /* threshold event is triggered in finer grain than soft limit */
    if (unlikely(mem_cgroup_event_ratelimit(memcg,
    					MEM_CGROUP_TARGET_THRESH))) {
@@@ -1026,8 -1022,6 +1022,6 @@@
    	do_numainfo = mem_cgroup_event_ratelimit(memcg,
    					MEM_CGROUP_TARGET_NUMAINFO);
  #endif
- 		preempt_enable();
- 
    	mem_cgroup_threshold(memcg);
    	if (unlikely(do_softlimit))
    		mem_cgroup_update_tree(memcg, page);
@@@ -1035,8 -1029,7 +1029,7 @@@
    	if (unlikely(do_numainfo))
    		atomic_inc(&memcg->numainfo_events);
  #endif
- 	} else
- 		preempt_enable();
+ 	}
  }
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@@ -1347,20 -1340,6 +1340,6 @@@ out
    return lruvec;
  }
- /*
-  * Following LRU functions are allowed to be used without PCG_LOCK.
-  * Operations are called by routine of global LRU independently from memcg.
-  * What we have to take care of here is validness of pc->mem_cgroup.
-  *
-  * Changes to pc->mem_cgroup happens when
-  * 1. charge
-  * 2. moving account
-  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
-  * It is added to LRU before charge.
-  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
-  * When moving account, the page is not on LRU. It's isolated.
-  */
- 
  /**
   * mem_cgroup_page_lruvec - return lruvec for adding an lru page
   * @page: the page
@@@ -2261,22 -2240,14 +2240,14 @@@ cleanup
   *
   * Notes: Race condition
   *
-  * We usually use lock_page_cgroup() for accessing page_cgroup member but
-  * it tends to be costly. But considering some conditions, we doesn't need
-  * to do so _always_.
+  * Charging occurs during page instantiation, while the page is
+  * unmapped and locked in page migration, or while the page table is
+  * locked in THP migration.  No race is possible.
   *
-  * Considering "charge", lock_page_cgroup() is not required because all
-  * file-stat operations happen after a page is attached to radix-tree. There
-  * are no race with "charge".
+  * Uncharge happens to pages with zero references, no race possible.
   *
-  * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
-  * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
-  * if there are race with "uncharge". Statistics itself is properly handled
-  * by flags.
-  *
-  * Considering "move", this is an only case we see a race. To make the race
-  * small, we check memcg->moving_account and detect there are possibility
-  * of race or not. If there is, we take a lock.
+  * Charge moving between groups is protected by checking mm->moving
+  * account and taking the move_lock in the slowpath.
   */
void __mem_cgroup_begin_update_page_stat(struct page *page,
@@@ -2551,55 -2522,63 +2522,63 @@@ static int memcg_cpu_hotplug_callback(s
    return NOTIFY_OK;
  }
- 
- /* See mem_cgroup_try_charge() for details */
- enum {
- 	CHARGE_OK,		/* success */
- 	CHARGE_RETRY,		/* need to retry but retry is not bad */
- 	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
- 	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
- };
- 
- static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- 				unsigned int nr_pages, unsigned int min_pages,
- 				bool invoke_oom)
+ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ 		      unsigned int nr_pages)
  {
- 	unsigned long csize = nr_pages * PAGE_SIZE;
+ 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
+ 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
    struct mem_cgroup *mem_over_limit;
    struct res_counter *fail_res;
+ 	unsigned long nr_reclaimed;
    unsigned long flags = 0;
- 	int ret;
+ 	unsigned long long size;
+ 	int ret = 0;
- 	ret = res_counter_charge(&memcg->res, csize, &fail_res);
+ retry:
+ 	if (consume_stock(memcg, nr_pages))
+ 		goto done;
- 	if (likely(!ret)) {
+ 	size = batch * PAGE_SIZE;
+ 	if (!res_counter_charge(&memcg->res, size, &fail_res)) {
    	if (!do_swap_account)
- 			return CHARGE_OK;
- 		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
- 		if (likely(!ret))
- 			return CHARGE_OK;
- 
- 		res_counter_uncharge(&memcg->res, csize);
+ 			goto done_restock;
+ 		if (!res_counter_charge(&memcg->memsw, size, &fail_res))
+ 			goto done_restock;
+ 		res_counter_uncharge(&memcg->res, size);
    	mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
    	flags |= MEM_CGROUP_RECLAIM_NOSWAP;
    } else
    	mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+ 
+ 	if (batch > nr_pages) {
+ 		batch = nr_pages;
+ 		goto retry;
+ 	}
+ 
    /*
- 	 * Never reclaim on behalf of optional batching, retry with a
- 	 * single page instead.
+ 	 * Unlike in global OOM situations, memcg is not in a physical
+ 	 * memory shortage.  Allow dying and OOM-killed tasks to
+ 	 * bypass the last charges so that they can exit quickly and
+ 	 * free their memory.
     */
- 	if (nr_pages > min_pages)
- 		return CHARGE_RETRY;
+ 	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+ 		     fatal_signal_pending(current) ||
+ 		     current->flags & PF_EXITING))
+ 		goto bypass;
+ 
+ 	if (unlikely(task_in_memcg_oom(current)))
+ 		goto nomem;
if (!(gfp_mask & __GFP_WAIT))
- 		return CHARGE_WOULDBLOCK;
+ 		goto nomem;
- 	if (gfp_mask & __GFP_NORETRY)
- 		return CHARGE_NOMEM;
+ 	nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
- 	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
- 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
- 		return CHARGE_RETRY;
+ 	if (mem_cgroup_margin(mem_over_limit) >= batch)
+ 		goto retry;
+ 
+ 	if (gfp_mask & __GFP_NORETRY)
+ 		goto nomem;
    /*
     * Even though the limit is exceeded at this point, reclaim
     * may have been able to free some pages.  Retry the charge
@@@ -2609,142 -2588,47 +2588,47 @@@
     * unlikely to succeed so close to the limit, and we fall back
     * to regular pages anyway in case of failure.
     */
- 	if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
- 		return CHARGE_RETRY;
- 
+ 	if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER))
+ 		goto retry;
    /*
     * At task move, charge accounts can be doubly counted. So, it's
     * better to wait until the end of task_move if something is going on.
     */
    if (mem_cgroup_wait_acct_move(mem_over_limit))
- 		return CHARGE_RETRY;
- 
- 	if (invoke_oom)
- 		mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
- 
- 	return CHARGE_NOMEM;
- }
- 
- /**
-  * mem_cgroup_try_charge - try charging a memcg
-  * @memcg: memcg to charge
-  * @nr_pages: number of pages to charge
-  * @oom: trigger OOM if reclaim fails
-  *
-  * Returns 0 if @memcg was charged successfully, -EINTR if the charge
-  * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
-  */
- static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
- 				 gfp_t gfp_mask,
- 				 unsigned int nr_pages,
- 				 bool oom)
- {
- 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
- 	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
- 	int ret;
- 
- 	if (mem_cgroup_is_root(memcg))
- 		goto done;
- 	/*
- 	 * Unlike in global OOM situations, memcg is not in a physical
- 	 * memory shortage.  Allow dying and OOM-killed tasks to
- 	 * bypass the last charges so that they can exit quickly and
- 	 * free their memory.
- 	 */
- 	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
- 		     fatal_signal_pending(current) ||
- 		     current->flags & PF_EXITING))
- 		goto bypass;
+ 		goto retry;
- 	if (unlikely(task_in_memcg_oom(current)))
- 		goto nomem;
+ 	if (nr_retries--)
+ 		goto retry;
if (gfp_mask & __GFP_NOFAIL)
- 		oom = false;
- again:
- 	if (consume_stock(memcg, nr_pages))
- 		goto done;
- 
- 	do {
- 		bool invoke_oom = oom && !nr_oom_retries;
- 
- 		/* If killed, bypass charge */
- 		if (fatal_signal_pending(current))
- 			goto bypass;
+ 		goto bypass;
- 		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
- 					   nr_pages, invoke_oom);
- 		switch (ret) {
- 		case CHARGE_OK:
- 			break;
- 		case CHARGE_RETRY: /* not in OOM situation but retry */
- 			batch = nr_pages;
- 			goto again;
- 		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
- 			goto nomem;
- 		case CHARGE_NOMEM: /* OOM routine works */
- 			if (!oom || invoke_oom)
- 				goto nomem;
- 			nr_oom_retries--;
- 			break;
- 		}
- 	} while (ret != CHARGE_OK);
+ 	if (fatal_signal_pending(current))
+ 		goto bypass;
- 	if (batch > nr_pages)
- 		refill_stock(memcg, batch - nr_pages);
- done:
- 	return 0;
+ 	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch));
  nomem:
    if (!(gfp_mask & __GFP_NOFAIL))
    	return -ENOMEM;
  bypass:
- 	return -EINTR;
- }
- 
- /**
-  * mem_cgroup_try_charge_mm - try charging a mm
-  * @mm: mm_struct to charge
-  * @nr_pages: number of pages to charge
-  * @oom: trigger OOM if reclaim fails
-  *
-  * Returns the charged mem_cgroup associated with the given mm_struct or
-  * NULL the charge failed.
-  */
- static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
- 				 gfp_t gfp_mask,
- 				 unsigned int nr_pages,
- 				 bool oom)
- 
- {
- 	struct mem_cgroup *memcg;
- 	int ret;
- 
- 	memcg = get_mem_cgroup_from_mm(mm);
- 	ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
- 	css_put(&memcg->css);
- 	if (ret == -EINTR)
- 		memcg = root_mem_cgroup;
- 	else if (ret)
- 		memcg = NULL;
+ 	memcg = root_mem_cgroup;
+ 	ret = -EINTR;
+ 	goto retry;
- 	return memcg;
+ done_restock:
+ 	if (batch > nr_pages)
+ 		refill_stock(memcg, batch - nr_pages);
+ done:
+ 	return ret;
  }
- /*
-  * Somemtimes we have to undo a charge we got by try_charge().
-  * This function is for that and do uncharge, put css's refcnt.
-  * gotten by try_charge().
-  */
- static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
- 				       unsigned int nr_pages)
+ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
  {
- 	if (!mem_cgroup_is_root(memcg)) {
- 		unsigned long bytes = nr_pages * PAGE_SIZE;
+ 	unsigned long bytes = nr_pages * PAGE_SIZE;
- 		res_counter_uncharge(&memcg->res, bytes);
- 		if (do_swap_account)
- 			res_counter_uncharge(&memcg->memsw, bytes);
- 	}
+ 	res_counter_uncharge(&memcg->res, bytes);
+ 	if (do_swap_account)
+ 		res_counter_uncharge(&memcg->memsw, bytes);
  }
/*
@@@ -2756,9 -2640,6 +2640,6 @@@ static void __mem_cgroup_cancel_local_c
  {
    unsigned long bytes = nr_pages * PAGE_SIZE;
- 	if (mem_cgroup_is_root(memcg))
- 		return;
- 
    res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
    if (do_swap_account)
    	res_counter_uncharge_until(&memcg->memsw,
@@@ -2779,6 -2660,16 +2660,16 @@@ static struct mem_cgroup *mem_cgroup_lo
    return mem_cgroup_from_id(id);
  }
+ /*
+  * try_get_mem_cgroup_from_page - look up page's memcg association
+  * @page: the page
+  *
+  * Look up, get a css reference, and return the memcg that owns @page.
+  *
+  * The page must be locked to prevent racing with swap-in and page
+  * cache charges.  If coming from an unlocked page table, the caller
+  * must ensure the page is on the LRU or this can race with charging.
+  */
  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
  {
    struct mem_cgroup *memcg = NULL;
@@@ -2789,7 -2680,6 +2680,6 @@@
    VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page);
- 	lock_page_cgroup(pc);
    if (PageCgroupUsed(pc)) {
    	memcg = pc->mem_cgroup;
    	if (memcg && !css_tryget_online(&memcg->css))
@@@ -2803,23 -2693,46 +2693,46 @@@
    		memcg = NULL;
    	rcu_read_unlock();
    }
    return memcg;
  }
- static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
- 				       struct page *page,
- 				       unsigned int nr_pages,
- 				       enum charge_type ctype,
- 				       bool lrucare)
+ static void lock_page_lru(struct page *page, int *isolated)
+ {
+ 	struct zone *zone = page_zone(page);
+ 
+ 	spin_lock_irq(&zone->lru_lock);
+ 	if (PageLRU(page)) {
+ 		struct lruvec *lruvec;
+ 
+ 		lruvec = mem_cgroup_page_lruvec(page, zone);
+ 		ClearPageLRU(page);
+ 		del_page_from_lru_list(page, lruvec, page_lru(page));
+ 		*isolated = 1;
+ 	} else
+ 		*isolated = 0;
+ }
+ 
+ static void unlock_page_lru(struct page *page, int isolated)
+ {
+ 	struct zone *zone = page_zone(page);
+ 
+ 	if (isolated) {
+ 		struct lruvec *lruvec;
+ 
+ 		lruvec = mem_cgroup_page_lruvec(page, zone);
+ 		VM_BUG_ON_PAGE(PageLRU(page), page);
+ 		SetPageLRU(page);
+ 		add_page_to_lru_list(page, lruvec, page_lru(page));
+ 	}
+ 	spin_unlock_irq(&zone->lru_lock);
+ }
+ 
+ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
+ 			  unsigned int nr_pages, bool lrucare)
  {
    struct page_cgroup *pc = lookup_page_cgroup(page);
- 	struct zone *uninitialized_var(zone);
- 	struct lruvec *lruvec;
- 	bool was_on_lru = false;
- 	bool anon;
+ 	int isolated;
- 	lock_page_cgroup(pc);
    VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
    /*
     * we don't need page_cgroup_lock about tail pages, becase they are not
@@@ -2830,52 -2743,38 +2743,38 @@@
     * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
     * may already be on some other mem_cgroup's LRU.  Take care of it.
     */
- 	if (lrucare) {
- 		zone = page_zone(page);
- 		spin_lock_irq(&zone->lru_lock);
- 		if (PageLRU(page)) {
- 			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- 			ClearPageLRU(page);
- 			del_page_from_lru_list(page, lruvec, page_lru(page));
- 			was_on_lru = true;
- 		}
- 	}
+ 	if (lrucare)
+ 		lock_page_lru(page, &isolated);
- 	pc->mem_cgroup = memcg;
    /*
- 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
- 	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
- 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
- 	 * before USED bit, we need memory barrier here.
- 	 * See mem_cgroup_add_lru_list(), etc.
+ 	 * Nobody should be changing or seriously looking at
+ 	 * pc->mem_cgroup and pc->flags at this point:
+ 	 *
+ 	 * - the page is uncharged
+ 	 *
+ 	 * - the page is off-LRU
+ 	 *
+ 	 * - an anonymous fault has exclusive page access, except for
+ 	 *   a locked page table
+ 	 *
+ 	 * - a page cache insertion, a swapin fault, or a migration
+ 	 *   have the page locked
     */
- 	smp_wmb();
- 	SetPageCgroupUsed(pc);
- 
- 	if (lrucare) {
- 		if (was_on_lru) {
- 			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- 			VM_BUG_ON_PAGE(PageLRU(page), page);
- 			SetPageLRU(page);
- 			add_page_to_lru_list(page, lruvec, page_lru(page));
- 		}
- 		spin_unlock_irq(&zone->lru_lock);
- 	}
- 
- 	if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
- 		anon = true;
- 	else
- 		anon = false;
+ 	pc->mem_cgroup = memcg;
+ 	pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
- 	mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
- 	unlock_page_cgroup(pc);
+ 	if (lrucare)
+ 		unlock_page_lru(page, isolated);
+ 	local_irq_disable();
+ 	mem_cgroup_charge_statistics(memcg, page, nr_pages);
    /*
     * "charge_statistics" updated event counter. Then, check it.
     * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
     * if they exceeds softlimit.
     */
    memcg_check_events(memcg, page);
+ 	local_irq_enable();
  }
static DEFINE_MUTEX(set_limit_mutex);
@@@ -2937,22 -2836,21 +2836,21 @@@ static int memcg_charge_kmem(struct mem
    if (ret)
    	return ret;
- 	ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
- 				    oom_gfp_allowed(gfp));
+ 	ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);
    if (ret == -EINTR)  {
    	/*
- 		 * mem_cgroup_try_charge() chosed to bypass to root due to
- 		 * OOM kill or fatal signal.  Since our only options are to
- 		 * either fail the allocation or charge it to this cgroup, do
- 		 * it as a temporary condition. But we can't fail. From a
- 		 * kmem/slab perspective, the cache has already been selected,
- 		 * by mem_cgroup_kmem_get_cache(), so it is too late to change
+ 		 * try_charge() chose to bypass to root due to OOM kill or
+ 		 * fatal signal.  Since our only options are to either fail
+ 		 * the allocation or charge it to this cgroup, do it as a
+ 		 * temporary condition. But we can't fail. From a kmem/slab
+ 		 * perspective, the cache has already been selected, by
+ 		 * mem_cgroup_kmem_get_cache(), so it is too late to change
    	 * our minds.
    	 *
    	 * This condition will only trigger if the task entered
- 		 * memcg_charge_kmem in a sane state, but was OOM-killed during
- 		 * mem_cgroup_try_charge() above. Tasks that were already
- 		 * dying when the allocation triggers should have been already
+ 		 * memcg_charge_kmem in a sane state, but was OOM-killed
+ 		 * during try_charge() above. Tasks that were already dying
+ 		 * when the allocation triggers should have been already
    	 * directed to the root cgroup in memcontrol.h
    	 */
    	res_counter_charge_nofail(&memcg->res, size, &fail_res);
@@@ -3463,12 -3361,13 +3361,13 @@@ void __memcg_kmem_commit_charge(struct 
    	memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
    	return;
    }
- 
+ 	/*
+ 	 * The page is freshly allocated and not visible to any
+ 	 * outside callers yet.  Set up pc non-atomically.
+ 	 */
    pc = lookup_page_cgroup(page);
- 	lock_page_cgroup(pc);
    pc->mem_cgroup = memcg;
- 	SetPageCgroupUsed(pc);
- 	unlock_page_cgroup(pc);
+ 	pc->flags = PCG_USED;
  }
void __memcg_kmem_uncharge_pages(struct page *page, int order)
@@@ -3478,19 -3377,11 +3377,11 @@@
pc = lookup_page_cgroup(page);
    if (!PageCgroupUsed(pc))
    	return;
- 	lock_page_cgroup(pc);
- 	if (PageCgroupUsed(pc)) {
- 		memcg = pc->mem_cgroup;
- 		ClearPageCgroupUsed(pc);
- 	}
- 	unlock_page_cgroup(pc);
+ 	memcg = pc->mem_cgroup;
+ 	pc->flags = 0;
/*
     * We trust that only if there is a memcg associated with the page, it
@@@ -3510,7 -3401,6 +3401,6 @@@ static inline void memcg_unregister_all
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
  /*
   * Because tail pages are not marked as "used", set it. We're under
   * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@@ -3531,8 -3421,7 +3421,7 @@@ void mem_cgroup_split_huge_fixup(struc
    for (i = 1; i < HPAGE_PMD_NR; i++) {
    	pc = head_pc + i;
    	pc->mem_cgroup = memcg;
- 		smp_wmb();/* see __commit_charge() */
- 		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+ 		pc->flags = head_pc->flags;
    }
    __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
    	       HPAGE_PMD_NR);
@@@ -3562,7 -3451,6 +3451,6 @@@ static int mem_cgroup_move_account(stru
  {
    unsigned long flags;
    int ret;
- 	bool anon = PageAnon(page);
VM_BUG_ON(from == to);
    VM_BUG_ON_PAGE(PageLRU(page), page);
@@@ -3576,15 -3464,21 +3464,21 @@@
    if (nr_pages > 1 && !PageTransHuge(page))
    	goto out;
- 	lock_page_cgroup(pc);
+ 	/*
+ 	 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
+ 	 * of its source page while we change it: page migration takes
+ 	 * both pages off the LRU, but page cache replacement doesn't.
+ 	 */
+ 	if (!trylock_page(page))
+ 		goto out;
ret = -EINVAL;
    if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
- 		goto unlock;
+ 		goto out_unlock;
move_lock_mem_cgroup(from, &flags);
- 	if (!anon && page_mapped(page)) {
+ 	if (!PageAnon(page) && page_mapped(page)) {
    	__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
    		       nr_pages);
    	__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
@@@ -3598,20 -3492,25 +3492,25 @@@
    		       nr_pages);
    }
- 	mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
+ 	/*
+ 	 * It is safe to change pc->mem_cgroup here because the page
+ 	 * is referenced, charged, and isolated - we can't race with
+ 	 * uncharging, charging, migration, or LRU putback.
+ 	 */
/* caller should have done css_get */
    pc->mem_cgroup = to;
- 	mem_cgroup_charge_statistics(to, page, anon, nr_pages);
    move_unlock_mem_cgroup(from, &flags);
    ret = 0;
- unlock:
- 	unlock_page_cgroup(pc);
- 	/*
- 	 * check events
- 	 */
+ 
+ 	local_irq_disable();
+ 	mem_cgroup_charge_statistics(to, page, nr_pages);
    memcg_check_events(to, page);
+ 	mem_cgroup_charge_statistics(from, page, -nr_pages);
    memcg_check_events(from, page);
+ 	local_irq_enable();
+ out_unlock:
+ 	unlock_page(page);
  out:
    return ret;
  }
@@@ -3682,690 -3581,77 +3581,77 @@@ out
    return ret;
  }
- int mem_cgroup_charge_anon(struct page *page,
- 			      struct mm_struct *mm, gfp_t gfp_mask)
+ #ifdef CONFIG_MEMCG_SWAP
+ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
+ 					 bool charge)
  {
- 	unsigned int nr_pages = 1;
- 	struct mem_cgroup *memcg;
- 	bool oom = true;
+ 	int val = (charge) ? 1 : -1;
+ 	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
+ }
- 	if (mem_cgroup_disabled())
- 		return 0;
+ /**
+  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
+  * @entry: swap entry to be moved
+  * @from:  mem_cgroup which the entry is moved from
+  * @to:  mem_cgroup which the entry is moved to
+  *
+  * It succeeds only when the swap_cgroup's record for this entry is the same
+  * as the mem_cgroup's id of @from.
+  *
+  * Returns 0 on success, -EINVAL on failure.
+  *
+  * The caller must have charged to @to, IOW, called res_counter_charge() about
+  * both res and memsw, and called css_get().
+  */
+ static int mem_cgroup_move_swap_account(swp_entry_t entry,
+ 				struct mem_cgroup *from, struct mem_cgroup *to)
+ {
+ 	unsigned short old_id, new_id;
- 	VM_BUG_ON_PAGE(page_mapped(page), page);
- 	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
- 	VM_BUG_ON(!mm);
+ 	old_id = mem_cgroup_id(from);
+ 	new_id = mem_cgroup_id(to);
- 	if (PageTransHuge(page)) {
- 		nr_pages <<= compound_order(page);
- 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
+ 		mem_cgroup_swap_statistics(from, false);
+ 		mem_cgroup_swap_statistics(to, true);
    	/*
- 		 * Never OOM-kill a process for a huge page.  The
- 		 * fault handler will fall back to regular pages.
+ 		 * This function is only called from task migration context now.
+ 		 * It postpones res_counter and refcount handling till the end
+ 		 * of task migration(mem_cgroup_clear_mc()) for performance
+ 		 * improvement. But we cannot postpone css_get(to)  because if
+ 		 * the process that has been moved to @to does swap-in, the
+ 		 * refcount of @to might be decreased to 0.
+ 		 *
+ 		 * We are in attach() phase, so the cgroup is guaranteed to be
+ 		 * alive, so we can just call css_get().
    	 */
- 		oom = false;
+ 		css_get(&to->css);
+ 		return 0;
    }
- 
- 	memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
- 	if (!memcg)
- 		return -ENOMEM;
- 	__mem_cgroup_commit_charge(memcg, page, nr_pages,
- 				   MEM_CGROUP_CHARGE_TYPE_ANON, false);
- 	return 0;
+ 	return -EINVAL;
+ }
+ #else
+ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
+ 				struct mem_cgroup *from, struct mem_cgroup *to)
+ {
+ 	return -EINVAL;
  }
+ #endif
- /*
-  * While swap-in, try_charge -> commit or cancel, the page is locked.
-  * And when try_charge() successfully returns, one refcnt to memcg without
-  * struct page_cgroup is acquired. This refcnt will be consumed by
-  * "commit()" or removed by "cancel()"
-  */
- static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
- 					  struct page *page,
- 					  gfp_t mask,
- 					  struct mem_cgroup **memcgp)
+ #ifdef CONFIG_DEBUG_VM
+ static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
  {
- 	struct mem_cgroup *memcg = NULL;
    struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
    /*
- 	 * Every swap fault against a single page tries to charge the
- 	 * page, bail as early as possible.  shmem_unuse() encounters
- 	 * already charged pages, too.  The USED bit is protected by
- 	 * the page lock, which serializes swap cache removal, which
- 	 * in turn serializes uncharging.
+ 	 * Can be NULL while feeding pages into the page allocator for
+ 	 * the first time, i.e. during boot or memory hotplug;
+ 	 * or when mem_cgroup_disabled().
     */
- 	if (PageCgroupUsed(pc))
- 		goto out;
- 	if (do_swap_account)
- 		memcg = try_get_mem_cgroup_from_page(page);
- 	if (!memcg)
- 		memcg = get_mem_cgroup_from_mm(mm);
- 	ret = mem_cgroup_try_charge(memcg, mask, 1, true);
- 	css_put(&memcg->css);
- 	if (ret == -EINTR)
- 		memcg = root_mem_cgroup;
- 	else if (ret)
- 		return ret;
- out:
- 	*memcgp = memcg;
- 	return 0;
- }
- 
- int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
- 				 gfp_t gfp_mask, struct mem_cgroup **memcgp)
- {
- 	if (mem_cgroup_disabled()) {
- 		*memcgp = NULL;
- 		return 0;
- 	}
- 	/*
- 	 * A racing thread's fault, or swapoff, may have already
- 	 * updated the pte, and even removed page from swap cache: in
- 	 * those cases unuse_pte()'s pte_same() test will fail; but
- 	 * there's also a KSM case which does need to charge the page.
- 	 */
- 	if (!PageSwapCache(page)) {
- 		struct mem_cgroup *memcg;
- 
- 		memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
- 		if (!memcg)
- 			return -ENOMEM;
- 		*memcgp = memcg;
- 		return 0;
- 	}
- 	return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
- }
- 
- void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
- {
- 	if (mem_cgroup_disabled())
- 		return;
- 	if (!memcg)
- 		return;
- 	__mem_cgroup_cancel_charge(memcg, 1);
- }
- 
- static void
- __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
- 					enum charge_type ctype)
- {
- 	if (mem_cgroup_disabled())
- 		return;
- 	if (!memcg)
- 		return;
- 
- 	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
- 	/*
- 	 * Now swap is on-memory. This means this page may be
- 	 * counted both as mem and swap....double count.
- 	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
- 	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
- 	 * may call delete_from_swap_cache() before reach here.
- 	 */
- 	if (do_swap_account && PageSwapCache(page)) {
- 		swp_entry_t ent = {.val = page_private(page)};
- 		mem_cgroup_uncharge_swap(ent);
- 	}
- }
- 
- void mem_cgroup_commit_charge_swapin(struct page *page,
- 				     struct mem_cgroup *memcg)
- {
- 	__mem_cgroup_commit_charge_swapin(page, memcg,
- 					  MEM_CGROUP_CHARGE_TYPE_ANON);
- }
- 
- int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
- 				gfp_t gfp_mask)
- {
- 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
- 	struct mem_cgroup *memcg;
- 	int ret;
- 
- 	if (mem_cgroup_disabled())
- 		return 0;
- 	if (PageCompound(page))
- 		return 0;
- 
- 	if (PageSwapCache(page)) { /* shmem */
- 		ret = __mem_cgroup_try_charge_swapin(mm, page,
- 						     gfp_mask, &memcg);
- 		if (ret)
- 			return ret;
- 		__mem_cgroup_commit_charge_swapin(page, memcg, type);
- 		return 0;
- 	}
- 
- 	memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
- 	if (!memcg)
- 		return -ENOMEM;
- 	__mem_cgroup_commit_charge(memcg, page, 1, type, false);
- 	return 0;
- }
- 
- static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
- 				   unsigned int nr_pages,
- 				   const enum charge_type ctype)
- {
- 	struct memcg_batch_info *batch = NULL;
- 	bool uncharge_memsw = true;
- 
- 	/* If swapout, usage of swap doesn't decrease */
- 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
- 		uncharge_memsw = false;
- 
- 	batch = &current->memcg_batch;
- 	/*
- 	 * In usual, we do css_get() when we remember memcg pointer.
- 	 * But in this case, we keep res->usage until end of a series of
- 	 * uncharges. Then, it's ok to ignore memcg's refcnt.
- 	 */
- 	if (!batch->memcg)
- 		batch->memcg = memcg;
- 	/*
- 	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
- 	 * In those cases, all pages freed continuously can be expected to be in
- 	 * the same cgroup and we have chance to coalesce uncharges.
- 	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
- 	 * because we want to do uncharge as soon as possible.
- 	 */
- 
- 	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
- 		goto direct_uncharge;
- 
- 	if (nr_pages > 1)
- 		goto direct_uncharge;
- 
- 	/*
- 	 * In typical case, batch->memcg == mem. This means we can
- 	 * merge a series of uncharges to an uncharge of res_counter.
- 	 * If not, we uncharge res_counter ony by one.
- 	 */
- 	if (batch->memcg != memcg)
- 		goto direct_uncharge;
- 	/* remember freed charge and uncharge it later */
- 	batch->nr_pages++;
- 	if (uncharge_memsw)
- 		batch->memsw_nr_pages++;
- 	return;
- direct_uncharge:
- 	res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
- 	if (uncharge_memsw)
- 		res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
- 	if (unlikely(batch->memcg != memcg))
- 		memcg_oom_recover(memcg);
- }
- 
- /*
-  * uncharge if !page_mapped(page)
-  */
- static struct mem_cgroup *
- __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
- 			     bool end_migration)
- {
- 	struct mem_cgroup *memcg = NULL;
- 	unsigned int nr_pages = 1;
- 	struct page_cgroup *pc;
- 	bool anon;
- 
- 	if (mem_cgroup_disabled())
- 		return NULL;
- 
- 	if (PageTransHuge(page)) {
- 		nr_pages <<= compound_order(page);
- 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- 	}
- 	/*
- 	 * Check if our page_cgroup is valid
- 	 */
- 	pc = lookup_page_cgroup(page);
- 	if (unlikely(!PageCgroupUsed(pc)))
- 		return NULL;
- 
- 	lock_page_cgroup(pc);
- 
- 	memcg = pc->mem_cgroup;
- 
- 	if (!PageCgroupUsed(pc))
- 		goto unlock_out;
- 
- 	anon = PageAnon(page);
- 
- 	switch (ctype) {
- 	case MEM_CGROUP_CHARGE_TYPE_ANON:
- 		/*
- 		 * Generally PageAnon tells if it's the anon statistics to be
- 		 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
- 		 * used before page reached the stage of being marked PageAnon.
- 		 */
- 		anon = true;
- 		/* fallthrough */
- 	case MEM_CGROUP_CHARGE_TYPE_DROP:
- 		/* See mem_cgroup_prepare_migration() */
- 		if (page_mapped(page))
- 			goto unlock_out;
- 		/*
- 		 * Pages under migration may not be uncharged.  But
- 		 * end_migration() /must/ be the one uncharging the
- 		 * unused post-migration page and so it has to call
- 		 * here with the migration bit still set.  See the
- 		 * res_counter handling below.
- 		 */
- 		if (!end_migration && PageCgroupMigration(pc))
- 			goto unlock_out;
- 		break;
- 	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
- 		if (!PageAnon(page)) {	/* Shared memory */
- 			if (page->mapping && !page_is_file_cache(page))
- 				goto unlock_out;
- 		} else if (page_mapped(page)) /* Anon */
- 				goto unlock_out;
- 		break;
- 	default:
- 		break;
- 	}
- 
- 	mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
- 
- 	ClearPageCgroupUsed(pc);
- 	/*
- 	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
- 	 * freed from LRU. This is safe because uncharged page is expected not
- 	 * to be reused (freed soon). Exception is SwapCache, it's handled by
- 	 * special functions.
- 	 */
- 
- 	unlock_page_cgroup(pc);
- 	/*
- 	 * even after unlock, we have memcg->res.usage here and this memcg
- 	 * will never be freed, so it's safe to call css_get().
- 	 */
- 	memcg_check_events(memcg, page);
- 	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
- 		mem_cgroup_swap_statistics(memcg, true);
- 		css_get(&memcg->css);
- 	}
- 	/*
- 	 * Migration does not charge the res_counter for the
- 	 * replacement page, so leave it alone when phasing out the
- 	 * page that is unused after the migration.
- 	 */
- 	if (!end_migration && !mem_cgroup_is_root(memcg))
- 		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
- 
- 	return memcg;
- 
- unlock_out:
- 	unlock_page_cgroup(pc);
- 	return NULL;
- }
- 
- void mem_cgroup_uncharge_page(struct page *page)
- {
- 	/* early check. */
- 	if (page_mapped(page))
- 		return;
- 	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
- 	/*
- 	 * If the page is in swap cache, uncharge should be deferred
- 	 * to the swap path, which also properly accounts swap usage
- 	 * and handles memcg lifetime.
- 	 *
- 	 * Note that this check is not stable and reclaim may add the
- 	 * page to swap cache at any time after this.  However, if the
- 	 * page is not in swap cache by the time page->mapcount hits
- 	 * 0, there won't be any page table references to the swap
- 	 * slot, and reclaim will free it and not actually write the
- 	 * page to disk.
- 	 */
- 	if (PageSwapCache(page))
- 		return;
- 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
- }
- 
- void mem_cgroup_uncharge_cache_page(struct page *page)
- {
- 	VM_BUG_ON_PAGE(page_mapped(page), page);
- 	VM_BUG_ON_PAGE(page->mapping, page);
- 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
- }
- 
- /*
-  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
-  * In that cases, pages are freed continuously and we can expect pages
-  * are in the same memcg. All these calls itself limits the number of
-  * pages freed at once, then uncharge_start/end() is called properly.
-  * This may be called prural(2) times in a context,
-  */
- 
- void mem_cgroup_uncharge_start(void)
- {
- 	current->memcg_batch.do_batch++;
- 	/* We can do nest. */
- 	if (current->memcg_batch.do_batch == 1) {
- 		current->memcg_batch.memcg = NULL;
- 		current->memcg_batch.nr_pages = 0;
- 		current->memcg_batch.memsw_nr_pages = 0;
- 	}
- }
- 
- void mem_cgroup_uncharge_end(void)
- {
- 	struct memcg_batch_info *batch = &current->memcg_batch;
- 
- 	if (!batch->do_batch)
- 		return;
- 
- 	batch->do_batch--;
- 	if (batch->do_batch) /* If stacked, do nothing. */
- 		return;
- 
- 	if (!batch->memcg)
- 		return;
- 	/*
- 	 * This "batch->memcg" is valid without any css_get/put etc...
- 	 * bacause we hide charges behind us.
- 	 */
- 	if (batch->nr_pages)
- 		res_counter_uncharge(&batch->memcg->res,
- 				     batch->nr_pages * PAGE_SIZE);
- 	if (batch->memsw_nr_pages)
- 		res_counter_uncharge(&batch->memcg->memsw,
- 				     batch->memsw_nr_pages * PAGE_SIZE);
- 	memcg_oom_recover(batch->memcg);
- 	/* forget this pointer (for sanity check) */
- 	batch->memcg = NULL;
- }
- 
- #ifdef CONFIG_SWAP
- /*
-  * called after __delete_from_swap_cache() and drop "page" account.
-  * memcg information is recorded to swap_cgroup of "ent"
-  */
- void
- mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
- {
- 	struct mem_cgroup *memcg;
- 	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
- 
- 	if (!swapout) /* this was a swap cache but the swap is unused ! */
- 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
- 
- 	memcg = __mem_cgroup_uncharge_common(page, ctype, false);
- 
- 	/*
- 	 * record memcg information,  if swapout && memcg != NULL,
- 	 * css_get() was called in uncharge().
- 	 */
- 	if (do_swap_account && swapout && memcg)
- 		swap_cgroup_record(ent, mem_cgroup_id(memcg));
- }
- #endif
- 
- #ifdef CONFIG_MEMCG_SWAP
- /*
-  * called from swap_entry_free(). remove record in swap_cgroup and
-  * uncharge "memsw" account.
-  */
- void mem_cgroup_uncharge_swap(swp_entry_t ent)
- {
- 	struct mem_cgroup *memcg;
- 	unsigned short id;
- 
- 	if (!do_swap_account)
- 		return;
- 
- 	id = swap_cgroup_record(ent, 0);
- 	rcu_read_lock();
- 	memcg = mem_cgroup_lookup(id);
- 	if (memcg) {
- 		/*
- 		 * We uncharge this because swap is freed.  This memcg can
- 		 * be obsolete one. We avoid calling css_tryget_online().
- 		 */
- 		if (!mem_cgroup_is_root(memcg))
- 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
- 		mem_cgroup_swap_statistics(memcg, false);
- 		css_put(&memcg->css);
- 	}
- 	rcu_read_unlock();
- }
- 
- /**
-  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
-  * @entry: swap entry to be moved
-  * @from:  mem_cgroup which the entry is moved from
-  * @to:  mem_cgroup which the entry is moved to
-  *
-  * It succeeds only when the swap_cgroup's record for this entry is the same
-  * as the mem_cgroup's id of @from.
-  *
-  * Returns 0 on success, -EINVAL on failure.
-  *
-  * The caller must have charged to @to, IOW, called res_counter_charge() about
-  * both res and memsw, and called css_get().
-  */
- static int mem_cgroup_move_swap_account(swp_entry_t entry,
- 				struct mem_cgroup *from, struct mem_cgroup *to)
- {
- 	unsigned short old_id, new_id;
- 
- 	old_id = mem_cgroup_id(from);
- 	new_id = mem_cgroup_id(to);
- 
- 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
- 		mem_cgroup_swap_statistics(from, false);
- 		mem_cgroup_swap_statistics(to, true);
- 		/*
- 		 * This function is only called from task migration context now.
- 		 * It postpones res_counter and refcount handling till the end
- 		 * of task migration(mem_cgroup_clear_mc()) for performance
- 		 * improvement. But we cannot postpone css_get(to)  because if
- 		 * the process that has been moved to @to does swap-in, the
- 		 * refcount of @to might be decreased to 0.
- 		 *
- 		 * We are in attach() phase, so the cgroup is guaranteed to be
- 		 * alive, so we can just call css_get().
- 		 */
- 		css_get(&to->css);
- 		return 0;
- 	}
- 	return -EINVAL;
- }
- #else
- static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
- 				struct mem_cgroup *from, struct mem_cgroup *to)
- {
- 	return -EINVAL;
- }
- #endif
- 
- /*
-  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
-  * page belongs to.
-  */
- void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
- 				  struct mem_cgroup **memcgp)
- {
- 	struct mem_cgroup *memcg = NULL;
- 	unsigned int nr_pages = 1;
- 	struct page_cgroup *pc;
- 	enum charge_type ctype;
- 
- 	*memcgp = NULL;
- 
- 	if (mem_cgroup_disabled())
- 		return;
- 
- 	if (PageTransHuge(page))
- 		nr_pages <<= compound_order(page);
- 
- 	pc = lookup_page_cgroup(page);
- 	lock_page_cgroup(pc);
- 	if (PageCgroupUsed(pc)) {
- 		memcg = pc->mem_cgroup;
- 		css_get(&memcg->css);
- 		/*
- 		 * At migrating an anonymous page, its mapcount goes down
- 		 * to 0 and uncharge() will be called. But, even if it's fully
- 		 * unmapped, migration may fail and this page has to be
- 		 * charged again. We set MIGRATION flag here and delay uncharge
- 		 * until end_migration() is called
- 		 *
- 		 * Corner Case Thinking
- 		 * A)
- 		 * When the old page was mapped as Anon and it's unmap-and-freed
- 		 * while migration was ongoing.
- 		 * If unmap finds the old page, uncharge() of it will be delayed
- 		 * until end_migration(). If unmap finds a new page, it's
- 		 * uncharged when it make mapcount to be 1->0. If unmap code
- 		 * finds swap_migration_entry, the new page will not be mapped
- 		 * and end_migration() will find it(mapcount==0).
- 		 *
- 		 * B)
- 		 * When the old page was mapped but migraion fails, the kernel
- 		 * remaps it. A charge for it is kept by MIGRATION flag even
- 		 * if mapcount goes down to 0. We can do remap successfully
- 		 * without charging it again.
- 		 *
- 		 * C)
- 		 * The "old" page is under lock_page() until the end of
- 		 * migration, so, the old page itself will not be swapped-out.
- 		 * If the new page is swapped out before end_migraton, our
- 		 * hook to usual swap-out path will catch the event.
- 		 */
- 		if (PageAnon(page))
- 			SetPageCgroupMigration(pc);
- 	}
- 	unlock_page_cgroup(pc);
- 	/*
- 	 * If the page is not charged at this point,
- 	 * we return here.
- 	 */
- 	if (!memcg)
- 		return;
- 
- 	*memcgp = memcg;
- 	/*
- 	 * We charge new page before it's used/mapped. So, even if unlock_page()
- 	 * is called before end_migration, we can catch all events on this new
- 	 * page. In the case new page is migrated but not remapped, new page's
- 	 * mapcount will be finally 0 and we call uncharge in end_migration().
- 	 */
- 	if (PageAnon(page))
- 		ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
- 	else
- 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
- 	/*
- 	 * The page is committed to the memcg, but it's not actually
- 	 * charged to the res_counter since we plan on replacing the
- 	 * old one and only one page is going to be left afterwards.
- 	 */
- 	__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
- }
- 
- /* remove redundant charge if migration failed*/
- void mem_cgroup_end_migration(struct mem_cgroup *memcg,
- 	struct page *oldpage, struct page *newpage, bool migration_ok)
- {
- 	struct page *used, *unused;
- 	struct page_cgroup *pc;
- 	bool anon;
- 
- 	if (!memcg)
- 		return;
- 
- 	if (!migration_ok) {
- 		used = oldpage;
- 		unused = newpage;
- 	} else {
- 		used = newpage;
- 		unused = oldpage;
- 	}
- 	anon = PageAnon(used);
- 	__mem_cgroup_uncharge_common(unused,
- 				     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
- 				     : MEM_CGROUP_CHARGE_TYPE_CACHE,
- 				     true);
- 	css_put(&memcg->css);
- 	/*
- 	 * We disallowed uncharge of pages under migration because mapcount
- 	 * of the page goes down to zero, temporarly.
- 	 * Clear the flag and check the page should be charged.
- 	 */
- 	pc = lookup_page_cgroup(oldpage);
- 	lock_page_cgroup(pc);
- 	ClearPageCgroupMigration(pc);
- 	unlock_page_cgroup(pc);
- 
- 	/*
- 	 * If a page is a file cache, radix-tree replacement is very atomic
- 	 * and we can skip this check. When it was an Anon page, its mapcount
- 	 * goes down to 0. But because we added MIGRATION flage, it's not
- 	 * uncharged yet. There are several case but page->mapcount check
- 	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
- 	 * check. (see prepare_charge() also)
- 	 */
- 	if (anon)
- 		mem_cgroup_uncharge_page(used);
- }
- 
- /*
-  * At replace page cache, newpage is not under any memcg but it's on
-  * LRU. So, this function doesn't touch res_counter but handles LRU
-  * in correct way. Both pages are locked so we cannot race with uncharge.
-  */
- void mem_cgroup_replace_page_cache(struct page *oldpage,
- 				  struct page *newpage)
- {
- 	struct mem_cgroup *memcg = NULL;
- 	struct page_cgroup *pc;
- 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
- 
- 	if (mem_cgroup_disabled())
- 		return;
- 
- 	pc = lookup_page_cgroup(oldpage);
- 	/* fix accounting on old pages */
- 	lock_page_cgroup(pc);
- 	if (PageCgroupUsed(pc)) {
- 		memcg = pc->mem_cgroup;
- 		mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
- 		ClearPageCgroupUsed(pc);
- 	}
- 	unlock_page_cgroup(pc);
- 
- 	/*
- 	 * When called from shmem_replace_page(), in some cases the
- 	 * oldpage has already been charged, and in some cases not.
- 	 */
- 	if (!memcg)
- 		return;
- 	/*
- 	 * Even if newpage->mapping was NULL before starting replacement,
- 	 * the newpage may be on LRU(or pagevec for LRU) already. We lock
- 	 * LRU while we overwrite pc->mem_cgroup.
- 	 */
- 	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
- }
- 
- #ifdef CONFIG_DEBUG_VM
- static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
- {
- 	struct page_cgroup *pc;
- 
- 	pc = lookup_page_cgroup(page);
- 	/*
- 	 * Can be NULL while feeding pages into the page allocator for
- 	 * the first time, i.e. during boot or memory hotplug;
- 	 * or when mem_cgroup_disabled().
- 	 */
- 	if (likely(pc) && PageCgroupUsed(pc))
- 		return pc;
- 	return NULL;
+ 	if (likely(pc) && PageCgroupUsed(pc))
+ 		return pc;
+ 	return NULL;
  }
bool mem_cgroup_bad_page_check(struct page *page)
@@@ -4550,7 -3836,7 +3836,7 @@@ unsigned long mem_cgroup_soft_limit_rec
    					    gfp_mask, &nr_scanned);
    	nr_reclaimed += reclaimed;
    	*total_scanned += nr_scanned;
- 		spin_lock(&mctz->lock);
+ 		spin_lock_irq(&mctz->lock);
/*
    	 * If we failed to reclaim anything from this memory cgroup
@@@ -4590,7 -3876,7 +3876,7 @@@
    	 */
    	/* If excess == 0, no tree ops */
    	__mem_cgroup_insert_exceeded(mz, mctz, excess);
- 		spin_unlock(&mctz->lock);
+ 		spin_unlock_irq(&mctz->lock);
    	css_put(&mz->memcg->css);
    	loop++;
    	/*
@@@ -4814,81 -4100,27 +4100,27 @@@ static int mem_cgroup_hierarchy_write(s
  out:
    mutex_unlock(&memcg_create_mutex);
- 	return retval;
- }
- 
- 
- static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
- 					       enum mem_cgroup_stat_index idx)
- {
- 	struct mem_cgroup *iter;
- 	long val = 0;
- 
- 	/* Per-cpu values can be negative, use a signed accumulator */
- 	for_each_mem_cgroup_tree(iter, memcg)
- 		val += mem_cgroup_read_stat(iter, idx);
- 
- 	if (val < 0) /* race ? */
- 		val = 0;
- 	return val;
- }
- 
- static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
- {
- 	u64 val;
- 
- 	if (!mem_cgroup_is_root(memcg)) {
- 		if (!swap)
- 			return res_counter_read_u64(&memcg->res, RES_USAGE);
- 		else
- 			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
- 	}
- 
- 	/*
- 	 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
- 	 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
- 	 */
- 	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
- 	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
- 
- 	if (swap)
- 		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
- 
- 	return val << PAGE_SHIFT;
+ 	return retval;
  }
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
- 				   struct cftype *cft)
+ 			       struct cftype *cft)
  {
    struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- 	u64 val;
- 	int name;
- 	enum res_type type;
- 
- 	type = MEMFILE_TYPE(cft->private);
- 	name = MEMFILE_ATTR(cft->private);
+ 	enum res_type type = MEMFILE_TYPE(cft->private);
+ 	int name = MEMFILE_ATTR(cft->private);
switch (type) {
    case _MEM:
- 		if (name == RES_USAGE)
- 			val = mem_cgroup_usage(memcg, false);
- 		else
- 			val = res_counter_read_u64(&memcg->res, name);
- 		break;
+ 		return res_counter_read_u64(&memcg->res, name);
    case _MEMSWAP:
- 		if (name == RES_USAGE)
- 			val = mem_cgroup_usage(memcg, true);
- 		else
- 			val = res_counter_read_u64(&memcg->memsw, name);
- 		break;
+ 		return res_counter_read_u64(&memcg->memsw, name);
    case _KMEM:
- 		val = res_counter_read_u64(&memcg->kmem, name);
+ 		return res_counter_read_u64(&memcg->kmem, name);
    	break;
    default:
    	BUG();
    }
- 
- 	return val;
  }
#ifdef CONFIG_MEMCG_KMEM
@@@ -5350,7 -4582,10 +4582,10 @@@ static void __mem_cgroup_threshold(stru
    if (!t)
    	goto unlock;
- 	usage = mem_cgroup_usage(memcg, swap);
+ 	if (!swap)
+ 		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ 	else
+ 		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/*
     * current_threshold points to threshold just below or equal to usage.
@@@ -5446,15 -4681,15 +4681,15 @@@ static int __mem_cgroup_usage_register_
mutex_lock(&memcg->thresholds_lock);
- 	if (type == _MEM)
+ 	if (type == _MEM) {
    	thresholds = &memcg->thresholds;
- 	else if (type == _MEMSWAP)
+ 		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ 	} else if (type == _MEMSWAP) {
    	thresholds = &memcg->memsw_thresholds;
- 	else
+ 		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ 	} else
    	BUG();
- 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
- 
    /* Check if a threshold crossed before adding a new one */
    if (thresholds->primary)
    	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@@ -5534,18 -4769,19 +4769,19 @@@ static void __mem_cgroup_usage_unregist
    int i, j, size;
mutex_lock(&memcg->thresholds_lock);
- 	if (type == _MEM)
+ 
+ 	if (type == _MEM) {
    	thresholds = &memcg->thresholds;
- 	else if (type == _MEMSWAP)
+ 		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ 	} else if (type == _MEMSWAP) {
    	thresholds = &memcg->memsw_thresholds;
- 	else
+ 		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ 	} else
    	BUG();
if (!thresholds->primary)
    	goto unlock;
- 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
- 
    /* Check if a threshold crossed before removing */
    __mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@@ -6007,6 -5243,7 +5243,6 @@@ static struct cftype mem_cgroup_files[
    },
    {
    	.name = "use_hierarchy",
 -		.flags = CFTYPE_INSANE,
    	.write_u64 = mem_cgroup_hierarchy_write,
    	.read_u64 = mem_cgroup_hierarchy_read,
    },
@@@ -6299,9 -5536,9 +5535,9 @@@ mem_cgroup_css_online(struct cgroup_sub
    	 * core guarantees its existence.
    	 */
    } else {
- 		res_counter_init(&memcg->res, NULL);
- 		res_counter_init(&memcg->memsw, NULL);
- 		res_counter_init(&memcg->kmem, NULL);
+ 		res_counter_init(&memcg->res, &root_mem_cgroup->res);
+ 		res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
+ 		res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
    	/*
    	 * Deeper hierachy with use_hierarchy == false doesn't make
    	 * much sense so let cgroup subsystem know about this
@@@ -6410,80 -5647,40 +5646,63 @@@ static void mem_cgroup_css_free(struct 
    __mem_cgroup_free(memcg);
  }
+/**
 + * mem_cgroup_css_reset - reset the states of a mem_cgroup
 + * @css: the target css
 + *
 + * Reset the states of the mem_cgroup associated with @css.  This is
 + * invoked when the userland requests disabling on the default hierarchy
 + * but the memcg is pinned through dependency.  The memcg should stop
 + * applying policies and should revert to the vanilla state as it may be
 + * made visible again.
 + *
 + * The current implementation only resets the essential configurations.
 + * This needs to be expanded to cover all the visible parts.
 + */
 +static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 +{
 +	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 +
 +	mem_cgroup_resize_limit(memcg, ULLONG_MAX);
 +	mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
 +	memcg_update_kmem_limit(memcg, ULLONG_MAX);
 +	res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
 +}
 +
  #ifdef CONFIG_MMU
  /* Handlers for move charge at task migration. */
- #define PRECHARGE_COUNT_AT_ONCE	256
  static int mem_cgroup_do_precharge(unsigned long count)
  {
- 	int ret = 0;
- 	int batch_count = PRECHARGE_COUNT_AT_ONCE;
- 	struct mem_cgroup *memcg = mc.to;
+ 	int ret;
- 	if (mem_cgroup_is_root(memcg)) {
+ 	/* Try a single bulk charge without reclaim first */
+ 	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+ 	if (!ret) {
    	mc.precharge += count;
- 		/* we don't need css_get for root */
    	return ret;
    }
- 	/* try to charge at once */
- 	if (count > 1) {
- 		struct res_counter *dummy;
- 		/*
- 		 * "memcg" cannot be under rmdir() because we've already checked
- 		 * by cgroup_lock_live_cgroup() that it is not removed and we
- 		 * are still under the same cgroup_mutex. So we can postpone
- 		 * css_get().
- 		 */
- 		if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
- 			goto one_by_one;
- 		if (do_swap_account && res_counter_charge(&memcg->memsw,
- 						PAGE_SIZE * count, &dummy)) {
- 			res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
- 			goto one_by_one;
- 		}
- 		mc.precharge += count;
+ 	if (ret == -EINTR) {
+ 		cancel_charge(root_mem_cgroup, count);
    	return ret;
    }
- one_by_one:
- 	/* fall back to one by one charge */
+ 
+ 	/* Try charges one by one with reclaim */
    while (count--) {
- 		if (signal_pending(current)) {
- 			ret = -EINTR;
- 			break;
- 		}
- 		if (!batch_count--) {
- 			batch_count = PRECHARGE_COUNT_AT_ONCE;
- 			cond_resched();
- 		}
- 		ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
+ 		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
+ 		/*
+ 		 * In case of failure, any residual charges against
+ 		 * mc.to will be dropped by mem_cgroup_clear_mc()
+ 		 * later on.  However, cancel any charges that are
+ 		 * bypassed to root right away or they'll be lost.
+ 		 */
+ 		if (ret == -EINTR)
+ 			cancel_charge(root_mem_cgroup, 1);
    	if (ret)
- 			/* mem_cgroup_clear_mc() will do uncharge later */
    		return ret;
    	mc.precharge++;
+ 		cond_resched();
    }
- 	return ret;
+ 	return 0;
  }
/**
@@@ -6619,9 -5816,9 +5838,9 @@@ static enum mc_target_type get_mctgt_ty
    if (page) {
    	pc = lookup_page_cgroup(page);
    	/*
- 		 * Do only loose check w/o page_cgroup lock.
- 		 * mem_cgroup_move_account() checks the pc is valid or not under
- 		 * the lock.
+ 		 * Do only loose check w/o serialization.
+ 		 * mem_cgroup_move_account() checks the pc is valid or
+ 		 * not under LRU exclusion.
    	 */
    	if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
    		ret = MC_TARGET_PAGE;
@@@ -6746,7 -5943,7 +5965,7 @@@ static void __mem_cgroup_clear_mc(void
/* we must uncharge all the leftover precharges from mc.to */
    if (mc.precharge) {
- 		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
+ 		cancel_charge(mc.to, mc.precharge);
    	mc.precharge = 0;
    }
    /*
@@@ -6754,27 -5951,24 +5973,24 @@@
     * we must uncharge here.
     */
    if (mc.moved_charge) {
- 		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
+ 		cancel_charge(mc.from, mc.moved_charge);
    	mc.moved_charge = 0;
    }
    /* we must fixup refcnts and charges */
    if (mc.moved_swap) {
    	/* uncharge swap account from the old cgroup */
- 		if (!mem_cgroup_is_root(mc.from))
- 			res_counter_uncharge(&mc.from->memsw,
- 						PAGE_SIZE * mc.moved_swap);
+ 		res_counter_uncharge(&mc.from->memsw,
+ 				     PAGE_SIZE * mc.moved_swap);
for (i = 0; i < mc.moved_swap; i++)
    		css_put(&mc.from->css);
- 		if (!mem_cgroup_is_root(mc.to)) {
- 			/*
- 			 * we charged both to->res and to->memsw, so we should
- 			 * uncharge to->res.
- 			 */
- 			res_counter_uncharge(&mc.to->res,
- 						PAGE_SIZE * mc.moved_swap);
- 		}
+ 		/*
+ 		 * we charged both to->res and to->memsw, so we should
+ 		 * uncharge to->res.
+ 		 */
+ 		res_counter_uncharge(&mc.to->res,
+ 				     PAGE_SIZE * mc.moved_swap);
    	/* we've already done css_get(mc.to) */
    	mc.moved_swap = 0;
    }
@@@ -7027,17 -6221,16 +6243,17 @@@ static void mem_cgroup_move_task(struc
/*
   * Cgroup retains root cgroups across [un]mount cycles making it necessary
 - * to verify sane_behavior flag on each mount attempt.
 + * to verify whether we're attached to the default hierarchy on each mount
 + * attempt.
   */
  static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
  {
    /*
 -	 * use_hierarchy is forced with sane_behavior.  cgroup core
 +	 * use_hierarchy is forced on the default hierarchy.  cgroup core
     * guarantees that @root doesn't have any children, so turning it
     * on for the root memcg is enough.
     */
 -	if (cgroup_sane_behavior(root_css->cgroup))
 +	if (cgroup_on_dfl(root_css->cgroup))
    	mem_cgroup_from_css(root_css)->use_hierarchy = true;
  }
@@@ -7046,12 -6239,11 +6262,12 @@@ struct cgroup_subsys memory_cgrp_subsy
    .css_online = mem_cgroup_css_online,
    .css_offline = mem_cgroup_css_offline,
    .css_free = mem_cgroup_css_free,
 +	.css_reset = mem_cgroup_css_reset,
    .can_attach = mem_cgroup_can_attach,
    .cancel_attach = mem_cgroup_cancel_attach,
    .attach = mem_cgroup_move_task,
    .bind = mem_cgroup_bind,
 -	.base_cftypes = mem_cgroup_files,
 +	.legacy_cftypes = mem_cgroup_files,
    .early_init = 0,
  };
@@@ -7068,8 -6260,7 +6284,8 @@@ __setup("swapaccount=", enable_swap_acc
static void __init memsw_file_init(void)
  {
 -	WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
 +	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
 +					  memsw_cgroup_files));
  }
static void __init enable_swap_cgroup(void)
@@@ -7086,6 -6277,403 +6302,403 @@@ static void __init enable_swap_cgroup(v
  }
  #endif
+ #ifdef CONFIG_MEMCG_SWAP
+ /**
+  * mem_cgroup_swapout - transfer a memsw charge to swap
+  * @page: page whose memsw charge to transfer
+  * @entry: swap entry to move the charge to
+  *
+  * Transfer the memsw charge of @page to @entry.
+  */
+ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+ {
+ 	struct page_cgroup *pc;
+ 	unsigned short oldid;
+ 
+ 	VM_BUG_ON_PAGE(PageLRU(page), page);
+ 	VM_BUG_ON_PAGE(page_count(page), page);
+ 
+ 	if (!do_swap_account)
+ 		return;
+ 
+ 	pc = lookup_page_cgroup(page);
+ 
+ 	/* Readahead page, never charged */
+ 	if (!PageCgroupUsed(pc))
+ 		return;
+ 
+ 	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
+ 
+ 	oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
+ 	VM_BUG_ON_PAGE(oldid, page);
+ 
+ 	pc->flags &= ~PCG_MEMSW;
+ 	css_get(&pc->mem_cgroup->css);
+ 	mem_cgroup_swap_statistics(pc->mem_cgroup, true);
+ }
+ 
+ /**
+  * mem_cgroup_uncharge_swap - uncharge a swap entry
+  * @entry: swap entry to uncharge
+  *
+  * Drop the memsw charge associated with @entry.
+  */
+ void mem_cgroup_uncharge_swap(swp_entry_t entry)
+ {
+ 	struct mem_cgroup *memcg;
+ 	unsigned short id;
+ 
+ 	if (!do_swap_account)
+ 		return;
+ 
+ 	id = swap_cgroup_record(entry, 0);
+ 	rcu_read_lock();
+ 	memcg = mem_cgroup_lookup(id);
+ 	if (memcg) {
+ 		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+ 		mem_cgroup_swap_statistics(memcg, false);
+ 		css_put(&memcg->css);
+ 	}
+ 	rcu_read_unlock();
+ }
+ #endif
+ 
+ /**
+  * mem_cgroup_try_charge - try charging a page
+  * @page: page to charge
+  * @mm: mm context of the victim
+  * @gfp_mask: reclaim mode
+  * @memcgp: charged memcg return
+  *
+  * Try to charge @page to the memcg that @mm belongs to, reclaiming
+  * pages according to @gfp_mask if necessary.
+  *
+  * Returns 0 on success, with *@memcgp pointing to the charged memcg.
+  * Otherwise, an error code is returned.
+  *
+  * After page->mapping has been set up, the caller must finalize the
+  * charge with mem_cgroup_commit_charge().  Or abort the transaction
+  * with mem_cgroup_cancel_charge() in case page instantiation fails.
+  */
+ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
+ 			  gfp_t gfp_mask, struct mem_cgroup **memcgp)
+ {
+ 	struct mem_cgroup *memcg = NULL;
+ 	unsigned int nr_pages = 1;
+ 	int ret = 0;
+ 
+ 	if (mem_cgroup_disabled())
+ 		goto out;
+ 
+ 	if (PageSwapCache(page)) {
+ 		struct page_cgroup *pc = lookup_page_cgroup(page);
+ 		/*
+ 		 * Every swap fault against a single page tries to charge the
+ 		 * page, bail as early as possible.  shmem_unuse() encounters
+ 		 * already charged pages, too.  The USED bit is protected by
+ 		 * the page lock, which serializes swap cache removal, which
+ 		 * in turn serializes uncharging.
+ 		 */
+ 		if (PageCgroupUsed(pc))
+ 			goto out;
+ 	}
+ 
+ 	if (PageTransHuge(page)) {
+ 		nr_pages <<= compound_order(page);
+ 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ 	}
+ 
+ 	if (do_swap_account && PageSwapCache(page))
+ 		memcg = try_get_mem_cgroup_from_page(page);
+ 	if (!memcg)
+ 		memcg = get_mem_cgroup_from_mm(mm);
+ 
+ 	ret = try_charge(memcg, gfp_mask, nr_pages);
+ 
+ 	css_put(&memcg->css);
+ 
+ 	if (ret == -EINTR) {
+ 		memcg = root_mem_cgroup;
+ 		ret = 0;
+ 	}
+ out:
+ 	*memcgp = memcg;
+ 	return ret;
+ }
+ 
+ /**
+  * mem_cgroup_commit_charge - commit a page charge
+  * @page: page to charge
+  * @memcg: memcg to charge the page to
+  * @lrucare: page might be on LRU already
+  *
+  * Finalize a charge transaction started by mem_cgroup_try_charge(),
+  * after page->mapping has been set up.  This must happen atomically
+  * as part of the page instantiation, i.e. under the page table lock
+  * for anonymous pages, under the page lock for page and swap cache.
+  *
+  * In addition, the page must not be on the LRU during the commit, to
+  * prevent racing with task migration.  If it might be, use @lrucare.
+  *
+  * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
+  */
+ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
+ 			      bool lrucare)
+ {
+ 	unsigned int nr_pages = 1;
+ 
+ 	VM_BUG_ON_PAGE(!page->mapping, page);
+ 	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
+ 
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 	/*
+ 	 * Swap faults will attempt to charge the same page multiple
+ 	 * times.  But reuse_swap_page() might have removed the page
+ 	 * from swapcache already, so we can't check PageSwapCache().
+ 	 */
+ 	if (!memcg)
+ 		return;
+ 
+ 	if (PageTransHuge(page)) {
+ 		nr_pages <<= compound_order(page);
+ 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ 	}
+ 
+ 	commit_charge(page, memcg, nr_pages, lrucare);
+ 
+ 	if (do_swap_account && PageSwapCache(page)) {
+ 		swp_entry_t entry = { .val = page_private(page) };
+ 		/*
+ 		 * The swap entry might not get freed for a long time,
+ 		 * let's not wait for it.  The page already received a
+ 		 * memory+swap charge, drop the swap entry duplicate.
+ 		 */
+ 		mem_cgroup_uncharge_swap(entry);
+ 	}
+ }
+ 
+ /**
+  * mem_cgroup_cancel_charge - cancel a page charge
+  * @page: page to charge
+  * @memcg: memcg to charge the page to
+  *
+  * Cancel a charge transaction started by mem_cgroup_try_charge().
+  */
+ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+ {
+ 	unsigned int nr_pages = 1;
+ 
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 	/*
+ 	 * Swap faults will attempt to charge the same page multiple
+ 	 * times.  But reuse_swap_page() might have removed the page
+ 	 * from swapcache already, so we can't check PageSwapCache().
+ 	 */
+ 	if (!memcg)
+ 		return;
+ 
+ 	if (PageTransHuge(page)) {
+ 		nr_pages <<= compound_order(page);
+ 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ 	}
+ 
+ 	cancel_charge(memcg, nr_pages);
+ }
+ 
+ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
+ 			   unsigned long nr_mem, unsigned long nr_memsw,
+ 			   unsigned long nr_anon, unsigned long nr_file,
+ 			   unsigned long nr_huge, struct page *dummy_page)
+ {
+ 	unsigned long flags;
+ 
+ 	if (nr_mem)
+ 		res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE);
+ 	if (nr_memsw)
+ 		res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE);
+ 
+ 	memcg_oom_recover(memcg);
+ 
+ 	local_irq_save(flags);
+ 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
+ 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
+ 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
+ 	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
+ 	__this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
+ 	memcg_check_events(memcg, dummy_page);
+ 	local_irq_restore(flags);
+ }
+ 
+ static void uncharge_list(struct list_head *page_list)
+ {
+ 	struct mem_cgroup *memcg = NULL;
+ 	unsigned long nr_memsw = 0;
+ 	unsigned long nr_anon = 0;
+ 	unsigned long nr_file = 0;
+ 	unsigned long nr_huge = 0;
+ 	unsigned long pgpgout = 0;
+ 	unsigned long nr_mem = 0;
+ 	struct list_head *next;
+ 	struct page *page;
+ 
+ 	next = page_list->next;
+ 	do {
+ 		unsigned int nr_pages = 1;
+ 		struct page_cgroup *pc;
+ 
+ 		page = list_entry(next, struct page, lru);
+ 		next = page->lru.next;
+ 
+ 		VM_BUG_ON_PAGE(PageLRU(page), page);
+ 		VM_BUG_ON_PAGE(page_count(page), page);
+ 
+ 		pc = lookup_page_cgroup(page);
+ 		if (!PageCgroupUsed(pc))
+ 			continue;
+ 
+ 		/*
+ 		 * Nobody should be changing or seriously looking at
+ 		 * pc->mem_cgroup and pc->flags at this point, we have
+ 		 * fully exclusive access to the page.
+ 		 */
+ 
+ 		if (memcg != pc->mem_cgroup) {
+ 			if (memcg) {
+ 				uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
+ 					       nr_anon, nr_file, nr_huge, page);
+ 				pgpgout = nr_mem = nr_memsw = 0;
+ 				nr_anon = nr_file = nr_huge = 0;
+ 			}
+ 			memcg = pc->mem_cgroup;
+ 		}
+ 
+ 		if (PageTransHuge(page)) {
+ 			nr_pages <<= compound_order(page);
+ 			VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ 			nr_huge += nr_pages;
+ 		}
+ 
+ 		if (PageAnon(page))
+ 			nr_anon += nr_pages;
+ 		else
+ 			nr_file += nr_pages;
+ 
+ 		if (pc->flags & PCG_MEM)
+ 			nr_mem += nr_pages;
+ 		if (pc->flags & PCG_MEMSW)
+ 			nr_memsw += nr_pages;
+ 		pc->flags = 0;
+ 
+ 		pgpgout++;
+ 	} while (next != page_list);
+ 
+ 	if (memcg)
+ 		uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
+ 			       nr_anon, nr_file, nr_huge, page);
+ }
+ 
+ /**
+  * mem_cgroup_uncharge - uncharge a page
+  * @page: page to uncharge
+  *
+  * Uncharge a page previously charged with mem_cgroup_try_charge() and
+  * mem_cgroup_commit_charge().
+  */
+ void mem_cgroup_uncharge(struct page *page)
+ {
+ 	struct page_cgroup *pc;
+ 
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 
+ 	/* Don't touch page->lru of any random page, pre-check: */
+ 	pc = lookup_page_cgroup(page);
+ 	if (!PageCgroupUsed(pc))
+ 		return;
+ 
+ 	INIT_LIST_HEAD(&page->lru);
+ 	uncharge_list(&page->lru);
+ }
+ 
+ /**
+  * mem_cgroup_uncharge_list - uncharge a list of page
+  * @page_list: list of pages to uncharge
+  *
+  * Uncharge a list of pages previously charged with
+  * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
+  */
+ void mem_cgroup_uncharge_list(struct list_head *page_list)
+ {
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 
+ 	if (!list_empty(page_list))
+ 		uncharge_list(page_list);
+ }
+ 
+ /**
+  * mem_cgroup_migrate - migrate a charge to another page
+  * @oldpage: currently charged page
+  * @newpage: page to transfer the charge to
+  * @lrucare: both pages might be on the LRU already
+  *
+  * Migrate the charge from @oldpage to @newpage.
+  *
+  * Both pages must be locked, @newpage->mapping must be set up.
+  */
+ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
+ 			bool lrucare)
+ {
+ 	unsigned int nr_pages = 1;
+ 	struct page_cgroup *pc;
+ 	int isolated;
+ 
+ 	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+ 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ 	VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
+ 	VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
+ 	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
+ 
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 
+ 	/* Page cache replacement: new page already charged? */
+ 	pc = lookup_page_cgroup(newpage);
+ 	if (PageCgroupUsed(pc))
+ 		return;
+ 
+ 	/* Re-entrant migration: old page already uncharged? */
+ 	pc = lookup_page_cgroup(oldpage);
+ 	if (!PageCgroupUsed(pc))
+ 		return;
+ 
+ 	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
+ 	VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
+ 
+ 	if (PageTransHuge(oldpage)) {
+ 		nr_pages <<= compound_order(oldpage);
+ 		VM_BUG_ON_PAGE(!PageTransHuge(oldpage), oldpage);
+ 		VM_BUG_ON_PAGE(!PageTransHuge(newpage), newpage);
+ 	}
+ 
+ 	if (lrucare)
+ 		lock_page_lru(oldpage, &isolated);
+ 
+ 	pc->flags = 0;
+ 
+ 	if (lrucare)
+ 		unlock_page_lru(oldpage, isolated);
+ 
+ 	local_irq_disable();
+ 	mem_cgroup_charge_statistics(pc->mem_cgroup, oldpage, -nr_pages);
+ 	memcg_check_events(pc->mem_cgroup, oldpage);
+ 	local_irq_enable();
+ 
+ 	commit_charge(newpage, pc->mem_cgroup, nr_pages, lrucare);
+ }
+ 
  /*
   * subsys_initcall() for memory controller.
   *
diff --combined mm/shmem.c
index 0f01800,6dc80d2..5909f29
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@@ -149,6 -149,19 +149,19 @@@ static inline void shmem_unacct_size(un
    	vm_unacct_memory(VM_ACCT(size));
  }
+ static inline int shmem_reacct_size(unsigned long flags,
+ 		loff_t oldsize, loff_t newsize)
+ {
+ 	if (!(flags & VM_NORESERVE)) {
+ 		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
+ 			return security_vm_enough_memory_mm(current->mm,
+ 					VM_ACCT(newsize) - VM_ACCT(oldsize));
+ 		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
+ 			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
+ 	}
+ 	return 0;
+ }
+ 
  /*
   * ... whereas tmpfs objects are accounted incrementally as
   * pages are allocated, in order to allow huge sparse files.
@@@ -280,7 -293,7 +293,7 @@@ static bool shmem_confirm_swap(struct a
   */
  static int shmem_add_to_page_cache(struct page *page,
    			   struct address_space *mapping,
- 				   pgoff_t index, gfp_t gfp, void *expected)
+ 				   pgoff_t index, void *expected)
  {
    int error;
@@@ -406,7 -419,6 +419,6 @@@ static void shmem_undo_range(struct ino
    		pvec.pages, indices);
    	if (!pvec.nr)
    		break;
- 		mem_cgroup_uncharge_start();
    	for (i = 0; i < pagevec_count(&pvec); i++) {
    		struct page *page = pvec.pages[i];
@@@ -434,7 -446,6 +446,6 @@@
    	}
    	pagevec_remove_exceptionals(&pvec);
    	pagevec_release(&pvec);
- 		mem_cgroup_uncharge_end();
    	cond_resched();
    	index++;
    }
@@@ -482,7 -493,6 +493,6 @@@
    		index = start;
    		continue;
    	}
- 		mem_cgroup_uncharge_start();
    	for (i = 0; i < pagevec_count(&pvec); i++) {
    		struct page *page = pvec.pages[i];
@@@ -518,7 -528,6 +528,6 @@@
    	}
    	pagevec_remove_exceptionals(&pvec);
    	pagevec_release(&pvec);
- 		mem_cgroup_uncharge_end();
    	index++;
    }
@@@ -549,6 -558,10 +558,10 @@@ static int shmem_setattr(struct dentry 
    	loff_t newsize = attr->ia_size;
if (newsize != oldsize) {
+ 			error = shmem_reacct_size(SHMEM_I(inode)->flags,
+ 					oldsize, newsize);
+ 			if (error)
+ 				return error;
    		i_size_write(inode, newsize);
    		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
    	}
@@@ -604,7 -617,7 +617,7 @@@ static int shmem_unuse_inode(struct shm
    radswap = swp_to_radix_entry(swap);
    index = radix_tree_locate_item(&mapping->page_tree, radswap);
    if (index == -1)
- 		return 0;
+ 		return -EAGAIN;	/* tell shmem_unuse we found nothing */
/*
     * Move _head_ to start search for next from here.
@@@ -649,7 -662,7 +662,7 @@@
     */
    if (!error)
    	error = shmem_add_to_page_cache(*pagep, mapping, index,
- 						GFP_NOWAIT, radswap);
+ 						radswap);
    if (error != -ENOMEM) {
    	/*
    	 * Truncation and eviction use free_swap_and_cache(), which
@@@ -663,7 -676,6 +676,6 @@@
    		spin_unlock(&info->lock);
    		swap_free(swap);
    	}
- 		error = 1;	/* not an error, but entry was found */
    }
    return error;
  }
@@@ -675,7 -687,7 +687,7 @@@ int shmem_unuse(swp_entry_t swap, struc
  {
    struct list_head *this, *next;
    struct shmem_inode_info *info;
- 	int found = 0;
+ 	struct mem_cgroup *memcg;
    int error = 0;
/*
@@@ -690,26 -702,32 +702,32 @@@
     * the shmem_swaplist_mutex which might hold up shmem_writepage().
     * Charged back to the user (not to caller) when swap account is used.
     */
- 	error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL);
+ 	error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
    if (error)
    	goto out;
    /* No radix_tree_preload: swap entry keeps a place for page in tree */
+ 	error = -EAGAIN;
mutex_lock(&shmem_swaplist_mutex);
    list_for_each_safe(this, next, &shmem_swaplist) {
    	info = list_entry(this, struct shmem_inode_info, swaplist);
    	if (info->swapped)
- 			found = shmem_unuse_inode(info, swap, &page);
+ 			error = shmem_unuse_inode(info, swap, &page);
    	else
    		list_del_init(&info->swaplist);
    	cond_resched();
- 		if (found)
+ 		if (error != -EAGAIN)
    		break;
+ 		/* found nothing in this: move on to search the next */
    }
    mutex_unlock(&shmem_swaplist_mutex);
- 	if (found < 0)
- 		error = found;
+ 	if (error) {
+ 		if (error != -ENOMEM)
+ 			error = 0;
+ 		mem_cgroup_cancel_charge(page, memcg);
+ 	} else
+ 		mem_cgroup_commit_charge(page, memcg, true);
  out:
    unlock_page(page);
    page_cache_release(page);
@@@ -813,7 -831,7 +831,7 @@@ static int shmem_writepage(struct page 
    }
mutex_unlock(&shmem_swaplist_mutex);
- 	swapcache_free(swap, NULL);
+ 	swapcache_free(swap);
  redirty:
    set_page_dirty(page);
    if (wbc->for_reclaim)
@@@ -986,7 -1004,7 +1004,7 @@@ static int shmem_replace_page(struct pa
    	 */
    	oldpage = newpage;
    } else {
- 		mem_cgroup_replace_page_cache(oldpage, newpage);
+ 		mem_cgroup_migrate(oldpage, newpage, false);
    	lru_cache_add_anon(newpage);
    	*pagep = newpage;
    }
@@@ -1013,6 -1031,7 +1031,7 @@@ static int shmem_getpage_gfp(struct ino
    struct address_space *mapping = inode->i_mapping;
    struct shmem_inode_info *info;
    struct shmem_sb_info *sbinfo;
+ 	struct mem_cgroup *memcg;
    struct page *page;
    swp_entry_t swap;
    int error;
@@@ -1091,11 -1110,10 +1110,10 @@@ repeat
    			goto failed;
    	}
- 		error = mem_cgroup_charge_file(page, current->mm,
- 						gfp & GFP_RECLAIM_MASK);
+ 		error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
    	if (!error) {
    		error = shmem_add_to_page_cache(page, mapping, index,
- 						gfp, swp_to_radix_entry(swap));
+ 						swp_to_radix_entry(swap));
    		/*
    		 * We already confirmed swap under page lock, and make
    		 * no memory allocation here, so usually no possibility
@@@ -1108,12 -1126,16 +1126,16 @@@
    		 * Reset swap.val? No, leave it so "failed" goes back to
    		 * "repeat": reading a hole and writing should succeed.
    		 */
- 			if (error)
+ 			if (error) {
+ 				mem_cgroup_cancel_charge(page, memcg);
    			delete_from_swap_cache(page);
+ 			}
    	}
    	if (error)
    		goto failed;
+ 		mem_cgroup_commit_charge(page, memcg, true);
+ 
    	spin_lock(&info->lock);
    	info->swapped--;
    	shmem_recalc_inode(inode);
@@@ -1149,22 -1171,22 +1171,22 @@@
    	__SetPageSwapBacked(page);
    	__set_page_locked(page);
    	if (sgp == SGP_WRITE)
- 			init_page_accessed(page);
+ 			__SetPageReferenced(page);
- 		error = mem_cgroup_charge_file(page, current->mm,
- 						gfp & GFP_RECLAIM_MASK);
+ 		error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
    	if (error)
    		goto decused;
    	error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
    	if (!error) {
    		error = shmem_add_to_page_cache(page, mapping, index,
- 							gfp, NULL);
+ 							NULL);
    		radix_tree_preload_end();
    	}
    	if (error) {
- 			mem_cgroup_uncharge_cache_page(page);
+ 			mem_cgroup_cancel_charge(page, memcg);
    		goto decused;
    	}
+ 		mem_cgroup_commit_charge(page, memcg, false);
    	lru_cache_add_anon(page);
spin_lock(&info->lock);
@@@ -2048,45 -2070,17 +2070,45 @@@ static int shmem_rmdir(struct inode *di
    return shmem_unlink(dir, dentry);
  }
+static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
 +{
 +	bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
 +	bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode);
 +
 +	if (old_dir != new_dir && old_is_dir != new_is_dir) {
 +		if (old_is_dir) {
 +			drop_nlink(old_dir);
 +			inc_nlink(new_dir);
 +		} else {
 +			drop_nlink(new_dir);
 +			inc_nlink(old_dir);
 +		}
 +	}
 +	old_dir->i_ctime = old_dir->i_mtime =
 +	new_dir->i_ctime = new_dir->i_mtime =
 +	old_dentry->d_inode->i_ctime =
 +	new_dentry->d_inode->i_ctime = CURRENT_TIME;
 +
 +	return 0;
 +}
 +
  /*
   * The VFS layer already does all the dentry stuff for rename,
   * we just have to decrement the usage count for the target if
   * it exists so that the VFS layer correctly free's it when it
   * gets overwritten.
   */
 -static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
 +static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
  {
    struct inode *inode = old_dentry->d_inode;
    int they_are_dirs = S_ISDIR(inode->i_mode);
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
 +		return -EINVAL;
 +
 +	if (flags & RENAME_EXCHANGE)
 +		return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
 +
    if (!simple_empty(new_dentry))
    	return -ENOTEMPTY;
@@@ -2769,7 -2763,7 +2791,7 @@@ static const struct inode_operations sh
    .mkdir		= shmem_mkdir,
    .rmdir		= shmem_rmdir,
    .mknod		= shmem_mknod,
 -	.rename		= shmem_rename,
 +	.rename2	= shmem_rename2,
    .tmpfile	= shmem_tmpfile,
  #endif
  #ifdef CONFIG_TMPFS_XATTR
@@@ -2960,16 -2954,16 +2982,16 @@@ static struct file *__shmem_file_setup(
    this.len = strlen(name);
    this.hash = 0; /* will go */
    sb = shm_mnt->mnt_sb;
+ 	path.mnt = mntget(shm_mnt);
    path.dentry = d_alloc_pseudo(sb, &this);
    if (!path.dentry)
    	goto put_memory;
    d_set_d_op(path.dentry, &anon_ops);
- 	path.mnt = mntget(shm_mnt);
res = ERR_PTR(-ENOSPC);
    inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
    if (!inode)
- 		goto put_dentry;
+ 		goto put_memory;
inode->i_flags |= i_flags;
    d_instantiate(path.dentry, inode);
@@@ -2977,19 -2971,19 +2999,19 @@@
    clear_nlink(inode);	/* It is unlinked */
    res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
    if (IS_ERR(res))
- 		goto put_dentry;
+ 		goto put_path;
res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
    	  &shmem_file_operations);
    if (IS_ERR(res))
- 		goto put_dentry;
+ 		goto put_path;
return res;
- put_dentry:
- 	path_put(&path);
  put_memory:
    shmem_unacct_size(flags, size);
+ put_path:
+ 	path_put(&path);
    return res;
  }
diff --combined net/bridge/br_multicast.c
index b4845f4,d9c4f57..7751c92
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@@ -1174,7 -1174,7 +1174,7 @@@ static void br_multicast_add_router(str
    }
if (slot)
- 		hlist_add_after_rcu(slot, &port->rlist);
+ 		hlist_add_behind_rcu(&port->rlist, slot);
    else
    	hlist_add_head_rcu(&port->rlist, &br->router_list);
  }
@@@ -2216,43 -2216,6 +2216,43 @@@ unlock
  EXPORT_SYMBOL_GPL(br_multicast_list_adjacent);
/**
 + * br_multicast_has_querier_anywhere - Checks for a querier on a bridge
 + * @dev: The bridge port providing the bridge on which to check for a querier
 + * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
 + *
 + * Checks whether the given interface has a bridge on top and if so returns
 + * true if a valid querier exists anywhere on the bridged link layer.
 + * Otherwise returns false.
 + */
 +bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto)
 +{
 +	struct net_bridge *br;
 +	struct net_bridge_port *port;
 +	struct ethhdr eth;
 +	bool ret = false;
 +
 +	rcu_read_lock();
 +	if (!br_port_exists(dev))
 +		goto unlock;
 +
 +	port = br_port_get_rcu(dev);
 +	if (!port || !port->br)
 +		goto unlock;
 +
 +	br = port->br;
 +
 +	memset(&eth, 0, sizeof(eth));
 +	eth.h_proto = htons(proto);
 +
 +	ret = br_multicast_querier_exists(br, &eth);
 +
 +unlock:
 +	rcu_read_unlock();
 +	return ret;
 +}
 +EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere);
 +
 +/**
   * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port
   * @dev: The bridge port adjacent to which to check for a querier
   * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
-- 
LinuxNextTracking

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

[linux-next] LinuxNextTracking branch, master, updated. next-20140806