The following commit has been merged in the master branch:
commit d653b0b3717843efa3e72bdcaf37b2a0b598eea0
Merge: 3f607a5176e202974afb3dbb31d0f37cb2f4551f 76e1488440013a0d737fbb9d1f8efe226138f7f0
Author: Stephen Rothwell <sfr(a)canb.auug.org.au>
Date: Wed Jul 30 20:08:38 2014 +1000
Merge branch 'akpm-current/current'
Conflicts:
arch/arm64/Kconfig
mm/shmem.c
diff --combined Documentation/devicetree/bindings/i2c/trivial-devices.txt
index 37803eb,c75046a..6af570e
--- a/Documentation/devicetree/bindings/i2c/trivial-devices.txt
+++ b/Documentation/devicetree/bindings/i2c/trivial-devices.txt
@@@ -50,7 -50,6 +50,7 @@@ epson,rx8581 I2C-BUS INTERFACE REAL TI
fsl,mag3110 MAG3110: Xtrinsic High Accuracy, 3D Magnetometer
fsl,mc13892 MC13892: Power Management Integrated Circuit (PMIC) for i.MX35/51
fsl,mma8450 MMA8450Q: Xtrinsic Low-power, 3-axis Xtrinsic Accelerometer
+fsl,mma8452 MMA8452Q: 3-axis 12-bit / 8-bit Digital Accelerometer
fsl,mpr121 MPR121: Proximity Capacitive Touch Sensor Controller
fsl,sgtl5000 SGTL5000: Ultra Low-Power Audio Codec
gmt,g751 G751: Digital Temperature Sensor and Thermal Watchdog with Two-Wire Interface
@@@ -70,6 -69,7 +70,7 @@@ nuvoton,npct501 i2c trusted platform m
nxp,pca9556 Octal SMBus and I2C registered interface
nxp,pca9557 8-bit I2C-bus and SMBus I/O port with reset
nxp,pcf8563 Real-time clock/calendar
+ nxp,pcf85063 Tiny Real-Time Clock
ovti,ov5642 OV5642: Color CMOS QSXGA (5-megapixel) Image Sensor with OmniBSI and
Embedded TrueFocus
pericom,pt7c4338 Real-time Clock Module
plx,pex8648 48-Lane, 12-Port PCI Express Gen 2 (5.0 GT/s) Switch
@@@ -84,6 -84,5 +85,6 @@@ stm,m41t80 M41T80 - SERIAL ACCESS RTC
taos,tsl2550 Ambient Light Sensor with SMBUS/Two Wire Serial Interface
ti,tsc2003 I2C Touch-Screen Controller
ti,tmp102 Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface
+ti,tmp103 Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface
ti,tmp275 Digital Temperature Sensor
winbond,wpct301 i2c trusted platform module (TPM)
diff --combined Documentation/kernel-parameters.txt
index d2fc335,6824f37..f1d8047
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -566,11 -566,6 +566,11 @@@ bytes respectively. Such letter suffixe
possible to determine what the correct size should be.
This option provides an override for these situations.
+ ca_keys= [KEYS] This parameter identifies a specific key(s) on
+ the system trusted keyring to be used for certificate
+ trust validation.
+ format: { id:<keyid> | builtin }
+
ccw_timeout_log [S390]
See Documentation/s390/CommonIO for details.
@@@ -1102,12 -1097,6 +1102,12 @@@
that can be changed at run time by the
set_graph_function file in the debugfs tracing directory.
+ ftrace_graph_notrace=[function-list]
+ [FTRACE] Do not trace from the functions specified in
+ function-list. This list is a comma separated list of
+ functions that can be changed at run time by the
+ set_graph_notrace file in the debugfs tracing directory.
+
gamecon.map[2|3]=
[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
support via parallel port (up to 5 devices per port)
@@@ -1324,23 -1313,6 +1324,23 @@@
Formats: { "ima" | "ima-ng" }
Default: "ima-ng"
+ ima.ahash_minsize= [IMA] Minimum file size for asynchronous hash usage
+ Format: <min_file_size>
+ Set the minimal file size for using asynchronous hash.
+ If left unspecified, ahash usage is disabled.
+
+ ahash performance varies for different data sizes on
+ different crypto accelerators. This option can be used
+ to achieve the best performance for a particular HW.
+
+ ima.ahash_bufsize= [IMA] Asynchronous hash buffer size
+ Format: <bufsize>
+ Set hashing buffer size. Default: 4k.
+
+ ahash performance varies for different chunk sizes on
+ different crypto accelerators. This option can be used
+ to achieve best performance for particular HW.
+
init= [KNL]
Format: <full_path>
Run specified binary instead of /sbin/init as init
@@@ -1444,6 -1416,10 +1444,6 @@@
ip= [IP_PNP]
See Documentation/filesystems/nfs/nfsroot.txt.
- ip2= [HW] Set IO/IRQ pairs for up to 4 IntelliPort boards
- See comment before ip2_setup() in
- drivers/char/ip2/ip2base.c.
-
irqfixup [HW]
When an interrupt is not handled search all handlers
for it. Intended to get systems with badly broken
@@@ -1716,8 -1692,12 +1716,12 @@@
7 (KERN_DEBUG) debug-level messages
log_buf_len=n[KMG] Sets the size of the printk ring buffer,
- in bytes. n must be a power of two. The default
- size is set in the kernel config file.
+ in bytes. n must be a power of two and greater
+ than the minimal size. The minimal size is defined
+ by LOG_BUF_SHIFT kernel config parameter. There is
+ also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter
+ that allows to increase the default size depending on
+ the number of CPUs. See init/Kconfig for more details.
logo.nologo [FB] Disables display of the built-in Linux logo.
This may be used to provide more screen space for
@@@ -2190,21 -2170,6 +2194,21 @@@
and restore using xsave. The kernel will fallback to
enabling legacy floating-point and sse state.
+ noxsaveopt [X86] Disables xsaveopt used in saving x86 extended
+ register states. The kernel will fall back to use
+ xsave to save the states. By using this parameter,
+ performance of saving the states is degraded because
+ xsave doesn't support modified optimization while
+ xsaveopt supports it on xsaveopt enabled systems.
+
+ noxsaves [X86] Disables xsaves and xrstors used in saving and
+ restoring x86 extended register state in compacted
+ form of xsave area. The kernel will fall back to use
+ xsaveopt and xrstor to save and restore the states
+ in standard form of xsave area. By using this
+ parameter, xsave area per process might occupy more
+ memory on xsaves enabled systems.
+
eagerfpu= [X86]
on enable eager fpu restore
off disable eager fpu restore
@@@ -2846,13 -2811,6 +2850,13 @@@
quiescent states. Units are jiffies, minimum
value is one, and maximum value is HZ.
+ rcutree.rcu_nocb_leader_stride= [KNL]
+ Set the number of NOCB kthread groups, which
+ defaults to the square root of the number of
+ CPUs. Larger numbers reduces the wakeup overhead
+ on the per-CPU grace-period kthreads, but increases
+ that same overhead on each group's leader.
+
rcutree.qhimark= [KNL]
Set threshold of queued RCU callbacks beyond which
batch limiting is disabled.
@@@ -3069,13 -3027,6 +3073,13 @@@
S [KNL] Run init in single mode
+ s390_iommu= [HW,S390]
+ Set s390 IOTLB flushing mode
+ strict
+ With strict flushing every unmap operation will result in
+ an IOTLB flush. Default is lazy flushing before reuse,
+ which is faster.
+
sa1100ir [NET]
See drivers/net/irda/sa1100_ir.c.
@@@ -3750,10 -3701,6 +3754,10 @@@
Disables the ticketlock slowpath using Xen PV
optimizations.
+ xen_nopv [X86]
+ Disables the PV optimizations forcing the HVM guest to
+ run as generic HVM guest with no PV drivers.
+
xirc2ps_cs= [NET,PCMCIA]
Format:
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
diff --combined Makefile
index 5def5e8,a4b34fe..c837e9a
--- a/Makefile
+++ b/Makefile
@@@ -1,7 -1,7 +1,7 @@@
VERSION = 3
PATCHLEVEL = 16
SUBLEVEL = 0
-EXTRAVERSION = -rc6
+EXTRAVERSION = -rc7
NAME = Shuffling Zombie Juror
# *DOCUMENTATION*
@@@ -360,14 -360,9 +360,14 @@@ include $(srctree)/scripts/Kbuild.inclu
# Make variables (CC, etc...)
AS = $(CROSS_COMPILE)as
LD = $(CROSS_COMPILE)ld
+LDFINAL = $(LD)
CC = $(CROSS_COMPILE)gcc
CPP = $(CC) -E
+ifdef CONFIG_LTO
+AR = $(CROSS_COMPILE)gcc-ar
+else
AR = $(CROSS_COMPILE)ar
+endif
NM = $(CROSS_COMPILE)nm
STRIP = $(CROSS_COMPILE)strip
OBJCOPY = $(CROSS_COMPILE)objcopy
@@@ -377,7 -372,6 +377,7 @@@ GENKSYMS = scripts/genksyms/genksym
INSTALLKERNEL := installkernel
DEPMOD = /sbin/depmod
PERL = perl
+PYTHON = python
CHECK = sparse
CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
@@@ -427,8 -421,8 +427,8 @@@ KERNELVERSION = $(VERSION)$(if $(PATCHL
export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
-export CPP AR NM STRIP OBJCOPY OBJDUMP
-export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
+export CPP AR NM STRIP OBJCOPY OBJDUMP LDFINAL
+export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
@@@ -438,17 -432,6 +438,17 @@@ export KBUILD_AFLAGS_MODULE KBUILD_CFLA
export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
export KBUILD_ARFLAGS
+ifdef CONFIG_LTO
+# LTO gcc creates a lot of files in TMPDIR, and with /tmp as tmpfs
+# it's easy to drive the machine OOM. Use the object directory
+# instead.
+ifndef TMPDIR
+TMPDIR ?= $(objtree)
+export TMPDIR
+$(info setting TMPDIR=$(objtree) for LTO build)
+endif
+endif
+
# When compiling out-of-tree modules, put MODVERDIR in the module
# tree rather than in the kernel tree. The kernel tree might
# even be read-only.
@@@ -638,6 -621,9 +638,9 @@@ els
KBUILD_CFLAGS += -O2
endif
+ # Tell gcc to never replace conditional load with a non-conditional one
+ KBUILD_CFLAGS += $(call cc-option,--param=allow-store-data-races=0)
+
ifdef CONFIG_READABLE_ASM
# Disable optimizations that make assembler listings hard to read.
# reorder blocks reorders the control in the function
@@@ -653,6 -639,22 +656,22 @@@ KBUILD_CFLAGS += $(call cc-option,-Wfra
endif
# Handle stack protector mode.
+ #
+ # Since kbuild can potentially perform two passes (first with the old
+ # .config values and then with updated .config values), we cannot error out
+ # if a desired compiler option is unsupported. If we were to error, kbuild
+ # could never get to the second pass and actually notice that we changed
+ # the option to something that was supported.
+ #
+ # Additionally, we don't want to fallback and/or silently change which compiler
+ # flags will be used, since that leads to producing kernels with different
+ # security feature characteristics depending on the compiler used. ("But I
+ # selected CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!")
+ #
+ # The middle ground is to warn here so that the failed option is obvious, but
+ # to let the build fail with bad compiler flags so that we can't produce a
+ # kernel when there is a CONFIG and compiler mismatch.
+ #
ifdef CONFIG_CC_STACKPROTECTOR_REGULAR
stackp-flag := -fstack-protector
ifeq ($(call cc-option, $(stackp-flag)),)
@@@ -705,8 -707,6 +724,8 @@@ KBUILD_CFLAGS += -fomit-frame-pointe
endif
endif
+KBUILD_CFLAGS += $(call cc-option, -fno-var-tracking-assignments)
+
ifdef CONFIG_DEBUG_INFO
KBUILD_CFLAGS += -g
KBUILD_AFLAGS += -Wa,-gdwarf-2
@@@ -770,7 -770,6 +789,7 @@@ ifeq ($(shell $(CONFIG_SHELL) $(srctree
endif
include $(srctree)/scripts/Makefile.extrawarn
+include ${srctree}/scripts/Makefile.lto
# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
KBUILD_CPPFLAGS += $(KCPPFLAGS)
@@@ -1240,9 -1239,9 +1259,9 @@@ help
@echo ' tags/TAGS - Generate tags file for editors'
@echo ' cscope - Generate cscope index'
@echo ' gtags - Generate GNU GLOBAL index'
- @echo ' kernelrelease - Output the release version string'
- @echo ' kernelversion - Output the version stored in Makefile'
- @echo ' image_name - Output the image name'
+ @echo ' kernelrelease - Output the release version string (use with make
-s)'
+ @echo ' kernelversion - Output the version stored in Makefile (use with make
-s)'
+ @echo ' image_name - Output the image name (use with make -s)'
@echo ' headers_install - Install sanitised kernel headers to
INSTALL_HDR_PATH'; \
echo ' (default: $(INSTALL_HDR_PATH))'; \
echo ''
diff --combined arch/arm/Kconfig
index 05e2e94,551e526..9c45bc5
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@@ -84,6 -84,7 +84,7 @@@ config AR
<http://www.arm.linux.org.uk/>.
config ARM_HAS_SG_CHAIN
+ select ARCH_HAS_SG_CHAIN
bool
config NEED_SG_DMA_LENGTH
@@@ -240,6 -241,13 +241,6 @@@ config ARM_PATCH_PHYS_VIR
this feature (eg, building a kernel for a single machine) and
you need to shrink the kernel to the minimal size.
-config NEED_MACH_GPIO_H
- bool
- help
- Select this when mach/gpio.h is required to provide special
- definitions for this platform. The need for mach/gpio.h should
- be avoided when possible.
-
config NEED_MACH_IO_H
bool
help
@@@ -256,22 -264,8 +257,22 @@@ config NEED_MACH_MEMORY_
config PHYS_OFFSET
hex "Physical address of main memory" if MMU
- depends on !ARM_PATCH_PHYS_VIRT && !NEED_MACH_MEMORY_H
+ depends on !ARM_PATCH_PHYS_VIRT
default DRAM_BASE if !MMU
+ default 0x00000000 if ARCH_EBSA110 || \
+ EP93XX_SDCE3_SYNC_PHYS_OFFSET || \
+ ARCH_FOOTBRIDGE || \
+ ARCH_INTEGRATOR || \
+ ARCH_IOP13XX || \
+ ARCH_KS8695 || \
+ (ARCH_REALVIEW && !REALVIEW_HIGH_PHYS_OFFSET)
+ default 0x10000000 if ARCH_OMAP1 || ARCH_RPC
+ default 0x20000000 if ARCH_S5PV210
+ default 0x70000000 if REALVIEW_HIGH_PHYS_OFFSET
+ default 0xc0000000 if EP93XX_SDCE0_PHYS_OFFSET || ARCH_SA1100
+ default 0xd0000000 if EP93XX_SDCE1_PHYS_OFFSET
+ default 0xe0000000 if EP93XX_SDCE2_PHYS_OFFSET
+ default 0xf0000000 if EP93XX_SDCE3_ASYNC_PHYS_OFFSET
help
Please provide the physical address corresponding to the
location of main memory in your system.
@@@ -320,7 -314,7 +321,7 @@@ config ARCH_MULTIPLATFOR
config ARCH_INTEGRATOR
bool "ARM Ltd. Integrator family"
select ARM_AMBA
- select ARM_PATCH_PHYS_VIRT
+ select ARM_PATCH_PHYS_VIRT if MMU
select AUTO_ZRELADDR
select COMMON_CLK
select COMMON_CLK_VERSATILE
@@@ -328,6 -322,7 +329,6 @@@
select HAVE_TCM
select ICST
select MULTI_IRQ_HANDLER
- select NEED_MACH_MEMORY_H
select PLAT_VERSATILE
select SPARSE_IRQ
select USE_OF
@@@ -347,6 -342,7 +348,6 @@@ config ARCH_REALVIE
select ICST
select NEED_MACH_MEMORY_H
select PLAT_VERSATILE
- select PLAT_VERSATILE_CLCD
help
This enables support for ARM Ltd RealView boards.
@@@ -361,6 -357,7 +362,6 @@@ config ARCH_VERSATIL
select HAVE_MACH_CLKDEV
select ICST
select PLAT_VERSATILE
- select PLAT_VERSATILE_CLCD
select PLAT_VERSATILE_CLOCK
select VERSATILE_FPGA_IRQ
help
@@@ -440,6 -437,7 +441,6 @@@ config ARCH_EP93X
select ARM_VIC
select CLKDEV_LOOKUP
select CPU_ARM920T
- select NEED_MACH_MEMORY_H
help
This enables support for the Cirrus EP93xx series of CPUs.
@@@ -532,6 -530,21 +533,6 @@@ config ARCH_DOV
help
Support for the Marvell Dove SoC 88AP510
-config ARCH_KIRKWOOD
- bool "Marvell Kirkwood"
- select ARCH_REQUIRE_GPIOLIB
- select CPU_FEROCEON
- select GENERIC_CLOCKEVENTS
- select MVEBU_MBUS
- select PCI
- select PCI_QUIRKS
- select PINCTRL
- select PINCTRL_KIRKWOOD
- select PLAT_ORION_LEGACY
- help
- Support for the following Marvell Kirkwood series SoCs:
- 88F6180, 88F6192 and 88F6281.
-
config ARCH_MV78XX0
bool "Marvell MV78xx0"
select ARCH_REQUIRE_GPIOLIB
@@@ -623,7 -636,6 +624,7 @@@ config ARCH_PX
select AUTO_ZRELADDR
select CLKDEV_LOOKUP
select CLKSRC_MMIO
+ select CLKSRC_OF
select GENERIC_CLOCKEVENTS
select GPIO_PXA
select HAVE_IDE
@@@ -648,7 -660,7 +649,7 @@@ config ARCH_MS
config ARCH_SHMOBILE_LEGACY
bool "Renesas ARM SoCs (non-multiplatform)"
select ARCH_SHMOBILE
- select ARM_PATCH_PHYS_VIRT
+ select ARM_PATCH_PHYS_VIRT if MMU
select CLKDEV_LOOKUP
select GENERIC_CLOCKEVENTS
select HAVE_ARM_SCU if SMP
@@@ -748,6 -760,61 +749,6 @@@ config ARCH_S3C64X
help
Samsung S3C64XX series based systems
-config ARCH_S5P64X0
- bool "Samsung S5P6440 S5P6450"
- select ATAGS
- select CLKDEV_LOOKUP
- select CLKSRC_SAMSUNG_PWM
- select CPU_V6
- select GENERIC_CLOCKEVENTS
- select GPIO_SAMSUNG
- select HAVE_S3C2410_I2C if I2C
- select HAVE_S3C2410_WATCHDOG if WATCHDOG
- select HAVE_S3C_RTC if RTC_CLASS
- select NEED_MACH_GPIO_H
- select SAMSUNG_ATAGS
- select SAMSUNG_WDT_RESET
- help
- Samsung S5P64X0 CPU based systems, such as the Samsung SMDK6440,
- SMDK6450.
-
-config ARCH_S5PC100
- bool "Samsung S5PC100"
- select ARCH_REQUIRE_GPIOLIB
- select ATAGS
- select CLKDEV_LOOKUP
- select CLKSRC_SAMSUNG_PWM
- select CPU_V7
- select GENERIC_CLOCKEVENTS
- select GPIO_SAMSUNG
- select HAVE_S3C2410_I2C if I2C
- select HAVE_S3C2410_WATCHDOG if WATCHDOG
- select HAVE_S3C_RTC if RTC_CLASS
- select NEED_MACH_GPIO_H
- select SAMSUNG_ATAGS
- select SAMSUNG_WDT_RESET
- help
- Samsung S5PC100 series based systems
-
-config ARCH_S5PV210
- bool "Samsung S5PV210/S5PC110"
- select ARCH_HAS_HOLES_MEMORYMODEL
- select ARCH_SPARSEMEM_ENABLE
- select ATAGS
- select CLKDEV_LOOKUP
- select CLKSRC_SAMSUNG_PWM
- select CPU_V7
- select GENERIC_CLOCKEVENTS
- select GPIO_SAMSUNG
- select HAVE_S3C2410_I2C if I2C
- select HAVE_S3C2410_WATCHDOG if WATCHDOG
- select HAVE_S3C_RTC if RTC_CLASS
- select NEED_MACH_GPIO_H
- select NEED_MACH_MEMORY_H
- select SAMSUNG_ATAGS
- help
- Samsung S5PV210/S5PC110 series based systems
-
config ARCH_DAVINCI
bool "TI DaVinci"
select ARCH_HAS_HOLES_MEMORYMODEL
@@@ -886,6 -953,8 +887,6 @@@ source "arch/arm/mach-ixp4xx/Kconfig
source "arch/arm/mach-keystone/Kconfig"
-source "arch/arm/mach-kirkwood/Kconfig"
-
source "arch/arm/mach-ks8695/Kconfig"
source "arch/arm/mach-msm/Kconfig"
@@@ -896,8 -965,6 +897,8 @@@ source "arch/arm/mach-mv78xx0/Kconfig
source "arch/arm/mach-imx/Kconfig"
+source "arch/arm/mach-mediatek/Kconfig"
+
source "arch/arm/mach-mxs/Kconfig"
source "arch/arm/mach-netx/Kconfig"
@@@ -939,6 -1006,10 +940,6 @@@ source "arch/arm/mach-s3c24xx/Kconfig
source "arch/arm/mach-s3c64xx/Kconfig"
-source "arch/arm/mach-s5p64x0/Kconfig"
-
-source "arch/arm/mach-s5pc100/Kconfig"
-
source "arch/arm/mach-s5pv210/Kconfig"
source "arch/arm/mach-exynos/Kconfig"
@@@ -1485,12 -1556,10 +1486,12 @@@ config ARM_PSC
config ARCH_NR_GPIO
int
default 1024 if ARCH_SHMOBILE || ARCH_TEGRA
- default 512 if ARCH_EXYNOS || ARCH_KEYSTONE || SOC_OMAP5 || SOC_DRA7XX || ARCH_S3C24XX
|| ARCH_S3C64XX
+ default 512 if ARCH_EXYNOS || ARCH_KEYSTONE || SOC_OMAP5 || \
+ SOC_DRA7XX || ARCH_S3C24XX || ARCH_S3C64XX || ARCH_S5PV210
default 416 if ARCH_SUNXI
default 392 if ARCH_U8500
default 352 if ARCH_VT8500
+ default 288 if ARCH_ROCKCHIP
default 264 if MACH_H4700
default 0
help
@@@ -1502,7 -1571,7 +1503,7 @@@ source kernel/Kconfig.preemp
config HZ_FIXED
int
- default 200 if ARCH_EBSA110 || ARCH_S3C24XX || ARCH_S5P64X0 || \
+ default 200 if ARCH_EBSA110 || ARCH_S3C24XX || \
ARCH_S5PV210 || ARCH_EXYNOS4
default AT91_TIMER_HZ if ARCH_AT91
default SHMOBILE_TIMER_HZ if ARCH_SHMOBILE_LEGACY
@@@ -2127,6 -2196,7 +2128,6 @@@ menu "Power management options
source "kernel/power/Kconfig"
config ARCH_SUSPEND_POSSIBLE
- depends on !ARCH_S5PC100
depends on CPU_ARM920T || CPU_ARM926T || CPU_FEROCEON || CPU_SA1100 || \
CPU_V6 || CPU_V6K || CPU_V7 || CPU_V7M || CPU_XSC3 || CPU_XSCALE || CPU_MOHAWK
def_bool y
diff --combined arch/arm/mm/dma-mapping.c
index 1f88db0,3116880..7a996aa
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@@ -26,6 -26,7 +26,7 @@@
#include <linux/io.h>
#include <linux/vmalloc.h>
#include <linux/sizes.h>
+ #include <linux/cma.h>
#include <asm/memory.h>
#include <asm/highmem.h>
@@@ -461,21 -462,12 +462,21 @@@ void __init dma_contiguous_remap(void
map.type = MT_MEMORY_DMA_READY;
/*
- * Clear previous low-memory mapping
+ * Clear previous low-memory mapping to ensure that the
+ * TLB does not see any conflicting entries, then flush
+ * the TLB of the old entries before creating new mappings.
+ *
+ * This ensures that any speculatively loaded TLB entries
+ * (even though they may be rare) can not cause any problems,
+ * and ensures that this code is architecturally compliant.
*/
for (addr = __phys_to_virt(start); addr < __phys_to_virt(end);
addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
+ flush_tlb_kernel_range(__phys_to_virt(start),
+ __phys_to_virt(end));
+
iotable_init(&map, 1);
}
}
diff --combined arch/arm64/Kconfig
index 555ad3c,7bc7b74..4e40949
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@@ -1,6 -1,8 +1,7 @@@
config ARM64
def_bool y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
- select ARCH_HAS_OPP
+ select ARCH_HAS_SG_CHAIN
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_SUPPORTS_ATOMIC_RMW
@@@ -10,9 -12,6 +11,9 @@@
select ARM_AMBA
select ARM_ARCH_TIMER
select ARM_GIC
+ select ARM_GIC_V2M if (PCI && PCI_MSI)
+ select ARM_GIC_V3
+ select AUDIT_ARCH_COMPAT_GENERIC
select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
select COMMON_CLK
@@@ -31,12 -30,10 +32,12 @@@
select GENERIC_STRNLEN_USER
select GENERIC_TIME_VSYSCALL
select HARDIRQS_SW_RESEND
+ select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
select HAVE_C_RECORDMCOUNT
+ select HAVE_CC_STACKPROTECTOR
select HAVE_DEBUG_BUGVERBOSE
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_API_DEBUG
@@@ -67,7 -64,6 +68,7 @@@
select RTC_LIB
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
+ select HAVE_CONTEXT_TRACKING
help
ARM 64-bit (AArch64) Linux support.
@@@ -160,63 -156,14 +161,63 @@@ endmen
menu "Kernel Features"
+choice
+ prompt "Page size"
+ default ARM64_4K_PAGES
+ help
+ Page size (translation granule) configuration.
+
+config ARM64_4K_PAGES
+ bool "4KB"
+ help
+ This feature enables 4KB pages support.
+
config ARM64_64K_PAGES
- bool "Enable 64KB pages support"
+ bool "64KB"
help
This feature enables 64KB pages support (4KB by default)
allowing only two levels of page tables and faster TLB
look-up. AArch32 emulation is not available when this feature
is enabled.
+endchoice
+
+choice
+ prompt "Virtual address space size"
+ default ARM64_VA_BITS_39 if ARM64_4K_PAGES
+ default ARM64_VA_BITS_42 if ARM64_64K_PAGES
+ help
+ Allows choosing one of multiple possible virtual address
+ space sizes. The level of translation table is determined by
+ a combination of page size and virtual address space size.
+
+config ARM64_VA_BITS_39
+ bool "39-bit"
+ depends on ARM64_4K_PAGES
+
+config ARM64_VA_BITS_42
+ bool "42-bit"
+ depends on ARM64_64K_PAGES
+
+config ARM64_VA_BITS_48
+ bool "48-bit"
+ depends on BROKEN
+
+endchoice
+
+config ARM64_VA_BITS
+ int
+ default 39 if ARM64_VA_BITS_39
+ default 42 if ARM64_VA_BITS_42
+ default 48 if ARM64_VA_BITS_48
+
+config ARM64_PGTABLE_LEVELS
+ int
+ default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42
+ default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48
+ default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39
+ default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48
+
config CPU_BIG_ENDIAN
bool "Build big-endian kernel"
help
@@@ -362,17 -309,6 +363,17 @@@ config EF
allow the kernel to be booted as an EFI application. This
is only useful on systems that have UEFI firmware.
+config DMI
+ bool "Enable support for SMBIOS (DMI) tables"
+ depends on EFI
+ default y
+ help
+ This enables SMBIOS/DMI feature for systems.
+
+ This option is only useful on systems that have UEFI firmware.
+ However, even with this option, the resultant kernel should
+ continue to boot on existing non-UEFI platforms.
+
endmenu
menu "Userspace binary formats"
diff --combined arch/ia64/Kconfig
index 44a6915,56986a0..c84c88b
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@@ -10,7 -10,6 +10,7 @@@ config IA6
select ARCH_MIGHT_HAVE_PC_SERIO
select PCI if (!IA64_HP_SIM)
select ACPI if (!IA64_HP_SIM)
+ select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select PM if (!IA64_HP_SIM)
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
@@@ -28,6 -27,7 +28,7 @@@
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_VIRT_CPU_ACCOUNTING
+ select ARCH_HAS_SG_CHAIN
select VIRT_TO_BUS
select ARCH_DISCARD_MEMBLOCK
select GENERIC_IRQ_PROBE
diff --combined arch/powerpc/kvm/book3s_64_mmu_hv.c
index 09a47ae,a01744f..ad463f8
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@@ -37,8 -37,6 +37,6 @@@
#include <asm/ppc-opcode.h>
#include <asm/cputable.h>
- #include "book3s_hv_cma.h"
-
/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
#define MAX_LPID_970 63
@@@ -64,10 -62,10 +62,10 @@@ long kvmppc_alloc_hpt(struct kvm *kvm,
}
kvm->arch.hpt_cma_alloc = 0;
page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
if (page) {
hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+ memset((void *)hpt, 0, (1 << order));
kvm->arch.hpt_cma_alloc = 1;
}
@@@ -450,7 -448,7 +448,7 @@@ static int kvmppc_mmu_book3s_64_hv_xlat
unsigned long slb_v;
unsigned long pp, key;
unsigned long v, gr;
- unsigned long *hptep;
+ __be64 *hptep;
int index;
int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
@@@ -473,13 -471,13 +471,13 @@@
preempt_enable();
return -ENOENT;
}
- hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
- v = hptep[0] & ~HPTE_V_HVLOCK;
+ hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+ v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
gr = kvm->arch.revmap[index].guest_rpte;
/* Unlock the HPTE */
asm volatile("lwsync" : : : "memory");
- hptep[0] = v;
+ hptep[0] = cpu_to_be64(v);
preempt_enable();
gpte->eaddr = eaddr;
@@@ -583,8 -581,7 +581,8 @@@ int kvmppc_book3s_hv_page_fault(struct
unsigned long ea, unsigned long dsisr)
{
struct kvm *kvm = vcpu->kvm;
- unsigned long *hptep, hpte[3], r;
+ unsigned long hpte[3], r;
+ __be64 *hptep;
unsigned long mmu_seq, psize, pte_size;
unsigned long gpa_base, gfn_base;
unsigned long gpa, gfn, hva, pfn;
@@@ -607,16 -604,16 +605,16 @@@
if (ea != vcpu->arch.pgfault_addr)
return RESUME_GUEST;
index = vcpu->arch.pgfault_index;
- hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
+ hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
rev = &kvm->arch.revmap[index];
preempt_disable();
while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
cpu_relax();
- hpte[0] = hptep[0] & ~HPTE_V_HVLOCK;
- hpte[1] = hptep[1];
+ hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
+ hpte[1] = be64_to_cpu(hptep[1]);
hpte[2] = r = rev->guest_rpte;
asm volatile("lwsync" : : : "memory");
- hptep[0] = hpte[0];
+ hptep[0] = cpu_to_be64(hpte[0]);
preempt_enable();
if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
@@@ -732,9 -729,8 +730,9 @@@
preempt_disable();
while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
cpu_relax();
- if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] ||
- rev->guest_rpte != hpte[2])
+ if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] ||
+ be64_to_cpu(hptep[1]) != hpte[1] ||
+ rev->guest_rpte != hpte[2])
/* HPTE has been changed under us; let the guest retry */
goto out_unlock;
hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
@@@ -754,20 -750,20 +752,20 @@@
rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
- if (hptep[0] & HPTE_V_VALID) {
+ if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
/* HPTE was previously valid, so we need to invalidate it */
unlock_rmap(rmap);
- hptep[0] |= HPTE_V_ABSENT;
+ hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
kvmppc_invalidate_hpte(kvm, hptep, index);
/* don't lose previous R and C bits */
- r |= hptep[1] & (HPTE_R_R | HPTE_R_C);
+ r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
} else {
kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
}
- hptep[1] = r;
+ hptep[1] = cpu_to_be64(r);
eieio();
- hptep[0] = hpte[0];
+ hptep[0] = cpu_to_be64(hpte[0]);
asm volatile("ptesync" : : : "memory");
preempt_enable();
if (page && hpte_is_writable(r))
@@@ -786,7 -782,7 +784,7 @@@
return ret;
out_unlock:
- hptep[0] &= ~HPTE_V_HVLOCK;
+ hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
preempt_enable();
goto out_put;
}
@@@ -862,7 -858,7 +860,7 @@@ static int kvm_unmap_rmapp(struct kvm *
{
struct revmap_entry *rev = kvm->arch.revmap;
unsigned long h, i, j;
- unsigned long *hptep;
+ __be64 *hptep;
unsigned long ptel, psize, rcbits;
for (;;) {
@@@ -878,11 -874,11 +876,11 @@@
* rmap chain lock.
*/
i = *rmapp & KVMPPC_RMAP_INDEX;
- hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
+ hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
/* unlock rmap before spinning on the HPTE lock */
unlock_rmap(rmapp);
- while (hptep[0] & HPTE_V_HVLOCK)
+ while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
cpu_relax();
continue;
}
@@@ -901,14 -897,14 +899,14 @@@
/* Now check and modify the HPTE */
ptel = rev[i].guest_rpte;
- psize = hpte_page_size(hptep[0], ptel);
- if ((hptep[0] & HPTE_V_VALID) &&
+ psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
+ if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
hpte_rpn(ptel, psize) == gfn) {
if (kvm->arch.using_mmu_notifiers)
- hptep[0] |= HPTE_V_ABSENT;
+ hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
kvmppc_invalidate_hpte(kvm, hptep, i);
/* Harvest R and C */
- rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
+ rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
if (rcbits & ~rev[i].guest_rpte) {
rev[i].guest_rpte = ptel | rcbits;
@@@ -916,7 -912,7 +914,7 @@@
}
}
unlock_rmap(rmapp);
- hptep[0] &= ~HPTE_V_HVLOCK;
+ hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
}
return 0;
}
@@@ -963,7 -959,7 +961,7 @@@ static int kvm_age_rmapp(struct kvm *kv
{
struct revmap_entry *rev = kvm->arch.revmap;
unsigned long head, i, j;
- unsigned long *hptep;
+ __be64 *hptep;
int ret = 0;
retry:
@@@ -979,24 -975,23 +977,24 @@@
i = head = *rmapp & KVMPPC_RMAP_INDEX;
do {
- hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
+ hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
j = rev[i].forw;
/* If this HPTE isn't referenced, ignore it */
- if (!(hptep[1] & HPTE_R_R))
+ if (!(be64_to_cpu(hptep[1]) & HPTE_R_R))
continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
/* unlock rmap before spinning on the HPTE lock */
unlock_rmap(rmapp);
- while (hptep[0] & HPTE_V_HVLOCK)
+ while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
cpu_relax();
goto retry;
}
/* Now check and modify the HPTE */
- if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
+ if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
+ (be64_to_cpu(hptep[1]) & HPTE_R_R)) {
kvmppc_clear_ref_hpte(kvm, hptep, i);
if (!(rev[i].guest_rpte & HPTE_R_R)) {
rev[i].guest_rpte |= HPTE_R_R;
@@@ -1004,7 -999,7 +1002,7 @@@
}
ret = 1;
}
- hptep[0] &= ~HPTE_V_HVLOCK;
+ hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
} while ((i = j) != head);
unlock_rmap(rmapp);
@@@ -1038,7 -1033,7 +1036,7 @@@ static int kvm_test_age_rmapp(struct kv
do {
hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
j = rev[i].forw;
- if (hp[1] & HPTE_R_R)
+ if (be64_to_cpu(hp[1]) & HPTE_R_R)
goto out;
} while ((i = j) != head);
}
@@@ -1078,7 -1073,7 +1076,7 @@@ static int kvm_test_clear_dirty_npages(
unsigned long head, i, j;
unsigned long n;
unsigned long v, r;
- unsigned long *hptep;
+ __be64 *hptep;
int npages_dirty = 0;
retry:
@@@ -1094,8 -1089,7 +1092,8 @@@
i = head = *rmapp & KVMPPC_RMAP_INDEX;
do {
- hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
+ unsigned long hptep1;
+ hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
j = rev[i].forw;
/*
@@@ -1112,30 -1106,29 +1110,30 @@@
* Otherwise we need to do the tlbie even if C==0 in
* order to pick up any delayed writeback of C.
*/
- if (!(hptep[1] & HPTE_R_C) &&
- (!hpte_is_writable(hptep[1]) || vcpus_running(kvm)))
+ hptep1 = be64_to_cpu(hptep[1]);
+ if (!(hptep1 & HPTE_R_C) &&
+ (!hpte_is_writable(hptep1) || vcpus_running(kvm)))
continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
/* unlock rmap before spinning on the HPTE lock */
unlock_rmap(rmapp);
- while (hptep[0] & HPTE_V_HVLOCK)
+ while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK))
cpu_relax();
goto retry;
}
/* Now check and modify the HPTE */
- if (!(hptep[0] & HPTE_V_VALID))
+ if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID)))
continue;
/* need to make it temporarily absent so C is stable */
- hptep[0] |= HPTE_V_ABSENT;
+ hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
kvmppc_invalidate_hpte(kvm, hptep, i);
- v = hptep[0];
- r = hptep[1];
+ v = be64_to_cpu(hptep[0]);
+ r = be64_to_cpu(hptep[1]);
if (r & HPTE_R_C) {
- hptep[1] = r & ~HPTE_R_C;
+ hptep[1] = cpu_to_be64(r & ~HPTE_R_C);
if (!(rev[i].guest_rpte & HPTE_R_C)) {
rev[i].guest_rpte |= HPTE_R_C;
note_hpte_modification(kvm, &rev[i]);
@@@ -1148,7 -1141,7 +1146,7 @@@
}
v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
v |= HPTE_V_VALID;
- hptep[0] = v;
+ hptep[0] = cpu_to_be64(v);
} while ((i = j) != head);
unlock_rmap(rmapp);
@@@ -1312,7 -1305,7 +1310,7 @@@ struct kvm_htab_ctx
* Returns 1 if this HPT entry has been modified or has pending
* R/C bit changes.
*/
-static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp)
+static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp)
{
unsigned long rcbits_unset;
@@@ -1321,14 -1314,13 +1319,14 @@@
/* Also need to consider changes in reference and changed bits */
rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
- if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset))
+ if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) &&
+ (be64_to_cpu(hptp[1]) & rcbits_unset))
return 1;
return 0;
}
-static long record_hpte(unsigned long flags, unsigned long *hptp,
+static long record_hpte(unsigned long flags, __be64 *hptp,
unsigned long *hpte, struct revmap_entry *revp,
int want_valid, int first_pass)
{
@@@ -1343,10 -1335,10 +1341,10 @@@
return 0;
valid = 0;
- if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+ if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) {
valid = 1;
if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
- !(hptp[0] & HPTE_V_BOLTED))
+ !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED))
valid = 0;
}
if (valid != want_valid)
@@@ -1358,7 -1350,7 +1356,7 @@@
preempt_disable();
while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
cpu_relax();
- v = hptp[0];
+ v = be64_to_cpu(hptp[0]);
/* re-evaluate valid and dirty from synchronized HPTE value */
valid = !!(v & HPTE_V_VALID);
@@@ -1366,9 -1358,9 +1364,9 @@@
/* Harvest R and C into guest view if necessary */
rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
- if (valid && (rcbits_unset & hptp[1])) {
- revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) |
- HPTE_GR_MODIFIED;
+ if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) {
+ revp->guest_rpte |= (be64_to_cpu(hptp[1]) &
+ (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
dirty = 1;
}
@@@ -1387,13 -1379,13 +1385,13 @@@
revp->guest_rpte = r;
}
asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
- hptp[0] &= ~HPTE_V_HVLOCK;
+ hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
preempt_enable();
if (!(valid == want_valid && (first_pass || dirty)))
ok = 0;
}
- hpte[0] = v;
- hpte[1] = r;
+ hpte[0] = cpu_to_be64(v);
+ hpte[1] = cpu_to_be64(r);
return ok;
}
@@@ -1403,7 -1395,7 +1401,7 @@@ static ssize_t kvm_htab_read(struct fil
struct kvm_htab_ctx *ctx = file->private_data;
struct kvm *kvm = ctx->kvm;
struct kvm_get_htab_header hdr;
- unsigned long *hptp;
+ __be64 *hptp;
struct revmap_entry *revp;
unsigned long i, nb, nw;
unsigned long __user *lbuf;
@@@ -1419,7 -1411,7 +1417,7 @@@
flags = ctx->flags;
i = ctx->index;
- hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+ hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
revp = kvm->arch.revmap + i;
lbuf = (unsigned long __user *)buf;
@@@ -1503,7 -1495,7 +1501,7 @@@ static ssize_t kvm_htab_write(struct fi
unsigned long i, j;
unsigned long v, r;
unsigned long __user *lbuf;
- unsigned long *hptp;
+ __be64 *hptp;
unsigned long tmp[2];
ssize_t nb;
long int err, ret;
@@@ -1545,7 -1537,7 +1543,7 @@@
i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
break;
- hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+ hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
lbuf = (unsigned long __user *)buf;
for (j = 0; j < hdr.n_valid; ++j) {
err = -EFAULT;
@@@ -1557,7 -1549,7 +1555,7 @@@
lbuf += 2;
nb += HPTE_SIZE;
- if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
+ if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
err = -EIO;
ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
@@@ -1583,7 -1575,7 +1581,7 @@@
}
for (j = 0; j < hdr.n_invalid; ++j) {
- if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
+ if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
++i;
hptp += 2;
diff --combined arch/powerpc/kvm/book3s_hv_builtin.c
index 3b41447,6cf498a..329d7fd
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@@ -16,12 -16,14 +16,14 @@@
#include <linux/init.h>
#include <linux/memblock.h>
#include <linux/sizes.h>
+ #include <linux/cma.h>
#include <asm/cputable.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
- #include "book3s_hv_cma.h"
+ #define KVM_CMA_CHUNK_ORDER 18
+
/*
* Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
* should be power of 2.
@@@ -43,6 -45,8 +45,8 @@@ static unsigned long kvm_cma_resv_rati
unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT; /* 128MB */
EXPORT_SYMBOL_GPL(kvm_rma_pages);
+ static struct cma *kvm_cma;
+
/* Work out RMLS (real mode limit selector) field value for a given RMA size.
Assumes POWER7 or PPC970. */
static inline int lpcr_rmls(unsigned long rma_size)
@@@ -97,7 -101,7 +101,7 @@@ struct kvm_rma_info *kvm_alloc_rma(
ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL);
if (!ri)
return NULL;
- page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages);
+ page = cma_alloc(kvm_cma, kvm_rma_pages, get_order(kvm_rma_pages));
if (!page)
goto err_out;
atomic_set(&ri->use_count, 1);
@@@ -112,7 -116,7 +116,7 @@@ EXPORT_SYMBOL_GPL(kvm_alloc_rma)
void kvm_release_rma(struct kvm_rma_info *ri)
{
if (atomic_dec_and_test(&ri->use_count)) {
- kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages);
+ cma_release(kvm_cma, pfn_to_page(ri->base_pfn), kvm_rma_pages);
kfree(ri);
}
}
@@@ -131,16 -135,18 +135,18 @@@ struct page *kvm_alloc_hpt(unsigned lon
{
unsigned long align_pages = HPT_ALIGN_PAGES;
+ VM_BUG_ON(get_order(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+
/* Old CPUs require HPT aligned on a multiple of its size */
if (!cpu_has_feature(CPU_FTR_ARCH_206))
align_pages = nr_pages;
- return kvm_alloc_cma(nr_pages, align_pages);
+ return cma_alloc(kvm_cma, nr_pages, get_order(align_pages));
}
EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
void kvm_release_hpt(struct page *page, unsigned long nr_pages)
{
- kvm_release_cma(page, nr_pages);
+ cma_release(kvm_cma, page, nr_pages);
}
EXPORT_SYMBOL_GPL(kvm_release_hpt);
@@@ -179,7 -185,8 +185,8 @@@ void __init kvm_cma_reserve(void
align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size);
- kvm_cma_declare_contiguous(selected_size, align_size);
+ cma_declare_contiguous(0, selected_size, 0, align_size,
+ KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma);
}
}
@@@ -212,16 -219,3 +219,16 @@@ bool kvm_hv_mode_active(void
{
return atomic_read(&hv_vm_count) != 0;
}
+
+extern int hcall_real_table[], hcall_real_table_end[];
+
+int kvmppc_hcall_impl_hv_realmode(unsigned long cmd)
+{
+ cmd /= 4;
+ if (cmd < hcall_real_table_end - hcall_real_table &&
+ hcall_real_table[cmd])
+ return 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
diff --combined arch/s390/Kconfig
index f5af5f6,d12d40e..3c94ef3
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@@ -116,6 -116,7 +116,6 @@@ config S39
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
- select HAVE_FUNCTION_TRACE_MCOUNT_TEST
select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_GZIP
@@@ -145,6 -146,7 +145,7 @@@
select TTY
select VIRT_CPU_ACCOUNTING
select VIRT_TO_BUS
+ select ARCH_HAS_SG_CHAIN
config SCHED_OMIT_FRAME_POINTER
def_bool y
diff --combined arch/sparc/Kconfig
index 4692c90,bff3192..a537816
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@@ -42,6 -42,7 +42,7 @@@ config SPAR
select MODULES_USE_ELF_RELA
select ODD_RT_SIGACTION
select OLD_SIGSUSPEND
+ select ARCH_HAS_SG_CHAIN
config SPARC32
def_bool !64BIT
@@@ -55,6 -56,7 +56,6 @@@ config SPARC6
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_GRAPH_FP_TEST
- select HAVE_FUNCTION_TRACE_MCOUNT_TEST
select HAVE_KRETPROBES
select HAVE_KPROBES
select HAVE_RCU_TABLE_FREE if SMP
diff --combined arch/x86/Kconfig
index 503f35c,2ae952c..273d20d
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -21,7 -21,6 +21,7 @@@ config X86_6
### Arch settings
config X86
def_bool y
+ select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
@@@ -55,6 -54,7 +55,6 @@@
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_GRAPH_FP_TEST
- select HAVE_FUNCTION_TRACE_MCOUNT_TEST
select HAVE_SYSCALL_TRACEPOINTS
select SYSCTL_EXCEPTION_TRACE
select HAVE_KVM
@@@ -96,6 -96,7 +96,7 @@@
select IRQ_FORCED_THREADING
select HAVE_BPF_JIT if X86_64
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select ARCH_HAS_SG_CHAIN
select CLKEVT_I8253
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_IOMAP
@@@ -132,7 -133,6 +133,7 @@@
select GENERIC_CPU_AUTOPROBE
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
+ select ACPI_LEGACY_TABLES_LOOKUP if ACPI
config INSTRUCTION_DECODER
def_bool y
@@@ -431,7 -431,6 +432,7 @@@ config X86_INTEL_C
bool "CE4100 TV platform"
depends on PCI
depends on PCI_GODIRECT
+ depends on X86_IO_APIC
depends on X86_32
depends on X86_EXTENDED_PLATFORM
select X86_REBOOTFIXUPS
@@@ -539,7 -538,7 +540,7 @@@ config X86_32_IRI
config SCHED_OMIT_FRAME_POINTER
def_bool y
- prompt "Single-depth WCHAN output"
+ prompt "Single-depth WCHAN output" if !LTO && !FRAME_POINTER
depends on X86
---help---
Calculate simpler /proc/<PID>/wchan values. If this option
@@@ -838,7 -837,6 +839,7 @@@ config X86_IO_API
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI
select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+ select IRQ_DOMAIN
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
bool "Reroute for broken boot IRQs"
diff --combined arch/x86/mm/fault.c
index 1dbade8,d30b78b..d393ac6
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@@ -350,7 -350,7 +350,7 @@@ out
void vmalloc_sync_all(void)
{
- sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
+ sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
}
/*
@@@ -577,8 -577,6 +577,8 @@@ static int is_f00f_bug(struct pt_regs *
static const char nx_warning[] = KERN_CRIT
"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+static const char smep_warning[] = KERN_CRIT
+"unable to execute userspace code (SMEP?) (uid: %d)\n";
static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code,
@@@ -599,10 -597,6 +599,10 @@@
if (pte && pte_present(*pte) && !pte_exec(*pte))
printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
+ if (pte && pte_present(*pte) && pte_exec(*pte) &&
+ (pgd_flags(*pgd) & _PAGE_USER) &&
+ (read_cr4() & X86_CR4_SMEP))
+ printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
}
printk(KERN_ALERT "BUG: unable to handle kernel ");
@@@ -1218,7 -1212,8 +1218,8 @@@ good_area
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
- * the fault:
+ * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
+ * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
*/
fault = handle_mm_fault(mm, vma, address, flags);
diff --combined block/bio-integrity.c
index bc423f7b,56754c4..38c8ac2
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@@ -70,10 -70,8 +70,10 @@@ struct bio_integrity_payload *bio_integ
bs->bvec_integrity_pool);
if (!bip->bip_vec)
goto err;
+ bip->bip_max_vcnt = bvec_nr_vecs(idx);
} else {
bip->bip_vec = bip->bip_inline_vecs;
+ bip->bip_max_vcnt = inline_vecs;
}
bip->bip_slab = idx;
@@@ -116,6 -114,14 +116,6 @@@ void bio_integrity_free(struct bio *bio
}
EXPORT_SYMBOL(bio_integrity_free);
-static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
-{
- if (bip->bip_slab == BIO_POOL_NONE)
- return BIP_INLINE_VECS;
-
- return bvec_nr_vecs(bip->bip_slab);
-}
-
/**
* bio_integrity_add_page - Attach integrity metadata
* @bio: bio to update
@@@ -131,7 -137,7 +131,7 @@@ int bio_integrity_add_page(struct bio *
struct bio_integrity_payload *bip = bio->bi_integrity;
struct bio_vec *iv;
- if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
+ if (bip->bip_vcnt >= bip->bip_max_vcnt) {
printk(KERN_ERR "%s: bip_vec full\n", __func__);
return 0;
}
@@@ -646,6 -652,4 +646,4 @@@ void __init bio_integrity_init(void
sizeof(struct bio_integrity_payload) +
sizeof(struct bio_vec) * BIP_INLINE_VECS,
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
- if (!bip_slab)
- panic("Failed to create slab\n");
}
diff --combined drivers/ata/Kconfig
index e65d400,b0d5b5a..e1b9278
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@@ -16,6 -16,7 +16,7 @@@ menuconfig AT
depends on BLOCK
depends on !(M32R || M68K || S390) || BROKEN
select SCSI
+ select GLOB
---help---
If you want to use an ATA hard disk, ATA tape drive, ATA CD-ROM or
any other ATA device under Linux, say Y and make sure that you know
@@@ -141,15 -142,6 +142,15 @@@ config AHCI_SUNX
If unsure, say N.
+config AHCI_TEGRA
+ tristate "NVIDIA Tegra124 AHCI SATA support"
+ depends on ARCH_TEGRA
+ help
+ This option enables support for the NVIDIA Tegra124 SoC's
+ onboard AHCI SATA.
+
+ If unsure, say N.
+
config AHCI_XGENE
tristate "APM X-Gene 6.0Gbps AHCI SATA host controller support"
depends on PHY_XGENE
diff --combined drivers/ata/libata-core.c
index 677c0c1,259d879..dbdc5d3
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@@ -59,6 -59,7 +59,7 @@@
#include <linux/async.h>
#include <linux/log2.h>
#include <linux/slab.h>
+ #include <linux/glob.h>
#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_host.h>
@@@ -4250,73 -4251,6 +4251,6 @@@ static const struct ata_blacklist_entr
{ }
};
- /**
- * glob_match - match a text string against a glob-style pattern
- * @text: the string to be examined
- * @pattern: the glob-style pattern to be matched against
- *
- * Either/both of text and pattern can be empty strings.
- *
- * Match text against a glob-style pattern, with wildcards and simple sets:
- *
- * ? matches any single character.
- * * matches any run of characters.
- * [xyz] matches a single character from the set: x, y, or z.
- * [a-d] matches a single character from the range: a, b, c, or d.
- * [a-d0-9] matches a single character from either range.
- *
- * The special characters ?, [, -, or *, can be matched using a set, eg. [*]
- * Behaviour with malformed patterns is undefined, though generally reasonable.
- *
- * Sample patterns: "SD1?", "SD1[0-5]", "*R0",
"SD*1?[012]*xx"
- *
- * This function uses one level of recursion per '*' in pattern.
- * Since it calls _nothing_ else, and has _no_ explicit local variables,
- * this will not cause stack problems for any reasonable use here.
- *
- * RETURNS:
- * 0 on match, 1 otherwise.
- */
- static int glob_match (const char *text, const char *pattern)
- {
- do {
- /* Match single character or a '?' wildcard */
- if (*text == *pattern || *pattern == '?') {
- if (!*pattern++)
- return 0; /* End of both strings: match */
- } else {
- /* Match single char against a '[' bracketed ']' pattern set */
- if (!*text || *pattern != '[')
- break; /* Not a pattern set */
- while (*++pattern && *pattern != ']' && *text != *pattern) {
- if (*pattern == '-' && *(pattern - 1) != '[')
- if (*text > *(pattern - 1) && *text < *(pattern + 1)) {
- ++pattern;
- break;
- }
- }
- if (!*pattern || *pattern == ']')
- return 1; /* No match */
- while (*pattern && *pattern++ != ']');
- }
- } while (*++text && *pattern);
-
- /* Match any run of chars against a '*' wildcard */
- if (*pattern == '*') {
- if (!*++pattern)
- return 0; /* Match: avoid recursion at end of pattern */
- /* Loop to handle additional pattern chars after the wildcard */
- while (*text) {
- if (glob_match(text, pattern) == 0)
- return 0; /* Remainder matched */
- ++text; /* Absorb (match) this char and try again */
- }
- }
- if (!*text && !*pattern)
- return 0; /* End of both strings: match */
- return 1; /* No match */
- }
-
static unsigned long ata_dev_blacklisted(const struct ata_device *dev)
{
unsigned char model_num[ATA_ID_PROD_LEN + 1];
@@@ -4327,10 -4261,10 +4261,10 @@@
ata_id_c_string(dev->id, model_rev, ATA_ID_FW_REV, sizeof(model_rev));
while (ad->model_num) {
- if (!glob_match(model_num, ad->model_num)) {
+ if (glob_match(model_num, ad->model_num)) {
if (ad->model_rev == NULL)
return ad->horkage;
- if (!glob_match(model_rev, ad->model_rev))
+ if (glob_match(model_rev, ad->model_rev))
return ad->horkage;
}
ad++;
@@@ -4798,8 -4732,9 +4732,8 @@@ void swap_buf_le16(u16 *buf, unsigned i
static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
{
struct ata_queued_cmd *qc = NULL;
- unsigned int i, tag, max_queue;
-
- max_queue = ap->scsi_host->can_queue;
+ unsigned int max_queue = ap->host->n_tags;
+ unsigned int i, tag;
/* no command while frozen */
if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
@@@ -6093,7 -6028,6 +6027,7 @@@ void ata_host_init(struct ata_host *hos
{
spin_lock_init(&host->lock);
mutex_init(&host->eh_mutex);
+ host->n_tags = ATA_MAX_QUEUE - 1;
host->dev = dev;
host->ops = ops;
}
@@@ -6175,7 -6109,15 +6109,7 @@@ int ata_host_register(struct ata_host *
{
int i, rc;
- /*
- * The max queue supported by hardware must not be greater than
- * ATA_MAX_QUEUE.
- */
- if (sht->can_queue > ATA_MAX_QUEUE) {
- dev_err(host->dev, "BUG: the hardware max queue is too large\n");
- WARN_ON(1);
- return -EINVAL;
- }
+ host->n_tags = clamp(sht->can_queue, 1, ATA_MAX_QUEUE - 1);
/* host must have been started */
if (!(host->flags & ATA_HOST_STARTED)) {
diff --combined drivers/base/Kconfig
index 88500fe,9d5fed1..4e7f0ff
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@@ -149,21 -149,15 +149,21 @@@ config EXTRA_FIRMWARE_DI
some other directory containing the firmware files.
config FW_LOADER_USER_HELPER
+ bool
+
+config FW_LOADER_USER_HELPER_FALLBACK
bool "Fallback user-helper invocation for firmware loading"
depends on FW_LOADER
- default y
+ select FW_LOADER_USER_HELPER
help
This option enables / disables the invocation of user-helper
(e.g. udev) for loading firmware files as a fallback after the
direct file loading in kernel fails. The user-mode helper is
no longer required unless you have a special firmware file that
- resides in a non-standard path.
+ resides in a non-standard path. Moreover, the udev support has
+ been deprecated upstream.
+
+ If you are unsure about this, say N here.
config DEBUG_DRIVER
bool "Driver Core verbose debug messages"
@@@ -214,15 -208,6 +214,15 @@@ config DMA_SHARED_BUFFE
APIs extension; the file's descriptor can then be passed on to other
driver.
+config FENCE_TRACE
+ bool "Enable verbose FENCE_TRACE messages"
+ depends on DMA_SHARED_BUFFER
+ help
+ Enable the FENCE_TRACE printks. This will add extra
+ spam to the console log, but will make it easier to diagnose
+ lockup related problems for dma-buffers shared across multiple
+ devices.
+
config DMA_CMA
bool "DMA Contiguous Memory Allocator"
depends on HAVE_DMA_CONTIGUOUS && CMA
@@@ -289,16 -274,6 +289,6 @@@ config CMA_ALIGNMEN
If unsure, leave the default value "8".
- config CMA_AREAS
- int "Maximum count of the CMA device-private areas"
- default 7
- help
- CMA allows to create CMA areas for particular devices. This parameter
- sets the maximum number of such device private CMA areas in the
- system.
-
- If unsure, leave the default value "7".
-
endif
endmenu
diff --combined drivers/input/input.c
index 29ca0bb,3b9284b..236bc56
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@@ -257,10 -257,9 +257,10 @@@ static int input_handle_abs_event(struc
}
static int input_get_disposition(struct input_dev *dev,
- unsigned int type, unsigned int code, int value)
+ unsigned int type, unsigned int code, int *pval)
{
int disposition = INPUT_IGNORE_EVENT;
+ int value = *pval;
switch (type) {
@@@ -358,7 -357,6 +358,7 @@@
break;
}
+ *pval = value;
return disposition;
}
@@@ -367,7 -365,7 +367,7 @@@ static void input_handle_event(struct i
{
int disposition;
- disposition = input_get_disposition(dev, type, code, value);
+ disposition = input_get_disposition(dev, type, code, &value);
if ((disposition & INPUT_PASS_TO_DEVICE) && dev->event)
dev->event(dev, type, code, value);
@@@ -710,6 -708,9 +710,9 @@@ static void input_disconnect_device(str
handle->open = 0;
spin_unlock_irq(&dev->event_lock);
+
+ if (is_event_supported(EV_LED, dev->evbit, EV_MAX))
+ input_led_disconnect(dev);
}
/**
@@@ -2136,6 -2137,9 +2139,9 @@@ int input_register_device(struct input_
list_add_tail(&dev->node, &input_dev_list);
+ if (is_event_supported(EV_LED, dev->evbit, EV_MAX))
+ input_led_connect(dev);
+
list_for_each_entry(handler, &input_handler_list, node)
input_attach_handler(dev, handler);
diff --combined drivers/leds/Kconfig
index 8c96e2d,6784c17..f6e32ba
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@@ -11,9 -11,6 +11,6 @@@ menuconfig NEW_LED
Say Y to enable Linux LED support. This allows control of supported
LEDs from both userspace and optionally, by kernel events (triggers).
- This is not related to standard keyboard LEDs which are controlled
- via the input system.
-
if NEW_LEDS
config LEDS_CLASS
@@@ -32,6 -29,14 +29,6 @@@ config LEDS_88PM860
This option enables support for on-chip LED drivers found on Marvell
Semiconductor 88PM8606 PMIC.
-config LEDS_ATMEL_PWM
- tristate "LED Support using Atmel PWM outputs"
- depends on LEDS_CLASS
- depends on ATMEL_PWM
- help
- This option enables support for LEDs driven using outputs
- of the dedicated PWM controller found on newer Atmel SOCs.
-
config LEDS_LM3530
tristate "LCD Backlight driver for LM3530"
depends on LEDS_CLASS
@@@ -135,13 -140,6 +132,13 @@@ config LEDS_SUNFIR
This option enables support for the Left, Middle, and Right
LEDs on the I/O and CPU boards of SunFire UltraSPARC servers.
+config LEDS_IPAQ_MICRO
+ tristate "LED Support for the Compaq iPAQ h3xxx"
+ depends on MFD_IPAQ_MICRO
+ help
+ Choose this option if you want to use the notification LED on
+ Compaq/HP iPAQ h3100 and h3600.
+
config LEDS_HP6XX
tristate "LED Support for the HP Jornada 6xx"
depends on LEDS_CLASS
diff --combined drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 9c93ff2,c57b085..ae3f105
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@@ -215,135 -215,52 +215,135 @@@ static int i40e_get_settings(struct net
/* hardware is either in 40G mode or 10G mode
* NOTE: this section initializes supported and advertising
*/
+ if (!link_up) {
+ /* link is down and the driver needs to fall back on
+ * device ID to determine what kinds of info to display,
+ * it's mostly a guess that may change when link is up
+ */
+ switch (hw->device_id) {
+ case I40E_DEV_ID_QSFP_A:
+ case I40E_DEV_ID_QSFP_B:
+ case I40E_DEV_ID_QSFP_C:
+ /* pluggable QSFP */
+ ecmd->supported = SUPPORTED_40000baseSR4_Full |
+ SUPPORTED_40000baseCR4_Full |
+ SUPPORTED_40000baseLR4_Full;
+ ecmd->advertising = ADVERTISED_40000baseSR4_Full |
+ ADVERTISED_40000baseCR4_Full |
+ ADVERTISED_40000baseLR4_Full;
+ break;
+ case I40E_DEV_ID_KX_B:
+ /* backplane 40G */
+ ecmd->supported = SUPPORTED_40000baseKR4_Full;
+ ecmd->advertising = ADVERTISED_40000baseKR4_Full;
+ break;
+ case I40E_DEV_ID_KX_C:
+ /* backplane 10G */
+ ecmd->supported = SUPPORTED_10000baseKR_Full;
+ ecmd->advertising = ADVERTISED_10000baseKR_Full;
+ break;
+ default:
+ /* all the rest are 10G/1G */
+ ecmd->supported = SUPPORTED_10000baseT_Full |
+ SUPPORTED_1000baseT_Full;
+ ecmd->advertising = ADVERTISED_10000baseT_Full |
+ ADVERTISED_1000baseT_Full;
+ break;
+ }
+
+ /* skip phy_type use as it is zero when link is down */
+ goto no_valid_phy_type;
+ }
+
switch (hw_link_info->phy_type) {
case I40E_PHY_TYPE_40GBASE_CR4:
case I40E_PHY_TYPE_40GBASE_CR4_CU:
- ecmd->supported = SUPPORTED_40000baseCR4_Full;
- ecmd->advertising = ADVERTISED_40000baseCR4_Full;
+ ecmd->supported = SUPPORTED_Autoneg |
+ SUPPORTED_40000baseCR4_Full;
+ ecmd->advertising = ADVERTISED_Autoneg |
+ ADVERTISED_40000baseCR4_Full;
break;
case I40E_PHY_TYPE_40GBASE_KR4:
- ecmd->supported = SUPPORTED_40000baseKR4_Full;
- ecmd->advertising = ADVERTISED_40000baseKR4_Full;
+ ecmd->supported = SUPPORTED_Autoneg |
+ SUPPORTED_40000baseKR4_Full;
+ ecmd->advertising = ADVERTISED_Autoneg |
+ ADVERTISED_40000baseKR4_Full;
break;
case I40E_PHY_TYPE_40GBASE_SR4:
+ case I40E_PHY_TYPE_XLPPI:
+ case I40E_PHY_TYPE_XLAUI:
ecmd->supported = SUPPORTED_40000baseSR4_Full;
break;
case I40E_PHY_TYPE_40GBASE_LR4:
ecmd->supported = SUPPORTED_40000baseLR4_Full;
- ecmd->advertising = ADVERTISED_40000baseLR4_Full;
break;
case I40E_PHY_TYPE_10GBASE_KX4:
- ecmd->supported = SUPPORTED_10000baseKX4_Full;
- ecmd->advertising = ADVERTISED_10000baseKX4_Full;
+ ecmd->supported = SUPPORTED_Autoneg |
+ SUPPORTED_10000baseKX4_Full;
+ ecmd->advertising = ADVERTISED_Autoneg |
+ ADVERTISED_10000baseKX4_Full;
break;
case I40E_PHY_TYPE_10GBASE_KR:
- ecmd->supported = SUPPORTED_10000baseKR_Full;
- ecmd->advertising = ADVERTISED_10000baseKR_Full;
+ ecmd->supported = SUPPORTED_Autoneg |
+ SUPPORTED_10000baseKR_Full;
+ ecmd->advertising = ADVERTISED_Autoneg |
+ ADVERTISED_10000baseKR_Full;
break;
- default:
- if (i40e_is_40G_device(hw->device_id)) {
- ecmd->supported = SUPPORTED_40000baseSR4_Full;
- ecmd->advertising = ADVERTISED_40000baseSR4_Full;
- } else {
- ecmd->supported = SUPPORTED_10000baseT_Full;
- ecmd->advertising = ADVERTISED_10000baseT_Full;
- }
+ case I40E_PHY_TYPE_10GBASE_SR:
+ case I40E_PHY_TYPE_10GBASE_LR:
+ ecmd->supported = SUPPORTED_10000baseT_Full;
+ break;
+ case I40E_PHY_TYPE_10GBASE_CR1_CU:
+ case I40E_PHY_TYPE_10GBASE_CR1:
+ case I40E_PHY_TYPE_10GBASE_T:
+ ecmd->supported = SUPPORTED_Autoneg |
+ SUPPORTED_10000baseT_Full;
+ ecmd->advertising = ADVERTISED_Autoneg |
+ ADVERTISED_10000baseT_Full;
+ break;
+ case I40E_PHY_TYPE_XAUI:
+ case I40E_PHY_TYPE_XFI:
+ case I40E_PHY_TYPE_SFI:
+ case I40E_PHY_TYPE_10GBASE_SFPP_CU:
+ ecmd->supported = SUPPORTED_10000baseT_Full;
+ break;
+ case I40E_PHY_TYPE_1000BASE_KX:
+ case I40E_PHY_TYPE_1000BASE_T:
+ ecmd->supported = SUPPORTED_Autoneg |
+ SUPPORTED_1000baseT_Full;
+ ecmd->advertising = ADVERTISED_Autoneg |
+ ADVERTISED_1000baseT_Full;
+ break;
+ case I40E_PHY_TYPE_100BASE_TX:
+ ecmd->supported = SUPPORTED_Autoneg |
+ SUPPORTED_100baseT_Full;
+ ecmd->advertising = ADVERTISED_Autoneg |
+ ADVERTISED_100baseT_Full;
+ break;
+ case I40E_PHY_TYPE_SGMII:
+ ecmd->supported = SUPPORTED_Autoneg |
+ SUPPORTED_1000baseT_Full |
+ SUPPORTED_100baseT_Full;
+ ecmd->advertising = ADVERTISED_Autoneg |
+ ADVERTISED_1000baseT_Full |
+ ADVERTISED_100baseT_Full;
break;
+ default:
+ /* if we got here and link is up something bad is afoot */
+ WARN_ON(link_up);
}
- ecmd->supported |= SUPPORTED_Autoneg;
- ecmd->advertising |= ADVERTISED_Autoneg;
+no_valid_phy_type:
+ /* this is if autoneg is enabled or disabled */
ecmd->autoneg = ((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ?
AUTONEG_ENABLE : AUTONEG_DISABLE);
switch (hw->phy.media_type) {
case I40E_MEDIA_TYPE_BACKPLANE:
- ecmd->supported |= SUPPORTED_Backplane;
- ecmd->advertising |= ADVERTISED_Backplane;
+ ecmd->supported |= SUPPORTED_Autoneg |
+ SUPPORTED_Backplane;
+ ecmd->advertising |= ADVERTISED_Autoneg |
+ ADVERTISED_Backplane;
ecmd->port = PORT_NONE;
break;
case I40E_MEDIA_TYPE_BASET:
@@@ -359,6 -276,7 +359,6 @@@
break;
case I40E_MEDIA_TYPE_FIBER:
ecmd->supported |= SUPPORTED_FIBRE;
- ecmd->advertising |= ADVERTISED_FIBRE;
ecmd->port = PORT_FIBRE;
break;
case I40E_MEDIA_TYPE_UNKNOWN:
@@@ -369,25 -287,6 +369,25 @@@
ecmd->transceiver = XCVR_EXTERNAL;
+ ecmd->supported |= SUPPORTED_Pause;
+
+ switch (hw->fc.current_mode) {
+ case I40E_FC_FULL:
+ ecmd->advertising |= ADVERTISED_Pause;
+ break;
+ case I40E_FC_TX_PAUSE:
+ ecmd->advertising |= ADVERTISED_Asym_Pause;
+ break;
+ case I40E_FC_RX_PAUSE:
+ ecmd->advertising |= (ADVERTISED_Pause |
+ ADVERTISED_Asym_Pause);
+ break;
+ default:
+ ecmd->advertising &= ~(ADVERTISED_Pause |
+ ADVERTISED_Asym_Pause);
+ break;
+ }
+
if (link_up) {
switch (link_speed) {
case I40E_LINK_SPEED_40GB:
@@@ -397,9 -296,6 +397,9 @@@
case I40E_LINK_SPEED_10GB:
ethtool_cmd_speed_set(ecmd, SPEED_10000);
break;
+ case I40E_LINK_SPEED_1GB:
+ ethtool_cmd_speed_set(ecmd, SPEED_1000);
+ break;
default:
break;
}
@@@ -413,182 -309,6 +413,182 @@@
}
/**
+ * i40e_set_settings - Set Speed and Duplex
+ * @netdev: network interface device structure
+ * @ecmd: ethtool command
+ *
+ * Set speed/duplex per media_types advertised/forced
+ **/
+static int i40e_set_settings(struct net_device *netdev,
+ struct ethtool_cmd *ecmd)
+{
+ struct i40e_netdev_priv *np = netdev_priv(netdev);
+ struct i40e_aq_get_phy_abilities_resp abilities;
+ struct i40e_aq_set_phy_config config;
+ struct i40e_pf *pf = np->vsi->back;
+ struct i40e_vsi *vsi = np->vsi;
+ struct i40e_hw *hw = &pf->hw;
+ struct ethtool_cmd safe_ecmd;
+ i40e_status status = 0;
+ bool change = false;
+ int err = 0;
+ u8 autoneg;
+ u32 advertise;
+
+ if (vsi != pf->vsi[pf->lan_vsi])
+ return -EOPNOTSUPP;
+
+ if (hw->phy.media_type != I40E_MEDIA_TYPE_BASET &&
+ hw->phy.media_type != I40E_MEDIA_TYPE_FIBER &&
+ hw->phy.media_type != I40E_MEDIA_TYPE_BACKPLANE)
+ return -EOPNOTSUPP;
+
+ /* get our own copy of the bits to check against */
+ memset(&safe_ecmd, 0, sizeof(struct ethtool_cmd));
+ i40e_get_settings(netdev, &safe_ecmd);
+
+ /* save autoneg and speed out of ecmd */
+ autoneg = ecmd->autoneg;
+ advertise = ecmd->advertising;
+
+ /* set autoneg and speed back to what they currently are */
+ ecmd->autoneg = safe_ecmd.autoneg;
+ ecmd->advertising = safe_ecmd.advertising;
+
+ ecmd->cmd = safe_ecmd.cmd;
+ /* If ecmd and safe_ecmd are not the same now, then they are
+ * trying to set something that we do not support
+ */
+ if (memcmp(ecmd, &safe_ecmd, sizeof(struct ethtool_cmd)))
+ return -EOPNOTSUPP;
+
+ while (test_bit(__I40E_CONFIG_BUSY, &vsi->state))
+ usleep_range(1000, 2000);
+
+ /* Get the current phy config */
+ status = i40e_aq_get_phy_capabilities(hw, false, false, &abilities,
+ NULL);
+ if (status)
+ return -EAGAIN;
+
+ /* Copy link_speed and abilities to config in case they are not
+ * set below
+ */
+ memset(&config, 0, sizeof(struct i40e_aq_set_phy_config));
+ config.link_speed = abilities.link_speed;
+ config.abilities = abilities.abilities;
+
+ /* Check autoneg */
+ if (autoneg == AUTONEG_ENABLE) {
+ /* If autoneg is not supported, return error */
+ if (!(safe_ecmd.supported & SUPPORTED_Autoneg)) {
+ netdev_info(netdev, "Autoneg not supported on this phy\n");
+ return -EINVAL;
+ }
+ /* If autoneg was not already enabled */
+ if (!(hw->phy.link_info.an_info & I40E_AQ_AN_COMPLETED)) {
+ config.abilities = abilities.abilities |
+ I40E_AQ_PHY_ENABLE_AN;
+ change = true;
+ }
+ } else {
+ /* If autoneg is supported 10GBASE_T is the only phy that
+ * can disable it, so otherwise return error
+ */
+ if (safe_ecmd.supported & SUPPORTED_Autoneg &&
+ hw->phy.link_info.phy_type != I40E_PHY_TYPE_10GBASE_T) {
+ netdev_info(netdev, "Autoneg cannot be disabled on this phy\n");
+ return -EINVAL;
+ }
+ /* If autoneg is currently enabled */
+ if (hw->phy.link_info.an_info & I40E_AQ_AN_COMPLETED) {
+ config.abilities = abilities.abilities |
+ ~I40E_AQ_PHY_ENABLE_AN;
+ change = true;
+ }
+ }
+
+ if (advertise & ~safe_ecmd.supported)
+ return -EINVAL;
+
+ if (advertise & ADVERTISED_100baseT_Full)
+ if (!(abilities.link_speed & I40E_LINK_SPEED_100MB)) {
+ config.link_speed |= I40E_LINK_SPEED_100MB;
+ change = true;
+ }
+ if (advertise & ADVERTISED_1000baseT_Full ||
+ advertise & ADVERTISED_1000baseKX_Full)
+ if (!(abilities.link_speed & I40E_LINK_SPEED_1GB)) {
+ config.link_speed |= I40E_LINK_SPEED_1GB;
+ change = true;
+ }
+ if (advertise & ADVERTISED_10000baseT_Full ||
+ advertise & ADVERTISED_10000baseKX4_Full ||
+ advertise & ADVERTISED_10000baseKR_Full)
+ if (!(abilities.link_speed & I40E_LINK_SPEED_10GB)) {
+ config.link_speed |= I40E_LINK_SPEED_10GB;
+ change = true;
+ }
+ if (advertise & ADVERTISED_40000baseKR4_Full ||
+ advertise & ADVERTISED_40000baseCR4_Full ||
+ advertise & ADVERTISED_40000baseSR4_Full ||
+ advertise & ADVERTISED_40000baseLR4_Full)
+ if (!(abilities.link_speed & I40E_LINK_SPEED_40GB)) {
+ config.link_speed |= I40E_LINK_SPEED_40GB;
+ change = true;
+ }
+
+ if (change) {
+ /* copy over the rest of the abilities */
+ config.phy_type = abilities.phy_type;
+ config.eee_capability = abilities.eee_capability;
+ config.eeer = abilities.eeer_val;
+ config.low_power_ctrl = abilities.d3_lpan;
+
+ /* If link is up set link and an so changes take effect */
+ if (hw->phy.link_info.link_info & I40E_AQ_LINK_UP)
+ config.abilities |= I40E_AQ_PHY_ENABLE_ATOMIC_LINK;
+
+ /* make the aq call */
+ status = i40e_aq_set_phy_config(hw, &config, NULL);
+ if (status) {
+ netdev_info(netdev, "Set phy config failed with error %d.\n",
+ status);
+ return -EAGAIN;
+ }
+
+ status = i40e_update_link_info(hw, true);
+ if (status)
+ netdev_info(netdev, "Updating link info failed with error %d\n",
+ status);
+
+ } else {
+ netdev_info(netdev, "Nothing changed, exiting without setting
anything.\n");
+ }
+
+ return err;
+}
+
+static int i40e_nway_reset(struct net_device *netdev)
+{
+ /* restart autonegotiation */
+ struct i40e_netdev_priv *np = netdev_priv(netdev);
+ struct i40e_pf *pf = np->vsi->back;
+ struct i40e_hw *hw = &pf->hw;
+ bool link_up = hw->phy.link_info.link_info & I40E_AQ_LINK_UP;
+ i40e_status ret = 0;
+
+ ret = i40e_aq_set_link_restart_an(hw, link_up, NULL);
+ if (ret) {
+ netdev_info(netdev, "link restart failed, aq_err=%d\n",
+ pf->hw.aq.asq_last_status);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/**
* i40e_get_pauseparam - Get Flow Control status
* Return tx/rx-pause status
**/
@@@ -614,85 -334,6 +614,85 @@@ static void i40e_get_pauseparam(struct
}
}
+/**
+ * i40e_set_pauseparam - Set Flow Control parameter
+ * @netdev: network interface device structure
+ * @pause: return tx/rx flow control status
+ **/
+static int i40e_set_pauseparam(struct net_device *netdev,
+ struct ethtool_pauseparam *pause)
+{
+ struct i40e_netdev_priv *np = netdev_priv(netdev);
+ struct i40e_pf *pf = np->vsi->back;
+ struct i40e_vsi *vsi = np->vsi;
+ struct i40e_hw *hw = &pf->hw;
+ struct i40e_link_status *hw_link_info = &hw->phy.link_info;
+ bool link_up = hw_link_info->link_info & I40E_AQ_LINK_UP;
+ i40e_status status;
+ u8 aq_failures;
+ int err = 0;
+
+ if (vsi != pf->vsi[pf->lan_vsi])
+ return -EOPNOTSUPP;
+
+ if (pause->autoneg != ((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ?
+ AUTONEG_ENABLE : AUTONEG_DISABLE)) {
+ netdev_info(netdev, "To change autoneg please use: ethtool -s <dev> autoneg
<on|off>\n");
+ return -EOPNOTSUPP;
+ }
+
+ /* If we have link and don't have autoneg */
+ if (!test_bit(__I40E_DOWN, &pf->state) &&
+ !(hw_link_info->an_info & I40E_AQ_AN_COMPLETED)) {
+ /* Send message that it might not necessarily work*/
+ netdev_info(netdev, "Autoneg did not complete so changing settings may not result
in an actual change.\n");
+ }
+
+ if (hw->fc.current_mode == I40E_FC_PFC) {
+ netdev_info(netdev, "Priority flow control enabled. Cannot set link flow
control.\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (pause->rx_pause && pause->tx_pause)
+ hw->fc.requested_mode = I40E_FC_FULL;
+ else if (pause->rx_pause && !pause->tx_pause)
+ hw->fc.requested_mode = I40E_FC_RX_PAUSE;
+ else if (!pause->rx_pause && pause->tx_pause)
+ hw->fc.requested_mode = I40E_FC_TX_PAUSE;
+ else if (!pause->rx_pause && !pause->tx_pause)
+ hw->fc.requested_mode = I40E_FC_NONE;
+ else
+ return -EINVAL;
+
+ /* Set the fc mode and only restart an if link is up*/
+ status = i40e_set_fc(hw, &aq_failures, link_up);
+
+ if (aq_failures & I40E_SET_FC_AQ_FAIL_GET) {
+ netdev_info(netdev, "Set fc failed on the get_phy_capabilities call with error %d
and status %d\n",
+ status, hw->aq.asq_last_status);
+ err = -EAGAIN;
+ }
+ if (aq_failures & I40E_SET_FC_AQ_FAIL_SET) {
+ netdev_info(netdev, "Set fc failed on the set_phy_config call with error %d and
status %d\n",
+ status, hw->aq.asq_last_status);
+ err = -EAGAIN;
+ }
+ if (aq_failures & I40E_SET_FC_AQ_FAIL_UPDATE) {
+ netdev_info(netdev, "Set fc failed on the update_link_info call with error %d and
status %d\n",
+ status, hw->aq.asq_last_status);
+ err = -EAGAIN;
+ }
+
+ if (!test_bit(__I40E_DOWN, &pf->state)) {
+ /* Give it a little more time to try to come back */
+ msleep(75);
+ if (!test_bit(__I40E_DOWN, &pf->state))
+ return i40e_nway_reset(netdev);
+ }
+
+ return err;
+}
+
static u32 i40e_get_msglevel(struct net_device *netdev)
{
struct i40e_netdev_priv *np = netdev_priv(netdev);
@@@ -763,33 -404,10 +763,33 @@@ static int i40e_get_eeprom(struct net_d
u8 *eeprom_buff;
u16 i, sectors;
bool last;
+ u32 magic;
+
#define I40E_NVM_SECTOR_SIZE 4096
if (eeprom->len == 0)
return -EINVAL;
+ /* check for NVMUpdate access method */
+ magic = hw->vendor_id | (hw->device_id << 16);
+ if (eeprom->magic && eeprom->magic != magic) {
+ int errno;
+
+ /* make sure it is the right magic for NVMUpdate */
+ if ((eeprom->magic >> 16) != hw->device_id)
+ return -EINVAL;
+
+ ret_val = i40e_nvmupd_command(hw,
+ (struct i40e_nvm_access *)eeprom,
+ bytes, &errno);
+ if (ret_val)
+ dev_info(&pf->pdev->dev,
+ "NVMUpdate read failed err=%d status=0x%x\n",
+ ret_val, hw->aq.asq_last_status);
+
+ return errno;
+ }
+
+ /* normal ethtool get_eeprom support */
eeprom->magic = hw->vendor_id | (hw->device_id << 16);
eeprom_buff = kzalloc(eeprom->len, GFP_KERNEL);
@@@ -816,7 -434,7 +816,7 @@@
ret_val = i40e_aq_read_nvm(hw, 0x0,
eeprom->offset + (I40E_NVM_SECTOR_SIZE * i),
len,
- eeprom_buff + (I40E_NVM_SECTOR_SIZE * i),
+ (u8 *)eeprom_buff + (I40E_NVM_SECTOR_SIZE * i),
last, NULL);
if (ret_val) {
dev_info(&pf->pdev->dev,
@@@ -828,7 -446,7 +828,7 @@@
release_nvm:
i40e_release_nvm(hw);
- memcpy(bytes, eeprom_buff, eeprom->len);
+ memcpy(bytes, (u8 *)eeprom_buff, eeprom->len);
free_buff:
kfree(eeprom_buff);
return ret_val;
@@@ -848,39 -466,6 +848,39 @@@ static int i40e_get_eeprom_len(struct n
return val;
}
+static int i40e_set_eeprom(struct net_device *netdev,
+ struct ethtool_eeprom *eeprom, u8 *bytes)
+{
+ struct i40e_netdev_priv *np = netdev_priv(netdev);
+ struct i40e_hw *hw = &np->vsi->back->hw;
+ struct i40e_pf *pf = np->vsi->back;
+ int ret_val = 0;
+ int errno;
+ u32 magic;
+
+ /* normal ethtool set_eeprom is not supported */
+ magic = hw->vendor_id | (hw->device_id << 16);
+ if (eeprom->magic == magic)
+ return -EOPNOTSUPP;
+
+ /* check for NVMUpdate access method */
+ if (!eeprom->magic || (eeprom->magic >> 16) != hw->device_id)
+ return -EINVAL;
+
+ if (test_bit(__I40E_RESET_RECOVERY_PENDING, &pf->state) ||
+ test_bit(__I40E_RESET_INTR_RECEIVED, &pf->state))
+ return -EBUSY;
+
+ ret_val = i40e_nvmupd_command(hw, (struct i40e_nvm_access *)eeprom,
+ bytes, &errno);
+ if (ret_val)
+ dev_info(&pf->pdev->dev,
+ "NVMUpdate write failed err=%d status=0x%x\n",
+ ret_val, hw->aq.asq_last_status);
+
+ return errno;
+}
+
static void i40e_get_drvinfo(struct net_device *netdev,
struct ethtool_drvinfo *drvinfo)
{
@@@ -1436,6 -1021,24 +1436,6 @@@ static int i40e_set_wol(struct net_devi
return 0;
}
-static int i40e_nway_reset(struct net_device *netdev)
-{
- /* restart autonegotiation */
- struct i40e_netdev_priv *np = netdev_priv(netdev);
- struct i40e_pf *pf = np->vsi->back;
- struct i40e_hw *hw = &pf->hw;
- i40e_status ret = 0;
-
- ret = i40e_aq_set_link_restart_an(hw, NULL);
- if (ret) {
- netdev_info(netdev, "link restart failed, aq_err=%d\n",
- pf->hw.aq.asq_last_status);
- return -EIO;
- }
-
- return 0;
-}
-
static int i40e_set_phys_id(struct net_device *netdev,
enum ethtool_phys_id_state state)
{
@@@ -1502,36 -1105,17 +1502,36 @@@ static int i40e_set_coalesce(struct net
if (ec->tx_max_coalesced_frames_irq || ec->rx_max_coalesced_frames_irq)
vsi->work_limit = ec->tx_max_coalesced_frames_irq;
+ vector = vsi->base_vector;
if ((ec->rx_coalesce_usecs >= (I40E_MIN_ITR << 1)) &&
- (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1)))
+ (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1))) {
vsi->rx_itr_setting = ec->rx_coalesce_usecs;
- else
+ } else if (ec->rx_coalesce_usecs == 0) {
+ vsi->rx_itr_setting = ec->rx_coalesce_usecs;
+ i40e_irq_dynamic_disable(vsi, vector);
+ if (ec->use_adaptive_rx_coalesce)
+ netif_info(pf, drv, netdev,
+ "Rx-secs=0, need to disable adaptive-Rx for a complete disable\n");
+ } else {
+ netif_info(pf, drv, netdev,
+ "Invalid value, Rx-usecs range is 0, 8-8160\n");
return -EINVAL;
+ }
if ((ec->tx_coalesce_usecs >= (I40E_MIN_ITR << 1)) &&
- (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1)))
+ (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1))) {
vsi->tx_itr_setting = ec->tx_coalesce_usecs;
- else
+ } else if (ec->tx_coalesce_usecs == 0) {
+ vsi->tx_itr_setting = ec->tx_coalesce_usecs;
+ i40e_irq_dynamic_disable(vsi, vector);
+ if (ec->use_adaptive_tx_coalesce)
+ netif_info(pf, drv, netdev,
+ "Tx-secs=0, need to disable adaptive-Tx for a complete disable\n");
+ } else {
+ netif_info(pf, drv, netdev,
+ "Invalid value, Tx-usecs range is 0, 8-8160\n");
return -EINVAL;
+ }
if (ec->use_adaptive_rx_coalesce)
vsi->rx_itr_setting |= I40E_ITR_DYNAMIC;
@@@ -1543,6 -1127,7 +1543,6 @@@
else
vsi->tx_itr_setting &= ~I40E_ITR_DYNAMIC;
- vector = vsi->base_vector;
for (i = 0; i < vsi->num_q_vectors; i++, vector++) {
q_vector = vsi->q_vectors[i];
q_vector->rx.itr = ITR_TO_REG(vsi->rx_itr_setting);
@@@ -1913,7 -1498,7 +1913,7 @@@ static int i40e_update_ethtool_fdir_ent
/* add filter to the list */
if (parent)
- hlist_add_after(&parent->fdir_node, &input->fdir_node);
+ hlist_add_behind(&input->fdir_node, &parent->fdir_node);
else
hlist_add_head(&input->fdir_node,
&pf->fdir_filter_list);
@@@ -2146,7 -1731,6 +2146,7 @@@ static int i40e_set_channels(struct net
static const struct ethtool_ops i40e_ethtool_ops = {
.get_settings = i40e_get_settings,
+ .set_settings = i40e_set_settings,
.get_drvinfo = i40e_get_drvinfo,
.get_regs_len = i40e_get_regs_len,
.get_regs = i40e_get_regs,
@@@ -2154,13 -1738,11 +2154,13 @@@
.get_link = ethtool_op_get_link,
.get_wol = i40e_get_wol,
.set_wol = i40e_set_wol,
+ .set_eeprom = i40e_set_eeprom,
.get_eeprom_len = i40e_get_eeprom_len,
.get_eeprom = i40e_get_eeprom,
.get_ringparam = i40e_get_ringparam,
.set_ringparam = i40e_set_ringparam,
.get_pauseparam = i40e_get_pauseparam,
+ .set_pauseparam = i40e_set_pauseparam,
.get_msglevel = i40e_get_msglevel,
.set_msglevel = i40e_set_msglevel,
.get_rxnfc = i40e_get_rxnfc,
diff --combined drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 94a1c07,a6e5bcc..e4100b5
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@@ -1408,6 -1408,7 +1408,6 @@@ static int ixgbe_reg_test(struct ixgbe_
default:
*data = 1;
return 1;
- break;
}
/*
@@@ -2517,7 -2518,7 +2517,7 @@@ static int ixgbe_update_ethtool_fdir_en
/* add filter to the list */
if (parent)
- hlist_add_after(&parent->fdir_node, &input->fdir_node);
+ hlist_add_behind(&input->fdir_node, &parent->fdir_node);
else
hlist_add_head(&input->fdir_node,
&adapter->fdir_filter_list);
@@@ -2865,6 -2866,7 +2865,6 @@@ static int ixgbe_get_ts_info(struct net
break;
default:
return ethtool_op_get_ts_info(dev, info);
- break;
}
return 0;
}
diff --combined drivers/staging/android/binder.c
index 02b0379,0ca9785..4f34dc0
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
@@@ -454,8 -454,9 +454,8 @@@ static size_t binder_buffer_size(struc
{
if (list_is_last(&buffer->entry, &proc->buffers))
return proc->buffer + proc->buffer_size - (void *)buffer->data;
- else
- return (size_t)list_entry(buffer->entry.next,
- struct binder_buffer, entry) - (size_t)buffer->data;
+ return (size_t)list_entry(buffer->entry.next,
+ struct binder_buffer, entry) - (size_t)buffer->data;
}
static void binder_insert_free_buffer(struct binder_proc *proc,
@@@ -585,7 -586,6 +585,6 @@@ static int binder_update_page_range(str
for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
int ret;
- struct page **page_array_ptr;
page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
@@@ -598,8 -598,7 +597,7 @@@
}
tmp_area.addr = page_addr;
tmp_area.size = PAGE_SIZE + PAGE_SIZE /* guard page? */;
- page_array_ptr = page;
- ret = map_vm_area(&tmp_area, PAGE_KERNEL, &page_array_ptr);
+ ret = map_vm_area(&tmp_area, PAGE_KERNEL, page);
if (ret) {
pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n",
proc->pid, page_addr);
@@@ -1185,7 -1184,6 +1183,7 @@@ static void binder_send_failed_reply(st
uint32_t error_code)
{
struct binder_thread *target_thread;
+ struct binder_transaction *next;
BUG_ON(t->flags & TF_ONE_WAY);
while (1) {
@@@ -1213,23 -1211,24 +1211,23 @@@
target_thread->return_error);
}
return;
- } else {
- struct binder_transaction *next = t->from_parent;
+ }
+ next = t->from_parent;
- binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
- "send failed reply for transaction %d, target dead\n",
- t->debug_id);
+ binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
+ "send failed reply for transaction %d, target dead\n",
+ t->debug_id);
- binder_pop_transaction(target_thread, t);
- if (next == NULL) {
- binder_debug(BINDER_DEBUG_DEAD_BINDER,
- "reply failed, no target thread at root\n");
- return;
- }
- t = next;
+ binder_pop_transaction(target_thread, t);
+ if (next == NULL) {
binder_debug(BINDER_DEBUG_DEAD_BINDER,
- "reply failed, no target thread -- retry %d\n",
- t->debug_id);
+ "reply failed, no target thread at root\n");
+ return;
}
+ t = next;
+ binder_debug(BINDER_DEBUG_DEAD_BINDER,
+ "reply failed, no target thread -- retry %d\n",
+ t->debug_id);
}
}
@@@ -2593,106 -2592,6 +2591,106 @@@ static unsigned int binder_poll(struct
return 0;
}
+static int binder_ioctl_write_read(struct file *filp,
+ unsigned int cmd, unsigned long arg,
+ struct binder_thread *thread)
+{
+ int ret = 0;
+ struct binder_proc *proc = filp->private_data;
+ unsigned int size = _IOC_SIZE(cmd);
+ void __user *ubuf = (void __user *)arg;
+ struct binder_write_read bwr;
+
+ if (size != sizeof(struct binder_write_read)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (copy_from_user(&bwr, ubuf, sizeof(bwr))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ binder_debug(BINDER_DEBUG_READ_WRITE,
+ "%d:%d write %lld at %016llx, read %lld at %016llx\n",
+ proc->pid, thread->pid,
+ (u64)bwr.write_size, (u64)bwr.write_buffer,
+ (u64)bwr.read_size, (u64)bwr.read_buffer);
+
+ if (bwr.write_size > 0) {
+ ret = binder_thread_write(proc, thread,
+ bwr.write_buffer,
+ bwr.write_size,
+ &bwr.write_consumed);
+ trace_binder_write_done(ret);
+ if (ret < 0) {
+ bwr.read_consumed = 0;
+ if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+ if (bwr.read_size > 0) {
+ ret = binder_thread_read(proc, thread, bwr.read_buffer,
+ bwr.read_size,
+ &bwr.read_consumed,
+ filp->f_flags & O_NONBLOCK);
+ trace_binder_read_done(ret);
+ if (!list_empty(&proc->todo))
+ wake_up_interruptible(&proc->wait);
+ if (ret < 0) {
+ if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+ binder_debug(BINDER_DEBUG_READ_WRITE,
+ "%d:%d wrote %lld of %lld, read return %lld of %lld\n",
+ proc->pid, thread->pid,
+ (u64)bwr.write_consumed, (u64)bwr.write_size,
+ (u64)bwr.read_consumed, (u64)bwr.read_size);
+ if (copy_to_user(ubuf, &bwr, sizeof(bwr))) {
+ ret = -EFAULT;
+ goto out;
+ }
+out:
+ return ret;
+}
+
+static int binder_ioctl_set_ctx_mgr(struct file *filp)
+{
+ int ret = 0;
+ struct binder_proc *proc = filp->private_data;
+ kuid_t curr_euid = current_euid();
+
+ if (binder_context_mgr_node != NULL) {
+ pr_err("BINDER_SET_CONTEXT_MGR already set\n");
+ ret = -EBUSY;
+ goto out;
+ }
+ if (uid_valid(binder_context_mgr_uid)) {
+ if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
+ pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
+ from_kuid(&init_user_ns, curr_euid),
+ from_kuid(&init_user_ns,
+ binder_context_mgr_uid));
+ ret = -EPERM;
+ goto out;
+ }
+ } else {
+ binder_context_mgr_uid = curr_euid;
+ }
+ binder_context_mgr_node = binder_new_node(proc, 0, 0);
+ if (binder_context_mgr_node == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ binder_context_mgr_node->local_weak_refs++;
+ binder_context_mgr_node->local_strong_refs++;
+ binder_context_mgr_node->has_strong_ref = 1;
+ binder_context_mgr_node->has_weak_ref = 1;
+out:
+ return ret;
+}
+
static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int ret;
@@@ -2700,9 -2599,9 +2698,9 @@@
struct binder_thread *thread;
unsigned int size = _IOC_SIZE(cmd);
void __user *ubuf = (void __user *)arg;
- kuid_t curr_euid = current_euid();
- /*pr_info("binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd,
arg);*/
+ /*pr_info("binder_ioctl: %d:%d %x %lx\n",
+ proc->pid, current->pid, cmd, arg);*/
trace_binder_ioctl(cmd, arg);
@@@ -2718,11 -2617,61 +2716,11 @@@
}
switch (cmd) {
- case BINDER_WRITE_READ: {
- struct binder_write_read bwr;
-
- if (size != sizeof(struct binder_write_read)) {
- ret = -EINVAL;
+ case BINDER_WRITE_READ:
+ ret = binder_ioctl_write_read(filp, cmd, arg, thread);
+ if (ret)
goto err;
- }
- if (copy_from_user(&bwr, ubuf, sizeof(bwr))) {
- ret = -EFAULT;
- goto err;
- }
- binder_debug(BINDER_DEBUG_READ_WRITE,
- "%d:%d write %lld at %016llx, read %lld at %016llx\n",
- proc->pid, thread->pid,
- (u64)bwr.write_size, (u64)bwr.write_buffer,
- (u64)bwr.read_size, (u64)bwr.read_buffer);
-
- if (bwr.write_size > 0) {
- ret = binder_thread_write(proc, thread,
- bwr.write_buffer,
- bwr.write_size,
- &bwr.write_consumed);
- trace_binder_write_done(ret);
- if (ret < 0) {
- bwr.read_consumed = 0;
- if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
- ret = -EFAULT;
- goto err;
- }
- }
- if (bwr.read_size > 0) {
- ret = binder_thread_read(proc, thread, bwr.read_buffer,
- bwr.read_size,
- &bwr.read_consumed,
- filp->f_flags & O_NONBLOCK);
- trace_binder_read_done(ret);
- if (!list_empty(&proc->todo))
- wake_up_interruptible(&proc->wait);
- if (ret < 0) {
- if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
- ret = -EFAULT;
- goto err;
- }
- }
- binder_debug(BINDER_DEBUG_READ_WRITE,
- "%d:%d wrote %lld of %lld, read return %lld of %lld\n",
- proc->pid, thread->pid,
- (u64)bwr.write_consumed, (u64)bwr.write_size,
- (u64)bwr.read_consumed, (u64)bwr.read_size);
- if (copy_to_user(ubuf, &bwr, sizeof(bwr))) {
- ret = -EFAULT;
- goto err;
- }
break;
- }
case BINDER_SET_MAX_THREADS:
if (copy_from_user(&proc->max_threads, ubuf, sizeof(proc->max_threads))) {
ret = -EINVAL;
@@@ -2730,9 -2679,31 +2728,9 @@@
}
break;
case BINDER_SET_CONTEXT_MGR:
- if (binder_context_mgr_node != NULL) {
- pr_err("BINDER_SET_CONTEXT_MGR already set\n");
- ret = -EBUSY;
- goto err;
- }
- if (uid_valid(binder_context_mgr_uid)) {
- if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
- pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
- from_kuid(&init_user_ns, curr_euid),
- from_kuid(&init_user_ns, binder_context_mgr_uid));
- ret = -EPERM;
- goto err;
- }
- } else {
- binder_context_mgr_uid = curr_euid;
- }
- binder_context_mgr_node = binder_new_node(proc, 0, 0);
- if (binder_context_mgr_node == NULL) {
- ret = -ENOMEM;
+ ret = binder_ioctl_set_ctx_mgr(filp);
+ if (ret)
goto err;
- }
- binder_context_mgr_node->local_weak_refs++;
- binder_context_mgr_node->local_strong_refs++;
- binder_context_mgr_node->has_strong_ref = 1;
- binder_context_mgr_node->has_weak_ref = 1;
break;
case BINDER_THREAD_EXIT:
binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n",
@@@ -2796,15 -2767,9 +2794,15 @@@ static void binder_vma_close(struct vm_
binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
}
+static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
static struct vm_operations_struct binder_vm_ops = {
.open = binder_vma_open,
.close = binder_vma_close,
+ .fault = binder_vm_fault,
};
static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
diff --combined drivers/staging/lustre/lustre/libcfs/hash.c
index 5dde794,6db7391..8ef1deb
--- a/drivers/staging/lustre/lustre/libcfs/hash.c
+++ b/drivers/staging/lustre/lustre/libcfs/hash.c
@@@ -107,7 -107,7 +107,7 @@@
* table. Also, user can break the iteration by return 1 in callback.
*/
-#include <linux/libcfs/libcfs.h>
+#include "../../include/linux/libcfs/libcfs.h"
#include <linux/seq_file.h>
#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
@@@ -351,7 -351,7 +351,7 @@@ cfs_hash_dh_hnode_add(struct cfs_hash *
cfs_hash_dhead_t, dh_head);
if (dh->dh_tail != NULL) /* not empty */
- hlist_add_after(dh->dh_tail, hnode);
+ hlist_add_behind(hnode, dh->dh_tail);
else /* empty list */
hlist_add_head(hnode, &dh->dh_head);
dh->dh_tail = hnode;
@@@ -406,7 -406,7 +406,7 @@@ cfs_hash_dd_hnode_add(struct cfs_hash *
cfs_hash_dhead_dep_t, dd_head);
if (dh->dd_tail != NULL) /* not empty */
- hlist_add_after(dh->dd_tail, hnode);
+ hlist_add_behind(hnode, dh->dd_tail);
else /* empty list */
hlist_add_head(hnode, &dh->dd_head);
dh->dd_tail = hnode;
diff --combined drivers/video/backlight/backlight.c
index bddc8b1,19b170d..0ce8823
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@@ -190,8 -190,6 +190,6 @@@ static ssize_t brightness_store(struct
}
mutex_unlock(&bd->ops_lock);
- backlight_generate_event(bd, BACKLIGHT_UPDATE_SYSFS);
-
return rc;
}
static DEVICE_ATTR_RW(brightness);
@@@ -223,8 -221,6 +221,8 @@@ static ssize_t actual_brightness_show(s
mutex_lock(&bd->ops_lock);
if (bd->ops && bd->ops->get_brightness)
rc = sprintf(buf, "%d\n", bd->ops->get_brightness(bd));
+ else
+ rc = sprintf(buf, "%d\n", bd->props.brightness);
mutex_unlock(&bd->ops_lock);
return rc;
diff --combined fs/cifs/cifssmb.c
index 7d4361f,c3dc52e..692d79f
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@@ -196,6 -196,10 +196,6 @@@ cifs_reconnect_tcon(struct cifs_tcon *t
if (rc)
goto out;
- /*
- * FIXME: check if wsize needs updated due to negotiated smb buffer
- * size shrinking
- */
atomic_inc(&tconInfoReconnectCount);
/* tell server Unix caps we support */
@@@ -1513,6 -1517,7 +1513,6 @@@ cifs_readv_receive(struct TCP_Server_In
return length;
server->total_read += length;
- rdata->bytes = length;
cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
server->total_read, buflen, data_len);
@@@ -1555,18 -1560,12 +1555,18 @@@ cifs_readv_callback(struct mid_q_entry
rc);
}
/* FIXME: should this be counted toward the initiating task? */
- task_io_account_read(rdata->bytes);
- cifs_stats_bytes_read(tcon, rdata->bytes);
+ task_io_account_read(rdata->got_bytes);
+ cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
rdata->result = -EAGAIN;
+ if (server->sign && rdata->got_bytes)
+ /* reset bytes number since we can not check a sign */
+ rdata->got_bytes = 0;
+ /* FIXME: should this be counted toward the initiating task? */
+ task_io_account_read(rdata->got_bytes);
+ cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
default:
rdata->result = -EIO;
@@@ -1735,7 -1734,10 +1735,7 @@@ CIFSSMBRead(const unsigned int xid, str
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
if (*buf) {
- if (resp_buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(iov[0].iov_base);
- else if (resp_buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(iov[0].iov_base);
+ free_rsp_buf(resp_buf_type, iov[0].iov_base);
} else if (resp_buf_type != CIFS_NO_BUFFER) {
/* return buffer to caller to free */
*buf = iov[0].iov_base;
@@@ -1900,79 -1902,27 +1900,79 @@@ cifs_writev_requeue(struct cifs_writeda
int i, rc;
struct inode *inode = wdata->cfile->dentry->d_inode;
struct TCP_Server_Info *server;
+ unsigned int rest_len;
- for (i = 0; i < wdata->nr_pages; i++) {
- lock_page(wdata->pages[i]);
- clear_page_dirty_for_io(wdata->pages[i]);
- }
-
+ server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+ i = 0;
+ rest_len = wdata->bytes;
do {
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- rc = server->ops->async_writev(wdata, cifs_writedata_release);
- } while (rc == -EAGAIN);
+ struct cifs_writedata *wdata2;
+ unsigned int j, nr_pages, wsize, tailsz, cur_len;
+
+ wsize = server->ops->wp_retry_size(inode);
+ if (wsize < rest_len) {
+ nr_pages = wsize / PAGE_CACHE_SIZE;
+ if (!nr_pages) {
+ rc = -ENOTSUPP;
+ break;
+ }
+ cur_len = nr_pages * PAGE_CACHE_SIZE;
+ tailsz = PAGE_CACHE_SIZE;
+ } else {
+ nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE);
+ cur_len = rest_len;
+ tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE;
+ }
- for (i = 0; i < wdata->nr_pages; i++) {
- unlock_page(wdata->pages[i]);
- if (rc != 0) {
- SetPageError(wdata->pages[i]);
- end_page_writeback(wdata->pages[i]);
- page_cache_release(wdata->pages[i]);
+ wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
+ if (!wdata2) {
+ rc = -ENOMEM;
+ break;
}
- }
- mapping_set_error(inode->i_mapping, rc);
+ for (j = 0; j < nr_pages; j++) {
+ wdata2->pages[j] = wdata->pages[i + j];
+ lock_page(wdata2->pages[j]);
+ clear_page_dirty_for_io(wdata2->pages[j]);
+ }
+
+ wdata2->sync_mode = wdata->sync_mode;
+ wdata2->nr_pages = nr_pages;
+ wdata2->offset = page_offset(wdata2->pages[0]);
+ wdata2->pagesz = PAGE_CACHE_SIZE;
+ wdata2->tailsz = tailsz;
+ wdata2->bytes = cur_len;
+
+ wdata2->cfile = find_writable_file(CIFS_I(inode), false);
+ if (!wdata2->cfile) {
+ cifs_dbg(VFS, "No writable handles for inode\n");
+ rc = -EBADF;
+ break;
+ }
+ wdata2->pid = wdata2->cfile->pid;
+ rc = server->ops->async_writev(wdata2, cifs_writedata_release);
+
+ for (j = 0; j < nr_pages; j++) {
+ unlock_page(wdata2->pages[j]);
+ if (rc != 0 && rc != -EAGAIN) {
+ SetPageError(wdata2->pages[j]);
+ end_page_writeback(wdata2->pages[j]);
+ page_cache_release(wdata2->pages[j]);
+ }
+ }
+
+ if (rc) {
+ kref_put(&wdata2->refcount, cifs_writedata_release);
+ if (rc == -EAGAIN)
+ continue;
+ mapping_set_error(inode->i_mapping, rc);
+ break;
+ }
+
+ rest_len -= cur_len;
+ i += nr_pages;
+ } while (i < wdata->nr_pages);
+
kref_put(&wdata->refcount, cifs_writedata_release);
}
@@@ -2253,7 -2203,10 +2253,7 @@@ CIFSSMBWrite2(const unsigned int xid, s
}
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
- if (resp_buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(iov[0].iov_base);
- else if (resp_buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(iov[0].iov_base);
+ free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls
since file handle passed in no longer valid */
@@@ -2477,14 -2430,14 +2477,14 @@@ CIFSSMBPosixLock(const unsigned int xid
}
parm_data = (struct cifs_posix_lock *)
((char *)&pSMBr->hdr.Protocol + data_offset);
- if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
+ if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
pLockData->fl_type = F_UNLCK;
else {
if (parm_data->lock_type ==
- __constant_cpu_to_le16(CIFS_RDLCK))
+ cpu_to_le16(CIFS_RDLCK))
pLockData->fl_type = F_RDLCK;
else if (parm_data->lock_type ==
- __constant_cpu_to_le16(CIFS_WRLCK))
+ cpu_to_le16(CIFS_WRLCK))
pLockData->fl_type = F_WRLCK;
pLockData->fl_start = le64_to_cpu(parm_data->start);
@@@ -2498,7 -2451,10 +2498,7 @@@ plk_err_exit
if (pSMB)
cifs_small_buf_release(pSMB);
- if (resp_buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(iov[0].iov_base);
- else if (resp_buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(iov[0].iov_base);
+ free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls
since file handle passed in no longer valid */
@@@ -3276,25 -3232,25 +3276,25 @@@ CIFSSMB_set_compression(const unsigned
pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
pSMB->TotalParameterCount = 0;
- pSMB->TotalDataCount = __constant_cpu_to_le32(2);
+ pSMB->TotalDataCount = cpu_to_le32(2);
pSMB->MaxParameterCount = 0;
pSMB->MaxDataCount = 0;
pSMB->MaxSetupCount = 4;
pSMB->Reserved = 0;
pSMB->ParameterOffset = 0;
- pSMB->DataCount = __constant_cpu_to_le32(2);
+ pSMB->DataCount = cpu_to_le32(2);
pSMB->DataOffset =
cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req,
compression_state) - 4); /* 84 */
pSMB->SetupCount = 4;
- pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL);
+ pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
pSMB->ParameterCount = 0;
- pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION);
+ pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION);
pSMB->IsFsctl = 1; /* FSCTL */
pSMB->IsRootFlag = 0;
pSMB->Fid = fid; /* file handle always le */
/* 3 byte pad, followed by 2 byte compress state */
- pSMB->ByteCount = __constant_cpu_to_le16(5);
+ pSMB->ByteCount = cpu_to_le16(5);
inc_rfc1001_len(pSMB, 5);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@@ -3430,10 -3386,10 +3430,10 @@@ static __u16 ACL_to_cifs_posix(char *pa
cifs_acl->version = cpu_to_le16(1);
if (acl_type == ACL_TYPE_ACCESS) {
cifs_acl->access_entry_count = cpu_to_le16(count);
- cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF);
+ cifs_acl->default_entry_count = cpu_to_le16(0xFFFF);
} else if (acl_type == ACL_TYPE_DEFAULT) {
cifs_acl->default_entry_count = cpu_to_le16(count);
- cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF);
+ cifs_acl->access_entry_count = cpu_to_le16(0xFFFF);
} else {
cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
return 0;
@@@ -3882,7 -3838,10 +3882,7 @@@ CIFSSMBGetCIFSACL(const unsigned int xi
}
}
qsec_out:
- if (buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(iov[0].iov_base);
- else if (buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(iov[0].iov_base);
+ free_rsp_buf(buf_type, iov[0].iov_base);
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
return rc;
}
diff --combined fs/cifs/file.c
index 01a6339,3c1967c..03558d4
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@@ -1058,7 -1058,7 +1058,7 @@@ cifs_push_mandatory_locks(struct cifsFi
max_num = (max_buf - sizeof(struct smb_hdr)) /
sizeof(LOCKING_ANDX_RANGE);
- buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
if (!buf) {
free_xid(xid);
return -ENOMEM;
@@@ -1393,7 -1393,7 +1393,7 @@@ cifs_unlock_range(struct cifsFileInfo *
max_num = (max_buf - sizeof(struct smb_hdr)) /
sizeof(LOCKING_ANDX_RANGE);
- buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@@ -1670,8 -1670,8 +1670,8 @@@ cifs_write(struct cifsFileInfo *open_fi
break;
}
- len = min((size_t)cifs_sb->wsize,
- write_size - total_written);
+ len = min(server->ops->wp_retry_size(dentry->d_inode),
+ (unsigned int)write_size - total_written);
/* iov[0] is reserved for smb header */
iov[1].iov_base = (char *)write_data + total_written;
iov[1].iov_len = len;
@@@ -1878,178 -1878,15 +1878,178 @@@ static int cifs_partialpagewrite(struc
return rc;
}
+static struct cifs_writedata *
+wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
+ pgoff_t end, pgoff_t *index,
+ unsigned int *found_pages)
+{
+ unsigned int nr_pages;
+ struct page **pages;
+ struct cifs_writedata *wdata;
+
+ wdata = cifs_writedata_alloc((unsigned int)tofind,
+ cifs_writev_complete);
+ if (!wdata)
+ return NULL;
+
+ /*
+ * find_get_pages_tag seems to return a max of 256 on each
+ * iteration, so we must call it several times in order to
+ * fill the array or the wsize is effectively limited to
+ * 256 * PAGE_CACHE_SIZE.
+ */
+ *found_pages = 0;
+ pages = wdata->pages;
+ do {
+ nr_pages = find_get_pages_tag(mapping, index,
+ PAGECACHE_TAG_DIRTY, tofind,
+ pages);
+ *found_pages += nr_pages;
+ tofind -= nr_pages;
+ pages += nr_pages;
+ } while (nr_pages && tofind && *index <= end);
+
+ return wdata;
+}
+
+static unsigned int
+wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
+ struct address_space *mapping,
+ struct writeback_control *wbc,
+ pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done)
+{
+ unsigned int nr_pages = 0, i;
+ struct page *page;
+
+ for (i = 0; i < found_pages; i++) {
+ page = wdata->pages[i];
+ /*
+ * At this point we hold neither mapping->tree_lock nor
+ * lock on the page itself: the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or even
+ * swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+
+ if (nr_pages == 0)
+ lock_page(page);
+ else if (!trylock_page(page))
+ break;
+
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ break;
+ }
+
+ if (!wbc->range_cyclic && page->index > end) {
+ *done = true;
+ unlock_page(page);
+ break;
+ }
+
+ if (*next && (page->index != *next)) {
+ /* Not next consecutive page */
+ unlock_page(page);
+ break;
+ }
+
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+
+ if (PageWriteback(page) ||
+ !clear_page_dirty_for_io(page)) {
+ unlock_page(page);
+ break;
+ }
+
+ /*
+ * This actually clears the dirty bit in the radix tree.
+ * See cifs_writepage() for more commentary.
+ */
+ set_page_writeback(page);
+ if (page_offset(page) >= i_size_read(mapping->host)) {
+ *done = true;
+ unlock_page(page);
+ end_page_writeback(page);
+ break;
+ }
+
+ wdata->pages[i] = page;
+ *next = page->index + 1;
+ ++nr_pages;
+ }
+
+ /* reset index to refind any pages skipped */
+ if (nr_pages == 0)
+ *index = wdata->pages[0]->index + 1;
+
+ /* put any pages we aren't going to use */
+ for (i = nr_pages; i < found_pages; i++) {
+ page_cache_release(wdata->pages[i]);
+ wdata->pages[i] = NULL;
+ }
+
+ return nr_pages;
+}
+
+static int
+wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
+ struct address_space *mapping, struct writeback_control *wbc)
+{
+ int rc = 0;
+ struct TCP_Server_Info *server;
+ unsigned int i;
+
+ wdata->sync_mode = wbc->sync_mode;
+ wdata->nr_pages = nr_pages;
+ wdata->offset = page_offset(wdata->pages[0]);
+ wdata->pagesz = PAGE_CACHE_SIZE;
+ wdata->tailsz = min(i_size_read(mapping->host) -
+ page_offset(wdata->pages[nr_pages - 1]),
+ (loff_t)PAGE_CACHE_SIZE);
+ wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz;
+
+ if (wdata->cfile != NULL)
+ cifsFileInfo_put(wdata->cfile);
+ wdata->cfile = find_writable_file(CIFS_I(mapping->host), false);
+ if (!wdata->cfile) {
+ cifs_dbg(VFS, "No writable handles for inode\n");
+ rc = -EBADF;
+ } else {
+ wdata->pid = wdata->cfile->pid;
+ server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+ rc = server->ops->async_writev(wdata, cifs_writedata_release);
+ }
+
+ for (i = 0; i < nr_pages; ++i)
+ unlock_page(wdata->pages[i]);
+
+ if (!rc)
+ return rc;
+
+ /* send failure -- clean up the mess */
+ for (i = 0; i < nr_pages; ++i) {
+ if (rc == -EAGAIN)
+ redirty_page_for_writepage(wbc, wdata->pages[i]);
+ else
+ SetPageError(wdata->pages[i]);
+ end_page_writeback(wdata->pages[i]);
+ page_cache_release(wdata->pages[i]);
+ }
+ if (rc != -EAGAIN)
+ mapping_set_error(mapping, rc);
+
+ return rc;
+}
+
static int cifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
+ struct TCP_Server_Info *server;
bool done = false, scanned = false, range_whole = false;
pgoff_t end, index;
struct cifs_writedata *wdata;
- struct TCP_Server_Info *server;
- struct page *page;
int rc = 0;
/*
@@@ -2069,55 -1906,165 +2069,55 @@@
range_whole = true;
scanned = true;
}
+ server = cifs_sb_master_tcon(cifs_sb)->ses->server;
retry:
while (!done && index <= end) {
- unsigned int i, nr_pages, found_pages;
- pgoff_t next = 0, tofind;
- struct page **pages;
+ unsigned int nr_pages, found_pages, wsize, credits;
+ pgoff_t next = 0, tofind, saved_index = index;
- tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1,
- end - index) + 1;
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
+ &wsize, &credits);
+ if (rc)
+ break;
- wdata = cifs_writedata_alloc((unsigned int)tofind,
- cifs_writev_complete);
+ tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1;
+
+ wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index,
+ &found_pages);
if (!wdata) {
rc = -ENOMEM;
+ add_credits_and_wake_if(server, credits, 0);
break;
}
- /*
- * find_get_pages_tag seems to return a max of 256 on each
- * iteration, so we must call it several times in order to
- * fill the array or the wsize is effectively limited to
- * 256 * PAGE_CACHE_SIZE.
- */
- found_pages = 0;
- pages = wdata->pages;
- do {
- nr_pages = find_get_pages_tag(mapping, &index,
- PAGECACHE_TAG_DIRTY,
- tofind, pages);
- found_pages += nr_pages;
- tofind -= nr_pages;
- pages += nr_pages;
- } while (nr_pages && tofind && index <= end);
-
if (found_pages == 0) {
kref_put(&wdata->refcount, cifs_writedata_release);
+ add_credits_and_wake_if(server, credits, 0);
break;
}
- nr_pages = 0;
- for (i = 0; i < found_pages; i++) {
- page = wdata->pages[i];
- /*
- * At this point we hold neither mapping->tree_lock nor
- * lock on the page itself: the page may be truncated or
- * invalidated (changing page->mapping to NULL), or even
- * swizzled back from swapper_space to tmpfs file
- * mapping
- */
-
- if (nr_pages == 0)
- lock_page(page);
- else if (!trylock_page(page))
- break;
-
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- break;
- }
-
- if (!wbc->range_cyclic && page->index > end) {
- done = true;
- unlock_page(page);
- break;
- }
-
- if (next && (page->index != next)) {
- /* Not next consecutive page */
- unlock_page(page);
- break;
- }
-
- if (wbc->sync_mode != WB_SYNC_NONE)
- wait_on_page_writeback(page);
-
- if (PageWriteback(page) ||
- !clear_page_dirty_for_io(page)) {
- unlock_page(page);
- break;
- }
-
- /*
- * This actually clears the dirty bit in the radix tree.
- * See cifs_writepage() for more commentary.
- */
- set_page_writeback(page);
-
- if (page_offset(page) >= i_size_read(mapping->host)) {
- done = true;
- unlock_page(page);
- end_page_writeback(page);
- break;
- }
-
- wdata->pages[i] = page;
- next = page->index + 1;
- ++nr_pages;
- }
-
- /* reset index to refind any pages skipped */
- if (nr_pages == 0)
- index = wdata->pages[0]->index + 1;
-
- /* put any pages we aren't going to use */
- for (i = nr_pages; i < found_pages; i++) {
- page_cache_release(wdata->pages[i]);
- wdata->pages[i] = NULL;
- }
+ nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc,
+ end, &index, &next, &done);
/* nothing to write? */
if (nr_pages == 0) {
kref_put(&wdata->refcount, cifs_writedata_release);
+ add_credits_and_wake_if(server, credits, 0);
continue;
}
- wdata->sync_mode = wbc->sync_mode;
- wdata->nr_pages = nr_pages;
- wdata->offset = page_offset(wdata->pages[0]);
- wdata->pagesz = PAGE_CACHE_SIZE;
- wdata->tailsz =
- min(i_size_read(mapping->host) -
- page_offset(wdata->pages[nr_pages - 1]),
- (loff_t)PAGE_CACHE_SIZE);
- wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
- wdata->tailsz;
+ wdata->credits = credits;
- do {
- if (wdata->cfile != NULL)
- cifsFileInfo_put(wdata->cfile);
- wdata->cfile = find_writable_file(CIFS_I(mapping->host),
- false);
- if (!wdata->cfile) {
- cifs_dbg(VFS, "No writable handles for inode\n");
- rc = -EBADF;
- break;
- }
- wdata->pid = wdata->cfile->pid;
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- rc = server->ops->async_writev(wdata,
- cifs_writedata_release);
- } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
+ rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
+ if (rc)
+ add_credits_and_wake_if(server, wdata->credits, 0);
- for (i = 0; i < nr_pages; ++i)
- unlock_page(wdata->pages[i]);
+ kref_put(&wdata->refcount, cifs_writedata_release);
- /* send failure -- clean up the mess */
- if (rc != 0) {
- for (i = 0; i < nr_pages; ++i) {
- if (rc == -EAGAIN)
- redirty_page_for_writepage(wbc,
- wdata->pages[i]);
- else
- SetPageError(wdata->pages[i]);
- end_page_writeback(wdata->pages[i]);
- page_cache_release(wdata->pages[i]);
- }
- if (rc != -EAGAIN)
- mapping_set_error(mapping, rc);
+ if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) {
+ index = saved_index;
+ continue;
}
- kref_put(&wdata->refcount, cifs_writedata_release);
wbc->nr_to_write -= nr_pages;
if (wbc->nr_to_write <= 0)
@@@ -2415,106 -2362,125 +2415,106 @@@ cifs_uncached_writev_complete(struct wo
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
-/* attempt to send write to server, retry on any -EAGAIN errors */
static int
-cifs_uncached_retry_writev(struct cifs_writedata *wdata)
+wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
+ size_t *len, unsigned long nr_pages)
{
- int rc;
- struct TCP_Server_Info *server;
+ int rc = 0;
+ size_t save_len, copied, bytes, cur_len = *len;
+ unsigned long i;
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+ save_len = cur_len;
+ for (i = 0; i < nr_pages; i++) {
+ bytes = min_t(const size_t, cur_len, PAGE_SIZE);
+ copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from);
+ cur_len -= copied;
+ /*
+ * If we didn't copy as much as we expected, then that
+ * may mean we trod into an unmapped area. Stop copying
+ * at that point. On the next pass through the big
+ * loop, we'll likely end up getting a zero-length
+ * write and bailing out of it.
+ */
+ if (copied < bytes)
+ break;
+ }
+ cur_len = save_len - cur_len;
+ *len = cur_len;
- do {
- if (wdata->cfile->invalidHandle) {
- rc = cifs_reopen_file(wdata->cfile, false);
- if (rc != 0)
- continue;
- }
- rc = server->ops->async_writev(wdata,
- cifs_uncached_writedata_release);
- } while (rc == -EAGAIN);
+ /*
+ * If we have no data to send, then that probably means that
+ * the copy above failed altogether. That's most likely because
+ * the address in the iovec was bogus. Return -EFAULT and let
+ * the caller free anything we allocated and bail out.
+ */
+ if (!cur_len)
+ return -EFAULT;
+ /*
+ * i + 1 now represents the number of pages we actually used in
+ * the copy phase above. Bring nr_pages down to that, and free
+ * any pages that we didn't use.
+ */
+ for ( ; nr_pages > i + 1; nr_pages--)
+ put_page(wdata->pages[nr_pages - 1]);
return rc;
}
-static ssize_t
-cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
+static int
+cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
+ struct cifsFileInfo *open_file,
+ struct cifs_sb_info *cifs_sb, struct list_head *wdata_list)
{
+ int rc = 0;
+ size_t cur_len;
unsigned long nr_pages, i;
- size_t bytes, copied, len, cur_len;
- ssize_t total_written = 0;
- loff_t offset;
- struct cifsFileInfo *open_file;
- struct cifs_tcon *tcon;
- struct cifs_sb_info *cifs_sb;
- struct cifs_writedata *wdata, *tmp;
- struct list_head wdata_list;
- int rc;
+ struct cifs_writedata *wdata;
+ struct iov_iter saved_from;
+ loff_t saved_offset = offset;
pid_t pid;
-
- len = iov_iter_count(from);
- rc = generic_write_checks(file, poffset, &len, 0);
- if (rc)
- return rc;
-
- if (!len)
- return 0;
-
- iov_iter_truncate(from, len);
-
- INIT_LIST_HEAD(&wdata_list);
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- open_file = file->private_data;
- tcon = tlink_tcon(open_file->tlink);
-
- if (!tcon->ses->server->ops->async_writev)
- return -ENOSYS;
-
- offset = *poffset;
+ struct TCP_Server_Info *server;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
else
pid = current->tgid;
+ server = tlink_tcon(open_file->tlink)->ses->server;
+ memcpy(&saved_from, from, sizeof(struct iov_iter));
+
do {
- size_t save_len;
+ unsigned int wsize, credits;
+
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
+ &wsize, &credits);
+ if (rc)
+ break;
- nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
+ nr_pages = get_numpages(wsize, len, &cur_len);
wdata = cifs_writedata_alloc(nr_pages,
cifs_uncached_writev_complete);
if (!wdata) {
rc = -ENOMEM;
+ add_credits_and_wake_if(server, credits, 0);
break;
}
rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
if (rc) {
kfree(wdata);
+ add_credits_and_wake_if(server, credits, 0);
break;
}
- save_len = cur_len;
- for (i = 0; i < nr_pages; i++) {
- bytes = min_t(size_t, cur_len, PAGE_SIZE);
- copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
- from);
- cur_len -= copied;
- /*
- * If we didn't copy as much as we expected, then that
- * may mean we trod into an unmapped area. Stop copying
- * at that point. On the next pass through the big
- * loop, we'll likely end up getting a zero-length
- * write and bailing out of it.
- */
- if (copied < bytes)
- break;
- }
- cur_len = save_len - cur_len;
-
- /*
- * If we have no data to send, then that probably means that
- * the copy above failed altogether. That's most likely because
- * the address in the iovec was bogus. Set the rc to -EFAULT,
- * free anything we allocated and bail out.
- */
- if (!cur_len) {
+ rc = wdata_fill_from_iovec(wdata, from, &cur_len, nr_pages);
+ if (rc) {
for (i = 0; i < nr_pages; i++)
put_page(wdata->pages[i]);
kfree(wdata);
- rc = -EFAULT;
+ add_credits_and_wake_if(server, credits, 0);
break;
}
- /*
- * i + 1 now represents the number of pages we actually used in
- * the copy phase above. Bring nr_pages down to that, and free
- * any pages that we didn't use.
- */
- for ( ; nr_pages > i + 1; nr_pages--)
- put_page(wdata->pages[nr_pages - 1]);
-
wdata->sync_mode = WB_SYNC_ALL;
wdata->nr_pages = nr_pages;
wdata->offset = (__u64)offset;
@@@ -2523,71 -2489,18 +2523,71 @@@
wdata->bytes = cur_len;
wdata->pagesz = PAGE_SIZE;
wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
- rc = cifs_uncached_retry_writev(wdata);
+ wdata->credits = credits;
+
+ if (!wdata->cfile->invalidHandle ||
+ !cifs_reopen_file(wdata->cfile, false))
+ rc = server->ops->async_writev(wdata,
+ cifs_uncached_writedata_release);
if (rc) {
+ add_credits_and_wake_if(server, wdata->credits, 0);
kref_put(&wdata->refcount,
cifs_uncached_writedata_release);
+ if (rc == -EAGAIN) {
+ memcpy(from, &saved_from,
+ sizeof(struct iov_iter));
+ iov_iter_advance(from, offset - saved_offset);
+ continue;
+ }
break;
}
- list_add_tail(&wdata->list, &wdata_list);
+ list_add_tail(&wdata->list, wdata_list);
offset += cur_len;
len -= cur_len;
} while (len > 0);
+ return rc;
+}
+
+static ssize_t
+cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
+{
+ size_t len;
+ ssize_t total_written = 0;
+ loff_t offset;
+ struct cifsFileInfo *open_file;
+ struct cifs_tcon *tcon;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_writedata *wdata, *tmp;
+ struct list_head wdata_list;
+ struct iov_iter saved_from;
+ int rc;
+
+ len = iov_iter_count(from);
+ rc = generic_write_checks(file, poffset, &len, 0);
+ if (rc)
+ return rc;
+
+ if (!len)
+ return 0;
+
+ iov_iter_truncate(from, len);
+
+ INIT_LIST_HEAD(&wdata_list);
+ cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ open_file = file->private_data;
+ tcon = tlink_tcon(open_file->tlink);
+
+ if (!tcon->ses->server->ops->async_writev)
+ return -ENOSYS;
+
+ offset = *poffset;
+ memcpy(&saved_from, from, sizeof(struct iov_iter));
+
+ rc = cifs_write_from_iter(*poffset, len, from, open_file, cifs_sb,
+ &wdata_list);
+
/*
* If at least one write was successfully sent, then discard any rc
* value from the later writes. If the other write succeeds, then
@@@ -2616,25 -2529,7 +2616,25 @@@ restart_loop
/* resend call if it's a retryable error */
if (rc == -EAGAIN) {
- rc = cifs_uncached_retry_writev(wdata);
+ struct list_head tmp_list;
+ struct iov_iter tmp_from;
+
+ INIT_LIST_HEAD(&tmp_list);
+ list_del_init(&wdata->list);
+
+ memcpy(&tmp_from, &saved_from,
+ sizeof(struct iov_iter));
+ iov_iter_advance(&tmp_from,
+ wdata->offset - *poffset);
+
+ rc = cifs_write_from_iter(wdata->offset,
+ wdata->bytes, &tmp_from,
+ open_file, cifs_sb, &tmp_list);
+
+ list_splice(&tmp_list, &wdata_list);
+
+ kref_put(&wdata->refcount,
+ cifs_uncached_writedata_release);
goto restart_loop;
}
}
@@@ -2827,6 -2722,26 +2827,6 @@@ cifs_uncached_readdata_release(struct k
cifs_readdata_release(refcount);
}
-static int
-cifs_retry_async_readv(struct cifs_readdata *rdata)
-{
- int rc;
- struct TCP_Server_Info *server;
-
- server = tlink_tcon(rdata->cfile->tlink)->ses->server;
-
- do {
- if (rdata->cfile->invalidHandle) {
- rc = cifs_reopen_file(rdata->cfile, true);
- if (rc != 0)
- continue;
- }
- rc = server->ops->async_readv(rdata);
- } while (rc == -EAGAIN);
-
- return rc;
-}
-
/**
* cifs_readdata_to_iov - copy data from pages in response to an iovec
* @rdata: the readdata response with list of pages holding data
@@@ -2839,7 -2754,7 +2839,7 @@@
static int
cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
{
- size_t remaining = rdata->bytes;
+ size_t remaining = rdata->got_bytes;
unsigned int i;
for (i = 0; i < rdata->nr_pages; i++) {
@@@ -2867,12 -2782,11 +2867,12 @@@ static in
cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
struct cifs_readdata *rdata, unsigned int len)
{
- int total_read = 0, result = 0;
+ int result = 0;
unsigned int i;
unsigned int nr_pages = rdata->nr_pages;
struct kvec iov;
+ rdata->got_bytes = 0;
rdata->tailsz = PAGE_SIZE;
for (i = 0; i < nr_pages; i++) {
struct page *page = rdata->pages[i];
@@@ -2906,45 -2820,55 +2906,45 @@@
if (result < 0)
break;
- total_read += result;
+ rdata->got_bytes += result;
}
- return total_read > 0 ? total_read : result;
+ return rdata->got_bytes > 0 && result != -ECONNABORTED ?
+ rdata->got_bytes : result;
}
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+static int
+cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
+ struct cifs_sb_info *cifs_sb, struct list_head *rdata_list)
{
- struct file *file = iocb->ki_filp;
- ssize_t rc;
- size_t len, cur_len;
- ssize_t total_read = 0;
- loff_t offset = iocb->ki_pos;
- unsigned int npages;
- struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *tcon;
- struct cifsFileInfo *open_file;
- struct cifs_readdata *rdata, *tmp;
- struct list_head rdata_list;
+ struct cifs_readdata *rdata;
+ unsigned int npages, rsize, credits;
+ size_t cur_len;
+ int rc;
pid_t pid;
+ struct TCP_Server_Info *server;
- len = iov_iter_count(to);
- if (!len)
- return 0;
-
- INIT_LIST_HEAD(&rdata_list);
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- open_file = file->private_data;
- tcon = tlink_tcon(open_file->tlink);
-
- if (!tcon->ses->server->ops->async_readv)
- return -ENOSYS;
+ server = tlink_tcon(open_file->tlink)->ses->server;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
else
pid = current->tgid;
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- cifs_dbg(FYI, "attempting read on write only file instance\n");
-
do {
- cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
+ &rsize, &credits);
+ if (rc)
+ break;
+
+ cur_len = min_t(const size_t, len, rsize);
npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
/* allocate a readdata struct */
rdata = cifs_readdata_alloc(npages,
cifs_uncached_readv_complete);
if (!rdata) {
+ add_credits_and_wake_if(server, credits, 0);
rc = -ENOMEM;
break;
}
@@@ -2960,113 -2884,44 +2960,113 @@@
rdata->pid = pid;
rdata->pagesz = PAGE_SIZE;
rdata->read_into_pages = cifs_uncached_read_into_pages;
+ rdata->credits = credits;
- rc = cifs_retry_async_readv(rdata);
+ if (!rdata->cfile->invalidHandle ||
+ !cifs_reopen_file(rdata->cfile, true))
+ rc = server->ops->async_readv(rdata);
error:
if (rc) {
+ add_credits_and_wake_if(server, rdata->credits, 0);
kref_put(&rdata->refcount,
cifs_uncached_readdata_release);
+ if (rc == -EAGAIN)
+ continue;
break;
}
- list_add_tail(&rdata->list, &rdata_list);
+ list_add_tail(&rdata->list, rdata_list);
offset += cur_len;
len -= cur_len;
} while (len > 0);
+ return rc;
+}
+
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t rc;
+ size_t len;
+ ssize_t total_read = 0;
+ loff_t offset = iocb->ki_pos;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+ struct cifsFileInfo *open_file;
+ struct cifs_readdata *rdata, *tmp;
+ struct list_head rdata_list;
+
+ len = iov_iter_count(to);
+ if (!len)
+ return 0;
+
+ INIT_LIST_HEAD(&rdata_list);
+ cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ open_file = file->private_data;
+ tcon = tlink_tcon(open_file->tlink);
+
+ if (!tcon->ses->server->ops->async_readv)
+ return -ENOSYS;
+
+ if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+ cifs_dbg(FYI, "attempting read on write only file instance\n");
+
+ rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list);
+
/* if at least one read request send succeeded, then reset rc */
if (!list_empty(&rdata_list))
rc = 0;
len = iov_iter_count(to);
/* the loop below should proceed in the order of increasing offsets */
+again:
list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
- again:
if (!rc) {
/* FIXME: freezable sleep too? */
rc = wait_for_completion_killable(&rdata->done);
if (rc)
rc = -EINTR;
- else if (rdata->result) {
- rc = rdata->result;
+ else if (rdata->result == -EAGAIN) {
/* resend call if it's a retryable error */
- if (rc == -EAGAIN) {
- rc = cifs_retry_async_readv(rdata);
- goto again;
+ struct list_head tmp_list;
+ unsigned int got_bytes = rdata->got_bytes;
+
+ list_del_init(&rdata->list);
+ INIT_LIST_HEAD(&tmp_list);
+
+ /*
+ * Got a part of data and then reconnect has
+ * happened -- fill the buffer and continue
+ * reading.
+ */
+ if (got_bytes && got_bytes < rdata->bytes) {
+ rc = cifs_readdata_to_iov(rdata, to);
+ if (rc) {
+ kref_put(&rdata->refcount,
+ cifs_uncached_readdata_release);
+ continue;
+ }
}
- } else {
+
+ rc = cifs_send_async_read(
+ rdata->offset + got_bytes,
+ rdata->bytes - got_bytes,
+ rdata->cfile, cifs_sb,
+ &tmp_list);
+
+ list_splice(&tmp_list, &rdata_list);
+
+ kref_put(&rdata->refcount,
+ cifs_uncached_readdata_release);
+ goto again;
+ } else if (rdata->result)
+ rc = rdata->result;
+ else
rc = cifs_readdata_to_iov(rdata, to);
- }
+ /* if there was a short read -- discard anything left */
+ if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
+ rc = -ENODATA;
}
list_del_init(&rdata->list);
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
@@@ -3175,19 -3030,18 +3175,19 @@@ cifs_read(struct file *file, char *read
for (total_read = 0, cur_offset = read_data; read_size > total_read;
total_read += bytes_read, cur_offset += bytes_read) {
- current_read_size = min_t(uint, read_size - total_read, rsize);
- /*
- * For windows me and 9x we do not want to request more than it
- * negotiated since it will refuse the read then.
- */
- if ((tcon->ses) && !(tcon->ses->capabilities &
+ do {
+ current_read_size = min_t(uint, read_size - total_read,
+ rsize);
+ /*
+ * For windows me and 9x we do not want to request more
+ * than it negotiated since it will refuse the read
+ * then.
+ */
+ if ((tcon->ses) && !(tcon->ses->capabilities &
tcon->ses->server->vals->cap_large_files)) {
- current_read_size = min_t(uint, current_read_size,
- CIFSMaxBufSize);
- }
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
+ current_read_size = min_t(uint,
+ current_read_size, CIFSMaxBufSize);
+ }
if (open_file->invalidHandle) {
rc = cifs_reopen_file(open_file, true);
if (rc != 0)
@@@ -3200,8 -3054,7 +3200,8 @@@
rc = server->ops->sync_read(xid, open_file, &io_parms,
&bytes_read, &cur_offset,
&buf_type);
- }
+ } while (rc == -EAGAIN);
+
if (rc || (bytes_read == 0)) {
if (total_read) {
break;
@@@ -3280,30 -3133,25 +3280,30 @@@ int cifs_file_mmap(struct file *file, s
static void
cifs_readv_complete(struct work_struct *work)
{
- unsigned int i;
+ unsigned int i, got_bytes;
struct cifs_readdata *rdata = container_of(work,
struct cifs_readdata, work);
+ got_bytes = rdata->got_bytes;
for (i = 0; i < rdata->nr_pages; i++) {
struct page *page = rdata->pages[i];
lru_cache_add_file(page);
- if (rdata->result == 0) {
+ if (rdata->result == 0 ||
+ (rdata->result == -EAGAIN && got_bytes)) {
flush_dcache_page(page);
SetPageUptodate(page);
}
unlock_page(page);
- if (rdata->result == 0)
+ if (rdata->result == 0 ||
+ (rdata->result == -EAGAIN && got_bytes))
cifs_readpage_to_fscache(rdata->mapping->host, page);
+ got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes);
+
page_cache_release(page);
rdata->pages[i] = NULL;
}
@@@ -3314,7 -3162,7 +3314,7 @@@ static in
cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
struct cifs_readdata *rdata, unsigned int len)
{
- int total_read = 0, result = 0;
+ int result = 0;
unsigned int i;
u64 eof;
pgoff_t eof_index;
@@@ -3326,7 -3174,6 +3326,7 @@@
eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
+ rdata->got_bytes = 0;
rdata->tailsz = PAGE_CACHE_SIZE;
for (i = 0; i < nr_pages; i++) {
struct page *page = rdata->pages[i];
@@@ -3381,70 -3228,10 +3381,70 @@@
if (result < 0)
break;
- total_read += result;
+ rdata->got_bytes += result;
+ }
+
+ return rdata->got_bytes > 0 && result != -ECONNABORTED ?
+ rdata->got_bytes : result;
+}
+
+static int
+readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
+ unsigned int rsize, struct list_head *tmplist,
+ unsigned int *nr_pages, loff_t *offset, unsigned int *bytes)
+{
+ struct page *page, *tpage;
+ unsigned int expected_index;
+ int rc;
+
+ INIT_LIST_HEAD(tmplist);
+
+ page = list_entry(page_list->prev, struct page, lru);
+
+ /*
+ * Lock the page and put it in the cache. Since no one else
+ * should have access to this page, we're safe to simply set
+ * PG_locked without checking it first.
+ */
+ __set_page_locked(page);
+ rc = add_to_page_cache_locked(page, mapping,
+ page->index, GFP_KERNEL);
+
+ /* give up if we can't stick it in the cache */
+ if (rc) {
+ __clear_page_locked(page);
+ return rc;
}
- return total_read > 0 ? total_read : result;
+ /* move first page to the tmplist */
+ *offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ *bytes = PAGE_CACHE_SIZE;
+ *nr_pages = 1;
+ list_move_tail(&page->lru, tmplist);
+
+ /* now try and add more pages onto the request */
+ expected_index = page->index + 1;
+ list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
+ /* discontinuity ? */
+ if (page->index != expected_index)
+ break;
+
+ /* would this page push the read over the rsize? */
+ if (*bytes + PAGE_CACHE_SIZE > rsize)
+ break;
+
+ __set_page_locked(page);
+ if (add_to_page_cache_locked(page, mapping, page->index,
+ GFP_KERNEL)) {
+ __clear_page_locked(page);
+ break;
+ }
+ list_move_tail(&page->lru, tmplist);
+ (*bytes) += PAGE_CACHE_SIZE;
+ expected_index++;
+ (*nr_pages)++;
+ }
+ return rc;
}
static int cifs_readpages(struct file *file, struct address_space *mapping,
@@@ -3454,10 -3241,19 +3454,10 @@@
struct list_head tmplist;
struct cifsFileInfo *open_file = file->private_data;
struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- unsigned int rsize = cifs_sb->rsize;
+ struct TCP_Server_Info *server;
pid_t pid;
/*
- * Give up immediately if rsize is too small to read an entire page.
- * The VFS will fall back to readpage. We should never reach this
- * point however since we set ra_pages to 0 when the rsize is smaller
- * than a cache page.
- */
- if (unlikely(rsize < PAGE_CACHE_SIZE))
- return 0;
-
- /*
* Reads as many pages as possible from fscache. Returns -ENOBUFS
* immediately if the cookie is negative
*
@@@ -3475,7 -3271,7 +3475,7 @@@
pid = current->tgid;
rc = 0;
- INIT_LIST_HEAD(&tmplist);
+ server = tlink_tcon(open_file->tlink)->ses->server;
cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
__func__, file, mapping, num_pages);
@@@ -3492,35 -3288,58 +3492,35 @@@
* the rdata->pages, then we want them in increasing order.
*/
while (!list_empty(page_list)) {
- unsigned int i;
- unsigned int bytes = PAGE_CACHE_SIZE;
- unsigned int expected_index;
- unsigned int nr_pages = 1;
+ unsigned int i, nr_pages, bytes, rsize;
loff_t offset;
struct page *page, *tpage;
struct cifs_readdata *rdata;
+ unsigned credits;
- page = list_entry(page_list->prev, struct page, lru);
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
+ &rsize, &credits);
+ if (rc)
+ break;
/*
- * Lock the page and put it in the cache. Since no one else
- * should have access to this page, we're safe to simply set
- * PG_locked without checking it first.
+ * Give up immediately if rsize is too small to read an entire
+ * page. The VFS will fall back to readpage. We should never
+ * reach this point however since we set ra_pages to 0 when the
+ * rsize is smaller than a cache page.
*/
- __set_page_locked(page);
- rc = add_to_page_cache_locked(page, mapping,
- page->index, GFP_KERNEL);
+ if (unlikely(rsize < PAGE_CACHE_SIZE)) {
+ add_credits_and_wake_if(server, credits, 0);
+ return 0;
+ }
- /* give up if we can't stick it in the cache */
+ rc = readpages_get_pages(mapping, page_list, rsize, &tmplist,
+ &nr_pages, &offset, &bytes);
if (rc) {
- __clear_page_locked(page);
+ add_credits_and_wake_if(server, credits, 0);
break;
}
- /* move first page to the tmplist */
- offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
- list_move_tail(&page->lru, &tmplist);
-
- /* now try and add more pages onto the request */
- expected_index = page->index + 1;
- list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
- /* discontinuity ? */
- if (page->index != expected_index)
- break;
-
- /* would this page push the read over the rsize? */
- if (bytes + PAGE_CACHE_SIZE > rsize)
- break;
-
- __set_page_locked(page);
- if (add_to_page_cache_locked(page, mapping,
- page->index, GFP_KERNEL)) {
- __clear_page_locked(page);
- break;
- }
- list_move_tail(&page->lru, &tmplist);
- bytes += PAGE_CACHE_SIZE;
- expected_index++;
- nr_pages++;
- }
-
rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
if (!rdata) {
/* best to give up if we're out of mem */
@@@ -3531,7 -3350,6 +3531,7 @@@
page_cache_release(page);
}
rc = -ENOMEM;
+ add_credits_and_wake_if(server, credits, 0);
break;
}
@@@ -3542,32 -3360,21 +3542,32 @@@
rdata->pid = pid;
rdata->pagesz = PAGE_CACHE_SIZE;
rdata->read_into_pages = cifs_readpages_read_into_pages;
+ rdata->credits = credits;
list_for_each_entry_safe(page, tpage, &tmplist, lru) {
list_del(&page->lru);
rdata->pages[rdata->nr_pages++] = page;
}
- rc = cifs_retry_async_readv(rdata);
- if (rc != 0) {
+ if (!rdata->cfile->invalidHandle ||
+ !cifs_reopen_file(rdata->cfile, true))
+ rc = server->ops->async_readv(rdata);
+ if (rc) {
+ add_credits_and_wake_if(server, rdata->credits, 0);
for (i = 0; i < rdata->nr_pages; i++) {
page = rdata->pages[i];
lru_cache_add_file(page);
unlock_page(page);
page_cache_release(page);
+ if (rc == -EAGAIN)
+ list_add_tail(&page->lru, &tmplist);
}
kref_put(&rdata->refcount, cifs_readdata_release);
+ if (rc == -EAGAIN) {
+ /* Re-add pages to the page_list and retry */
+ list_splice(&tmplist, page_list);
+ continue;
+ }
break;
}
@@@ -3811,6 -3618,13 +3811,6 @@@ static int cifs_launder_page(struct pag
return rc;
}
-static int
-cifs_pending_writers_wait(void *unused)
-{
- schedule();
- return 0;
-}
-
void cifs_oplock_break(struct work_struct *work)
{
struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@@ -3822,7 -3636,7 +3822,7 @@@
int rc = 0;
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
- cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE);
+ TASK_UNINTERRUPTIBLE);
server->ops->downgrade_oplock(server, cinode,
test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
diff --combined fs/cifs/sess.c
index 39ee326,27e6175..39b8507
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@@ -46,7 -46,7 +46,7 @@@ static __u32 cifs_ssetup_hdr(struct cif
CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
USHRT_MAX));
pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
- pSMB->req.VcNumber = __constant_cpu_to_le16(1);
+ pSMB->req.VcNumber = cpu_to_le16(1);
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@@ -520,559 -520,382 +520,559 @@@ select_sectype(struct TCP_Server_Info *
}
}
-int
-CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *nls_cp)
+struct sess_data {
+ unsigned int xid;
+ struct cifs_ses *ses;
+ struct nls_table *nls_cp;
+ void (*func)(struct sess_data *);
+ int result;
+
+ /* we will send the SMB in three pieces:
+ * a fixed length beginning part, an optional
+ * SPNEGO blob (which can be zero length), and a
+ * last part which will include the strings
+ * and rest of bcc area. This allows us to avoid
+ * a large buffer 17K allocation
+ */
+ int buf0_type;
+ struct kvec iov[3];
+};
+
+static int
+sess_alloc_buffer(struct sess_data *sess_data, int wct)
{
- int rc = 0;
- int wct;
+ int rc;
+ struct cifs_ses *ses = sess_data->ses;
struct smb_hdr *smb_buf;
- char *bcc_ptr;
- char *str_area;
- SESSION_SETUP_ANDX *pSMB;
- __u32 capabilities;
- __u16 count;
- int resp_buf_type;
- struct kvec iov[3];
- enum securityEnum type;
- __u16 action, bytes_remaining;
- struct key *spnego_key = NULL;
- __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
- u16 blob_len;
- char *ntlmsspblob = NULL;
- if (ses == NULL) {
- WARN(1, "%s: ses == NULL!", __func__);
- return -EINVAL;
- }
+ rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
+ (void **)&smb_buf);
- type = select_sectype(ses->server, ses->sectype);
- cifs_dbg(FYI, "sess setup type %d\n", type);
- if (type == Unspecified) {
- cifs_dbg(VFS,
- "Unable to select appropriate authentication method!");
- return -EINVAL;
+ if (rc)
+ return rc;
+
+ sess_data->iov[0].iov_base = (char *)smb_buf;
+ sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
+ /*
+ * This variable will be used to clear the buffer
+ * allocated above in case of any error in the calling function.
+ */
+ sess_data->buf0_type = CIFS_SMALL_BUFFER;
+
+ /* 2000 big enough to fit max user, domain, NOS name etc. */
+ sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL);
+ if (!sess_data->iov[2].iov_base) {
+ rc = -ENOMEM;
+ goto out_free_smb_buf;
}
- if (type == RawNTLMSSP) {
- /* if memory allocation is successful, caller of this function
- * frees it.
- */
- ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
- if (!ses->ntlmssp)
- return -ENOMEM;
- ses->ntlmssp->sesskey_per_smbsess = false;
+ return 0;
+
+out_free_smb_buf:
+ kfree(smb_buf);
+ sess_data->iov[0].iov_base = NULL;
+ sess_data->iov[0].iov_len = 0;
+ sess_data->buf0_type = CIFS_NO_BUFFER;
+ return rc;
+}
+
+static void
+sess_free_buffer(struct sess_data *sess_data)
+{
+ free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base);
+ sess_data->buf0_type = CIFS_NO_BUFFER;
+ kfree(sess_data->iov[2].iov_base);
+}
+
+static int
+sess_establish_session(struct sess_data *sess_data)
+{
+ struct cifs_ses *ses = sess_data->ses;
+
+ mutex_lock(&ses->server->srv_mutex);
+ if (!ses->server->session_estab) {
+ if (ses->server->sign) {
+ ses->server->session_key.response =
+ kmemdup(ses->auth_key.response,
+ ses->auth_key.len, GFP_KERNEL);
+ if (!ses->server->session_key.response) {
+ mutex_unlock(&ses->server->srv_mutex);
+ return -ENOMEM;
+ }
+ ses->server->session_key.len =
+ ses->auth_key.len;
+ }
+ ses->server->sequence_number = 0x2;
+ ses->server->session_estab = true;
}
+ mutex_unlock(&ses->server->srv_mutex);
-ssetup_ntlmssp_authenticate:
- if (phase == NtLmChallenge)
- phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
+ cifs_dbg(FYI, "CIFS session established successfully\n");
+ spin_lock(&GlobalMid_Lock);
+ ses->status = CifsGood;
+ ses->need_reconnect = false;
+ spin_unlock(&GlobalMid_Lock);
- if (type == LANMAN) {
-#ifndef CONFIG_CIFS_WEAK_PW_HASH
- /* LANMAN and plaintext are less secure and off by default.
- So we make this explicitly be turned on in kconfig (in the
- build) and turned on at runtime (changed from the default)
- in proc/fs/cifs or via mount parm. Unfortunately this is
- needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
- return -EOPNOTSUPP;
-#endif
- wct = 10; /* lanman 2 style sessionsetup */
- } else if ((type == NTLM) || (type == NTLMv2)) {
- /* For NTLMv2 failures eventually may need to retry NTLM */
- wct = 13; /* old style NTLM sessionsetup */
- } else /* same size: negotiate or auth, NTLMSSP or extended security */
- wct = 12;
+ return 0;
+}
- rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
- (void **)&smb_buf);
- if (rc)
- return rc;
+static int
+sess_sendreceive(struct sess_data *sess_data)
+{
+ int rc;
+ struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base;
+ __u16 count;
- pSMB = (SESSION_SETUP_ANDX *)smb_buf;
+ count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len;
+ smb_buf->smb_buf_length =
+ cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
+ put_bcc(count, smb_buf);
+
+ rc = SendReceive2(sess_data->xid, sess_data->ses,
+ sess_data->iov, 3 /* num_iovecs */,
+ &sess_data->buf0_type,
+ CIFS_LOG_ERROR);
+
+ return rc;
+}
+/*
+ * LANMAN and plaintext are less secure and off by default.
+ * So we make this explicitly be turned on in kconfig (in the
+ * build) and turned on at runtime (changed from the default)
+ * in proc/fs/cifs or via mount parm. Unfortunately this is
+ * needed for old Win (e.g. Win95), some obscure NAS and OS/2
+ */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+static void
+sess_auth_lanman(struct sess_data *sess_data)
+{
+ int rc = 0;
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ char *bcc_ptr;
+ struct cifs_ses *ses = sess_data->ses;
+ char lnm_session_key[CIFS_AUTH_RESP_SIZE];
+ __u32 capabilities;
+ __u16 bytes_remaining;
+
+ /* lanman 2 style sessionsetup */
+ /* wct = 10 */
+ rc = sess_alloc_buffer(sess_data, 10);
+ if (rc)
+ goto out;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ bcc_ptr = sess_data->iov[2].iov_base;
capabilities = cifs_ssetup_hdr(ses, pSMB);
- /* we will send the SMB in three pieces:
- a fixed length beginning part, an optional
- SPNEGO blob (which can be zero length), and a
- last part which will include the strings
- and rest of bcc area. This allows us to avoid
- a large buffer 17K allocation */
- iov[0].iov_base = (char *)pSMB;
- iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
-
- /* setting this here allows the code at the end of the function
- to free the request buffer if there's an error */
- resp_buf_type = CIFS_SMALL_BUFFER;
+ pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
- /* 2000 big enough to fit max user, domain, NOS name etc. */
- str_area = kmalloc(2000, GFP_KERNEL);
- if (str_area == NULL) {
- rc = -ENOMEM;
- goto ssetup_exit;
- }
- bcc_ptr = str_area;
+ /* no capabilities flags in old lanman negotiation */
+ pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- iov[1].iov_base = NULL;
- iov[1].iov_len = 0;
+ /* Calculate hash with password and copy into bcc_ptr.
+ * Encryption Key (stored as in cryptkey) gets used if the
+ * security mode bit in Negottiate Protocol response states
+ * to use challenge/response method (i.e. Password bit is 1).
+ */
+ rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
+ ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
+ true : false, lnm_session_key);
- if (type == LANMAN) {
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- char lnm_session_key[CIFS_AUTH_RESP_SIZE];
+ memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
+
+ /*
+ * can not sign if LANMAN negotiated so no need
+ * to calculate signing key? but what if server
+ * changed to do higher than lanman dialect and
+ * we reconnected would we ever calc signing_key?
+ */
- pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
+ cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
+ /* Unicode not allowed for LANMAN dialects */
+ ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
- /* no capabilities flags in old lanman negotiation */
+ sess_data->iov[2].iov_len = (long) bcc_ptr -
+ (long) sess_data->iov[2].iov_base;
- pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+ rc = sess_sendreceive(sess_data);
+ if (rc)
+ goto out;
- /* Calculate hash with password and copy into bcc_ptr.
- * Encryption Key (stored as in cryptkey) gets used if the
- * security mode bit in Negottiate Protocol response states
- * to use challenge/response method (i.e. Password bit is 1).
- */
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
- rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
- ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
- true : false, lnm_session_key);
+ /* lanman response has a word count of 3 */
+ if (smb_buf->WordCount != 3) {
+ rc = -EIO;
+ cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+ goto out;
+ }
- memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+ cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
+ ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
+ cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
- /* can not sign if LANMAN negotiated so no need
- to calculate signing key? but what if server
- changed to do higher than lanman dialect and
- we reconnected would we ever calc signing_key? */
+ bytes_remaining = get_bcc(smb_buf);
+ bcc_ptr = pByteArea(smb_buf);
- cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
- /* Unicode not allowed for LANMAN dialects */
- ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+ /* BB check if Unicode and decode strings */
+ if (bytes_remaining == 0) {
+ /* no string area to decode, do nothing */
+ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+ /* unicode string area must be word-aligned */
+ if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ ++bcc_ptr;
+ --bytes_remaining;
+ }
+ decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ } else {
+ decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ }
+
+ rc = sess_establish_session(sess_data);
+out:
+ sess_data->result = rc;
+ sess_data->func = NULL;
+ sess_free_buffer(sess_data);
+}
+
+#else
+
+static void
+sess_auth_lanman(struct sess_data *sess_data)
+{
+ sess_data->result = -EOPNOTSUPP;
+ sess_data->func = NULL;
+}
#endif
- } else if (type == NTLM) {
- pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
- pSMB->req_no_secext.CaseInsensitivePasswordLength =
+
+static void
+sess_auth_ntlm(struct sess_data *sess_data)
+{
+ int rc = 0;
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ char *bcc_ptr;
+ struct cifs_ses *ses = sess_data->ses;
+ __u32 capabilities;
+ __u16 bytes_remaining;
+
+ /* old style NTLM sessionsetup */
+ /* wct = 13 */
+ rc = sess_alloc_buffer(sess_data, 13);
+ if (rc)
+ goto out;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ bcc_ptr = sess_data->iov[2].iov_base;
+ capabilities = cifs_ssetup_hdr(ses, pSMB);
+
+ pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+ pSMB->req_no_secext.CaseInsensitivePasswordLength =
cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- pSMB->req_no_secext.CaseSensitivePasswordLength =
+ pSMB->req_no_secext.CaseSensitivePasswordLength =
cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- /* calculate ntlm response and session key */
- rc = setup_ntlm_response(ses, nls_cp);
- if (rc) {
- cifs_dbg(VFS, "Error %d during NTLM authentication\n",
+ /* calculate ntlm response and session key */
+ rc = setup_ntlm_response(ses, sess_data->nls_cp);
+ if (rc) {
+ cifs_dbg(VFS, "Error %d during NTLM authentication\n",
rc);
- goto ssetup_exit;
- }
+ goto out;
+ }
- /* copy ntlm response */
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
-
- if (ses->capabilities & CAP_UNICODE) {
- /* unicode strings must be word aligned */
- if (iov[0].iov_len % 2) {
- *bcc_ptr = 0;
- bcc_ptr++;
- }
- unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
- } else
- ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
- } else if (type == NTLMv2) {
- pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
-
- /* LM2 password would be here if we supported it */
- pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
-
- /* calculate nlmv2 response and session key */
- rc = setup_ntlmv2_rsp(ses, nls_cp);
- if (rc) {
- cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n",
- rc);
- goto ssetup_exit;
+ /* copy ntlm response */
+ memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ CIFS_AUTH_RESP_SIZE);
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ CIFS_AUTH_RESP_SIZE);
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
+
+ if (ses->capabilities & CAP_UNICODE) {
+ /* unicode strings must be word aligned */
+ if (sess_data->iov[0].iov_len % 2) {
+ *bcc_ptr = 0;
+ bcc_ptr++;
}
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- ses->auth_key.len - CIFS_SESS_KEY_SIZE);
- bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
-
- /* set case sensitive password length after tilen may get
- * assigned, tilen is 0 otherwise.
- */
- pSMB->req_no_secext.CaseSensitivePasswordLength =
- cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+ } else {
+ ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+ }
- if (ses->capabilities & CAP_UNICODE) {
- if (iov[0].iov_len % 2) {
- *bcc_ptr = 0;
- bcc_ptr++;
- }
- unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
- } else
- ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
- } else if (type == Kerberos) {
-#ifdef CONFIG_CIFS_UPCALL
- struct cifs_spnego_msg *msg;
- spnego_key = cifs_get_spnego_key(ses);
- if (IS_ERR(spnego_key)) {
- rc = PTR_ERR(spnego_key);
- spnego_key = NULL;
- goto ssetup_exit;
- }
+ sess_data->iov[2].iov_len = (long) bcc_ptr -
+ (long) sess_data->iov[2].iov_base;
- msg = spnego_key->payload.data;
- /* check version field to make sure that cifs.upcall is
- sending us a response in an expected form */
- if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
- cifs_dbg(VFS, "incorrect version of cifs.upcall "
- "expected %d but got %d)",
- CIFS_SPNEGO_UPCALL_VERSION, msg->version);
- rc = -EKEYREJECTED;
- goto ssetup_exit;
- }
+ rc = sess_sendreceive(sess_data);
+ if (rc)
+ goto out;
- ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
- GFP_KERNEL);
- if (!ses->auth_key.response) {
- cifs_dbg(VFS,
- "Kerberos can't allocate (%u bytes) memory",
- msg->sesskey_len);
- rc = -ENOMEM;
- goto ssetup_exit;
- }
- ses->auth_key.len = msg->sesskey_len;
-
- pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
- capabilities |= CAP_EXTENDED_SECURITY;
- pSMB->req.Capabilities = cpu_to_le32(capabilities);
- iov[1].iov_base = msg->data + msg->sesskey_len;
- iov[1].iov_len = msg->secblob_len;
- pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len);
-
- if (ses->capabilities & CAP_UNICODE) {
- /* unicode strings must be word aligned */
- if ((iov[0].iov_len + iov[1].iov_len) % 2) {
- *bcc_ptr = 0;
- bcc_ptr++;
- }
- unicode_oslm_strings(&bcc_ptr, nls_cp);
- unicode_domain_string(&bcc_ptr, ses, nls_cp);
- } else
- /* BB: is this right? */
- ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-#else /* ! CONFIG_CIFS_UPCALL */
- cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
- rc = -ENOSYS;
- goto ssetup_exit;
-#endif /* CONFIG_CIFS_UPCALL */
- } else if (type == RawNTLMSSP) {
- if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
- cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
- rc = -ENOSYS;
- goto ssetup_exit;
- }
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
- cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase);
- pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
- capabilities |= CAP_EXTENDED_SECURITY;
- pSMB->req.Capabilities |= cpu_to_le32(capabilities);
- switch(phase) {
- case NtLmNegotiate:
- build_ntlmssp_negotiate_blob(
- pSMB->req.SecurityBlob, ses);
- iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
- iov[1].iov_base = pSMB->req.SecurityBlob;
- pSMB->req.SecurityBlobLength =
- cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
- break;
- case NtLmAuthenticate:
- /*
- * 5 is an empirical value, large enough to hold
- * authenticate message plus max 10 of av paris,
- * domain, user, workstation names, flags, etc.
- */
- ntlmsspblob = kzalloc(
- 5*sizeof(struct _AUTHENTICATE_MESSAGE),
- GFP_KERNEL);
- if (!ntlmsspblob) {
- rc = -ENOMEM;
- goto ssetup_exit;
- }
+ if (smb_buf->WordCount != 3) {
+ rc = -EIO;
+ cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+ goto out;
+ }
- rc = build_ntlmssp_auth_blob(ntlmsspblob,
- &blob_len, ses, nls_cp);
- if (rc)
- goto ssetup_exit;
- iov[1].iov_len = blob_len;
- iov[1].iov_base = ntlmsspblob;
- pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
- /*
- * Make sure that we tell the server that we are using
- * the uid that it just gave us back on the response
- * (challenge)
- */
- smb_buf->Uid = ses->Suid;
- break;
- default:
- cifs_dbg(VFS, "invalid phase %d\n", phase);
- rc = -ENOSYS;
- goto ssetup_exit;
+ if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+ cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
+ ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
+ cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
+
+ bytes_remaining = get_bcc(smb_buf);
+ bcc_ptr = pByteArea(smb_buf);
+
+ /* BB check if Unicode and decode strings */
+ if (bytes_remaining == 0) {
+ /* no string area to decode, do nothing */
+ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+ /* unicode string area must be word-aligned */
+ if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ ++bcc_ptr;
+ --bytes_remaining;
}
- /* unicode strings must be word aligned */
- if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+ decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ } else {
+ decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ }
+
+ rc = sess_establish_session(sess_data);
+out:
+ sess_data->result = rc;
+ sess_data->func = NULL;
+ sess_free_buffer(sess_data);
+ kfree(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+}
+
+static void
+sess_auth_ntlmv2(struct sess_data *sess_data)
+{
+ int rc = 0;
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ char *bcc_ptr;
+ struct cifs_ses *ses = sess_data->ses;
+ __u32 capabilities;
+ __u16 bytes_remaining;
+
+ /* old style NTLM sessionsetup */
+ /* wct = 13 */
+ rc = sess_alloc_buffer(sess_data, 13);
+ if (rc)
+ goto out;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ bcc_ptr = sess_data->iov[2].iov_base;
+ capabilities = cifs_ssetup_hdr(ses, pSMB);
+
+ pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+
+ /* LM2 password would be here if we supported it */
+ pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
+
+ /* calculate nlmv2 response and session key */
+ rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp);
+ if (rc) {
+ cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc);
+ goto out;
+ }
+
+ memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+
+ /* set case sensitive password length after tilen may get
+ * assigned, tilen is 0 otherwise.
+ */
+ pSMB->req_no_secext.CaseSensitivePasswordLength =
+ cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+
+ if (ses->capabilities & CAP_UNICODE) {
+ if (sess_data->iov[0].iov_len % 2) {
*bcc_ptr = 0;
bcc_ptr++;
}
- unicode_oslm_strings(&bcc_ptr, nls_cp);
+ unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
} else {
- cifs_dbg(VFS, "secType %d not supported!\n", type);
- rc = -ENOSYS;
- goto ssetup_exit;
+ ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
}
- iov[2].iov_base = str_area;
- iov[2].iov_len = (long) bcc_ptr - (long) str_area;
- count = iov[1].iov_len + iov[2].iov_len;
- smb_buf->smb_buf_length =
- cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
+ sess_data->iov[2].iov_len = (long) bcc_ptr -
+ (long) sess_data->iov[2].iov_base;
- put_bcc(count, smb_buf);
+ rc = sess_sendreceive(sess_data);
+ if (rc)
+ goto out;
- rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
- CIFS_LOG_ERROR);
- /* SMB request buf freed in SendReceive2 */
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+
+ if (smb_buf->WordCount != 3) {
+ rc = -EIO;
+ cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+ goto out;
+ }
+
+ if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+ cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
+ ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
+ cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
- pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
- smb_buf = (struct smb_hdr *)iov[0].iov_base;
+ bytes_remaining = get_bcc(smb_buf);
+ bcc_ptr = pByteArea(smb_buf);
- if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) &&
- (smb_buf->Status.CifsError ==
- cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
- if (phase != NtLmNegotiate) {
- cifs_dbg(VFS, "Unexpected more processing error\n");
- goto ssetup_exit;
+ /* BB check if Unicode and decode strings */
+ if (bytes_remaining == 0) {
+ /* no string area to decode, do nothing */
+ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+ /* unicode string area must be word-aligned */
+ if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ ++bcc_ptr;
+ --bytes_remaining;
}
- /* NTLMSSP Negotiate sent now processing challenge (response) */
- phase = NtLmChallenge; /* process ntlmssp challenge */
- rc = 0; /* MORE_PROC rc is not an error here, but expected */
+ decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ } else {
+ decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
}
+
+ rc = sess_establish_session(sess_data);
+out:
+ sess_data->result = rc;
+ sess_data->func = NULL;
+ sess_free_buffer(sess_data);
+ kfree(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+}
+
+#ifdef CONFIG_CIFS_UPCALL
+static void
+sess_auth_kerberos(struct sess_data *sess_data)
+{
+ int rc = 0;
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ char *bcc_ptr;
+ struct cifs_ses *ses = sess_data->ses;
+ __u32 capabilities;
+ __u16 bytes_remaining;
+ struct key *spnego_key = NULL;
+ struct cifs_spnego_msg *msg;
+ u16 blob_len;
+
+ /* extended security */
+ /* wct = 12 */
+ rc = sess_alloc_buffer(sess_data, 12);
if (rc)
- goto ssetup_exit;
+ goto out;
- if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ bcc_ptr = sess_data->iov[2].iov_base;
+ capabilities = cifs_ssetup_hdr(ses, pSMB);
+
+ spnego_key = cifs_get_spnego_key(ses);
+ if (IS_ERR(spnego_key)) {
+ rc = PTR_ERR(spnego_key);
+ spnego_key = NULL;
+ goto out;
+ }
+
+ msg = spnego_key->payload.data;
+ /*
+ * check version field to make sure that cifs.upcall is
+ * sending us a response in an expected form
+ */
+ if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
+ cifs_dbg(VFS,
+ "incorrect version of cifs.upcall (expected %d but got %d)",
+ CIFS_SPNEGO_UPCALL_VERSION, msg->version);
+ rc = -EKEYREJECTED;
+ goto out_put_spnego_key;
+ }
+
+ ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+ GFP_KERNEL);
+ if (!ses->auth_key.response) {
+ cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory",
+ msg->sesskey_len);
+ rc = -ENOMEM;
+ goto out_put_spnego_key;
+ }
+ ses->auth_key.len = msg->sesskey_len;
+
+ pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+ capabilities |= CAP_EXTENDED_SECURITY;
+ pSMB->req.Capabilities = cpu_to_le32(capabilities);
+ sess_data->iov[1].iov_base = msg->data + msg->sesskey_len;
+ sess_data->iov[1].iov_len = msg->secblob_len;
+ pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len);
+
+ if (ses->capabilities & CAP_UNICODE) {
+ /* unicode strings must be word aligned */
+ if ((sess_data->iov[0].iov_len
+ + sess_data->iov[1].iov_len) % 2) {
+ *bcc_ptr = 0;
+ bcc_ptr++;
+ }
+ unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+ unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
+ } else {
+ /* BB: is this right? */
+ ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+ }
+
+ sess_data->iov[2].iov_len = (long) bcc_ptr -
+ (long) sess_data->iov[2].iov_base;
+
+ rc = sess_sendreceive(sess_data);
+ if (rc)
+ goto out_put_spnego_key;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+
+ if (smb_buf->WordCount != 4) {
rc = -EIO;
cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
- goto ssetup_exit;
+ goto out_put_spnego_key;
}
- action = le16_to_cpu(pSMB->resp.Action);
- if (action & GUEST_LOGIN)
+
+ if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
- /* response can have either 3 or 4 word count - Samba sends 3 */
- /* and lanman response is 3 */
+
bytes_remaining = get_bcc(smb_buf);
bcc_ptr = pByteArea(smb_buf);
- if (smb_buf->WordCount == 4) {
- blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
- if (blob_len > bytes_remaining) {
- cifs_dbg(VFS, "bad security blob length %d\n",
- blob_len);
- rc = -EINVAL;
- goto ssetup_exit;
- }
- if (phase == NtLmChallenge) {
- rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
- /* now goto beginning for ntlmssp authenticate phase */
- if (rc)
- goto ssetup_exit;
- }
- bcc_ptr += blob_len;
- bytes_remaining -= blob_len;
+ blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+ if (blob_len > bytes_remaining) {
+ cifs_dbg(VFS, "bad security blob length %d\n",
+ blob_len);
+ rc = -EINVAL;
+ goto out_put_spnego_key;
}
+ bcc_ptr += blob_len;
+ bytes_remaining -= blob_len;
/* BB check if Unicode and decode strings */
if (bytes_remaining == 0) {
@@@ -1083,371 -906,60 +1083,371 @@@
++bcc_ptr;
--bytes_remaining;
}
- decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
+ decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
} else {
- decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
+ decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
}
-ssetup_exit:
- if (spnego_key) {
- key_invalidate(spnego_key);
- key_put(spnego_key);
+ rc = sess_establish_session(sess_data);
+out_put_spnego_key:
+ key_invalidate(spnego_key);
+ key_put(spnego_key);
+out:
+ sess_data->result = rc;
+ sess_data->func = NULL;
+ sess_free_buffer(sess_data);
+ kfree(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+}
+
+#else
+
+static void
+sess_auth_kerberos(struct sess_data *sess_data)
+{
+ cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
+ sess_data->result = -ENOSYS;
+ sess_data->func = NULL;
+}
+#endif /* ! CONFIG_CIFS_UPCALL */
+
+/*
+ * The required kvec buffers have to be allocated before calling this
+ * function.
+ */
+static int
+_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
+{
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ struct cifs_ses *ses = sess_data->ses;
+ __u32 capabilities;
+ char *bcc_ptr;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)pSMB;
+
+ capabilities = cifs_ssetup_hdr(ses, pSMB);
+ if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+ cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
+ return -ENOSYS;
}
- kfree(str_area);
- kfree(ntlmsspblob);
- ntlmsspblob = NULL;
- if (resp_buf_type == CIFS_SMALL_BUFFER) {
- cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base);
- cifs_small_buf_release(iov[0].iov_base);
- } else if (resp_buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(iov[0].iov_base);
- /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
- if ((phase == NtLmChallenge) && (rc == 0))
- goto ssetup_ntlmssp_authenticate;
+ pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+ capabilities |= CAP_EXTENDED_SECURITY;
+ pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+
+ bcc_ptr = sess_data->iov[2].iov_base;
+ /* unicode strings must be word aligned */
+ if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) {
+ *bcc_ptr = 0;
+ bcc_ptr++;
+ }
+ unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+
+ sess_data->iov[2].iov_len = (long) bcc_ptr -
+ (long) sess_data->iov[2].iov_base;
+
+ return 0;
+}
+
+static void
+sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data);
+
+static void
+sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
+{
+ int rc;
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ struct cifs_ses *ses = sess_data->ses;
+ __u16 bytes_remaining;
+ char *bcc_ptr;
+ u16 blob_len;
+
+ cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n");
+
+ /*
+ * if memory allocation is successful, caller of this function
+ * frees it.
+ */
+ ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+ if (!ses->ntlmssp) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ ses->ntlmssp->sesskey_per_smbsess = false;
+
+ /* wct = 12 */
+ rc = sess_alloc_buffer(sess_data, 12);
+ if (rc)
+ goto out;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+
+ /* Build security blob before we assemble the request */
+ build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses);
+ sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+ sess_data->iov[1].iov_base = pSMB->req.SecurityBlob;
+ pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+
+ rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
+ if (rc)
+ goto out;
+
+ rc = sess_sendreceive(sess_data);
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+
+ /* If true, rc here is expected and not an error */
+ if (sess_data->buf0_type != CIFS_NO_BUFFER &&
+ smb_buf->Status.CifsError ==
+ cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
+ rc = 0;
+
+ if (rc)
+ goto out;
+
+ cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
+
+ if (smb_buf->WordCount != 4) {
+ rc = -EIO;
+ cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+ goto out;
+ }
+
+ ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
+ cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
+
+ bytes_remaining = get_bcc(smb_buf);
+ bcc_ptr = pByteArea(smb_buf);
+
+ blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+ if (blob_len > bytes_remaining) {
+ cifs_dbg(VFS, "bad security blob length %d\n",
+ blob_len);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
+out:
+ sess_free_buffer(sess_data);
if (!rc) {
- mutex_lock(&ses->server->srv_mutex);
- if (!ses->server->session_estab) {
- if (ses->server->sign) {
- ses->server->session_key.response =
- kmemdup(ses->auth_key.response,
- ses->auth_key.len, GFP_KERNEL);
- if (!ses->server->session_key.response) {
- rc = -ENOMEM;
- mutex_unlock(&ses->server->srv_mutex);
- goto keycp_exit;
- }
- ses->server->session_key.len =
- ses->auth_key.len;
- }
- ses->server->sequence_number = 0x2;
- ses->server->session_estab = true;
- }
- mutex_unlock(&ses->server->srv_mutex);
+ sess_data->func = sess_auth_rawntlmssp_authenticate;
+ return;
+ }
+
+ /* Else error. Cleanup */
+ kfree(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+ kfree(ses->ntlmssp);
+ ses->ntlmssp = NULL;
+
+ sess_data->func = NULL;
+ sess_data->result = rc;
+}
- cifs_dbg(FYI, "CIFS session established successfully\n");
- spin_lock(&GlobalMid_Lock);
- ses->status = CifsGood;
- ses->need_reconnect = false;
- spin_unlock(&GlobalMid_Lock);
+static void
+sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
+{
+ int rc;
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ struct cifs_ses *ses = sess_data->ses;
+ __u16 bytes_remaining;
+ char *bcc_ptr;
+ char *ntlmsspblob = NULL;
+ u16 blob_len;
+
+ cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n");
+
+ /* wct = 12 */
+ rc = sess_alloc_buffer(sess_data, 12);
+ if (rc)
+ goto out;
+
+ /* Build security blob before we assemble the request */
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)pSMB;
+ /*
+ * 5 is an empirical value, large enough to hold
+ * authenticate message plus max 10 of av paris,
+ * domain, user, workstation names, flags, etc.
+ */
+ ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE),
+ GFP_KERNEL);
+ if (!ntlmsspblob) {
+ rc = -ENOMEM;
+ goto out;
}
-keycp_exit:
+ rc = build_ntlmssp_auth_blob(ntlmsspblob,
+ &blob_len, ses, sess_data->nls_cp);
+ if (rc)
+ goto out_free_ntlmsspblob;
+ sess_data->iov[1].iov_len = blob_len;
+ sess_data->iov[1].iov_base = ntlmsspblob;
+ pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
+ /*
+ * Make sure that we tell the server that we are using
+ * the uid that it just gave us back on the response
+ * (challenge)
+ */
+ smb_buf->Uid = ses->Suid;
+
+ rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
+ if (rc)
+ goto out_free_ntlmsspblob;
+
+ rc = sess_sendreceive(sess_data);
+ if (rc)
+ goto out_free_ntlmsspblob;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+ if (smb_buf->WordCount != 4) {
+ rc = -EIO;
+ cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+ goto out_free_ntlmsspblob;
+ }
+
+ if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+ cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
+ bytes_remaining = get_bcc(smb_buf);
+ bcc_ptr = pByteArea(smb_buf);
+ blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+ if (blob_len > bytes_remaining) {
+ cifs_dbg(VFS, "bad security blob length %d\n",
+ blob_len);
+ rc = -EINVAL;
+ goto out_free_ntlmsspblob;
+ }
+ bcc_ptr += blob_len;
+ bytes_remaining -= blob_len;
+
+
+ /* BB check if Unicode and decode strings */
+ if (bytes_remaining == 0) {
+ /* no string area to decode, do nothing */
+ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+ /* unicode string area must be word-aligned */
+ if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ ++bcc_ptr;
+ --bytes_remaining;
+ }
+ decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ } else {
+ decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ }
+
+out_free_ntlmsspblob:
+ kfree(ntlmsspblob);
+out:
+ sess_free_buffer(sess_data);
+
+ if (!rc)
+ rc = sess_establish_session(sess_data);
+
+ /* Cleanup */
kfree(ses->auth_key.response);
ses->auth_key.response = NULL;
kfree(ses->ntlmssp);
+ ses->ntlmssp = NULL;
+
+ sess_data->func = NULL;
+ sess_data->result = rc;
+}
+
+static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data)
+{
+ int type;
+
+ type = select_sectype(ses->server, ses->sectype);
+ cifs_dbg(FYI, "sess setup type %d\n", type);
+ if (type == Unspecified) {
+ cifs_dbg(VFS,
+ "Unable to select appropriate authentication method!");
+ return -EINVAL;
+ }
+
+ switch (type) {
+ case LANMAN:
+ /* LANMAN and plaintext are less secure and off by default.
+ * So we make this explicitly be turned on in kconfig (in the
+ * build) and turned on at runtime (changed from the default)
+ * in proc/fs/cifs or via mount parm. Unfortunately this is
+ * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+ sess_data->func = sess_auth_lanman;
+ break;
+#else
+ return -EOPNOTSUPP;
+#endif
+ case NTLM:
+ sess_data->func = sess_auth_ntlm;
+ break;
+ case NTLMv2:
+ sess_data->func = sess_auth_ntlmv2;
+ break;
+ case Kerberos:
+#ifdef CONFIG_CIFS_UPCALL
+ sess_data->func = sess_auth_kerberos;
+ break;
+#else
+ cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
+ return -ENOSYS;
+ break;
+#endif /* CONFIG_CIFS_UPCALL */
+ case RawNTLMSSP:
+ sess_data->func = sess_auth_rawntlmssp_negotiate;
+ break;
+ default:
+ cifs_dbg(VFS, "secType %d not supported!\n", type);
+ return -ENOSYS;
+ }
+
+ return 0;
+}
+
+int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_cp)
+{
+ int rc = 0;
+ struct sess_data *sess_data;
+
+ if (ses == NULL) {
+ WARN(1, "%s: ses == NULL!", __func__);
+ return -EINVAL;
+ }
+
+ sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL);
+ if (!sess_data)
+ return -ENOMEM;
+
+ rc = select_sec(ses, sess_data);
+ if (rc)
+ goto out;
+
+ sess_data->xid = xid;
+ sess_data->ses = ses;
+ sess_data->buf0_type = CIFS_NO_BUFFER;
+ sess_data->nls_cp = (struct nls_table *) nls_cp;
+
+ while (sess_data->func)
+ sess_data->func(sess_data);
+
+ /* Store result before we free sess_data */
+ rc = sess_data->result;
+out:
+ kfree(sess_data);
return rc;
}
diff --combined fs/cifs/smb2ops.c
index 081529f,7f99a0f..59437c5
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@@ -112,53 -112,6 +112,53 @@@ smb2_get_credits(struct mid_q_entry *mi
return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest);
}
+static int
+smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
+ unsigned int *num, unsigned int *credits)
+{
+ int rc = 0;
+ unsigned int scredits;
+
+ spin_lock(&server->req_lock);
+ while (1) {
+ if (server->credits <= 0) {
+ spin_unlock(&server->req_lock);
+ cifs_num_waiters_inc(server);
+ rc = wait_event_killable(server->request_q,
+ has_credits(server, &server->credits));
+ cifs_num_waiters_dec(server);
+ if (rc)
+ return rc;
+ spin_lock(&server->req_lock);
+ } else {
+ if (server->tcpStatus == CifsExiting) {
+ spin_unlock(&server->req_lock);
+ return -ENOENT;
+ }
+
+ scredits = server->credits;
+ /* can deadlock with reopen */
+ if (scredits == 1) {
+ *num = SMB2_MAX_BUFFER_SIZE;
+ *credits = 0;
+ break;
+ }
+
+ /* leave one credit for a possible reopen */
+ scredits--;
+ *num = min_t(unsigned int, size,
+ scredits * SMB2_MAX_BUFFER_SIZE);
+
+ *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
+ server->credits -= *credits;
+ server->in_flight++;
+ break;
+ }
+ }
+ spin_unlock(&server->req_lock);
+ return rc;
+}
+
static __u64
smb2_get_next_mid(struct TCP_Server_Info *server)
{
@@@ -229,6 -182,8 +229,6 @@@ smb2_negotiate_wsize(struct cifs_tcon *
/* start with specified wsize, or default */
wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
- /* set it to the maximum buffer size value we can send with 1 credit */
- wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
return wsize;
}
@@@ -242,6 -197,8 +242,6 @@@ smb2_negotiate_rsize(struct cifs_tcon *
/* start with specified rsize, or default */
rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
- /* set it to the maximum buffer size value we can send with 1 credit */
- rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
return rsize;
}
@@@ -590,7 -547,7 +590,7 @@@ smb2_clone_range(const unsigned int xid
goto cchunk_out;
/* For now array only one chunk long, will make more flexible later */
- pcchunk->ChunkCount = __constant_cpu_to_le32(1);
+ pcchunk->ChunkCount = cpu_to_le32(1);
pcchunk->Reserved = 0;
pcchunk->Reserved2 = 0;
@@@ -1147,13 -1104,6 +1147,13 @@@ smb3_parse_lease_buf(void *buf, unsigne
return le32_to_cpu(lc->lcontext.LeaseState);
}
+static unsigned int
+smb2_wp_retry_size(struct inode *inode)
+{
+ return min_t(unsigned int, CIFS_SB(inode->i_sb)->wsize,
+ SMB2_MAX_BUFFER_SIZE);
+}
+
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
.setup_request = smb2_setup_request,
@@@ -1163,7 -1113,6 +1163,7 @@@
.set_credits = smb2_set_credits,
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
+ .wait_mtu_credits = cifs_wait_mtu_credits,
.get_next_mid = smb2_get_next_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
@@@ -1228,7 -1177,6 +1228,7 @@@
.create_lease_buf = smb2_create_lease_buf,
.parse_lease_buf = smb2_parse_lease_buf,
.clone_range = smb2_clone_range,
+ .wp_retry_size = smb2_wp_retry_size,
};
struct smb_version_operations smb21_operations = {
@@@ -1240,7 -1188,6 +1240,7 @@@
.set_credits = smb2_set_credits,
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
+ .wait_mtu_credits = smb2_wait_mtu_credits,
.get_next_mid = smb2_get_next_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
@@@ -1305,7 -1252,6 +1305,7 @@@
.create_lease_buf = smb2_create_lease_buf,
.parse_lease_buf = smb2_parse_lease_buf,
.clone_range = smb2_clone_range,
+ .wp_retry_size = smb2_wp_retry_size,
};
struct smb_version_operations smb30_operations = {
@@@ -1317,7 -1263,6 +1317,7 @@@
.set_credits = smb2_set_credits,
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
+ .wait_mtu_credits = smb2_wait_mtu_credits,
.get_next_mid = smb2_get_next_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
@@@ -1385,7 -1330,6 +1385,7 @@@
.parse_lease_buf = smb3_parse_lease_buf,
.clone_range = smb2_clone_range,
.validate_negotiate = smb3_validate_negotiate,
+ .wp_retry_size = smb2_wp_retry_size,
};
struct smb_version_values smb20_values = {
diff --combined fs/cifs/smb2pdu.c
index 768cddb,a9b03c2..2057250
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@@ -245,6 -245,10 +245,6 @@@ smb2_reconnect(__le16 smb2_command, str
if (rc)
goto out;
atomic_inc(&tconInfoReconnectCount);
- /*
- * BB FIXME add code to check if wsize needs update due to negotiated
- * smb buffer size shrinking.
- */
out:
/*
* Check if handle based operation so we know whether we can continue
@@@ -305,6 -309,16 +305,6 @@@ small_smb2_init(__le16 smb2_command, st
return rc;
}
-static void
-free_rsp_buf(int resp_buftype, void *rsp)
-{
- if (resp_buftype == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(rsp);
- else if (resp_buftype == CIFS_LARGE_BUFFER)
- cifs_buf_release(rsp);
-}
-
-
/*
*
* SMB2 Worker functions follow:
@@@ -1355,7 -1369,7 +1355,7 @@@ SMB2_set_compression(const unsigned in
char *ret_data = NULL;
fsctl_input.CompressionState =
- __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+ cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
FSCTL_SET_COMPRESSION, true /* is_fsctl */,
@@@ -1724,18 -1738,12 +1724,18 @@@ smb2_readv_callback(struct mid_q_entry
rc);
}
/* FIXME: should this be counted toward the initiating task? */
- task_io_account_read(rdata->bytes);
- cifs_stats_bytes_read(tcon, rdata->bytes);
+ task_io_account_read(rdata->got_bytes);
+ cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
rdata->result = -EAGAIN;
+ if (server->sign && rdata->got_bytes)
+ /* reset bytes number since we can not check a sign */
+ rdata->got_bytes = 0;
+ /* FIXME: should this be counted toward the initiating task? */
+ task_io_account_read(rdata->got_bytes);
+ cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
default:
if (rdata->result != -ENODATA)
@@@ -1754,12 -1762,11 +1754,12 @@@
int
smb2_async_readv(struct cifs_readdata *rdata)
{
- int rc;
+ int rc, flags = 0;
struct smb2_hdr *buf;
struct cifs_io_parms io_parms;
struct smb_rqst rqst = { .rq_iov = &rdata->iov,
.rq_nvec = 1 };
+ struct TCP_Server_Info *server;
cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
__func__, rdata->offset, rdata->bytes);
@@@ -1770,41 -1777,18 +1770,41 @@@
io_parms.persistent_fid = rdata->cfile->fid.persistent_fid;
io_parms.volatile_fid = rdata->cfile->fid.volatile_fid;
io_parms.pid = rdata->pid;
+
+ server = io_parms.tcon->ses->server;
+
rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0);
- if (rc)
+ if (rc) {
+ if (rc == -EAGAIN && rdata->credits) {
+ /* credits was reseted by reconnect */
+ rdata->credits = 0;
+ /* reduce in_flight value since we won't send the req */
+ spin_lock(&server->req_lock);
+ server->in_flight--;
+ spin_unlock(&server->req_lock);
+ }
return rc;
+ }
buf = (struct smb2_hdr *)rdata->iov.iov_base;
/* 4 for rfc1002 length field */
rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4;
+ if (rdata->credits) {
+ buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
+ SMB2_MAX_BUFFER_SIZE));
+ spin_lock(&server->req_lock);
+ server->credits += rdata->credits -
+ le16_to_cpu(buf->CreditCharge);
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ flags = CIFS_HAS_CREDITS;
+ }
+
kref_get(&rdata->refcount);
rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
cifs_readv_receive, smb2_readv_callback,
- rdata, 0);
+ rdata, flags);
if (rc) {
kref_put(&rdata->refcount, cifs_readdata_release);
cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
@@@ -1922,25 -1906,15 +1922,25 @@@ in
smb2_async_writev(struct cifs_writedata *wdata,
void (*release)(struct kref *kref))
{
- int rc = -EACCES;
+ int rc = -EACCES, flags = 0;
struct smb2_write_req *req = NULL;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
struct kvec iov;
struct smb_rqst rqst;
rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req);
- if (rc)
+ if (rc) {
+ if (rc == -EAGAIN && wdata->credits) {
+ /* credits was reseted by reconnect */
+ wdata->credits = 0;
+ /* reduce in_flight value since we won't send the req */
+ spin_lock(&server->req_lock);
+ server->in_flight--;
+ spin_unlock(&server->req_lock);
+ }
goto async_writev_out;
+ }
req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid);
@@@ -1973,20 -1947,9 +1973,20 @@@
inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
+ if (wdata->credits) {
+ req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
+ SMB2_MAX_BUFFER_SIZE));
+ spin_lock(&server->req_lock);
+ server->credits += wdata->credits -
+ le16_to_cpu(req->hdr.CreditCharge);
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ flags = CIFS_HAS_CREDITS;
+ }
+
kref_get(&wdata->refcount);
- rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
- smb2_writev_callback, wdata, 0);
+ rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata,
+ flags);
if (rc) {
kref_put(&wdata->refcount, release);
diff --combined fs/exec.c
index ab1f120,2ef2751..a2b42a9
--- a/fs/exec.c
+++ b/fs/exec.c
@@@ -368,10 -368,6 +368,6 @@@ static int bprm_mm_init(struct linux_bi
if (!mm)
goto err;
- err = init_new_context(current, mm);
- if (err)
- goto err;
-
err = __bprm_mm_init(bprm);
if (err)
goto err;
@@@ -1216,7 -1212,7 +1212,7 @@@ EXPORT_SYMBOL(install_exec_creds)
/*
* determine how safe it is to execute the proposed program
* - the caller must hold ->cred_guard_mutex to protect against
- * PTRACE_ATTACH
+ * PTRACE_ATTACH or seccomp thread-sync
*/
static void check_unsafe_exec(struct linux_binprm *bprm)
{
@@@ -1234,7 -1230,7 +1230,7 @@@
* This isn't strictly necessary, but it makes it harder for LSMs to
* mess up.
*/
- if (current->no_new_privs)
+ if (task_no_new_privs(current))
bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
t = p;
@@@ -1272,7 -1268,7 +1268,7 @@@ int prepare_binprm(struct linux_binprm
bprm->cred->egid = current_egid();
if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
- !current->no_new_privs &&
+ !task_no_new_privs(current) &&
kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
/* Set-uid? */
diff --combined fs/fscache/main.c
index a31b83c,3248c15..b39d487
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@@ -67,7 -67,7 +67,7 @@@ static int fscache_max_active_sysctl(st
return ret;
}
- struct ctl_table fscache_sysctls[] = {
+ static struct ctl_table fscache_sysctls[] = {
{
.procname = "object_max_active",
.data = &fscache_object_max_active,
@@@ -87,7 -87,7 +87,7 @@@
{}
};
- struct ctl_table fscache_sysctls_root[] = {
+ static struct ctl_table fscache_sysctls_root[] = {
{
.procname = "fscache",
.mode = 0555,
@@@ -197,6 -197,24 +197,6 @@@ static void __exit fscache_exit(void
module_exit(fscache_exit);
/*
- * wait_on_bit() sleep function for uninterruptible waiting
- */
-int fscache_wait_bit(void *flags)
-{
- schedule();
- return 0;
-}
-
-/*
- * wait_on_bit() sleep function for interruptible waiting
- */
-int fscache_wait_bit_interruptible(void *flags)
-{
- schedule();
- return signal_pending(current);
-}
-
-/*
* wait_on_atomic_t() sleep function for uninterruptible waiting
*/
int fscache_wait_atomic_t(atomic_t *p)
diff --combined fs/namespace.c
index b10db3d,2a1447c..019ff81
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@@ -225,7 -225,6 +225,7 @@@ static struct mount *alloc_vfsmnt(cons
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
+ INIT_LIST_HEAD(&mnt->mnt_mp_list);
#ifdef CONFIG_FSNOTIFY
INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
#endif
@@@ -668,45 -667,11 +668,45 @@@ struct vfsmount *lookup_mnt(struct pat
return m;
}
-static struct mountpoint *new_mountpoint(struct dentry *dentry)
+/*
+ * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
+ * current mount namespace.
+ *
+ * The common case is dentries are not mountpoints at all and that
+ * test is handled inline. For the slow case when we are actually
+ * dealing with a mountpoint of some kind, walk through all of the
+ * mounts in the current mount namespace and test to see if the dentry
+ * is a mountpoint.
+ *
+ * The mount_hashtable is not usable in the context because we
+ * need to identify all mounts that may be in the current mount
+ * namespace not just a mount that happens to have some specified
+ * parent mount.
+ */
+bool __is_local_mountpoint(struct dentry *dentry)
+{
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ struct mount *mnt;
+ bool is_covered = false;
+
+ if (!d_mountpoint(dentry))
+ goto out;
+
+ down_read(&namespace_sem);
+ list_for_each_entry(mnt, &ns->list, mnt_list) {
+ is_covered = (mnt->mnt_mountpoint == dentry);
+ if (is_covered)
+ break;
+ }
+ up_read(&namespace_sem);
+out:
+ return is_covered;
+}
+
+static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
{
struct hlist_head *chain = mp_hash(dentry);
struct mountpoint *mp;
- int ret;
hlist_for_each_entry(mp, chain, m_hash) {
if (mp->m_dentry == dentry) {
@@@ -717,14 -682,6 +717,14 @@@
return mp;
}
}
+ return NULL;
+}
+
+static struct mountpoint *new_mountpoint(struct dentry *dentry)
+{
+ struct hlist_head *chain = mp_hash(dentry);
+ struct mountpoint *mp;
+ int ret;
mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
if (!mp)
@@@ -739,7 -696,6 +739,7 @@@
mp->m_dentry = dentry;
mp->m_count = 1;
hlist_add_head(&mp->m_hash, chain);
+ INIT_LIST_HEAD(&mp->m_list);
return mp;
}
@@@ -747,7 -703,6 +747,7 @@@ static void put_mountpoint(struct mount
{
if (!--mp->m_count) {
struct dentry *dentry = mp->m_dentry;
+ BUG_ON(!list_empty(&mp->m_list));
spin_lock(&dentry->d_lock);
dentry->d_flags &= ~DCACHE_MOUNTED;
spin_unlock(&dentry->d_lock);
@@@ -794,7 -749,6 +794,7 @@@ static void detach_mnt(struct mount *mn
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
hlist_del_init_rcu(&mnt->mnt_hash);
+ list_del_init(&mnt->mnt_mp_list);
put_mountpoint(mnt->mnt_mp);
mnt->mnt_mp = NULL;
}
@@@ -811,7 -765,6 +811,7 @@@ void mnt_set_mountpoint(struct mount *m
child_mnt->mnt_mountpoint = dget(mp->m_dentry);
child_mnt->mnt_parent = mnt;
child_mnt->mnt_mp = mp;
+ list_add_tail(&child_mnt->mnt_mp_list, &mp->m_list);
}
/*
@@@ -845,7 -798,7 +845,7 @@@ static void commit_tree(struct mount *m
list_splice(&head, n->list.prev);
if (shadows)
- hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
+ hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
else
hlist_add_head_rcu(&mnt->mnt_hash,
m_hash(&parent->mnt, mnt->mnt_mountpoint));
@@@ -983,25 -936,9 +983,25 @@@ static struct mount *clone_mnt(struct m
return ERR_PTR(err);
}
+static void cleanup_mnt(struct mount *mnt)
+{
+ fsnotify_vfsmount_delete(&mnt->mnt);
+ dput(mnt->mnt.mnt_root);
+ deactivate_super(mnt->mnt.mnt_sb);
+ mnt_free_id(mnt);
+ complete(mnt->mnt_undone);
+ call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
+}
+
+static void cleanup_mnt_work(struct work_struct *work)
+{
+ cleanup_mnt(container_of(work, struct mount, mnt_cleanup_work));
+}
+
static void mntput_no_expire(struct mount *mnt)
{
-put_again:
+ struct completion undone;
+
rcu_read_lock();
mnt_add_count(mnt, -1);
if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
@@@ -1015,15 -952,12 +1015,15 @@@
return;
}
if (unlikely(mnt->mnt_pinned)) {
- mnt_add_count(mnt, mnt->mnt_pinned + 1);
+ init_completion(&undone);
+ mnt->mnt_undone = &undone;
+ mnt_add_count(mnt, mnt->mnt_pinned);
mnt->mnt_pinned = 0;
rcu_read_unlock();
unlock_mount_hash();
acct_auto_close_mnt(&mnt->mnt);
- goto put_again;
+ wait_for_completion(&undone);
+ return;
}
if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
rcu_read_unlock();
@@@ -1047,19 -981,11 +1047,19 @@@
* so mnt_get_writers() below is safe.
*/
WARN_ON(mnt_get_writers(mnt));
- fsnotify_vfsmount_delete(&mnt->mnt);
- dput(mnt->mnt.mnt_root);
- deactivate_super(mnt->mnt.mnt_sb);
- mnt_free_id(mnt);
- call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
+ /* The stack may be deep here, cleanup the mount on a work
+ * queue where the stack is guaranteed to be shallow.
+ */
+ init_completion(&undone);
+ if (!mnt->mnt_undone)
+ mnt->mnt_undone = &undone;
+ else
+ complete(&undone);
+
+ INIT_WORK(&mnt->mnt_cleanup_work, cleanup_mnt_work);
+ schedule_work(&mnt->mnt_cleanup_work);
+
+ wait_for_completion(&undone);
}
void mntput(struct vfsmount *mnt)
@@@ -1335,7 -1261,6 +1335,7 @@@ void umount_tree(struct mount *mnt, in
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
list_del_init(&p->mnt_child);
if (mnt_has_parent(p)) {
+ list_del_init(&p->mnt_mp_list);
put_mountpoint(p->mnt_mp);
/* move the reference to mountpoint into ->mnt_ex_mountpoint */
p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
@@@ -1448,37 -1373,6 +1448,37 @@@ static int do_umount(struct mount *mnt
return retval;
}
+/*
+ * __detach_mounts - lazily unmount all mounts on the specified dentry
+ *
+ * During unlink, rmdir, and d_drop it is possible to loose the path
+ * to an existing mountpoint, and wind up leaking the mount.
+ * detach_mounts allows lazily unmounting those mounts instead of
+ * leaking them.
+ *
+ * The caller may hold dentry->d_inode->i_mutex.
+ */
+void __detach_mounts(struct dentry *dentry)
+{
+ struct mountpoint *mp;
+ struct mount *mnt;
+
+ namespace_lock();
+ mp = lookup_mountpoint(dentry);
+ if (!mp)
+ goto out_unlock;
+
+ lock_mount_hash();
+ while (!list_empty(&mp->m_list)) {
+ mnt = list_first_entry(&mp->m_list, struct mount, mnt_mp_list);
+ umount_tree(mnt, 2);
+ }
+ unlock_mount_hash();
+ put_mountpoint(mp);
+out_unlock:
+ namespace_unlock();
+}
+
/*
* Is the caller allowed to modify his namespace?
*/
@@@ -1828,9 -1722,7 +1828,9 @@@ retry
namespace_lock();
mnt = lookup_mnt(path);
if (likely(!mnt)) {
- struct mountpoint *mp = new_mountpoint(dentry);
+ struct mountpoint *mp = lookup_mountpoint(dentry);
+ if (!mp)
+ mp = new_mountpoint(dentry);
if (IS_ERR(mp)) {
namespace_unlock();
mutex_unlock(&dentry->d_inode->i_mutex);
diff --combined fs/proc/base.c
index e442784,043c83c..2105331
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@@ -105,7 -105,7 +105,7 @@@
*/
struct pid_entry {
- char *name;
+ const char *name;
int len;
umode_t mode;
const struct inode_operations *iop;
@@@ -130,10 -130,6 +130,6 @@@
{ .proc_get_link = get_link } )
#define REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
- #define INF(NAME, MODE, read) \
- NOD(NAME, (S_IFREG|(MODE)), \
- NULL, &proc_info_file_operations, \
- { .proc_read = read } )
#define ONE(NAME, MODE, show) \
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
@@@ -200,27 -196,32 +196,32 @@@ static int proc_root_link(struct dentr
return result;
}
- static int proc_pid_cmdline(struct task_struct *task, char *buffer)
+ static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
- return get_cmdline(task, buffer, PAGE_SIZE);
+ /*
+ * Rely on struct seq_operations::show() being called once
+ * per internal buffer allocation. See single_open(), traverse().
+ */
+ BUG_ON(m->size < PAGE_SIZE);
+ m->count += get_cmdline(task, m->buf, PAGE_SIZE);
+ return 0;
}
- static int proc_pid_auxv(struct task_struct *task, char *buffer)
+ static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
- int res = PTR_ERR(mm);
if (mm && !IS_ERR(mm)) {
unsigned int nwords = 0;
do {
nwords += 2;
} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
- res = nwords * sizeof(mm->saved_auxv[0]);
- if (res > PAGE_SIZE)
- res = PAGE_SIZE;
- memcpy(buffer, mm->saved_auxv, res);
+ seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0]));
mmput(mm);
- }
- return res;
+ return 0;
+ } else
+ return PTR_ERR(mm);
}
@@@ -229,7 -230,8 +230,8 @@@
* Provides a wchan file via kallsyms in a proper one-value-per-file format.
* Returns the resolved symbol. If that fails, simply return the address.
*/
- static int proc_pid_wchan(struct task_struct *task, char *buffer)
+ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
unsigned long wchan;
char symname[KSYM_NAME_LEN];
@@@ -240,9 -242,9 +242,9 @@@
if (!ptrace_may_access(task, PTRACE_MODE_READ))
return 0;
else
- return sprintf(buffer, "%lu", wchan);
+ return seq_printf(m, "%lu", wchan);
else
- return sprintf(buffer, "%s", symname);
+ return seq_printf(m, "%s", symname);
}
#endif /* CONFIG_KALLSYMS */
@@@ -304,9 -306,10 +306,10 @@@ static int proc_pid_stack(struct seq_fi
/*
* Provides /proc/PID/schedstat
*/
- static int proc_pid_schedstat(struct task_struct *task, char *buffer)
+ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
- return sprintf(buffer, "%llu %llu %lu\n",
+ return seq_printf(m, "%llu %llu %lu\n",
(unsigned long long)task->se.sum_exec_runtime,
(unsigned long long)task->sched_info.run_delay,
task->sched_info.pcount);
@@@ -404,7 -407,8 +407,8 @@@ static const struct file_operations pro
};
#endif
- static int proc_oom_score(struct task_struct *task, char *buffer)
+ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
unsigned long totalpages = totalram_pages + total_swap_pages;
unsigned long points = 0;
@@@ -414,12 -418,12 +418,12 @@@
points = oom_badness(task, NULL, NULL, totalpages) *
1000 / totalpages;
read_unlock(&tasklist_lock);
- return sprintf(buffer, "%lu\n", points);
+ return seq_printf(m, "%lu\n", points);
}
struct limit_names {
- char *name;
- char *unit;
+ const char *name;
+ const char *unit;
};
static const struct limit_names lnames[RLIM_NLIMITS] = {
@@@ -442,12 -446,11 +446,11 @@@
};
/* Display limits for a process */
- static int proc_pid_limits(struct task_struct *task, char *buffer)
+ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
unsigned int i;
- int count = 0;
unsigned long flags;
- char *bufptr = buffer;
struct rlimit rlim[RLIM_NLIMITS];
@@@ -459,35 -462,34 +462,34 @@@
/*
* print the file header
*/
- count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n",
+ seq_printf(m, "%-25s %-20s %-20s %-10s\n",
"Limit", "Soft Limit", "Hard Limit",
"Units");
for (i = 0; i < RLIM_NLIMITS; i++) {
if (rlim[i].rlim_cur == RLIM_INFINITY)
- count += sprintf(&bufptr[count], "%-25s %-20s ",
+ seq_printf(m, "%-25s %-20s ",
lnames[i].name, "unlimited");
else
- count += sprintf(&bufptr[count], "%-25s %-20lu ",
+ seq_printf(m, "%-25s %-20lu ",
lnames[i].name, rlim[i].rlim_cur);
if (rlim[i].rlim_max == RLIM_INFINITY)
- count += sprintf(&bufptr[count], "%-20s ", "unlimited");
+ seq_printf(m, "%-20s ", "unlimited");
else
- count += sprintf(&bufptr[count], "%-20lu ",
- rlim[i].rlim_max);
+ seq_printf(m, "%-20lu ", rlim[i].rlim_max);
if (lnames[i].unit)
- count += sprintf(&bufptr[count], "%-10s\n",
- lnames[i].unit);
+ seq_printf(m, "%-10s\n", lnames[i].unit);
else
- count += sprintf(&bufptr[count], "\n");
+ seq_putc(m, '\n');
}
- return count;
+ return 0;
}
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- static int proc_pid_syscall(struct task_struct *task, char *buffer)
+ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
long nr;
unsigned long args[6], sp, pc;
@@@ -496,11 -498,11 +498,11 @@@
return res;
if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
- res = sprintf(buffer, "running\n");
+ seq_puts(m, "running\n");
else if (nr < 0)
- res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+ seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
else
- res = sprintf(buffer,
+ seq_printf(m,
"%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
nr,
args[0], args[1], args[2], args[3], args[4], args[5],
@@@ -598,43 -600,6 +600,6 @@@ static const struct inode_operations pr
.setattr = proc_setattr,
};
- #define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack
for overruns */
-
- static ssize_t proc_info_read(struct file * file, char __user * buf,
- size_t count, loff_t *ppos)
- {
- struct inode * inode = file_inode(file);
- unsigned long page;
- ssize_t length;
- struct task_struct *task = get_proc_task(inode);
-
- length = -ESRCH;
- if (!task)
- goto out_no_task;
-
- if (count > PROC_BLOCK_SIZE)
- count = PROC_BLOCK_SIZE;
-
- length = -ENOMEM;
- if (!(page = __get_free_page(GFP_TEMPORARY)))
- goto out;
-
- length = PROC_I(inode)->op.proc_read(task, (char*)page);
-
- if (length >= 0)
- length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
- free_page(page);
- out:
- put_task_struct(task);
- out_no_task:
- return length;
- }
-
- static const struct file_operations proc_info_file_operations = {
- .read = proc_info_read,
- .llseek = generic_file_llseek,
- };
-
static int proc_single_show(struct seq_file *m, void *v)
{
struct inode *inode = m->private;
@@@ -1625,6 -1590,7 +1590,6 @@@ int pid_revalidate(struct dentry *dentr
put_task_struct(task);
return 1;
}
- d_drop(dentry);
return 0;
}
@@@ -1761,6 -1727,9 +1726,6 @@@ out
put_task_struct(task);
out_notask:
- if (status <= 0)
- d_drop(dentry);
-
return status;
}
@@@ -2052,7 -2021,7 +2017,7 @@@ static int show_timer(struct seq_file *
struct k_itimer *timer;
struct timers_private *tp = m->private;
int notify;
- static char *nstr[] = {
+ static const char * const nstr[] = {
[SIGEV_SIGNAL] = "signal",
[SIGEV_NONE] = "none",
[SIGEV_THREAD] = "thread",
@@@ -2388,7 -2357,7 +2353,7 @@@ static const struct file_operations pro
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
+ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
{
struct task_io_accounting acct = task->ioac;
unsigned long flags;
@@@ -2412,7 -2381,7 +2377,7 @@@
unlock_task_sighand(task, &flags);
}
- result = sprintf(buffer,
+ result = seq_printf(m,
"rchar: %llu\n"
"wchar: %llu\n"
"syscr: %llu\n"
@@@ -2432,20 -2401,22 +2397,22 @@@ out_unlock
return result;
}
- static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+ static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
- return do_io_accounting(task, buffer, 0);
+ return do_io_accounting(task, m, 0);
}
- static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+ static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
- return do_io_accounting(task, buffer, 1);
+ return do_io_accounting(task, m, 1);
}
#endif /* CONFIG_TASK_IO_ACCOUNTING */
#ifdef CONFIG_USER_NS
static int proc_id_map_open(struct inode *inode, struct file *file,
- struct seq_operations *seq_ops)
+ const struct seq_operations *seq_ops)
{
struct user_namespace *ns = NULL;
struct task_struct *task;
@@@ -2553,10 -2524,10 +2520,10 @@@ static const struct pid_entry tgid_base
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations,
proc_net_operations),
#endif
REG("environ", S_IRUSR, proc_environ_operations),
- INF("auxv", S_IRUSR, proc_pid_auxv),
+ ONE("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
- INF("limits", S_IRUGO, proc_pid_limits),
+ ONE("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
@@@ -2565,9 -2536,9 +2532,9 @@@
#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- INF("syscall", S_IRUSR, proc_pid_syscall),
+ ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
- INF("cmdline", S_IRUGO, proc_pid_cmdline),
+ ONE("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tgid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_pid_maps_operations),
@@@ -2590,13 -2561,13 +2557,13 @@@
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations,
proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
- INF("wchan", S_IRUGO, proc_pid_wchan),
+ ONE("wchan", S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
ONE("stack", S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHEDSTATS
- INF("schedstat", S_IRUGO, proc_pid_schedstat),
+ ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
REG("latency", S_IRUGO, proc_lstats_operations),
@@@ -2607,7 -2578,7 +2574,7 @@@
#ifdef CONFIG_CGROUPS
REG("cgroup", S_IRUGO, proc_cgroup_operations),
#endif
- INF("oom_score", S_IRUGO, proc_oom_score),
+ ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
@@@ -2621,10 -2592,10 +2588,10 @@@
REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- INF("io", S_IRUSR, proc_tgid_io_accounting),
+ ONE("io", S_IRUSR, proc_tgid_io_accounting),
#endif
#ifdef CONFIG_HARDWALL
- INF("hardwall", S_IRUGO, proc_pid_hardwall),
+ ONE("hardwall", S_IRUGO, proc_pid_hardwall),
#endif
#ifdef CONFIG_USER_NS
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
@@@ -2672,7 -2643,8 +2639,7 @@@ static void proc_flush_task_mnt(struct
/* no ->d_hash() rejects on procfs */
dentry = d_hash_and_lookup(mnt->mnt_root, &name);
if (dentry) {
- shrink_dcache_parent(dentry);
- d_drop(dentry);
+ d_invalidate(dentry);
dput(dentry);
}
@@@ -2692,7 -2664,8 +2659,7 @@@
name.len = snprintf(buf, sizeof(buf), "%d", pid);
dentry = d_hash_and_lookup(dir, &name);
if (dentry) {
- shrink_dcache_parent(dentry);
- d_drop(dentry);
+ d_invalidate(dentry);
dput(dentry);
}
@@@ -2774,12 -2747,12 +2741,12 @@@ out
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int
flags)
{
- int result = 0;
+ int result = -ENOENT;
struct task_struct *task;
unsigned tgid;
struct pid_namespace *ns;
- tgid = name_to_int(dentry);
+ tgid = name_to_int(&dentry->d_name);
if (tgid == ~0U)
goto out;
@@@ -2890,18 -2863,18 +2857,18 @@@ static const struct pid_entry tid_base_
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations,
proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations,
proc_ns_dir_operations),
REG("environ", S_IRUSR, proc_environ_operations),
- INF("auxv", S_IRUSR, proc_pid_auxv),
+ ONE("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
- INF("limits", S_IRUGO, proc_pid_limits),
+ ONE("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- INF("syscall", S_IRUSR, proc_pid_syscall),
+ ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
- INF("cmdline", S_IRUGO, proc_pid_cmdline),
+ ONE("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_tid_maps_operations),
@@@ -2926,13 -2899,13 +2893,13 @@@
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations,
proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
- INF("wchan", S_IRUGO, proc_pid_wchan),
+ ONE("wchan", S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
ONE("stack", S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHEDSTATS
- INF("schedstat", S_IRUGO, proc_pid_schedstat),
+ ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
REG("latency", S_IRUGO, proc_lstats_operations),
@@@ -2943,7 -2916,7 +2910,7 @@@
#ifdef CONFIG_CGROUPS
REG("cgroup", S_IRUGO, proc_cgroup_operations),
#endif
- INF("oom_score", S_IRUGO, proc_oom_score),
+ ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
@@@ -2954,10 -2927,10 +2921,10 @@@
REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- INF("io", S_IRUSR, proc_tid_io_accounting),
+ ONE("io", S_IRUSR, proc_tid_io_accounting),
#endif
#ifdef CONFIG_HARDWALL
- INF("hardwall", S_IRUGO, proc_pid_hardwall),
+ ONE("hardwall", S_IRUGO, proc_pid_hardwall),
#endif
#ifdef CONFIG_USER_NS
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
@@@ -3027,7 -3000,7 +2994,7 @@@ static struct dentry *proc_task_lookup(
if (!leader)
goto out_no_task;
- tid = name_to_int(dentry);
+ tid = name_to_int(&dentry->d_name);
if (tid == ~0U)
goto out;
diff --combined fs/proc/fd.c
index eb82e9f,955bb55..e11d7c5
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@@ -129,6 -129,8 +129,6 @@@ static int tid_fd_revalidate(struct den
}
put_task_struct(task);
}
-
- d_drop(dentry);
return 0;
}
@@@ -204,7 -206,7 +204,7 @@@ static struct dentry *proc_lookupfd_com
{
struct task_struct *task = get_proc_task(dir);
int result = -ENOENT;
- unsigned fd = name_to_int(dentry);
+ unsigned fd = name_to_int(&dentry->d_name);
if (!task)
goto out_no_task;
diff --combined include/linux/fs.h
index 2daccaf,8b4a021..1ab6c69
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -833,7 -833,7 +833,7 @@@ static inline struct file *get_file(str
*
* Lockd stuffs a "host" pointer into this.
*/
-typedef struct files_struct *fl_owner_t;
+typedef void *fl_owner_t;
struct file_lock_operations {
void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
@@@ -2688,7 -2688,7 +2688,7 @@@ static const struct file_operations __f
.read = simple_attr_read, \
.write = simple_attr_write, \
.llseek = generic_file_llseek, \
- };
+ }
static inline __printf(1, 2)
void __simple_attr_check_format(const char *fmt, ...)
diff --combined include/linux/kernel.h
index a9e2268,44a498d..e989204
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@@ -470,6 -470,7 +470,7 @@@ extern enum system_states
#define TAINT_FIRMWARE_WORKAROUND 11
#define TAINT_OOT_MODULE 12
#define TAINT_UNSIGNED_MODULE 13
+ #define TAINT_SOFTLOCKUP 14
extern const char hex_asc[];
#define hex_asc_lo(x) hex_asc[((x) & 0x0f)]
@@@ -493,15 -494,10 +494,10 @@@ static inline char *hex_byte_pack_upper
return buf;
}
extern int hex_to_bin(char ch);
extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
-int mac_pton(const char *s, u8 *mac);
+bool mac_pton(const char *s, u8 *mac);
/*
* General tracing related utility functions - trace_printk(),
@@@ -719,23 -715,8 +715,8 @@@ static inline void ftrace_dump(enum ftr
(void) (&_max1 == &_max2); \
_max1 > _max2 ? _max1 : _max2; })
- #define min3(x, y, z) ({ \
- typeof(x) _min1 = (x); \
- typeof(y) _min2 = (y); \
- typeof(z) _min3 = (z); \
- (void) (&_min1 == &_min2); \
- (void) (&_min1 == &_min3); \
- _min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
- (_min2 < _min3 ? _min2 : _min3); })
-
- #define max3(x, y, z) ({ \
- typeof(x) _max1 = (x); \
- typeof(y) _max2 = (y); \
- typeof(z) _max3 = (z); \
- (void) (&_max1 == &_max2); \
- (void) (&_max1 == &_max3); \
- _max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \
- (_max2 > _max3 ? _max2 : _max3); })
+ #define min3(x, y, z) min((typeof(x))min(x, y), z)
+ #define max3(x, y, z) max((typeof(x))max(x, y), z)
/**
* min_not_zero - return the minimum that is _not_ zero, unless both are zero
@@@ -750,20 -731,13 +731,13 @@@
/**
* clamp - return a value clamped to a given range with strict typechecking
* @val: current value
- * @min: minimum allowable value
- * @max: maximum allowable value
+ * @lo: lowest allowable value
+ * @hi: highest allowable value
*
* This macro does strict typechecking of min/max to make sure they are of the
* same type as val. See the unnecessary pointer comparisons.
*/
- #define clamp(val, min, max) ({ \
- typeof(val) __val = (val); \
- typeof(min) __min = (min); \
- typeof(max) __max = (max); \
- (void) (&__val == &__min); \
- (void) (&__val == &__max); \
- __val = __val < __min ? __min: __val; \
- __val > __max ? __max: __val; })
+ #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
/*
* ..and if you can't take the strict
diff --combined include/linux/scatterlist.h
index f4ec8bb,4b152c8..ed8f9e7
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@@ -136,7 -136,7 +136,7 @@@ static inline void sg_set_buf(struct sc
static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
struct scatterlist *sgl)
{
- #ifndef ARCH_HAS_SG_CHAIN
+ #ifndef CONFIG_ARCH_HAS_SG_CHAIN
BUG();
#endif
@@@ -229,10 -229,10 +229,10 @@@ void sg_init_one(struct scatterlist *,
typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t);
typedef void (sg_free_fn)(struct scatterlist *, unsigned int);
-void __sg_free_table(struct sg_table *, unsigned int, sg_free_fn *);
+void __sg_free_table(struct sg_table *, unsigned int, bool, sg_free_fn *);
void sg_free_table(struct sg_table *);
-int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, gfp_t,
- sg_alloc_fn *);
+int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int,
+ struct scatterlist *, gfp_t, sg_alloc_fn *);
int sg_alloc_table(struct sg_table *, unsigned int, gfp_t);
int sg_alloc_table_from_pages(struct sg_table *sgt,
struct page **pages, unsigned int n_pages,
diff --combined include/linux/sched.h
index fa964cf,b9d5364..89f531e
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -33,6 -33,7 +33,7 @@@ struct sched_param
#include <linux/smp.h>
#include <linux/sem.h>
+ #include <linux/shm.h>
#include <linux/signal.h>
#include <linux/compiler.h>
#include <linux/completion.h>
@@@ -1270,6 -1271,9 +1271,6 @@@ struct task_struct
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-#ifdef CONFIG_RCU_BOOST
- struct rt_mutex *rcu_boost_mutex;
-#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
@@@ -1304,12 -1308,13 +1305,12 @@@
* execve */
unsigned in_iowait:1;
- /* task may not gain privileges */
- unsigned no_new_privs:1;
-
/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
+ unsigned long atomic_flags; /* Flags needing atomic access. */
+
pid_t pid;
pid_t tgid;
@@@ -1385,6 -1390,7 +1386,7 @@@
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
+ struct sysv_shm sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
@@@ -1436,6 -1442,8 +1438,6 @@@
struct rb_node *pi_waiters_leftmost;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
- /* Top pi_waiters task */
- struct task_struct *pi_top_task;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
@@@ -1628,12 -1636,6 +1630,6 @@@
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
- struct memcg_batch_info {
- int do_batch; /* incremented when batch uncharge started */
- struct mem_cgroup *memcg; /* target memcg of uncharge */
- unsigned long nr_pages; /* uncharged usage */
- unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
- } memcg_batch;
unsigned int memcg_kmem_skip_account;
struct memcg_oom_info {
struct mem_cgroup *memcg;
@@@ -1961,19 -1963,6 +1957,19 @@@ static inline void memalloc_noio_restor
current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
}
+/* Per-process atomic flags. */
+#define PFA_NO_NEW_PRIVS 0x00000001 /* May not gain new privileges. */
+
+static inline bool task_no_new_privs(struct task_struct *p)
+{
+ return test_bit(PFA_NO_NEW_PRIVS, &p->atomic_flags);
+}
+
+static inline void task_set_no_new_privs(struct task_struct *p)
+{
+ set_bit(PFA_NO_NEW_PRIVS, &p->atomic_flags);
+}
+
/*
* task->jobctl flags
*/
@@@ -2016,6 -2005,9 +2012,6 @@@ static inline void rcu_copy_process(str
#ifdef CONFIG_TREE_PREEMPT_RCU
p->rcu_blocked_node = NULL;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-#ifdef CONFIG_RCU_BOOST
- p->rcu_boost_mutex = NULL;
-#endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(&p->rcu_node_entry);
}
@@@ -2364,10 -2356,8 +2360,10 @@@ static inline int on_sig_stack(unsigne
static inline int sas_ss_flags(unsigned long sp)
{
- return (current->sas_ss_size == 0 ? SS_DISABLE
- : on_sig_stack(sp) ? SS_ONSTACK : 0);
+ if (!current->sas_ss_size)
+ return SS_DISABLE;
+
+ return on_sig_stack(sp) ? SS_ONSTACK : 0;
}
static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
@@@ -2794,7 -2784,7 +2790,7 @@@ static inline bool __must_check current
/*
* Polling state must be visible before we test NEED_RESCHED,
- * paired by resched_task()
+ * paired by resched_curr()
*/
smp_mb__after_atomic();
@@@ -2812,7 -2802,7 +2808,7 @@@ static inline bool __must_check current
/*
* Polling state must be visible before we test NEED_RESCHED,
- * paired by resched_task()
+ * paired by resched_curr()
*/
smp_mb__after_atomic();
@@@ -2844,7 -2834,7 +2840,7 @@@ static inline void current_clr_polling(
* TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
* fold.
*/
- smp_mb(); /* paired with resched_task() */
+ smp_mb(); /* paired with resched_curr() */
preempt_fold_need_resched();
}
@@@ -2969,15 -2959,10 +2965,10 @@@ static inline void inc_syscw(struct tas
#ifdef CONFIG_MEMCG
extern void mm_update_next_owner(struct mm_struct *mm);
- extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
#else
static inline void mm_update_next_owner(struct mm_struct *mm)
{
}
-
- static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
- {
- }
#endif /* CONFIG_MEMCG */
static inline unsigned long task_rlimit(const struct task_struct *tsk,
diff --combined include/scsi/scsi.h
index e6df23c,d34cf2d..261e708
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@@ -31,7 -31,7 +31,7 @@@ enum scsi_timeouts
* Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
* is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
*/
- #ifdef ARCH_HAS_SG_CHAIN
+ #ifdef CONFIG_ARCH_HAS_SG_CHAIN
#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048
#else
#define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS
@@@ -332,7 -332,6 +332,7 @@@ static inline int scsi_status_is_good(i
#define TYPE_ENCLOSURE 0x0d /* Enclosure Services Device */
#define TYPE_RBC 0x0e
#define TYPE_OSD 0x11
+#define TYPE_ZBC 0x14
#define TYPE_NO_LUN 0x7f
/* SCSI protocols; these are taken from SPC-3 section 7.5 */
@@@ -386,7 -385,7 +386,7 @@@ struct scsi_lun
#define SCSI_W_LUN_ACCESS_CONTROL (SCSI_W_LUN_BASE + 2)
#define SCSI_W_LUN_TARGET_LOG_PAGE (SCSI_W_LUN_BASE + 3)
-static inline int scsi_is_wlun(unsigned int lun)
+static inline int scsi_is_wlun(u64 lun)
{
return (lun & 0xff00) == SCSI_W_LUN_BASE;
}
diff --combined init/Kconfig
index 85fb985,77dc4cb..d3ef635
--- a/init/Kconfig
+++ b/init/Kconfig
@@@ -505,7 -505,7 +505,7 @@@ config PREEMPT_RC
def_bool TREE_PREEMPT_RCU
help
This option enables preemptible-RCU code that is common between
- the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
+ TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU.
config RCU_STALL_COMMON
def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE )
@@@ -737,7 -737,7 +737,7 @@@ choic
config RCU_NOCB_CPU_NONE
bool "No build_forced no-CBs CPUs"
- depends on RCU_NOCB_CPU && !NO_HZ_FULL
+ depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL
help
This option does not force any of the CPUs to be no-CBs CPUs.
Only CPUs designated by the rcu_nocbs= boot parameter will be
@@@ -751,7 -751,7 +751,7 @@@
config RCU_NOCB_CPU_ZERO
bool "CPU 0 is a build_forced no-CBs CPU"
- depends on RCU_NOCB_CPU && !NO_HZ_FULL
+ depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL
help
This option forces CPU 0 to be a no-CBs CPU, so that its RCU
callbacks are invoked by a per-CPU kthread whose name begins
@@@ -807,15 -807,53 +807,53 @@@ config LOG_BUF_SHIF
range 12 21
default 17
help
- Select kernel log buffer size as a power of 2.
+ Select the minimal kernel log buffer size as a power of 2.
+ The final size is affected by LOG_CPU_MAX_BUF_SHIFT config
+ parameter, see below. Any higher size also might be forced
+ by "log_buf_len" boot parameter.
+
Examples:
- 17 => 128 KB
+ 17 => 128 KB
16 => 64 KB
- 15 => 32 KB
- 14 => 16 KB
+ 15 => 32 KB
+ 14 => 16 KB
13 => 8 KB
12 => 4 KB
+ config LOG_CPU_MAX_BUF_SHIFT
+ int "CPU kernel log buffer size contribution (13 => 8 KB, 17 =>
128KB)"
+ range 0 21
+ default 12 if !BASE_SMALL
+ default 0 if BASE_SMALL
+ help
+ This option allows to increase the default ring buffer size
+ according to the number of CPUs. The value defines the contribution
+ of each CPU as a power of 2. The used space is typically only few
+ lines however it might be much more when problems are reported,
+ e.g. backtraces.
+
+ The increased size means that a new buffer has to be allocated and
+ the original static one is unused. It makes sense only on systems
+ with more CPUs. Therefore this value is used only when the sum of
+ contributions is greater than the half of the default kernel ring
+ buffer as defined by LOG_BUF_SHIFT. The default values are set
+ so that more than 64 CPUs are needed to trigger the allocation.
+
+ Also this option is ignored when "log_buf_len" kernel parameter is
+ used as it forces an exact (power of two) size of the ring buffer.
+
+ The number of possible CPUs is used for this computation ignoring
+ hotplugging making the compuation optimal for the the worst case
+ scenerio while allowing a simple algorithm to be used from bootup.
+
+ Examples shift values and their meaning:
+ 17 => 128 KB for each CPU
+ 16 => 64 KB for each CPU
+ 15 => 32 KB for each CPU
+ 14 => 16 KB for each CPU
+ 13 => 8 KB for each CPU
+ 12 => 4 KB for each CPU
+
#
# Architectures with an unreliable sched_clock() should select this:
#
@@@ -1264,77 -1302,6 +1302,77 @@@ config CC_OPTIMIZE_FOR_SIZ
If unsure, say N.
+config LTO_MENU
+ bool "Enable gcc link time optimization (LTO)"
+ # Only tested on X86 for now. For other architectures you likely
+ # have to fix some things first, like adding asmlinkages etc.
+ depends on X86
+ # lto does not support excluding flags for specific files
+ # right now. Can be removed if that is fixed.
+ depends on !FUNCTION_TRACER
+ help
+ With this option gcc will do whole program optimizations for
+ the whole kernel and module. This increases compile time, but can
+ lead to better code. It allows gcc to inline functions between
+ different files and do other optimization. It might also trigger
+ bugs due to more aggressive optimization. It allows gcc to drop unused
+ code. On smaller monolithic kernel configurations
+ it usually leads to smaller kernels, especially when modules
+ are disabled.
+
+ With this option gcc will also do some global checking over
+ different source files. It also disables a number of kernel
+ features.
+
+ This option is recommended for release builds. With LTO
+ the kernel always has to be re-optimized (but not re-parsed)
+ on each build.
+
+ This requires a gcc 4.8 or later compiler and
+ Linux binutils 2.21.51.0.3 or later. gcc 4.9 builds significantly
+ faster than 4.8 It does not currently work with a FSF release of
+ binutils or with the gold linker.
+
+ On larger configurations this may need more than 4GB of RAM.
+ It will likely not work on those with a 32bit compiler.
+
+ When the toolchain support is not available this will (hopefully)
+ be automatically disabled.
+
+ For more information see Documentation/lto-build
+
+config LTO_DISABLE
+ bool "Disable LTO again"
+ depends on LTO_MENU
+ default n
+ help
+ This option is merely here so that allyesconfig or allmodconfig do
+ not enable LTO. If you want to actually use LTO do not enable.
+
+config LTO
+ bool
+ default y
+ depends on LTO_MENU && !LTO_DISABLE
+
+config LTO_DEBUG
+ bool "Enable LTO compile time debugging"
+ depends on LTO
+ help
+ Enable LTO debugging in the compiler. The compiler dumps
+ some log files that make it easier to figure out LTO
+ behavior. The log files also allow to reconstruct
+ the global inlining and a global callgraph.
+ They however add some (single threaded) cost to the
+ compilation. When in doubt do not enable.
+
+config LTO_CP_CLONE
+ bool "Allow aggressive cloning for function specialization"
+ depends on LTO
+ help
+ Allow the compiler to clone and specialize functions for specific
+ arguments when it determines these arguments are very commonly
+ called. Experimential. Will increase text size.
+
config SYSCTL
bool
@@@ -1834,8 -1801,6 +1872,8 @@@ config MODULE_FORCE_UNLOA
config MODVERSIONS
bool "Module versioning support"
+ # LTO should work with gcc 4.9
+ depends on !LTO
help
Usually, you have to use modules compiled with your kernel.
Saying Y here makes it sometimes possible to use modules
diff --combined kernel/acct.c
index 3cec8c4,1bfdda0..98c4a20
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@@ -93,7 -93,6 +93,7 @@@ struct bsd_acct_struct
static DEFINE_SPINLOCK(acct_lock);
static LIST_HEAD(acct_list);
+static LIST_HEAD(acct_close_list);
/*
* Check the amount of free space and suspend/resume accordingly.
@@@ -142,12 -141,12 +142,12 @@@ static int check_free_space(struct bsd_
if (acct->active) {
if (act < 0) {
acct->active = 0;
- printk(KERN_INFO "Process accounting paused\n");
+ pr_info("Process accounting paused\n");
}
} else {
if (act > 0) {
acct->active = 1;
- printk(KERN_INFO "Process accounting resumed\n");
+ pr_info("Process accounting resumed\n");
}
}
@@@ -262,6 -261,7 +262,7 @@@ SYSCALL_DEFINE1(acct, const char __use
if (name) {
struct filename *tmp = getname(name);
+
if (IS_ERR(tmp))
return PTR_ERR(tmp);
error = acct_on(tmp);
@@@ -281,20 -281,6 +282,20 @@@
return error;
}
+static void acct_close_mnts(struct work_struct *unused)
+{
+ struct bsd_acct_struct *acct;
+
+ spin_lock(&acct_lock);
+restart:
+ list_for_each_entry(acct, &acct_close_list, list) {
+ acct_file_reopen(acct, NULL, NULL);
+ goto restart;
+ }
+ spin_unlock(&acct_lock);
+}
+static DECLARE_WORK(acct_close_work, acct_close_mnts);
+
/**
* acct_auto_close - turn off a filesystem's accounting if it is on
* @m: vfsmount being shut down
@@@ -304,15 -290,15 +305,15 @@@
*/
void acct_auto_close_mnt(struct vfsmount *m)
{
- struct bsd_acct_struct *acct;
+ struct bsd_acct_struct *acct, *tmp;
spin_lock(&acct_lock);
-restart:
- list_for_each_entry(acct, &acct_list, list)
+ list_for_each_entry_safe(acct, tmp, &acct_list, list) {
if (acct->file && acct->file->f_path.mnt == m) {
- acct_file_reopen(acct, NULL, NULL);
- goto restart;
+ list_move_tail(&acct->list, &acct_close_list);
+ schedule_work(&acct_close_work);
}
+ }
spin_unlock(&acct_lock);
}
@@@ -391,7 -377,7 +392,7 @@@ static comp_t encode_comp_t(unsigned lo
return exp;
}
- #if ACCT_VERSION==1 || ACCT_VERSION==2
+ #if ACCT_VERSION == 1 || ACCT_VERSION == 2
/*
* encode an u64 into a comp2_t (24 bits)
*
@@@ -404,7 -390,7 +405,7 @@@
#define MANTSIZE2 20 /* 20 bit mantissa. */
#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
- #define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */
+ #define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */
static comp2_t encode_comp2_t(u64 value)
{
@@@ -435,7 -421,7 +436,7 @@@
}
#endif
- #if ACCT_VERSION==3
+ #if ACCT_VERSION == 3
/*
* encode an u64 into a 32 bit IEEE float
*/
@@@ -444,8 -430,9 +445,9 @@@ static u32 encode_float(u64 value
unsigned exp = 190;
unsigned u;
- if (value==0) return 0;
- while ((s64)value > 0){
+ if (value == 0)
+ return 0;
+ while ((s64)value > 0) {
value <<= 1;
exp--;
}
@@@ -499,22 -486,23 +501,23 @@@ static void do_acct_process(struct bsd_
strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
/* calculate run_time in nsec*/
- do_posix_clock_monotonic_gettime(&uptime);
+ ktime_get_ts(&uptime);
run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
+ current->group_leader->start_time.tv_nsec;
/* convert nsec -> AHZ */
elapsed = nsec_to_AHZ(run_time);
- #if ACCT_VERSION==3
+ #if ACCT_VERSION == 3
ac.ac_etime = encode_float(elapsed);
#else
ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
- (unsigned long) elapsed : (unsigned long) -1l);
+ (unsigned long) elapsed : (unsigned long) -1l);
#endif
- #if ACCT_VERSION==1 || ACCT_VERSION==2
+ #if ACCT_VERSION == 1 || ACCT_VERSION == 2
{
/* new enlarged etime field */
comp2_t etime = encode_comp2_t(elapsed);
+
ac.ac_etime_hi = etime >> 16;
ac.ac_etime_lo = (u16) etime;
}
@@@ -524,15 -512,15 +527,15 @@@
/* we really need to bite the bullet and change layout */
ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
- #if ACCT_VERSION==2
+ #if ACCT_VERSION == 2
ac.ac_ahz = AHZ;
#endif
- #if ACCT_VERSION==1 || ACCT_VERSION==2
+ #if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
ac.ac_gid16 = ac.ac_gid;
#endif
- #if ACCT_VERSION==3
+ #if ACCT_VERSION == 3
ac.ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
@@@ -593,6 -581,7 +596,7 @@@ void acct_collect(long exitcode, int gr
if (group_dead && current->mm) {
struct vm_area_struct *vma;
+
down_read(¤t->mm->mmap_sem);
vma = current->mm->mmap;
while (vma) {
diff --combined kernel/fork.c
index 7657301,735ea98..38dcf83
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@@ -315,15 -315,6 +315,15 @@@ static struct task_struct *dup_task_str
goto free_ti;
tsk->stack = ti;
+#ifdef CONFIG_SECCOMP
+ /*
+ * We must handle setting up seccomp filters once we're under
+ * the sighand lock in case orig has changed between now and
+ * then. Until then, filter must be NULL to avoid messing up
+ * the usage counts on the error path calling free_task.
+ */
+ tsk->seccomp.filter = NULL;
+#endif
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
@@@ -374,12 -365,11 +374,11 @@@ static int dup_mmap(struct mm_struct *m
*/
down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
- mm->locked_vm = 0;
- mm->mmap = NULL;
- mm->vmacache_seqnum = 0;
- mm->map_count = 0;
- cpumask_clear(mm_cpumask(mm));
- mm->mm_rb = RB_ROOT;
+ mm->total_vm = oldmm->total_vm;
+ mm->shared_vm = oldmm->shared_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
pprev = &mm->mmap;
@@@ -536,19 -526,37 +535,37 @@@ static void mm_init_aio(struct mm_struc
#endif
}
+ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+ {
+ #ifdef CONFIG_MEMCG
+ mm->owner = p;
+ #endif
+ }
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
+ mm->mmap = NULL;
+ mm->mm_rb = RB_ROOT;
+ mm->vmacache_seqnum = 0;
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
atomic_long_set(&mm->nr_ptes, 0);
+ mm->map_count = 0;
+ mm->locked_vm = 0;
+ mm->pinned_vm = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
+ mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ mmu_notifier_mm_init(mm);
clear_tlb_flush_pending(mm);
+ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ mm->pmd_huge_pte = NULL;
+ #endif
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
@@@ -558,11 -566,17 +575,17 @@@
mm->def_flags = 0;
}
- if (likely(!mm_alloc_pgd(mm))) {
- mmu_notifier_mm_init(mm);
- return mm;
- }
+ if (mm_alloc_pgd(mm))
+ goto fail_nopgd;
+
+ if (init_new_context(p, mm))
+ goto fail_nocontext;
+
+ return mm;
+ fail_nocontext:
+ mm_free_pgd(mm);
+ fail_nopgd:
free_mm(mm);
return NULL;
}
@@@ -596,7 -610,6 +619,6 @@@ struct mm_struct *mm_alloc(void
return NULL;
memset(mm, 0, sizeof(*mm));
- mm_init_cpumask(mm);
return mm_init(mm, current);
}
@@@ -828,17 -841,10 +850,10 @@@ static struct mm_struct *dup_mm(struct
goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
- mm_init_cpumask(mm);
if (!mm_init(mm, tsk))
goto fail_nomem;
- if (init_new_context(tsk, mm))
- goto fail_nocontext;
-
dup_mm_exe_file(oldmm, mm);
err = dup_mmap(mm, oldmm);
@@@ -860,15 -866,6 +875,6 @@@ free_pt
fail_nomem:
return NULL;
-
- fail_nocontext:
- /*
- * If init_new_context() failed, we cannot use mmput() to free the mm
- * because it calls destroy_context()
- */
- mm_free_pgd(mm);
- free_mm(mm);
- return NULL;
}
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
@@@ -1090,39 -1087,6 +1096,39 @@@ static int copy_signal(unsigned long cl
return 0;
}
+static void copy_seccomp(struct task_struct *p)
+{
+#ifdef CONFIG_SECCOMP
+ /*
+ * Must be called with sighand->lock held, which is common to
+ * all threads in the group. Holding cred_guard_mutex is not
+ * needed because this new task is not yet running and cannot
+ * be racing exec.
+ */
+ BUG_ON(!spin_is_locked(¤t->sighand->siglock));
+
+ /* Ref-count the new filter user, and assign it. */
+ get_seccomp_filter(current);
+ p->seccomp = current->seccomp;
+
+ /*
+ * Explicitly enable no_new_privs here in case it got set
+ * between the task_struct being duplicated and holding the
+ * sighand lock. The seccomp state and nnp must be in sync.
+ */
+ if (task_no_new_privs(current))
+ task_set_no_new_privs(p);
+
+ /*
+ * If the parent gained a seccomp mode after copying thread
+ * flags and between before we held the sighand lock, we have
+ * to manually enable the seccomp thread flag here.
+ */
+ if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
+ set_tsk_thread_flag(p, TIF_SECCOMP);
+#endif
+}
+
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
current->clear_child_tid = tidptr;
@@@ -1137,16 -1101,10 +1143,9 @@@ static void rt_mutex_init_task(struct t
p->pi_waiters = RB_ROOT;
p->pi_waiters_leftmost = NULL;
p->pi_blocked_on = NULL;
#endif
}
- #ifdef CONFIG_MEMCG
- void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
- {
- mm->owner = p;
- }
- #endif /* CONFIG_MEMCG */
-
/*
* Initialize POSIX timer handling for a single task.
*/
@@@ -1237,6 -1195,7 +1236,6 @@@ static struct task_struct *copy_process
goto fork_out;
ftrace_graph_init_task(p);
- get_seccomp_filter(p);
rt_mutex_init_task(p);
@@@ -1302,7 -1261,7 +1301,7 @@@
posix_cpu_timers_init(p);
- do_posix_clock_monotonic_gettime(&p->start_time);
+ ktime_get_ts(&p->start_time);
p->real_start_time = p->start_time;
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
@@@ -1347,10 -1306,6 +1346,6 @@@
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
- #ifdef CONFIG_MEMCG
- p->memcg_batch.do_batch = 0;
- p->memcg_batch.memcg = NULL;
- #endif
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
@@@ -1368,6 -1323,7 +1363,7 @@@
if (retval)
goto bad_fork_cleanup_policy;
/* copy all the process information */
+ shm_init_task(p);
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_audit;
@@@ -1477,12 -1433,6 +1473,12 @@@
spin_lock(¤t->sighand->siglock);
/*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding sighand lock.
+ */
+ copy_seccomp(p);
+
+ /*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
* fork. Restart if a signal comes in before we add the new process to
@@@ -1919,6 -1869,11 +1915,11 @@@ SYSCALL_DEFINE1(unshare, unsigned long
*/
exit_sem(current);
}
+ if (unshare_flags & CLONE_NEWIPC) {
+ /* Orphan segments in old ns (see sem above). */
+ exit_shm(current);
+ shm_init_task(current);
+ }
if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
diff --combined lib/Kconfig
index a8a775730,fdf90f3..2accc79
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@@ -177,6 -177,13 +177,13 @@@ config CRC
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+ config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
@@@ -396,6 -403,39 +403,39 @@@ config CPU_RMA
config DQL
bool
+ config GLOB
+ bool
+ # This actually supports modular compilation, but the module overhead
+ # is ridiculous for the amount of code involved. Until an out-of-tree
+ # driver asks for it, we'll just link it directly it into the kernel
+ # when required. Since we're ignoring out-of-tree users, there's also
+ # no need bother prompting for a manual decision:
+ # prompt "glob_match() function"
+ help
+ This option provides a glob_match function for performing
+ simple text pattern matching. It originated in the ATA code
+ to blacklist particular drive models, but other device drivers
+ may need similar functionality.
+
+ All drivers in the Linux kernel tree that require this function
+ should automatically select this option. Say N unless you
+ are compiling an out-of tree driver which tells you that it
+ depends on this.
+
+ config GLOB_SELFTEST
+ bool "glob self-test on init"
+ default n
+ depends on GLOB
+ help
+ This option enables a simple self-test of the glob_match
+ function on startup. It is primarily useful for people
+ working on the code to ensure they haven't introduced any
+ regressions.
+
+ It only adds a little bit of code and slows kernel boot (or
+ module load) by a small amount, so you're welcome to play with
+ it, but you probably don't need it.
+
#
# Netlink attribute parsing support is select'ed if needed
#
@@@ -451,8 -491,7 +491,8 @@@ config MPILI
config SIGNATURE
tristate
- depends on KEYS && CRYPTO
+ depends on KEYS
+ select CRYPTO
select CRYPTO_SHA1
select MPILIB
help
@@@ -475,4 -514,11 +515,11 @@@ config UCS2_STRIN
source "lib/fonts/Kconfig"
+ #
+ # sg chaining option
+ #
+
+ config ARCH_HAS_SG_CHAIN
+ def_bool n
+
endmenu
diff --combined lib/Kconfig.debug
index 066936a,fd939e1..ff15fb6
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@@ -15,7 -15,7 +15,7 @@@ config PRINTK_TIM
The behavior is also controlled by the kernel command line
parameter printk.time=1. See Documentation/kernel-parameters.txt
- config DEFAULT_MESSAGE_LOGLEVEL
+ config MESSAGE_LOGLEVEL_DEFAULT
int "Default message log level (1-7)"
range 1 7
default "4"
@@@ -180,7 -180,7 +180,7 @@@ config STRIP_ASM_SYM
config READABLE_ASM
bool "Generate readable assembler code"
- depends on DEBUG_KERNEL
+ depends on DEBUG_KERNEL && !LTO
help
Disable some compiler optimizations that tend to generate human unreadable
assembler output. This may make the kernel slightly slower, but it helps
@@@ -835,7 -835,7 +835,7 @@@ config DEBUG_RT_MUTEXE
config RT_MUTEX_TESTER
bool "Built-in scriptable tester for rt-mutexes"
- depends on DEBUG_KERNEL && RT_MUTEXES
+ depends on DEBUG_KERNEL && RT_MUTEXES && BROKEN
help
This option enables a rt-mutex tester.
@@@ -1131,6 -1131,20 +1131,6 @@@ config PROVE_RCU_REPEATEDL
Say N if you are unsure.
-config PROVE_RCU_DELAY
- bool "RCU debugging: preemptible RCU race provocation"
- depends on DEBUG_KERNEL && PREEMPT_RCU
- default n
- help
- There is a class of races that involve an unlikely preemption
- of __rcu_read_unlock() just after ->rcu_read_lock_nesting has
- been set to INT_MIN. This feature inserts a delay at that
- point to increase the probability of these races.
-
- Say Y to increase probability of preemption of __rcu_read_unlock().
-
- Say N if you are unsure.
-
config SPARSE_RCU_POINTER
bool "RCU debugging: sparse-based checks for pointer usage"
default n
@@@ -1635,19 -1649,6 +1635,19 @@@ config TEST_BP
If unsure, say N.
+config TEST_FIRMWARE
+ tristate "Test firmware loading via userspace interface"
+ default n
+ depends on FW_LOADER
+ help
+ This builds the "test_firmware" module that creates a userspace
+ interface for testing firmware loading. This can be used to
+ control the triggering of firmware loading without needing an
+ actual firmware-using device. The contents can be rechecked by
+ userspace.
+
+ If unsure, say N.
+
source "samples/Kconfig"
source "lib/Kconfig.kgdb"
diff --combined lib/Makefile
index 230b4b1,e48067c..44dbcee
--- a/lib/Makefile
+++ b/lib/Makefile
@@@ -34,7 -34,6 +34,7 @@@ obj-$(CONFIG_TEST_KSTRTOX) += test-kstr
obj-$(CONFIG_TEST_MODULE) += test_module.o
obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o
obj-$(CONFIG_TEST_BPF) += test_bpf.o
+obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
ifeq ($(CONFIG_DEBUG_KOBJECT),y)
CFLAGS_kobject.o += -DDEBUG
@@@ -72,6 -71,7 +72,7 @@@ obj-$(CONFIG_CRC32) += crc32.
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+ obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
@@@ -137,6 -137,8 +138,8 @@@ obj-$(CONFIG_CORDIC) += cordic.
obj-$(CONFIG_DQL) += dynamic_queue_limits.o
+ obj-$(CONFIG_GLOB) += glob.o
+
obj-$(CONFIG_MPILIB) += mpi/
obj-$(CONFIG_SIGNATURE) += digsig.o
diff --combined lib/scatterlist.c
index b4415fc,4251cbd..9cdf62f
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@@ -73,7 -73,7 +73,7 @@@ EXPORT_SYMBOL(sg_nents)
**/
struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents)
{
- #ifndef ARCH_HAS_SG_CHAIN
+ #ifndef CONFIG_ARCH_HAS_SG_CHAIN
struct scatterlist *ret = &sgl[nents - 1];
#else
struct scatterlist *sg, *ret = NULL;
@@@ -165,7 -165,6 +165,7 @@@ static void sg_kfree(struct scatterlis
* __sg_free_table - Free a previously mapped sg table
* @table: The sg table header to use
* @max_ents: The maximum number of entries per single scatterlist
+ * @skip_first_chunk: don't free the (preallocated) first scatterlist chunk
* @free_fn: Free function
*
* Description:
@@@ -175,7 -174,7 +175,7 @@@
*
**/
void __sg_free_table(struct sg_table *table, unsigned int max_ents,
- sg_free_fn *free_fn)
+ bool skip_first_chunk, sg_free_fn *free_fn)
{
struct scatterlist *sgl, *next;
@@@ -203,10 -202,7 +203,10 @@@
}
table->orig_nents -= sg_size;
- free_fn(sgl, alloc_size);
+ if (!skip_first_chunk) {
+ free_fn(sgl, alloc_size);
+ skip_first_chunk = false;
+ }
sgl = next;
}
@@@ -221,7 -217,7 +221,7 @@@ EXPORT_SYMBOL(__sg_free_table)
**/
void sg_free_table(struct sg_table *table)
{
- __sg_free_table(table, SG_MAX_SINGLE_ALLOC, sg_kfree);
+ __sg_free_table(table, SG_MAX_SINGLE_ALLOC, false, sg_kfree);
}
EXPORT_SYMBOL(sg_free_table);
@@@ -245,8 -241,8 +245,8 @@@
*
**/
int __sg_alloc_table(struct sg_table *table, unsigned int nents,
- unsigned int max_ents, gfp_t gfp_mask,
- sg_alloc_fn *alloc_fn)
+ unsigned int max_ents, struct scatterlist *first_chunk,
+ gfp_t gfp_mask, sg_alloc_fn *alloc_fn)
{
struct scatterlist *sg, *prv;
unsigned int left;
@@@ -255,7 -251,7 +255,7 @@@
if (nents == 0)
return -EINVAL;
- #ifndef ARCH_HAS_SG_CHAIN
+ #ifndef CONFIG_ARCH_HAS_SG_CHAIN
if (WARN_ON_ONCE(nents > max_ents))
return -EINVAL;
#endif
@@@ -273,12 -269,7 +273,12 @@@
left -= sg_size;
- sg = alloc_fn(alloc_size, gfp_mask);
+ if (first_chunk) {
+ sg = first_chunk;
+ first_chunk = NULL;
+ } else {
+ sg = alloc_fn(alloc_size, gfp_mask);
+ }
if (unlikely(!sg)) {
/*
* Adjust entry count to reflect that the last
@@@ -333,9 -324,9 +333,9 @@@ int sg_alloc_table(struct sg_table *tab
int ret;
ret = __sg_alloc_table(table, nents, SG_MAX_SINGLE_ALLOC,
- gfp_mask, sg_kmalloc);
+ NULL, gfp_mask, sg_kmalloc);
if (unlikely(ret))
- __sg_free_table(table, SG_MAX_SINGLE_ALLOC, sg_kfree);
+ __sg_free_table(table, SG_MAX_SINGLE_ALLOC, false, sg_kfree);
return ret;
}
diff --combined mm/filemap.c
index d175917,fb74fb8..367ea2c
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@@ -31,6 -31,7 +31,7 @@@
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+ #include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
#include <linux/rmap.h>
@@@ -233,7 -234,6 +234,6 @@@ void delete_from_page_cache(struct pag
spin_lock_irq(&mapping->tree_lock);
__delete_from_page_cache(page, NULL);
spin_unlock_irq(&mapping->tree_lock);
- mem_cgroup_uncharge_cache_page(page);
if (freepage)
freepage(page);
@@@ -241,6 -241,18 +241,6 @@@
}
EXPORT_SYMBOL(delete_from_page_cache);
-static int sleep_on_page(void *word)
-{
- io_schedule();
- return 0;
-}
-
-static int sleep_on_page_killable(void *word)
-{
- sleep_on_page(word);
- return fatal_signal_pending(current) ? -EINTR : 0;
-}
-
static int filemap_check_errors(struct address_space *mapping)
{
int ret = 0;
@@@ -489,8 -501,7 +489,7 @@@ int replace_page_cache_page(struct pag
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
- /* mem_cgroup codes must not be called under tree_lock */
- mem_cgroup_replace_page_cache(old, new);
+ mem_cgroup_migrate(old, new, true);
radix_tree_preload_end();
if (freepage)
freepage(old);
@@@ -548,19 -559,24 +547,24 @@@ static int __add_to_page_cache_locked(s
pgoff_t offset, gfp_t gfp_mask,
void **shadowp)
{
+ int huge = PageHuge(page);
+ struct mem_cgroup *memcg;
int error;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
- error = mem_cgroup_charge_file(page, current->mm,
- gfp_mask & GFP_RECLAIM_MASK);
- if (error)
- return error;
+ if (!huge) {
+ error = mem_cgroup_try_charge(page, current->mm,
+ gfp_mask, &memcg);
+ if (error)
+ return error;
+ }
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error) {
- mem_cgroup_uncharge_cache_page(page);
+ if (!huge)
+ mem_cgroup_cancel_charge(page, memcg);
return error;
}
@@@ -575,13 -591,16 +579,16 @@@
goto err_insert;
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
+ if (!huge)
+ mem_cgroup_commit_charge(page, memcg, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
err_insert:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
- mem_cgroup_uncharge_cache_page(page);
+ if (!huge)
+ mem_cgroup_cancel_charge(page, memcg);
page_cache_release(page);
return error;
}
@@@ -680,7 -699,7 +687,7 @@@ void wait_on_page_bit(struct page *page
DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
if (test_bit(bit_nr, &page->flags))
- __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
+ __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_on_page_bit);
@@@ -693,7 -712,7 +700,7 @@@ int wait_on_page_bit_killable(struct pa
return 0;
return __wait_on_bit(page_waitqueue(page), &wait,
- sleep_on_page_killable, TASK_KILLABLE);
+ bit_wait_io, TASK_KILLABLE);
}
/**
@@@ -794,7 -813,7 +801,7 @@@ void __lock_page(struct page *page
{
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
+ __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
@@@ -804,10 -823,21 +811,21 @@@ int __lock_page_killable(struct page *p
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
return __wait_on_bit_lock(page_waitqueue(page), &wait,
- sleep_on_page_killable, TASK_KILLABLE);
+ bit_wait_io, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
+ /*
+ * Return values:
+ * 1 - page is locked; mmap_sem is still held.
+ * 0 - page is not locked.
+ * mmap_sem has been released (up_read()), unless flags had both
+ * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
+ * which case mmap_sem is still held.
+ *
+ * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
+ * with the page locked and the mmap_sem unperturbed.
+ */
int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags)
{
@@@ -1088,9 -1118,9 +1106,9 @@@ no_page
if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
fgp_flags |= FGP_LOCK;
- /* Init accessed so avoit atomic mark_page_accessed later */
+ /* Init accessed so avoid atomic mark_page_accessed later */
if (fgp_flags & FGP_ACCESSED)
- init_page_accessed(page);
+ __SetPageReferenced(page);
err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
if (unlikely(err)) {
@@@ -1824,6 -1854,18 +1842,18 @@@ static void do_async_mmap_readahead(str
* The goto's are kind of ugly, but this streamlines the normal case of having
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
+ *
+ * vma->vm_mm->mmap_sem must be held on entry.
+ *
+ * If our return value has VM_FAULT_RETRY set, it's because
+ * lock_page_or_retry() returned 0.
+ * The mmap_sem has usually been released in this case.
+ * See __lock_page_or_retry() for the exception.
+ *
+ * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
+ * has not been released.
+ *
+ * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
*/
int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
diff --combined mm/memcontrol.c
index 45c10c6,d44bf3e..6f81411
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -648,10 -648,8 +648,8 @@@ EXPORT_SYMBOL(memcg_kmem_enabled_key)
static void disarm_kmem_keys(struct mem_cgroup *memcg)
{
- if (memcg_kmem_is_active(memcg)) {
+ if (memcg_kmem_is_active(memcg))
static_key_slow_dec(&memcg_kmem_enabled_key);
- ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
- }
/*
* This check can't live in kmem destruction function,
* since the charges will outlive the cgroup
@@@ -754,9 -752,11 +752,11 @@@ static void __mem_cgroup_remove_exceede
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
{
- spin_lock(&mctz->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&mctz->lock, flags);
__mem_cgroup_remove_exceeded(mz, mctz);
- spin_unlock(&mctz->lock);
+ spin_unlock_irqrestore(&mctz->lock, flags);
}
@@@ -779,7 -779,9 +779,9 @@@ static void mem_cgroup_update_tree(stru
* mem is over its softlimit.
*/
if (excess || mz->on_tree) {
- spin_lock(&mctz->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&mctz->lock, flags);
/* if on-tree, remove it */
if (mz->on_tree)
__mem_cgroup_remove_exceeded(mz, mctz);
@@@ -788,7 -790,7 +790,7 @@@
* If excess is 0, no tree ops.
*/
__mem_cgroup_insert_exceeded(mz, mctz, excess);
- spin_unlock(&mctz->lock);
+ spin_unlock_irqrestore(&mctz->lock, flags);
}
}
}
@@@ -839,9 -841,9 +841,9 @@@ mem_cgroup_largest_soft_limit_node(stru
{
struct mem_cgroup_per_zone *mz;
- spin_lock(&mctz->lock);
+ spin_lock_irq(&mctz->lock);
mz = __mem_cgroup_largest_soft_limit_node(mctz);
- spin_unlock(&mctz->lock);
+ spin_unlock_irq(&mctz->lock);
return mz;
}
@@@ -882,13 -884,6 +884,6 @@@ static long mem_cgroup_read_stat(struc
return val;
}
- static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
- bool charge)
- {
- int val = (charge) ? 1 : -1;
- this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
- }
-
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx)
{
@@@ -909,13 -904,13 +904,13 @@@
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
- bool anon, int nr_pages)
+ int nr_pages)
{
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
* counted as CACHE even if it's on ANON LRU.
*/
- if (anon)
+ if (PageAnon(page))
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
nr_pages);
else
@@@ -1013,7 -1008,6 +1008,6 @@@ static bool mem_cgroup_event_ratelimit(
*/
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
{
- preempt_disable();
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
@@@ -1026,8 -1020,6 +1020,6 @@@
do_numainfo = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_NUMAINFO);
#endif
- preempt_enable();
-
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
mem_cgroup_update_tree(memcg, page);
@@@ -1035,8 -1027,7 +1027,7 @@@
if (unlikely(do_numainfo))
atomic_inc(&memcg->numainfo_events);
#endif
- } else
- preempt_enable();
+ }
}
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@@ -1347,20 -1338,6 +1338,6 @@@ out
return lruvec;
}
- /*
- * Following LRU functions are allowed to be used without PCG_LOCK.
- * Operations are called by routine of global LRU independently from memcg.
- * What we have to take care of here is validness of pc->mem_cgroup.
- *
- * Changes to pc->mem_cgroup happens when
- * 1. charge
- * 2. moving account
- * In typical case, "charge" is done before add-to-lru. Exception is
SwapCache.
- * It is added to LRU before charge.
- * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
- * When moving account, the page is not on LRU. It's isolated.
- */
-
/**
* mem_cgroup_page_lruvec - return lruvec for adding an lru page
* @page: the page
@@@ -2261,22 -2238,14 +2238,14 @@@ cleanup
*
* Notes: Race condition
*
- * We usually use lock_page_cgroup() for accessing page_cgroup member but
- * it tends to be costly. But considering some conditions, we doesn't need
- * to do so _always_.
+ * Charging occurs during page instantiation, while the page is
+ * unmapped and locked in page migration, or while the page table is
+ * locked in THP migration. No race is possible.
*
- * Considering "charge", lock_page_cgroup() is not required because all
- * file-stat operations happen after a page is attached to radix-tree. There
- * are no race with "charge".
+ * Uncharge happens to pages with zero references, no race possible.
*
- * Considering "uncharge", we know that memcg doesn't clear
pc->mem_cgroup
- * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup
even
- * if there are race with "uncharge". Statistics itself is properly handled
- * by flags.
- *
- * Considering "move", this is an only case we see a race. To make the race
- * small, we check memcg->moving_account and detect there are possibility
- * of race or not. If there is, we take a lock.
+ * Charge moving between groups is protected by checking mm->moving
+ * account and taking the move_lock in the slowpath.
*/
void __mem_cgroup_begin_update_page_stat(struct page *page,
@@@ -2551,55 -2520,63 +2520,63 @@@ static int memcg_cpu_hotplug_callback(s
return NOTIFY_OK;
}
-
- /* See mem_cgroup_try_charge() for details */
- enum {
- CHARGE_OK, /* success */
- CHARGE_RETRY, /* need to retry but retry is not bad */
- CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
- CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
- };
-
- static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages, unsigned int min_pages,
- bool invoke_oom)
+ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
{
- unsigned long csize = nr_pages * PAGE_SIZE;
+ unsigned int batch = max(CHARGE_BATCH, nr_pages);
+ int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct res_counter *fail_res;
+ unsigned long nr_reclaimed;
unsigned long flags = 0;
- int ret;
+ unsigned long long size;
+ int ret = 0;
- ret = res_counter_charge(&memcg->res, csize, &fail_res);
+ retry:
+ if (consume_stock(memcg, nr_pages))
+ goto done;
- if (likely(!ret)) {
+ size = batch * PAGE_SIZE;
+ if (!res_counter_charge(&memcg->res, size, &fail_res)) {
if (!do_swap_account)
- return CHARGE_OK;
- ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
- if (likely(!ret))
- return CHARGE_OK;
-
- res_counter_uncharge(&memcg->res, csize);
+ goto done_restock;
+ if (!res_counter_charge(&memcg->memsw, size, &fail_res))
+ goto done_restock;
+ res_counter_uncharge(&memcg->res, size);
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
} else
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+
+ if (batch > nr_pages) {
+ batch = nr_pages;
+ goto retry;
+ }
+
/*
- * Never reclaim on behalf of optional batching, retry with a
- * single page instead.
+ * Unlike in global OOM situations, memcg is not in a physical
+ * memory shortage. Allow dying and OOM-killed tasks to
+ * bypass the last charges so that they can exit quickly and
+ * free their memory.
*/
- if (nr_pages > min_pages)
- return CHARGE_RETRY;
+ if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+ fatal_signal_pending(current) ||
+ current->flags & PF_EXITING))
+ goto bypass;
+
+ if (unlikely(task_in_memcg_oom(current)))
+ goto nomem;
if (!(gfp_mask & __GFP_WAIT))
- return CHARGE_WOULDBLOCK;
+ goto nomem;
- if (gfp_mask & __GFP_NORETRY)
- return CHARGE_NOMEM;
+ nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
+
+ if (mem_cgroup_margin(mem_over_limit) >= batch)
+ goto retry;
- ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
- if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
- return CHARGE_RETRY;
+ if (gfp_mask & __GFP_NORETRY)
+ goto nomem;
/*
* Even though the limit is exceeded at this point, reclaim
* may have been able to free some pages. Retry the charge
@@@ -2609,142 -2586,47 +2586,47 @@@
* unlikely to succeed so close to the limit, and we fall back
* to regular pages anyway in case of failure.
*/
- if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
- return CHARGE_RETRY;
-
+ if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER))
+ goto retry;
/*
* At task move, charge accounts can be doubly counted. So, it's
* better to wait until the end of task_move if something is going on.
*/
if (mem_cgroup_wait_acct_move(mem_over_limit))
- return CHARGE_RETRY;
-
- if (invoke_oom)
- mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
-
- return CHARGE_NOMEM;
- }
-
- /**
- * mem_cgroup_try_charge - try charging a memcg
- * @memcg: memcg to charge
- * @nr_pages: number of pages to charge
- * @oom: trigger OOM if reclaim fails
- *
- * Returns 0 if @memcg was charged successfully, -EINTR if the charge
- * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
- */
- static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
- gfp_t gfp_mask,
- unsigned int nr_pages,
- bool oom)
- {
- unsigned int batch = max(CHARGE_BATCH, nr_pages);
- int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
- int ret;
-
- if (mem_cgroup_is_root(memcg))
- goto done;
- /*
- * Unlike in global OOM situations, memcg is not in a physical
- * memory shortage. Allow dying and OOM-killed tasks to
- * bypass the last charges so that they can exit quickly and
- * free their memory.
- */
- if (unlikely(test_thread_flag(TIF_MEMDIE) ||
- fatal_signal_pending(current) ||
- current->flags & PF_EXITING))
- goto bypass;
+ goto retry;
- if (unlikely(task_in_memcg_oom(current)))
- goto nomem;
+ if (nr_retries--)
+ goto retry;
if (gfp_mask & __GFP_NOFAIL)
- oom = false;
- again:
- if (consume_stock(memcg, nr_pages))
- goto done;
-
- do {
- bool invoke_oom = oom && !nr_oom_retries;
-
- /* If killed, bypass charge */
- if (fatal_signal_pending(current))
- goto bypass;
+ goto bypass;
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
- nr_pages, invoke_oom);
- switch (ret) {
- case CHARGE_OK:
- break;
- case CHARGE_RETRY: /* not in OOM situation but retry */
- batch = nr_pages;
- goto again;
- case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
- goto nomem;
- case CHARGE_NOMEM: /* OOM routine works */
- if (!oom || invoke_oom)
- goto nomem;
- nr_oom_retries--;
- break;
- }
- } while (ret != CHARGE_OK);
+ if (fatal_signal_pending(current))
+ goto bypass;
- if (batch > nr_pages)
- refill_stock(memcg, batch - nr_pages);
- done:
- return 0;
+ mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch));
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
bypass:
- return -EINTR;
- }
-
- /**
- * mem_cgroup_try_charge_mm - try charging a mm
- * @mm: mm_struct to charge
- * @nr_pages: number of pages to charge
- * @oom: trigger OOM if reclaim fails
- *
- * Returns the charged mem_cgroup associated with the given mm_struct or
- * NULL the charge failed.
- */
- static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
- gfp_t gfp_mask,
- unsigned int nr_pages,
- bool oom)
-
- {
- struct mem_cgroup *memcg;
- int ret;
-
- memcg = get_mem_cgroup_from_mm(mm);
- ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
- css_put(&memcg->css);
- if (ret == -EINTR)
- memcg = root_mem_cgroup;
- else if (ret)
- memcg = NULL;
+ memcg = root_mem_cgroup;
+ ret = -EINTR;
+ goto retry;
- return memcg;
+ done_restock:
+ if (batch > nr_pages)
+ refill_stock(memcg, batch - nr_pages);
+ done:
+ return ret;
}
- /*
- * Somemtimes we have to undo a charge we got by try_charge().
- * This function is for that and do uncharge, put css's refcnt.
- * gotten by try_charge().
- */
- static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
- unsigned int nr_pages)
+ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
- if (!mem_cgroup_is_root(memcg)) {
- unsigned long bytes = nr_pages * PAGE_SIZE;
+ unsigned long bytes = nr_pages * PAGE_SIZE;
- res_counter_uncharge(&memcg->res, bytes);
- if (do_swap_account)
- res_counter_uncharge(&memcg->memsw, bytes);
- }
+ res_counter_uncharge(&memcg->res, bytes);
+ if (do_swap_account)
+ res_counter_uncharge(&memcg->memsw, bytes);
}
/*
@@@ -2756,9 -2638,6 +2638,6 @@@ static void __mem_cgroup_cancel_local_c
{
unsigned long bytes = nr_pages * PAGE_SIZE;
- if (mem_cgroup_is_root(memcg))
- return;
-
res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
if (do_swap_account)
res_counter_uncharge_until(&memcg->memsw,
@@@ -2779,6 -2658,16 +2658,16 @@@ static struct mem_cgroup *mem_cgroup_lo
return mem_cgroup_from_id(id);
}
+ /*
+ * try_get_mem_cgroup_from_page - look up page's memcg association
+ * @page: the page
+ *
+ * Look up, get a css reference, and return the memcg that owns @page.
+ *
+ * The page must be locked to prevent racing with swap-in and page
+ * cache charges. If coming from an unlocked page table, the caller
+ * must ensure the page is on the LRU or this can race with charging.
+ */
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
{
struct mem_cgroup *memcg = NULL;
@@@ -2789,7 -2678,6 +2678,6 @@@
VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page);
- lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
if (memcg && !css_tryget_online(&memcg->css))
@@@ -2803,23 -2691,46 +2691,46 @@@
memcg = NULL;
rcu_read_unlock();
}
return memcg;
}
- static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
- struct page *page,
- unsigned int nr_pages,
- enum charge_type ctype,
- bool lrucare)
+ static void lock_page_lru(struct page *page, int *isolated)
+ {
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irq(&zone->lru_lock);
+ if (PageLRU(page)) {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+ ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ *isolated = 1;
+ } else
+ *isolated = 0;
+ }
+
+ static void unlock_page_lru(struct page *page, int isolated)
+ {
+ struct zone *zone = page_zone(page);
+
+ if (isolated) {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ SetPageLRU(page);
+ add_page_to_lru_list(page, lruvec, page_lru(page));
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ }
+
+ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
+ unsigned int nr_pages, bool lrucare)
{
struct page_cgroup *pc = lookup_page_cgroup(page);
- struct zone *uninitialized_var(zone);
- struct lruvec *lruvec;
- bool was_on_lru = false;
- bool anon;
+ int isolated;
- lock_page_cgroup(pc);
VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
@@@ -2830,52 -2741,38 +2741,38 @@@
* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
* may already be on some other mem_cgroup's LRU. Take care of it.
*/
- if (lrucare) {
- zone = page_zone(page);
- spin_lock_irq(&zone->lru_lock);
- if (PageLRU(page)) {
- lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_lru(page));
- was_on_lru = true;
- }
- }
+ if (lrucare)
+ lock_page_lru(page, &isolated);
- pc->mem_cgroup = memcg;
/*
- * We access a page_cgroup asynchronously without lock_page_cgroup().
- * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
- * is accessed after testing USED bit. To make pc->mem_cgroup visible
- * before USED bit, we need memory barrier here.
- * See mem_cgroup_add_lru_list(), etc.
+ * Nobody should be changing or seriously looking at
+ * pc->mem_cgroup and pc->flags at this point:
+ *
+ * - the page is uncharged
+ *
+ * - the page is off-LRU
+ *
+ * - an anonymous fault has exclusive page access, except for
+ * a locked page table
+ *
+ * - a page cache insertion, a swapin fault, or a migration
+ * have the page locked
*/
- smp_wmb();
- SetPageCgroupUsed(pc);
-
- if (lrucare) {
- if (was_on_lru) {
- lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- SetPageLRU(page);
- add_page_to_lru_list(page, lruvec, page_lru(page));
- }
- spin_unlock_irq(&zone->lru_lock);
- }
-
- if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
- anon = true;
- else
- anon = false;
+ pc->mem_cgroup = memcg;
+ pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
- mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
- unlock_page_cgroup(pc);
+ if (lrucare)
+ unlock_page_lru(page, isolated);
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
/*
* "charge_statistics" updated event counter. Then, check it.
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
* if they exceeds softlimit.
*/
memcg_check_events(memcg, page);
+ local_irq_enable();
}
static DEFINE_MUTEX(set_limit_mutex);
@@@ -2896,16 -2793,13 +2793,13 @@@ static inline bool memcg_can_account_km
}
/*
- * This is a bit cumbersome, but it is rarely used and avoids a backpointer
- * in the memcg_cache_params struct.
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
*/
- static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
+ static inline int memcg_cache_id(struct mem_cgroup *memcg)
{
- struct kmem_cache *cachep;
-
- VM_BUG_ON(p->is_root_cache);
- cachep = p->root_cache;
- return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
+ return memcg ? memcg->kmemcg_id : -1;
}
#ifdef CONFIG_SLABINFO
@@@ -2921,7 -2815,7 +2815,7 @@@ static int mem_cgroup_slabinfo_read(str
mutex_lock(&memcg_slab_mutex);
list_for_each_entry(params, &memcg->memcg_slab_caches, list)
- cache_show(memcg_params_to_cache(params), m);
+ cache_show(params->cachep, m);
mutex_unlock(&memcg_slab_mutex);
return 0;
@@@ -2937,22 -2831,21 +2831,21 @@@ static int memcg_charge_kmem(struct mem
if (ret)
return ret;
- ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
- oom_gfp_allowed(gfp));
+ ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);
if (ret == -EINTR) {
/*
- * mem_cgroup_try_charge() chosed to bypass to root due to
- * OOM kill or fatal signal. Since our only options are to
- * either fail the allocation or charge it to this cgroup, do
- * it as a temporary condition. But we can't fail. From a
- * kmem/slab perspective, the cache has already been selected,
- * by mem_cgroup_kmem_get_cache(), so it is too late to change
+ * try_charge() chose to bypass to root due to OOM kill or
+ * fatal signal. Since our only options are to either fail
+ * the allocation or charge it to this cgroup, do it as a
+ * temporary condition. But we can't fail. From a kmem/slab
+ * perspective, the cache has already been selected, by
+ * mem_cgroup_kmem_get_cache(), so it is too late to change
* our minds.
*
* This condition will only trigger if the task entered
- * memcg_charge_kmem in a sane state, but was OOM-killed during
- * mem_cgroup_try_charge() above. Tasks that were already
- * dying when the allocation triggers should have been already
+ * memcg_charge_kmem in a sane state, but was OOM-killed
+ * during try_charge() above. Tasks that were already dying
+ * when the allocation triggers should have been already
* directed to the root cgroup in memcontrol.h
*/
res_counter_charge_nofail(&memcg->res, size, &fail_res);
@@@ -2988,16 -2881,6 +2881,6 @@@ static void memcg_uncharge_kmem(struct
css_put(&memcg->css);
}
- /*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
- int memcg_cache_id(struct mem_cgroup *memcg)
- {
- return memcg ? memcg->kmemcg_id : -1;
- }
-
static size_t memcg_caches_array_size(int num_groups)
{
ssize_t size;
@@@ -3043,6 -2926,10 +2926,10 @@@ int memcg_update_cache_size(struct kmem
return -ENOMEM;
new_params->is_root_cache = true;
+ INIT_LIST_HEAD(&new_params->children);
+ if (cur_params)
+ list_replace(&cur_params->children,
+ &new_params->children);
/*
* There is the chance it will be bigger than
@@@ -3095,11 -2982,14 +2982,14 @@@ int memcg_alloc_cache_params(struct mem
return -ENOMEM;
if (memcg) {
+ s->memcg_params->cachep = s;
s->memcg_params->memcg = memcg;
s->memcg_params->root_cache = root_cache;
css_get(&memcg->css);
- } else
+ } else {
s->memcg_params->is_root_cache = true;
+ INIT_LIST_HEAD(&s->memcg_params->children);
+ }
return 0;
}
@@@ -3119,11 -3009,18 +3009,18 @@@ static void memcg_register_cache(struc
static char memcg_name_buf[NAME_MAX + 1]; /* protected by
memcg_slab_mutex */
struct kmem_cache *cachep;
+ char *cache_name;
int id;
lockdep_assert_held(&memcg_slab_mutex);
id = memcg_cache_id(memcg);
+ /*
+ * The cgroup was taken offline while the create work was pending,
+ * nothing to do then.
+ */
+ if (id < 0)
+ return;
/*
* Since per-memcg caches are created asynchronously on first
@@@ -3134,14 -3031,22 +3031,22 @@@
return;
cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
- cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
+
+ cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
+ mem_cgroup_id(memcg), memcg_name_buf);
+ if (!cache_name)
+ return;
+
+ cachep = memcg_create_kmem_cache(memcg, root_cache, cache_name);
/*
* If we could not create a memcg cache, do not complain, because
* that's not critical at all as we can always proceed with the root
* cache.
*/
- if (!cachep)
+ if (!cachep) {
+ kfree(cache_name);
return;
+ }
list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
@@@ -3170,8 -3075,17 +3075,17 @@@ static void memcg_unregister_cache(stru
memcg = cachep->memcg_params->memcg;
id = memcg_cache_id(memcg);
- BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
- root_cache->memcg_params->memcg_caches[id] = NULL;
+ /*
+ * This function can be called both after and before css offline. If
+ * it's called before css offline, which happens on the root cache
+ * destruction, we should clear the slot corresponding to the cache in
+ * memcg_caches array. Otherwise the slot must have already been
+ * cleared in memcg_unregister_all_caches.
+ */
+ if (id >= 0) {
+ BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
+ root_cache->memcg_params->memcg_caches[id] = NULL;
+ }
list_del(&cachep->memcg_params->list);
@@@ -3209,42 -3123,41 +3123,41 @@@ static inline void memcg_resume_kmem_ac
current->memcg_kmem_skip_account--;
}
- int __memcg_cleanup_cache_params(struct kmem_cache *s)
+ void __memcg_cleanup_cache_params(struct kmem_cache *s)
{
- struct kmem_cache *c;
- int i, failed = 0;
+ struct memcg_cache_params *params, *tmp;
mutex_lock(&memcg_slab_mutex);
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
-
- memcg_unregister_cache(c);
-
- if (cache_from_memcg_idx(s, i))
- failed++;
- }
+ list_for_each_entry_safe(params, tmp,
+ &s->memcg_params->children, siblings)
+ memcg_unregister_cache(params->cachep);
mutex_unlock(&memcg_slab_mutex);
- return failed;
}
static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
{
- struct kmem_cache *cachep;
struct memcg_cache_params *params, *tmp;
+ int id = memcg_cache_id(memcg);
if (!memcg_kmem_is_active(memcg))
return;
mutex_lock(&memcg_slab_mutex);
+ memcg->kmemcg_id = -1;
list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
- cachep = memcg_params_to_cache(params);
+ struct kmem_cache *cachep = params->cachep;
+ struct kmem_cache *root_cache = params->root_cache;
+
+ BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
+ root_cache->memcg_params->memcg_caches[id] = NULL;
+
kmem_cache_shrink(cachep);
if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
memcg_unregister_cache(cachep);
}
mutex_unlock(&memcg_slab_mutex);
+
+ ida_simple_remove(&kmem_limited_groups, id);
}
struct memcg_register_cache_work {
@@@ -3343,6 -3256,7 +3256,7 @@@ struct kmem_cache *__memcg_kmem_get_cac
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
+ int id;
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@@ -3356,7 -3270,15 +3270,15 @@@
if (!memcg_can_account_kmem(memcg))
goto out;
- memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+ id = memcg_cache_id(memcg);
+ /*
+ * This can happen if current was migrated to another cgroup and this
+ * cgroup was taken offline after we issued mem_cgroup_from_task above.
+ */
+ if (unlikely(id < 0))
+ goto out;
+
+ memcg_cachep = cache_from_memcg_idx(cachep, id);
if (likely(memcg_cachep)) {
cachep = memcg_cachep;
goto out;
@@@ -3463,12 -3385,13 +3385,13 @@@ void __memcg_kmem_commit_charge(struct
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
return;
}
-
+ /*
+ * The page is freshly allocated and not visible to any
+ * outside callers yet. Set up pc non-atomically.
+ */
pc = lookup_page_cgroup(page);
- lock_page_cgroup(pc);
pc->mem_cgroup = memcg;
- SetPageCgroupUsed(pc);
- unlock_page_cgroup(pc);
+ pc->flags = PCG_USED;
}
void __memcg_kmem_uncharge_pages(struct page *page, int order)
@@@ -3478,19 -3401,11 +3401,11 @@@
pc = lookup_page_cgroup(page);
if (!PageCgroupUsed(pc))
return;
- lock_page_cgroup(pc);
- if (PageCgroupUsed(pc)) {
- memcg = pc->mem_cgroup;
- ClearPageCgroupUsed(pc);
- }
- unlock_page_cgroup(pc);
+ memcg = pc->mem_cgroup;
+ pc->flags = 0;
/*
* We trust that only if there is a memcg associated with the page, it
@@@ -3510,7 -3425,6 +3425,6 @@@ static inline void memcg_unregister_all
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
/*
* Because tail pages are not marked as "used", set it. We're under
* zone->lru_lock, 'splitting on pmd' and compound_lock.
@@@ -3531,8 -3445,7 +3445,7 @@@ void mem_cgroup_split_huge_fixup(struc
for (i = 1; i < HPAGE_PMD_NR; i++) {
pc = head_pc + i;
pc->mem_cgroup = memcg;
- smp_wmb();/* see __commit_charge() */
- pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+ pc->flags = head_pc->flags;
}
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
HPAGE_PMD_NR);
@@@ -3562,7 -3475,6 +3475,6 @@@ static int mem_cgroup_move_account(stru
{
unsigned long flags;
int ret;
- bool anon = PageAnon(page);
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@@ -3576,15 -3488,21 +3488,21 @@@
if (nr_pages > 1 && !PageTransHuge(page))
goto out;
- lock_page_cgroup(pc);
+ /*
+ * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
+ * of its source page while we change it: page migration takes
+ * both pages off the LRU, but page cache replacement doesn't.
+ */
+ if (!trylock_page(page))
+ goto out;
ret = -EINVAL;
if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
- goto unlock;
+ goto out_unlock;
move_lock_mem_cgroup(from, &flags);
- if (!anon && page_mapped(page)) {
+ if (!PageAnon(page) && page_mapped(page)) {
__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
nr_pages);
__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
@@@ -3598,20 -3516,25 +3516,25 @@@
nr_pages);
}
- mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
+ /*
+ * It is safe to change pc->mem_cgroup here because the page
+ * is referenced, charged, and isolated - we can't race with
+ * uncharging, charging, migration, or LRU putback.
+ */
/* caller should have done css_get */
pc->mem_cgroup = to;
- mem_cgroup_charge_statistics(to, page, anon, nr_pages);
move_unlock_mem_cgroup(from, &flags);
ret = 0;
- unlock:
- unlock_page_cgroup(pc);
- /*
- * check events
- */
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(to, page, nr_pages);
memcg_check_events(to, page);
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
memcg_check_events(from, page);
+ local_irq_enable();
+ out_unlock:
+ unlock_page(page);
out:
return ret;
}
@@@ -3682,483 -3605,39 +3605,39 @@@ out
return ret;
}
- int mem_cgroup_charge_anon(struct page *page,
- struct mm_struct *mm, gfp_t gfp_mask)
+ #ifdef CONFIG_MEMCG_SWAP
+ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
+ bool charge)
{
- unsigned int nr_pages = 1;
- struct mem_cgroup *memcg;
- bool oom = true;
+ int val = (charge) ? 1 : -1;
+ this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
+ }
- if (mem_cgroup_disabled())
- return 0;
+ /**
+ * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
+ * @entry: swap entry to be moved
+ * @from: mem_cgroup which the entry is moved from
+ * @to: mem_cgroup which the entry is moved to
+ *
+ * It succeeds only when the swap_cgroup's record for this entry is the same
+ * as the mem_cgroup's id of @from.
+ *
+ * Returns 0 on success, -EINVAL on failure.
+ *
+ * The caller must have charged to @to, IOW, called res_counter_charge() about
+ * both res and memsw, and called css_get().
+ */
+ static int mem_cgroup_move_swap_account(swp_entry_t entry,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+ {
+ unsigned short old_id, new_id;
- VM_BUG_ON_PAGE(page_mapped(page), page);
- VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
- VM_BUG_ON(!mm);
+ old_id = mem_cgroup_id(from);
+ new_id = mem_cgroup_id(to);
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- /*
- * Never OOM-kill a process for a huge page. The
- * fault handler will fall back to regular pages.
- */
- oom = false;
- }
-
- memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
- if (!memcg)
- return -ENOMEM;
- __mem_cgroup_commit_charge(memcg, page, nr_pages,
- MEM_CGROUP_CHARGE_TYPE_ANON, false);
- return 0;
- }
-
- /*
- * While swap-in, try_charge -> commit or cancel, the page is locked.
- * And when try_charge() successfully returns, one refcnt to memcg without
- * struct page_cgroup is acquired. This refcnt will be consumed by
- * "commit()" or removed by "cancel()"
- */
- static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
- struct page *page,
- gfp_t mask,
- struct mem_cgroup **memcgp)
- {
- struct mem_cgroup *memcg = NULL;
- struct page_cgroup *pc;
- int ret;
-
- pc = lookup_page_cgroup(page);
- /*
- * Every swap fault against a single page tries to charge the
- * page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. The USED bit is protected by
- * the page lock, which serializes swap cache removal, which
- * in turn serializes uncharging.
- */
- if (PageCgroupUsed(pc))
- goto out;
- if (do_swap_account)
- memcg = try_get_mem_cgroup_from_page(page);
- if (!memcg)
- memcg = get_mem_cgroup_from_mm(mm);
- ret = mem_cgroup_try_charge(memcg, mask, 1, true);
- css_put(&memcg->css);
- if (ret == -EINTR)
- memcg = root_mem_cgroup;
- else if (ret)
- return ret;
- out:
- *memcgp = memcg;
- return 0;
- }
-
- int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
- gfp_t gfp_mask, struct mem_cgroup **memcgp)
- {
- if (mem_cgroup_disabled()) {
- *memcgp = NULL;
- return 0;
- }
- /*
- * A racing thread's fault, or swapoff, may have already
- * updated the pte, and even removed page from swap cache: in
- * those cases unuse_pte()'s pte_same() test will fail; but
- * there's also a KSM case which does need to charge the page.
- */
- if (!PageSwapCache(page)) {
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
- if (!memcg)
- return -ENOMEM;
- *memcgp = memcg;
- return 0;
- }
- return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
- }
-
- void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
- {
- if (mem_cgroup_disabled())
- return;
- if (!memcg)
- return;
- __mem_cgroup_cancel_charge(memcg, 1);
- }
-
- static void
- __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
- enum charge_type ctype)
- {
- if (mem_cgroup_disabled())
- return;
- if (!memcg)
- return;
-
- __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
- /*
- * Now swap is on-memory. This means this page may be
- * counted both as mem and swap....double count.
- * Fix it by uncharging from memsw. Basically, this SwapCache is stable
- * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
- * may call delete_from_swap_cache() before reach here.
- */
- if (do_swap_account && PageSwapCache(page)) {
- swp_entry_t ent = {.val = page_private(page)};
- mem_cgroup_uncharge_swap(ent);
- }
- }
-
- void mem_cgroup_commit_charge_swapin(struct page *page,
- struct mem_cgroup *memcg)
- {
- __mem_cgroup_commit_charge_swapin(page, memcg,
- MEM_CGROUP_CHARGE_TYPE_ANON);
- }
-
- int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask)
- {
- enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
- struct mem_cgroup *memcg;
- int ret;
-
- if (mem_cgroup_disabled())
- return 0;
- if (PageCompound(page))
- return 0;
-
- if (PageSwapCache(page)) { /* shmem */
- ret = __mem_cgroup_try_charge_swapin(mm, page,
- gfp_mask, &memcg);
- if (ret)
- return ret;
- __mem_cgroup_commit_charge_swapin(page, memcg, type);
- return 0;
- }
-
- memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
- if (!memcg)
- return -ENOMEM;
- __mem_cgroup_commit_charge(memcg, page, 1, type, false);
- return 0;
- }
-
- static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
- unsigned int nr_pages,
- const enum charge_type ctype)
- {
- struct memcg_batch_info *batch = NULL;
- bool uncharge_memsw = true;
-
- /* If swapout, usage of swap doesn't decrease */
- if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
- uncharge_memsw = false;
-
- batch = ¤t->memcg_batch;
- /*
- * In usual, we do css_get() when we remember memcg pointer.
- * But in this case, we keep res->usage until end of a series of
- * uncharges. Then, it's ok to ignore memcg's refcnt.
- */
- if (!batch->memcg)
- batch->memcg = memcg;
- /*
- * do_batch > 0 when unmapping pages or inode invalidate/truncate.
- * In those cases, all pages freed continuously can be expected to be in
- * the same cgroup and we have chance to coalesce uncharges.
- * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
- * because we want to do uncharge as soon as possible.
- */
-
- if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
- goto direct_uncharge;
-
- if (nr_pages > 1)
- goto direct_uncharge;
-
- /*
- * In typical case, batch->memcg == mem. This means we can
- * merge a series of uncharges to an uncharge of res_counter.
- * If not, we uncharge res_counter ony by one.
- */
- if (batch->memcg != memcg)
- goto direct_uncharge;
- /* remember freed charge and uncharge it later */
- batch->nr_pages++;
- if (uncharge_memsw)
- batch->memsw_nr_pages++;
- return;
- direct_uncharge:
- res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
- if (uncharge_memsw)
- res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
- if (unlikely(batch->memcg != memcg))
- memcg_oom_recover(memcg);
- }
-
- /*
- * uncharge if !page_mapped(page)
- */
- static struct mem_cgroup *
- __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
- bool end_migration)
- {
- struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = 1;
- struct page_cgroup *pc;
- bool anon;
-
- if (mem_cgroup_disabled())
- return NULL;
-
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
- /*
- * Check if our page_cgroup is valid
- */
- pc = lookup_page_cgroup(page);
- if (unlikely(!PageCgroupUsed(pc)))
- return NULL;
-
- lock_page_cgroup(pc);
-
- memcg = pc->mem_cgroup;
-
- if (!PageCgroupUsed(pc))
- goto unlock_out;
-
- anon = PageAnon(page);
-
- switch (ctype) {
- case MEM_CGROUP_CHARGE_TYPE_ANON:
- /*
- * Generally PageAnon tells if it's the anon statistics to be
- * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
- * used before page reached the stage of being marked PageAnon.
- */
- anon = true;
- /* fallthrough */
- case MEM_CGROUP_CHARGE_TYPE_DROP:
- /* See mem_cgroup_prepare_migration() */
- if (page_mapped(page))
- goto unlock_out;
- /*
- * Pages under migration may not be uncharged. But
- * end_migration() /must/ be the one uncharging the
- * unused post-migration page and so it has to call
- * here with the migration bit still set. See the
- * res_counter handling below.
- */
- if (!end_migration && PageCgroupMigration(pc))
- goto unlock_out;
- break;
- case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
- if (!PageAnon(page)) { /* Shared memory */
- if (page->mapping && !page_is_file_cache(page))
- goto unlock_out;
- } else if (page_mapped(page)) /* Anon */
- goto unlock_out;
- break;
- default:
- break;
- }
-
- mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
-
- ClearPageCgroupUsed(pc);
- /*
- * pc->mem_cgroup is not cleared here. It will be accessed when it's
- * freed from LRU. This is safe because uncharged page is expected not
- * to be reused (freed soon). Exception is SwapCache, it's handled by
- * special functions.
- */
-
- unlock_page_cgroup(pc);
- /*
- * even after unlock, we have memcg->res.usage here and this memcg
- * will never be freed, so it's safe to call css_get().
- */
- memcg_check_events(memcg, page);
- if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
- mem_cgroup_swap_statistics(memcg, true);
- css_get(&memcg->css);
- }
- /*
- * Migration does not charge the res_counter for the
- * replacement page, so leave it alone when phasing out the
- * page that is unused after the migration.
- */
- if (!end_migration && !mem_cgroup_is_root(memcg))
- mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
-
- return memcg;
-
- unlock_out:
- unlock_page_cgroup(pc);
- return NULL;
- }
-
- void mem_cgroup_uncharge_page(struct page *page)
- {
- /* early check. */
- if (page_mapped(page))
- return;
- VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
- /*
- * If the page is in swap cache, uncharge should be deferred
- * to the swap path, which also properly accounts swap usage
- * and handles memcg lifetime.
- *
- * Note that this check is not stable and reclaim may add the
- * page to swap cache at any time after this. However, if the
- * page is not in swap cache by the time page->mapcount hits
- * 0, there won't be any page table references to the swap
- * slot, and reclaim will free it and not actually write the
- * page to disk.
- */
- if (PageSwapCache(page))
- return;
- __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
- }
-
- void mem_cgroup_uncharge_cache_page(struct page *page)
- {
- VM_BUG_ON_PAGE(page_mapped(page), page);
- VM_BUG_ON_PAGE(page->mapping, page);
- __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
- }
-
- /*
- * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
- * In that cases, pages are freed continuously and we can expect pages
- * are in the same memcg. All these calls itself limits the number of
- * pages freed at once, then uncharge_start/end() is called properly.
- * This may be called prural(2) times in a context,
- */
-
- void mem_cgroup_uncharge_start(void)
- {
- current->memcg_batch.do_batch++;
- /* We can do nest. */
- if (current->memcg_batch.do_batch == 1) {
- current->memcg_batch.memcg = NULL;
- current->memcg_batch.nr_pages = 0;
- current->memcg_batch.memsw_nr_pages = 0;
- }
- }
-
- void mem_cgroup_uncharge_end(void)
- {
- struct memcg_batch_info *batch = ¤t->memcg_batch;
-
- if (!batch->do_batch)
- return;
-
- batch->do_batch--;
- if (batch->do_batch) /* If stacked, do nothing. */
- return;
-
- if (!batch->memcg)
- return;
- /*
- * This "batch->memcg" is valid without any css_get/put etc...
- * bacause we hide charges behind us.
- */
- if (batch->nr_pages)
- res_counter_uncharge(&batch->memcg->res,
- batch->nr_pages * PAGE_SIZE);
- if (batch->memsw_nr_pages)
- res_counter_uncharge(&batch->memcg->memsw,
- batch->memsw_nr_pages * PAGE_SIZE);
- memcg_oom_recover(batch->memcg);
- /* forget this pointer (for sanity check) */
- batch->memcg = NULL;
- }
-
- #ifdef CONFIG_SWAP
- /*
- * called after __delete_from_swap_cache() and drop "page" account.
- * memcg information is recorded to swap_cgroup of "ent"
- */
- void
- mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
- {
- struct mem_cgroup *memcg;
- int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
-
- if (!swapout) /* this was a swap cache but the swap is unused ! */
- ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
-
- memcg = __mem_cgroup_uncharge_common(page, ctype, false);
-
- /*
- * record memcg information, if swapout && memcg != NULL,
- * css_get() was called in uncharge().
- */
- if (do_swap_account && swapout && memcg)
- swap_cgroup_record(ent, mem_cgroup_id(memcg));
- }
- #endif
-
- #ifdef CONFIG_MEMCG_SWAP
- /*
- * called from swap_entry_free(). remove record in swap_cgroup and
- * uncharge "memsw" account.
- */
- void mem_cgroup_uncharge_swap(swp_entry_t ent)
- {
- struct mem_cgroup *memcg;
- unsigned short id;
-
- if (!do_swap_account)
- return;
-
- id = swap_cgroup_record(ent, 0);
- rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
- if (memcg) {
- /*
- * We uncharge this because swap is freed. This memcg can
- * be obsolete one. We avoid calling css_tryget_online().
- */
- if (!mem_cgroup_is_root(memcg))
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
- mem_cgroup_swap_statistics(memcg, false);
- css_put(&memcg->css);
- }
- rcu_read_unlock();
- }
-
- /**
- * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
- * @entry: swap entry to be moved
- * @from: mem_cgroup which the entry is moved from
- * @to: mem_cgroup which the entry is moved to
- *
- * It succeeds only when the swap_cgroup's record for this entry is the same
- * as the mem_cgroup's id of @from.
- *
- * Returns 0 on success, -EINVAL on failure.
- *
- * The caller must have charged to @to, IOW, called res_counter_charge() about
- * both res and memsw, and called css_get().
- */
- static int mem_cgroup_move_swap_account(swp_entry_t entry,
- struct mem_cgroup *from, struct mem_cgroup *to)
- {
- unsigned short old_id, new_id;
-
- old_id = mem_cgroup_id(from);
- new_id = mem_cgroup_id(to);
-
- if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
- mem_cgroup_swap_statistics(from, false);
- mem_cgroup_swap_statistics(to, true);
+ if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
+ mem_cgroup_swap_statistics(from, false);
+ mem_cgroup_swap_statistics(to, true);
/*
* This function is only called from task migration context now.
* It postpones res_counter and refcount handling till the end
@@@ -4183,175 -3662,6 +3662,6 @@@ static inline int mem_cgroup_move_swap_
}
#endif
- /*
- * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
- * page belongs to.
- */
- void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
- struct mem_cgroup **memcgp)
- {
- struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = 1;
- struct page_cgroup *pc;
- enum charge_type ctype;
-
- *memcgp = NULL;
-
- if (mem_cgroup_disabled())
- return;
-
- if (PageTransHuge(page))
- nr_pages <<= compound_order(page);
-
- pc = lookup_page_cgroup(page);
- lock_page_cgroup(pc);
- if (PageCgroupUsed(pc)) {
- memcg = pc->mem_cgroup;
- css_get(&memcg->css);
- /*
- * At migrating an anonymous page, its mapcount goes down
- * to 0 and uncharge() will be called. But, even if it's fully
- * unmapped, migration may fail and this page has to be
- * charged again. We set MIGRATION flag here and delay uncharge
- * until end_migration() is called
- *
- * Corner Case Thinking
- * A)
- * When the old page was mapped as Anon and it's unmap-and-freed
- * while migration was ongoing.
- * If unmap finds the old page, uncharge() of it will be delayed
- * until end_migration(). If unmap finds a new page, it's
- * uncharged when it make mapcount to be 1->0. If unmap code
- * finds swap_migration_entry, the new page will not be mapped
- * and end_migration() will find it(mapcount==0).
- *
- * B)
- * When the old page was mapped but migraion fails, the kernel
- * remaps it. A charge for it is kept by MIGRATION flag even
- * if mapcount goes down to 0. We can do remap successfully
- * without charging it again.
- *
- * C)
- * The "old" page is under lock_page() until the end of
- * migration, so, the old page itself will not be swapped-out.
- * If the new page is swapped out before end_migraton, our
- * hook to usual swap-out path will catch the event.
- */
- if (PageAnon(page))
- SetPageCgroupMigration(pc);
- }
- unlock_page_cgroup(pc);
- /*
- * If the page is not charged at this point,
- * we return here.
- */
- if (!memcg)
- return;
-
- *memcgp = memcg;
- /*
- * We charge new page before it's used/mapped. So, even if unlock_page()
- * is called before end_migration, we can catch all events on this new
- * page. In the case new page is migrated but not remapped, new page's
- * mapcount will be finally 0 and we call uncharge in end_migration().
- */
- if (PageAnon(page))
- ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
- else
- ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
- /*
- * The page is committed to the memcg, but it's not actually
- * charged to the res_counter since we plan on replacing the
- * old one and only one page is going to be left afterwards.
- */
- __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
- }
-
- /* remove redundant charge if migration failed*/
- void mem_cgroup_end_migration(struct mem_cgroup *memcg,
- struct page *oldpage, struct page *newpage, bool migration_ok)
- {
- struct page *used, *unused;
- struct page_cgroup *pc;
- bool anon;
-
- if (!memcg)
- return;
-
- if (!migration_ok) {
- used = oldpage;
- unused = newpage;
- } else {
- used = newpage;
- unused = oldpage;
- }
- anon = PageAnon(used);
- __mem_cgroup_uncharge_common(unused,
- anon ? MEM_CGROUP_CHARGE_TYPE_ANON
- : MEM_CGROUP_CHARGE_TYPE_CACHE,
- true);
- css_put(&memcg->css);
- /*
- * We disallowed uncharge of pages under migration because mapcount
- * of the page goes down to zero, temporarly.
- * Clear the flag and check the page should be charged.
- */
- pc = lookup_page_cgroup(oldpage);
- lock_page_cgroup(pc);
- ClearPageCgroupMigration(pc);
- unlock_page_cgroup(pc);
-
- /*
- * If a page is a file cache, radix-tree replacement is very atomic
- * and we can skip this check. When it was an Anon page, its mapcount
- * goes down to 0. But because we added MIGRATION flage, it's not
- * uncharged yet. There are several case but page->mapcount check
- * and USED bit check in mem_cgroup_uncharge_page() will do enough
- * check. (see prepare_charge() also)
- */
- if (anon)
- mem_cgroup_uncharge_page(used);
- }
-
- /*
- * At replace page cache, newpage is not under any memcg but it's on
- * LRU. So, this function doesn't touch res_counter but handles LRU
- * in correct way. Both pages are locked so we cannot race with uncharge.
- */
- void mem_cgroup_replace_page_cache(struct page *oldpage,
- struct page *newpage)
- {
- struct mem_cgroup *memcg = NULL;
- struct page_cgroup *pc;
- enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
-
- if (mem_cgroup_disabled())
- return;
-
- pc = lookup_page_cgroup(oldpage);
- /* fix accounting on old pages */
- lock_page_cgroup(pc);
- if (PageCgroupUsed(pc)) {
- memcg = pc->mem_cgroup;
- mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
- ClearPageCgroupUsed(pc);
- }
- unlock_page_cgroup(pc);
-
- /*
- * When called from shmem_replace_page(), in some cases the
- * oldpage has already been charged, and in some cases not.
- */
- if (!memcg)
- return;
- /*
- * Even if newpage->mapping was NULL before starting replacement,
- * the newpage may be on LRU(or pagevec for LRU) already. We lock
- * LRU while we overwrite pc->mem_cgroup.
- */
- __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
- }
-
#ifdef CONFIG_DEBUG_VM
static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
{
@@@ -4550,7 -3860,7 +3860,7 @@@ unsigned long mem_cgroup_soft_limit_rec
gfp_mask, &nr_scanned);
nr_reclaimed += reclaimed;
*total_scanned += nr_scanned;
- spin_lock(&mctz->lock);
+ spin_lock_irq(&mctz->lock);
/*
* If we failed to reclaim anything from this memory cgroup
@@@ -4590,7 -3900,7 +3900,7 @@@
*/
/* If excess == 0, no tree ops */
__mem_cgroup_insert_exceeded(mz, mctz, excess);
- spin_unlock(&mctz->lock);
+ spin_unlock_irq(&mctz->lock);
css_put(&mz->memcg->css);
loop++;
/*
@@@ -4809,86 -4119,32 +4119,32 @@@ static int mem_cgroup_hierarchy_write(s
else
retval = -EBUSY;
} else
- retval = -EINVAL;
-
- out:
- mutex_unlock(&memcg_create_mutex);
-
- return retval;
- }
-
-
- static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx)
- {
- struct mem_cgroup *iter;
- long val = 0;
-
- /* Per-cpu values can be negative, use a signed accumulator */
- for_each_mem_cgroup_tree(iter, memcg)
- val += mem_cgroup_read_stat(iter, idx);
-
- if (val < 0) /* race ? */
- val = 0;
- return val;
- }
-
- static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
- {
- u64 val;
-
- if (!mem_cgroup_is_root(memcg)) {
- if (!swap)
- return res_counter_read_u64(&memcg->res, RES_USAGE);
- else
- return res_counter_read_u64(&memcg->memsw, RES_USAGE);
- }
-
- /*
- * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
- * as well as in MEM_CGROUP_STAT_RSS_HUGE.
- */
- val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
- val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+ retval = -EINVAL;
- if (swap)
- val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
+ out:
+ mutex_unlock(&memcg_create_mutex);
- return val << PAGE_SHIFT;
+ return retval;
}
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft)
+ struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- u64 val;
- int name;
- enum res_type type;
-
- type = MEMFILE_TYPE(cft->private);
- name = MEMFILE_ATTR(cft->private);
+ enum res_type type = MEMFILE_TYPE(cft->private);
+ int name = MEMFILE_ATTR(cft->private);
switch (type) {
case _MEM:
- if (name == RES_USAGE)
- val = mem_cgroup_usage(memcg, false);
- else
- val = res_counter_read_u64(&memcg->res, name);
- break;
+ return res_counter_read_u64(&memcg->res, name);
case _MEMSWAP:
- if (name == RES_USAGE)
- val = mem_cgroup_usage(memcg, true);
- else
- val = res_counter_read_u64(&memcg->memsw, name);
- break;
+ return res_counter_read_u64(&memcg->memsw, name);
case _KMEM:
- val = res_counter_read_u64(&memcg->kmem, name);
+ return res_counter_read_u64(&memcg->kmem, name);
break;
default:
BUG();
}
-
- return val;
}
#ifdef CONFIG_MEMCG_KMEM
@@@ -5350,7 -4606,10 +4606,10 @@@ static void __mem_cgroup_threshold(stru
if (!t)
goto unlock;
- usage = mem_cgroup_usage(memcg, swap);
+ if (!swap)
+ usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ else
+ usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/*
* current_threshold points to threshold just below or equal to usage.
@@@ -5442,15 -4701,15 +4701,15 @@@ static int __mem_cgroup_usage_register_
mutex_lock(&memcg->thresholds_lock);
- if (type == _MEM)
+ if (type == _MEM) {
thresholds = &memcg->thresholds;
- else if (type == _MEMSWAP)
+ usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ } else if (type == _MEMSWAP) {
thresholds = &memcg->memsw_thresholds;
- else
+ usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ } else
BUG();
- usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
-
/* Check if a threshold crossed before adding a new one */
if (thresholds->primary)
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@@ -5530,18 -4789,19 +4789,19 @@@ static void __mem_cgroup_usage_unregist
int i, j, size;
mutex_lock(&memcg->thresholds_lock);
- if (type == _MEM)
+
+ if (type == _MEM) {
thresholds = &memcg->thresholds;
- else if (type == _MEMSWAP)
+ usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ } else if (type == _MEMSWAP) {
thresholds = &memcg->memsw_thresholds;
- else
+ usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ } else
BUG();
if (!thresholds->primary)
goto unlock;
- usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
-
/* Check if a threshold crossed before removing */
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@@ -6003,6 -5263,7 +5263,6 @@@ static struct cftype mem_cgroup_files[
},
{
.name = "use_hierarchy",
- .flags = CFTYPE_INSANE,
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
@@@ -6295,9 -5556,9 +5555,9 @@@ mem_cgroup_css_online(struct cgroup_sub
* core guarantees its existence.
*/
} else {
- res_counter_init(&memcg->res, NULL);
- res_counter_init(&memcg->memsw, NULL);
- res_counter_init(&memcg->kmem, NULL);
+ res_counter_init(&memcg->res, &root_mem_cgroup->res);
+ res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
+ res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@@ -6406,80 -5667,40 +5666,63 @@@ static void mem_cgroup_css_free(struct
__mem_cgroup_free(memcg);
}
+/**
+ * mem_cgroup_css_reset - reset the states of a mem_cgroup
+ * @css: the target css
+ *
+ * Reset the states of the mem_cgroup associated with @css. This is
+ * invoked when the userland requests disabling on the default hierarchy
+ * but the memcg is pinned through dependency. The memcg should stop
+ * applying policies and should revert to the vanilla state as it may be
+ * made visible again.
+ *
+ * The current implementation only resets the essential configurations.
+ * This needs to be expanded to cover all the visible parts.
+ */
+static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ mem_cgroup_resize_limit(memcg, ULLONG_MAX);
+ mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
+ memcg_update_kmem_limit(memcg, ULLONG_MAX);
+ res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
+}
+
#ifdef CONFIG_MMU
/* Handlers for move charge at task migration. */
- #define PRECHARGE_COUNT_AT_ONCE 256
static int mem_cgroup_do_precharge(unsigned long count)
{
- int ret = 0;
- int batch_count = PRECHARGE_COUNT_AT_ONCE;
- struct mem_cgroup *memcg = mc.to;
+ int ret;
- if (mem_cgroup_is_root(memcg)) {
+ /* Try a single bulk charge without reclaim first */
+ ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+ if (!ret) {
mc.precharge += count;
- /* we don't need css_get for root */
return ret;
}
- /* try to charge at once */
- if (count > 1) {
- struct res_counter *dummy;
- /*
- * "memcg" cannot be under rmdir() because we've already checked
- * by cgroup_lock_live_cgroup() that it is not removed and we
- * are still under the same cgroup_mutex. So we can postpone
- * css_get().
- */
- if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
- goto one_by_one;
- if (do_swap_account && res_counter_charge(&memcg->memsw,
- PAGE_SIZE * count, &dummy)) {
- res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
- goto one_by_one;
- }
- mc.precharge += count;
+ if (ret == -EINTR) {
+ cancel_charge(root_mem_cgroup, count);
return ret;
}
- one_by_one:
- /* fall back to one by one charge */
+
+ /* Try charges one by one with reclaim */
while (count--) {
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
- if (!batch_count--) {
- batch_count = PRECHARGE_COUNT_AT_ONCE;
- cond_resched();
- }
- ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
+ ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
+ /*
+ * In case of failure, any residual charges against
+ * mc.to will be dropped by mem_cgroup_clear_mc()
+ * later on. However, cancel any charges that are
+ * bypassed to root right away or they'll be lost.
+ */
+ if (ret == -EINTR)
+ cancel_charge(root_mem_cgroup, 1);
if (ret)
- /* mem_cgroup_clear_mc() will do uncharge later */
return ret;
mc.precharge++;
+ cond_resched();
}
- return ret;
+ return 0;
}
/**
@@@ -6615,9 -5836,9 +5858,9 @@@ static enum mc_target_type get_mctgt_ty
if (page) {
pc = lookup_page_cgroup(page);
/*
- * Do only loose check w/o page_cgroup lock.
- * mem_cgroup_move_account() checks the pc is valid or not under
- * the lock.
+ * Do only loose check w/o serialization.
+ * mem_cgroup_move_account() checks the pc is valid or
+ * not under LRU exclusion.
*/
if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
@@@ -6742,7 -5963,7 +5985,7 @@@ static void __mem_cgroup_clear_mc(void
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
- __mem_cgroup_cancel_charge(mc.to, mc.precharge);
+ cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
}
/*
@@@ -6750,27 -5971,24 +5993,24 @@@
* we must uncharge here.
*/
if (mc.moved_charge) {
- __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
+ cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
}
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
/* uncharge swap account from the old cgroup */
- if (!mem_cgroup_is_root(mc.from))
- res_counter_uncharge(&mc.from->memsw,
- PAGE_SIZE * mc.moved_swap);
+ res_counter_uncharge(&mc.from->memsw,
+ PAGE_SIZE * mc.moved_swap);
for (i = 0; i < mc.moved_swap; i++)
css_put(&mc.from->css);
- if (!mem_cgroup_is_root(mc.to)) {
- /*
- * we charged both to->res and to->memsw, so we should
- * uncharge to->res.
- */
- res_counter_uncharge(&mc.to->res,
- PAGE_SIZE * mc.moved_swap);
- }
+ /*
+ * we charged both to->res and to->memsw, so we should
+ * uncharge to->res.
+ */
+ res_counter_uncharge(&mc.to->res,
+ PAGE_SIZE * mc.moved_swap);
/* we've already done css_get(mc.to) */
mc.moved_swap = 0;
}
@@@ -7023,17 -6241,16 +6263,17 @@@ static void mem_cgroup_move_task(struc
/*
* Cgroup retains root cgroups across [un]mount cycles making it necessary
- * to verify sane_behavior flag on each mount attempt.
+ * to verify whether we're attached to the default hierarchy on each mount
+ * attempt.
*/
static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
{
/*
- * use_hierarchy is forced with sane_behavior. cgroup core
+ * use_hierarchy is forced on the default hierarchy. cgroup core
* guarantees that @root doesn't have any children, so turning it
* on for the root memcg is enough.
*/
- if (cgroup_sane_behavior(root_css->cgroup))
+ if (cgroup_on_dfl(root_css->cgroup))
mem_cgroup_from_css(root_css)->use_hierarchy = true;
}
@@@ -7042,12 -6259,11 +6282,12 @@@ struct cgroup_subsys memory_cgrp_subsy
.css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
.css_free = mem_cgroup_css_free,
+ .css_reset = mem_cgroup_css_reset,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
.bind = mem_cgroup_bind,
- .base_cftypes = mem_cgroup_files,
+ .legacy_cftypes = mem_cgroup_files,
.early_init = 0,
};
@@@ -7064,8 -6280,7 +6304,8 @@@ __setup("swapaccount=", enable_swap_acc
static void __init memsw_file_init(void)
{
- WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+ memsw_cgroup_files));
}
static void __init enable_swap_cgroup(void)
@@@ -7082,6 -6297,397 +6322,397 @@@ static void __init enable_swap_cgroup(v
}
#endif
+ #ifdef CONFIG_MEMCG_SWAP
+ /**
+ * mem_cgroup_swapout - transfer a memsw charge to swap
+ * @page: page whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @page to @entry.
+ */
+ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+ {
+ struct page_cgroup *pc;
+ unsigned short oldid;
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+
+ if (!do_swap_account)
+ return;
+
+ pc = lookup_page_cgroup(page);
+
+ /* Readahead page, never charged */
+ if (!PageCgroupUsed(pc))
+ return;
+
+ VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
+
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
+ VM_BUG_ON_PAGE(oldid, page);
+
+ pc->flags &= ~PCG_MEMSW;
+ css_get(&pc->mem_cgroup->css);
+ mem_cgroup_swap_statistics(pc->mem_cgroup, true);
+ }
+
+ /**
+ * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * @entry: swap entry to uncharge
+ *
+ * Drop the memsw charge associated with @entry.
+ */
+ void mem_cgroup_uncharge_swap(swp_entry_t entry)
+ {
+ struct mem_cgroup *memcg;
+ unsigned short id;
+
+ if (!do_swap_account)
+ return;
+
+ id = swap_cgroup_record(entry, 0);
+ rcu_read_lock();
+ memcg = mem_cgroup_lookup(id);
+ if (memcg) {
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+ mem_cgroup_swap_statistics(memcg, false);
+ css_put(&memcg->css);
+ }
+ rcu_read_unlock();
+ }
+ #endif
+
+ /**
+ * mem_cgroup_try_charge - try charging a page
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp_mask: reclaim mode
+ * @memcgp: charged memcg return
+ *
+ * Try to charge @page to the memcg that @mm belongs to, reclaiming
+ * pages according to @gfp_mask if necessary.
+ *
+ * Returns 0 on success, with *@memcgp pointing to the charged memcg.
+ * Otherwise, an error code is returned.
+ *
+ * After page->mapping has been set up, the caller must finalize the
+ * charge with mem_cgroup_commit_charge(). Or abort the transaction
+ * with mem_cgroup_cancel_charge() in case page instantiation fails.
+ */
+ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, struct mem_cgroup **memcgp)
+ {
+ struct mem_cgroup *memcg = NULL;
+ unsigned int nr_pages = 1;
+ int ret = 0;
+
+ if (mem_cgroup_disabled())
+ goto out;
+
+ if (PageSwapCache(page)) {
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+ /*
+ * Every swap fault against a single page tries to charge the
+ * page, bail as early as possible. shmem_unuse() encounters
+ * already charged pages, too. The USED bit is protected by
+ * the page lock, which serializes swap cache removal, which
+ * in turn serializes uncharging.
+ */
+ if (PageCgroupUsed(pc))
+ goto out;
+ }
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ }
+
+ if (do_swap_account && PageSwapCache(page))
+ memcg = try_get_mem_cgroup_from_page(page);
+ if (!memcg)
+ memcg = get_mem_cgroup_from_mm(mm);
+
+ ret = try_charge(memcg, gfp_mask, nr_pages);
+
+ css_put(&memcg->css);
+
+ if (ret == -EINTR) {
+ memcg = root_mem_cgroup;
+ ret = 0;
+ }
+ out:
+ *memcgp = memcg;
+ return ret;
+ }
+
+ /**
+ * mem_cgroup_commit_charge - commit a page charge
+ * @page: page to charge
+ * @memcg: memcg to charge the page to
+ * @lrucare: page might be on LRU already
+ *
+ * Finalize a charge transaction started by mem_cgroup_try_charge(),
+ * after page->mapping has been set up. This must happen atomically
+ * as part of the page instantiation, i.e. under the page table lock
+ * for anonymous pages, under the page lock for page and swap cache.
+ *
+ * In addition, the page must not be on the LRU during the commit, to
+ * prevent racing with task migration. If it might be, use @lrucare.
+ *
+ * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
+ */
+ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
+ bool lrucare)
+ {
+ unsigned int nr_pages = 1;
+
+ VM_BUG_ON_PAGE(!page->mapping, page);
+ VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
+
+ if (mem_cgroup_disabled())
+ return;
+ /*
+ * Swap faults will attempt to charge the same page multiple
+ * times. But reuse_swap_page() might have removed the page
+ * from swapcache already, so we can't check PageSwapCache().
+ */
+ if (!memcg)
+ return;
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ }
+
+ commit_charge(page, memcg, nr_pages, lrucare);
+
+ if (do_swap_account && PageSwapCache(page)) {
+ swp_entry_t entry = { .val = page_private(page) };
+ /*
+ * The swap entry might not get freed for a long time,
+ * let's not wait for it. The page already received a
+ * memory+swap charge, drop the swap entry duplicate.
+ */
+ mem_cgroup_uncharge_swap(entry);
+ }
+ }
+
+ /**
+ * mem_cgroup_cancel_charge - cancel a page charge
+ * @page: page to charge
+ * @memcg: memcg to charge the page to
+ *
+ * Cancel a charge transaction started by mem_cgroup_try_charge().
+ */
+ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+ {
+ unsigned int nr_pages = 1;
+
+ if (mem_cgroup_disabled())
+ return;
+ /*
+ * Swap faults will attempt to charge the same page multiple
+ * times. But reuse_swap_page() might have removed the page
+ * from swapcache already, so we can't check PageSwapCache().
+ */
+ if (!memcg)
+ return;
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ }
+
+ cancel_charge(memcg, nr_pages);
+ }
+
+ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
+ unsigned long nr_mem, unsigned long nr_memsw,
+ unsigned long nr_anon, unsigned long nr_file,
+ unsigned long nr_huge, struct page *dummy_page)
+ {
+ unsigned long flags;
+
+ if (nr_mem)
+ res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE);
+ if (nr_memsw)
+ res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE);
+
+ memcg_oom_recover(memcg);
+
+ local_irq_save(flags);
+ __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
+ __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
+ __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
+ __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
+ __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
+ memcg_check_events(memcg, dummy_page);
+ local_irq_restore(flags);
+ }
+
+ static void uncharge_list(struct list_head *page_list)
+ {
+ struct mem_cgroup *memcg = NULL;
+ unsigned long nr_memsw = 0;
+ unsigned long nr_anon = 0;
+ unsigned long nr_file = 0;
+ unsigned long nr_huge = 0;
+ unsigned long pgpgout = 0;
+ unsigned long nr_mem = 0;
+ struct list_head *next;
+ struct page *page;
+
+ next = page_list->next;
+ do {
+ unsigned int nr_pages = 1;
+ struct page_cgroup *pc;
+
+ page = list_entry(next, struct page, lru);
+ next = page->lru.next;
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+
+ pc = lookup_page_cgroup(page);
+ if (!PageCgroupUsed(pc))
+ continue;
+
+ /*
+ * Nobody should be changing or seriously looking at
+ * pc->mem_cgroup and pc->flags at this point, we have
+ * fully exclusive access to the page.
+ */
+
+ if (memcg != pc->mem_cgroup) {
+ if (memcg) {
+ uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
+ nr_anon, nr_file, nr_huge, page);
+ pgpgout = nr_mem = nr_memsw = 0;
+ nr_anon = nr_file = nr_huge = 0;
+ }
+ memcg = pc->mem_cgroup;
+ }
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ nr_huge += nr_pages;
+ }
+
+ if (PageAnon(page))
+ nr_anon += nr_pages;
+ else
+ nr_file += nr_pages;
+
+ if (pc->flags & PCG_MEM)
+ nr_mem += nr_pages;
+ if (pc->flags & PCG_MEMSW)
+ nr_memsw += nr_pages;
+ pc->flags = 0;
+
+ pgpgout++;
+ } while (next != page_list);
+
+ if (memcg)
+ uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
+ nr_anon, nr_file, nr_huge, page);
+ }
+
+ /**
+ * mem_cgroup_uncharge - uncharge a page
+ * @page: page to uncharge
+ *
+ * Uncharge a page previously charged with mem_cgroup_try_charge() and
+ * mem_cgroup_commit_charge().
+ */
+ void mem_cgroup_uncharge(struct page *page)
+ {
+ struct page_cgroup *pc;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ /* Don't touch page->lru of any random page, pre-check: */
+ pc = lookup_page_cgroup(page);
+ if (!PageCgroupUsed(pc))
+ return;
+
+ INIT_LIST_HEAD(&page->lru);
+ uncharge_list(&page->lru);
+ }
+
+ /**
+ * mem_cgroup_uncharge_list - uncharge a list of page
+ * @page_list: list of pages to uncharge
+ *
+ * Uncharge a list of pages previously charged with
+ * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
+ */
+ void mem_cgroup_uncharge_list(struct list_head *page_list)
+ {
+ if (mem_cgroup_disabled())
+ return;
+
+ if (!list_empty(page_list))
+ uncharge_list(page_list);
+ }
+
+ /**
+ * mem_cgroup_migrate - migrate a charge to another page
+ * @oldpage: currently charged page
+ * @newpage: page to transfer the charge to
+ * @lrucare: both pages might be on the LRU already
+ *
+ * Migrate the charge from @oldpage to @newpage.
+ *
+ * Both pages must be locked, @newpage->mapping must be set up.
+ */
+ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
+ bool lrucare)
+ {
+ unsigned int nr_pages = 1;
+ struct page_cgroup *pc;
+ int isolated;
+
+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
+ VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
+ VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ pc = lookup_page_cgroup(oldpage);
+ if (!PageCgroupUsed(pc))
+ return;
+
+ VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
+ VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
+
+ if (PageTransHuge(oldpage)) {
+ nr_pages <<= compound_order(oldpage);
+ VM_BUG_ON_PAGE(!PageTransHuge(oldpage), oldpage);
+ VM_BUG_ON_PAGE(!PageTransHuge(newpage), newpage);
+ }
+
+ if (lrucare)
+ lock_page_lru(oldpage, &isolated);
+
+ pc->flags = 0;
+
+ if (lrucare)
+ unlock_page_lru(oldpage, isolated);
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(pc->mem_cgroup, oldpage, -nr_pages);
+ memcg_check_events(pc->mem_cgroup, oldpage);
+ local_irq_enable();
+
+ commit_charge(newpage, pc->mem_cgroup, nr_pages, lrucare);
+ }
+
/*
* subsys_initcall() for memory controller.
*
diff --combined mm/migrate.c
index be6dbf9,7f5a424..327b5c6
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@@ -778,11 -778,14 +778,14 @@@ static int move_to_new_page(struct pag
rc = fallback_migrate_page(mapping, newpage, page, mode);
if (rc != MIGRATEPAGE_SUCCESS) {
- newpage->mapping = NULL;
+ if (!PageAnon(newpage))
+ newpage->mapping = NULL;
} else {
+ mem_cgroup_migrate(page, newpage, false);
if (remap_swapcache)
remove_migration_ptes(page, newpage);
- page->mapping = NULL;
+ if (!PageAnon(page))
+ page->mapping = NULL;
}
unlock_page(newpage);
@@@ -795,7 -798,6 +798,6 @@@ static int __unmap_and_move(struct pag
{
int rc = -EAGAIN;
int remap_swapcache = 1;
- struct mem_cgroup *mem;
struct anon_vma *anon_vma = NULL;
if (!trylock_page(page)) {
@@@ -821,9 -823,6 +823,6 @@@
lock_page(page);
}
- /* charge against new page */
- mem_cgroup_prepare_migration(page, newpage, &mem);
-
if (PageWriteback(page)) {
/*
* Only in the case of a full synchronous migration is it
@@@ -833,10 -832,10 +832,10 @@@
*/
if (mode != MIGRATE_SYNC) {
rc = -EBUSY;
- goto uncharge;
+ goto out_unlock;
}
if (!force)
- goto uncharge;
+ goto out_unlock;
wait_on_page_writeback(page);
}
/*
@@@ -872,7 -871,7 +871,7 @@@
*/
remap_swapcache = 0;
} else {
- goto uncharge;
+ goto out_unlock;
}
}
@@@ -885,7 -884,7 +884,7 @@@
* the page migration right away (proteced by page lock).
*/
rc = balloon_page_migrate(newpage, page, mode);
- goto uncharge;
+ goto out_unlock;
}
/*
@@@ -904,7 -903,7 +903,7 @@@
VM_BUG_ON_PAGE(PageAnon(page), page);
if (page_has_private(page)) {
try_to_free_buffers(page);
- goto uncharge;
+ goto out_unlock;
}
goto skip_unmap;
}
@@@ -923,10 -922,7 +922,7 @@@ skip_unmap
if (anon_vma)
put_anon_vma(anon_vma);
- uncharge:
- mem_cgroup_end_migration(mem, page, newpage,
- (rc == MIGRATEPAGE_SUCCESS ||
- rc == MIGRATEPAGE_BALLOON_SUCCESS));
+ out_unlock:
unlock_page(page);
out:
return rc;
@@@ -988,10 -984,9 +984,10 @@@ out
* it. Otherwise, putback_lru_page() will drop the reference grabbed
* during isolation.
*/
- if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+ if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
+ ClearPageSwapBacked(newpage);
put_new_page(newpage, private);
- else
+ } else
putback_lru_page(newpage);
if (result) {
@@@ -1786,7 -1781,6 +1782,6 @@@ int migrate_misplaced_transhuge_page(st
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
struct page *new_page = NULL;
- struct mem_cgroup *memcg = NULL;
int page_lru = page_is_file_cache(page);
unsigned long mmun_start = address & HPAGE_PMD_MASK;
unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
@@@ -1852,15 -1846,6 +1847,6 @@@ fail_putback
goto out_unlock;
}
- /*
- * Traditional migration needs to prepare the memcg charge
- * transaction early to prevent the old page from being
- * uncharged when installing migration entries. Here we can
- * save the potential rollback and start the charge transfer
- * only when migration is already known to end successfully.
- */
- mem_cgroup_prepare_migration(page, new_page, &memcg);
-
orig_entry = *pmd;
entry = mk_pmd(new_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
@@@ -1888,14 -1873,10 +1874,10 @@@
goto fail_putback;
}
+ mem_cgroup_migrate(page, new_page, false);
+
page_remove_rmap(page);
- /*
- * Finish the charge transaction under the page table lock to
- * prevent split_huge_page() from dividing up the charge
- * before it's fully transferred to the new page.
- */
- mem_cgroup_end_migration(memcg, page, new_page, true);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --combined mm/shmem.c
index 0f01800,b16d3e7..5909f29
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@@ -149,6 -149,19 +149,19 @@@ static inline void shmem_unacct_size(un
vm_unacct_memory(VM_ACCT(size));
}
+ static inline int shmem_reacct_size(unsigned long flags,
+ loff_t oldsize, loff_t newsize)
+ {
+ if (!(flags & VM_NORESERVE)) {
+ if (VM_ACCT(newsize) > VM_ACCT(oldsize))
+ return security_vm_enough_memory_mm(current->mm,
+ VM_ACCT(newsize) - VM_ACCT(oldsize));
+ else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
+ vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
+ }
+ return 0;
+ }
+
/*
* ... whereas tmpfs objects are accounted incrementally as
* pages are allocated, in order to allow huge sparse files.
@@@ -280,7 -293,7 +293,7 @@@ static bool shmem_confirm_swap(struct a
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
- pgoff_t index, gfp_t gfp, void *expected)
+ pgoff_t index, void *expected)
{
int error;
@@@ -406,7 -419,6 +419,6 @@@ static void shmem_undo_range(struct ino
pvec.pages, indices);
if (!pvec.nr)
break;
- mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@@ -434,7 -446,6 +446,6 @@@
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
- mem_cgroup_uncharge_end();
cond_resched();
index++;
}
@@@ -482,7 -493,6 +493,6 @@@
index = start;
continue;
}
- mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@@ -518,7 -528,6 +528,6 @@@
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
- mem_cgroup_uncharge_end();
index++;
}
@@@ -549,6 -558,10 +558,10 @@@ static int shmem_setattr(struct dentry
loff_t newsize = attr->ia_size;
if (newsize != oldsize) {
+ error = shmem_reacct_size(SHMEM_I(inode)->flags,
+ oldsize, newsize);
+ if (error)
+ return error;
i_size_write(inode, newsize);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
}
@@@ -604,7 -617,7 +617,7 @@@ static int shmem_unuse_inode(struct shm
radswap = swp_to_radix_entry(swap);
index = radix_tree_locate_item(&mapping->page_tree, radswap);
if (index == -1)
- return 0;
+ return -EAGAIN; /* tell shmem_unuse we found nothing */
/*
* Move _head_ to start search for next from here.
@@@ -649,7 -662,7 +662,7 @@@
*/
if (!error)
error = shmem_add_to_page_cache(*pagep, mapping, index,
- GFP_NOWAIT, radswap);
+ radswap);
if (error != -ENOMEM) {
/*
* Truncation and eviction use free_swap_and_cache(), which
@@@ -663,7 -676,6 +676,6 @@@
spin_unlock(&info->lock);
swap_free(swap);
}
- error = 1; /* not an error, but entry was found */
}
return error;
}
@@@ -675,7 -687,7 +687,7 @@@ int shmem_unuse(swp_entry_t swap, struc
{
struct list_head *this, *next;
struct shmem_inode_info *info;
- int found = 0;
+ struct mem_cgroup *memcg;
int error = 0;
/*
@@@ -690,26 -702,32 +702,32 @@@
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL);
+ error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
+ error = -EAGAIN;
mutex_lock(&shmem_swaplist_mutex);
list_for_each_safe(this, next, &shmem_swaplist) {
info = list_entry(this, struct shmem_inode_info, swaplist);
if (info->swapped)
- found = shmem_unuse_inode(info, swap, &page);
+ error = shmem_unuse_inode(info, swap, &page);
else
list_del_init(&info->swaplist);
cond_resched();
- if (found)
+ if (error != -EAGAIN)
break;
+ /* found nothing in this: move on to search the next */
}
mutex_unlock(&shmem_swaplist_mutex);
- if (found < 0)
- error = found;
+ if (error) {
+ if (error != -ENOMEM)
+ error = 0;
+ mem_cgroup_cancel_charge(page, memcg);
+ } else
+ mem_cgroup_commit_charge(page, memcg, true);
out:
unlock_page(page);
page_cache_release(page);
@@@ -813,7 -831,7 +831,7 @@@ static int shmem_writepage(struct page
}
mutex_unlock(&shmem_swaplist_mutex);
- swapcache_free(swap, NULL);
+ swapcache_free(swap);
redirty:
set_page_dirty(page);
if (wbc->for_reclaim)
@@@ -986,7 -1004,7 +1004,7 @@@ static int shmem_replace_page(struct pa
*/
oldpage = newpage;
} else {
- mem_cgroup_replace_page_cache(oldpage, newpage);
+ mem_cgroup_migrate(oldpage, newpage, false);
lru_cache_add_anon(newpage);
*pagep = newpage;
}
@@@ -1013,6 -1031,7 +1031,7 @@@ static int shmem_getpage_gfp(struct ino
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo;
+ struct mem_cgroup *memcg;
struct page *page;
swp_entry_t swap;
int error;
@@@ -1091,11 -1110,10 +1110,10 @@@ repeat
goto failed;
}
- error = mem_cgroup_charge_file(page, current->mm,
- gfp & GFP_RECLAIM_MASK);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
- gfp, swp_to_radix_entry(swap));
+ swp_to_radix_entry(swap));
/*
* We already confirmed swap under page lock, and make
* no memory allocation here, so usually no possibility
@@@ -1108,12 -1126,16 +1126,16 @@@
* Reset swap.val? No, leave it so "failed" goes back to
* "repeat": reading a hole and writing should succeed.
*/
- if (error)
+ if (error) {
+ mem_cgroup_cancel_charge(page, memcg);
delete_from_swap_cache(page);
+ }
}
if (error)
goto failed;
+ mem_cgroup_commit_charge(page, memcg, true);
+
spin_lock(&info->lock);
info->swapped--;
shmem_recalc_inode(inode);
@@@ -1149,22 -1171,22 +1171,22 @@@
__SetPageSwapBacked(page);
__set_page_locked(page);
if (sgp == SGP_WRITE)
- init_page_accessed(page);
+ __SetPageReferenced(page);
- error = mem_cgroup_charge_file(page, current->mm,
- gfp & GFP_RECLAIM_MASK);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
if (error)
goto decused;
error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
- gfp, NULL);
+ NULL);
radix_tree_preload_end();
}
if (error) {
- mem_cgroup_uncharge_cache_page(page);
+ mem_cgroup_cancel_charge(page, memcg);
goto decused;
}
+ mem_cgroup_commit_charge(page, memcg, false);
lru_cache_add_anon(page);
spin_lock(&info->lock);
@@@ -1289,7 -1311,7 +1311,7 @@@ static int shmem_fault(struct vm_area_s
shmem_falloc_waitq = shmem_falloc->waitq;
prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
- TASK_KILLABLE);
+ TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
schedule();
@@@ -2048,45 -2070,17 +2070,45 @@@ static int shmem_rmdir(struct inode *di
return shmem_unlink(dir, dentry);
}
+static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode
*new_dir, struct dentry *new_dentry)
+{
+ bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
+ bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode);
+
+ if (old_dir != new_dir && old_is_dir != new_is_dir) {
+ if (old_is_dir) {
+ drop_nlink(old_dir);
+ inc_nlink(new_dir);
+ } else {
+ drop_nlink(new_dir);
+ inc_nlink(old_dir);
+ }
+ }
+ old_dir->i_ctime = old_dir->i_mtime =
+ new_dir->i_ctime = new_dir->i_mtime =
+ old_dentry->d_inode->i_ctime =
+ new_dentry->d_inode->i_ctime = CURRENT_TIME;
+
+ return 0;
+}
+
/*
* The VFS layer already does all the dentry stuff for rename,
* we just have to decrement the usage count for the target if
* it exists so that the VFS layer correctly free's it when it
* gets overwritten.
*/
-static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode
*new_dir, struct dentry *new_dentry)
+static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode
*new_dir, struct dentry *new_dentry, unsigned int flags)
{
struct inode *inode = old_dentry->d_inode;
int they_are_dirs = S_ISDIR(inode->i_mode);
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if (flags & RENAME_EXCHANGE)
+ return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
+
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
@@@ -2769,7 -2763,7 +2791,7 @@@ static const struct inode_operations sh
.mkdir = shmem_mkdir,
.rmdir = shmem_rmdir,
.mknod = shmem_mknod,
- .rename = shmem_rename,
+ .rename2 = shmem_rename2,
.tmpfile = shmem_tmpfile,
#endif
#ifdef CONFIG_TMPFS_XATTR
@@@ -2960,16 -2954,16 +2982,16 @@@ static struct file *__shmem_file_setup(
this.len = strlen(name);
this.hash = 0; /* will go */
sb = shm_mnt->mnt_sb;
+ path.mnt = mntget(shm_mnt);
path.dentry = d_alloc_pseudo(sb, &this);
if (!path.dentry)
goto put_memory;
d_set_d_op(path.dentry, &anon_ops);
- path.mnt = mntget(shm_mnt);
res = ERR_PTR(-ENOSPC);
inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
if (!inode)
- goto put_dentry;
+ goto put_memory;
inode->i_flags |= i_flags;
d_instantiate(path.dentry, inode);
@@@ -2977,19 -2971,19 +2999,19 @@@
clear_nlink(inode); /* It is unlinked */
res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
if (IS_ERR(res))
- goto put_dentry;
+ goto put_path;
res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
&shmem_file_operations);
if (IS_ERR(res))
- goto put_dentry;
+ goto put_path;
return res;
- put_dentry:
- path_put(&path);
put_memory:
shmem_unacct_size(flags, size);
+ put_path:
+ path_put(&path);
return res;
}
diff --combined mm/slab_common.c
index d31c4ba,8b711f5..d80ec43
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@@ -19,6 -19,8 +19,8 @@@
#include <asm/tlbflush.h>
#include <asm/page.h>
#include <linux/memcontrol.h>
+
+ #define CREATE_TRACE_POINTS
#include <trace/events/kmem.h>
#include "slab.h"
@@@ -55,7 -57,7 +57,7 @@@ static int kmem_cache_sanity_check(cons
continue;
}
-#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
+#if !defined(CONFIG_SLUB)
if (!strcmp(s->name, name)) {
pr_err("%s (%s): Cache name already exists.\n",
__func__, name);
@@@ -264,7 -266,7 +266,7 @@@ EXPORT_SYMBOL(kmem_cache_create)
* memcg_create_kmem_cache - Create a cache for a memory cgroup.
* @memcg: The memory cgroup the new cache is for.
* @root_cache: The parent of the new cache.
- * @memcg_name: The name of the memory cgroup (used for naming the new cache).
+ * @cache_name: The string to be used as the new cache name.
*
* This function attempts to create a kmem cache that will serve allocation
* requests going from @memcg to @root_cache. The new cache inherits properties
@@@ -272,31 -274,25 +274,25 @@@
*/
struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
struct kmem_cache *root_cache,
- const char *memcg_name)
+ char *cache_name)
{
struct kmem_cache *s = NULL;
- char *cache_name;
get_online_cpus();
get_online_mems();
mutex_lock(&slab_mutex);
- cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
- memcg_cache_id(memcg), memcg_name);
- if (!cache_name)
- goto out_unlock;
-
s = do_kmem_cache_create(cache_name, root_cache->object_size,
root_cache->size, root_cache->align,
root_cache->flags, root_cache->ctor,
memcg, root_cache);
- if (IS_ERR(s)) {
- kfree(cache_name);
+ if (!IS_ERR(s))
+ list_add(&s->memcg_params->siblings,
+ &root_cache->memcg_params->children);
+ else
s = NULL;
- }
- out_unlock:
mutex_unlock(&slab_mutex);
put_online_mems();
@@@ -307,17 -303,15 +303,15 @@@
static int memcg_cleanup_cache_params(struct kmem_cache *s)
{
- int rc;
-
if (!s->memcg_params ||
!s->memcg_params->is_root_cache)
return 0;
mutex_unlock(&slab_mutex);
- rc = __memcg_cleanup_cache_params(s);
+ __memcg_cleanup_cache_params(s);
mutex_lock(&slab_mutex);
- return rc;
+ return !list_empty(&s->memcg_params->children);
}
#else
static int memcg_cleanup_cache_params(struct kmem_cache *s)
@@@ -354,6 -348,10 +348,10 @@@ void kmem_cache_destroy(struct kmem_cac
}
list_del(&s->list);
+ #ifdef CONFIG_MEMCG_KMEM
+ if (!is_root_cache(s))
+ list_del(&s->memcg_params->siblings);
+ #endif
mutex_unlock(&slab_mutex);
if (s->flags & SLAB_DESTROY_BY_RCU)
@@@ -692,20 -690,17 +690,17 @@@ void slab_stop(struct seq_file *m, voi
static void
memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
{
- struct kmem_cache *c;
+ #ifdef CONFIG_MEMCG_KMEM
+ struct memcg_cache_params *params;
struct slabinfo sinfo;
- int i;
- if (!is_root_cache(s))
+ if (!s->memcg_params ||
+ !s->memcg_params->is_root_cache)
return;
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
-
+ list_for_each_entry(params, &s->memcg_params->children, siblings) {
memset(&sinfo, 0, sizeof(sinfo));
- get_slabinfo(c, &sinfo);
+ get_slabinfo(params->cachep, &sinfo);
info->active_slabs += sinfo.active_slabs;
info->num_slabs += sinfo.num_slabs;
@@@ -713,6 -708,7 +708,7 @@@
info->active_objs += sinfo.active_objs;
info->num_objs += sinfo.num_objs;
}
+ #endif
}
int cache_show(struct kmem_cache *s, struct seq_file *m)
@@@ -787,3 -783,102 +783,102 @@@ static int __init slab_proc_init(void
}
module_init(slab_proc_init);
#endif /* CONFIG_SLABINFO */
+
+ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
+ gfp_t flags)
+ {
+ void *ret;
+ size_t ks = 0;
+
+ if (p)
+ ks = ksize(p);
+
+ if (ks >= new_size)
+ return (void *)p;
+
+ ret = kmalloc_track_caller(new_size, flags);
+ if (ret && p)
+ memcpy(ret, p, ks);
+
+ return ret;
+ }
+
+ /**
+ * __krealloc - like krealloc() but don't free @p.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * This function is like krealloc() except it never frees the originally
+ * allocated buffer. Use this if you don't want to free the buffer immediately
+ * like, for example, with RCU.
+ */
+ void *__krealloc(const void *p, size_t new_size, gfp_t flags)
+ {
+ if (unlikely(!new_size))
+ return ZERO_SIZE_PTR;
+
+ return __do_krealloc(p, new_size, flags);
+
+ }
+ EXPORT_SYMBOL(__krealloc);
+
+ /**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes. If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+ void *krealloc(const void *p, size_t new_size, gfp_t flags)
+ {
+ void *ret;
+
+ if (unlikely(!new_size)) {
+ kfree(p);
+ return ZERO_SIZE_PTR;
+ }
+
+ ret = __do_krealloc(p, new_size, flags);
+ if (ret && p != ret)
+ kfree(p);
+
+ return ret;
+ }
+ EXPORT_SYMBOL(krealloc);
+
+ /**
+ * kzfree - like kfree but zero memory
+ * @p: object to free memory of
+ *
+ * The memory of the object @p points to is zeroed before freed.
+ * If @p is %NULL, kzfree() does nothing.
+ *
+ * Note: this function zeroes the whole allocated buffer which can be a good
+ * deal bigger than the requested buffer size passed to kmalloc(). So be
+ * careful when using this function in performance sensitive code.
+ */
+ void kzfree(const void *p)
+ {
+ size_t ks;
+ void *mem = (void *)p;
+
+ if (unlikely(ZERO_OR_NULL_PTR(mem)))
+ return;
+ ks = ksize(mem);
+ memset(mem, 0, ks);
+ kfree(mem);
+ }
+ EXPORT_SYMBOL(kzfree);
+
+ /* Tracepoints definitions. */
+ EXPORT_TRACEPOINT_SYMBOL(kmalloc);
+ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
+ EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
+ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
+ EXPORT_TRACEPOINT_SYMBOL(kfree);
+ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --combined net/bridge/br_multicast.c
index b4845f4,d9c4f57..7751c92
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@@ -1174,7 -1174,7 +1174,7 @@@ static void br_multicast_add_router(str
}
if (slot)
- hlist_add_after_rcu(slot, &port->rlist);
+ hlist_add_behind_rcu(&port->rlist, slot);
else
hlist_add_head_rcu(&port->rlist, &br->router_list);
}
@@@ -2216,43 -2216,6 +2216,43 @@@ unlock
EXPORT_SYMBOL_GPL(br_multicast_list_adjacent);
/**
+ * br_multicast_has_querier_anywhere - Checks for a querier on a bridge
+ * @dev: The bridge port providing the bridge on which to check for a querier
+ * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
+ *
+ * Checks whether the given interface has a bridge on top and if so returns
+ * true if a valid querier exists anywhere on the bridged link layer.
+ * Otherwise returns false.
+ */
+bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto)
+{
+ struct net_bridge *br;
+ struct net_bridge_port *port;
+ struct ethhdr eth;
+ bool ret = false;
+
+ rcu_read_lock();
+ if (!br_port_exists(dev))
+ goto unlock;
+
+ port = br_port_get_rcu(dev);
+ if (!port || !port->br)
+ goto unlock;
+
+ br = port->br;
+
+ memset(ð, 0, sizeof(eth));
+ eth.h_proto = htons(proto);
+
+ ret = br_multicast_querier_exists(br, ð);
+
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere);
+
+/**
* br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port
* @dev: The bridge port adjacent to which to check for a querier
* @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
diff --combined net/xfrm/xfrm_policy.c
index 0525d78,92cb08d..beeed60
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@@ -389,7 -389,7 +389,7 @@@ redo
if (h != h0)
continue;
hlist_del(&pol->bydst);
- hlist_add_after(entry0, &pol->bydst);
+ hlist_add_behind(&pol->bydst, entry0);
}
entry0 = &pol->bydst;
}
@@@ -654,7 -654,7 +654,7 @@@ int xfrm_policy_insert(int dir, struct
break;
}
if (newpos)
- hlist_add_after(newpos, &policy->bydst);
+ hlist_add_behind(&policy->bydst, newpos);
else
hlist_add_head(&policy->bydst, chain);
xfrm_pol_hold(policy);
@@@ -2097,8 -2097,6 +2097,8 @@@ struct dst_entry *xfrm_lookup(struct ne
goto no_transform;
}
+ dst_hold(&xdst->u.dst);
+ xdst->u.dst.flags |= DST_NOCACHE;
route = xdst->route;
}
}
--
LinuxNextTracking