The following commit has been merged in the master branch: commit db835e5c572c05a44a289832f2adceca384bbfeb Merge: 831a89659673adee7e0558eef2ae5ac88bbb7d14 76e1488440013a0d737fbb9d1f8efe226138f7f0 Author: Stephen Rothwell sfr@canb.auug.org.au Date: Tue Jul 29 20:12:13 2014 +1000
Merge branch 'akpm-current/current'
Conflicts: arch/arm64/Kconfig mm/shmem.c
diff --combined Documentation/devicetree/bindings/i2c/trivial-devices.txt index 37803eb,c75046a..6af570e --- a/Documentation/devicetree/bindings/i2c/trivial-devices.txt +++ b/Documentation/devicetree/bindings/i2c/trivial-devices.txt @@@ -50,7 -50,6 +50,7 @@@ epson,rx8581 I2C-BUS INTERFACE REAL TI fsl,mag3110 MAG3110: Xtrinsic High Accuracy, 3D Magnetometer fsl,mc13892 MC13892: Power Management Integrated Circuit (PMIC) for i.MX35/51 fsl,mma8450 MMA8450Q: Xtrinsic Low-power, 3-axis Xtrinsic Accelerometer +fsl,mma8452 MMA8452Q: 3-axis 12-bit / 8-bit Digital Accelerometer fsl,mpr121 MPR121: Proximity Capacitive Touch Sensor Controller fsl,sgtl5000 SGTL5000: Ultra Low-Power Audio Codec gmt,g751 G751: Digital Temperature Sensor and Thermal Watchdog with Two-Wire Interface @@@ -70,6 -69,7 +70,7 @@@ nuvoton,npct501 i2c trusted platform m nxp,pca9556 Octal SMBus and I2C registered interface nxp,pca9557 8-bit I2C-bus and SMBus I/O port with reset nxp,pcf8563 Real-time clock/calendar + nxp,pcf85063 Tiny Real-Time Clock ovti,ov5642 OV5642: Color CMOS QSXGA (5-megapixel) Image Sensor with OmniBSI and Embedded TrueFocus pericom,pt7c4338 Real-time Clock Module plx,pex8648 48-Lane, 12-Port PCI Express Gen 2 (5.0 GT/s) Switch @@@ -84,6 -84,5 +85,6 @@@ stm,m41t80 M41T80 - SERIAL ACCESS RTC taos,tsl2550 Ambient Light Sensor with SMBUS/Two Wire Serial Interface ti,tsc2003 I2C Touch-Screen Controller ti,tmp102 Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface +ti,tmp103 Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface ti,tmp275 Digital Temperature Sensor winbond,wpct301 i2c trusted platform module (TPM) diff --combined Documentation/kernel-parameters.txt index d2fc335,6824f37..f1d8047 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@@ -566,11 -566,6 +566,11 @@@ bytes respectively. Such letter suffixe possible to determine what the correct size should be. This option provides an override for these situations.
+ ca_keys= [KEYS] This parameter identifies a specific key(s) on + the system trusted keyring to be used for certificate + trust validation. + format: { id:<keyid> | builtin } + ccw_timeout_log [S390] See Documentation/s390/CommonIO for details.
@@@ -1102,12 -1097,6 +1102,12 @@@ that can be changed at run time by the set_graph_function file in the debugfs tracing directory.
+ ftrace_graph_notrace=[function-list] + [FTRACE] Do not trace from the functions specified in + function-list. This list is a comma separated list of + functions that can be changed at run time by the + set_graph_notrace file in the debugfs tracing directory. + gamecon.map[2|3]= [HW,JOY] Multisystem joystick and NES/SNES/PSX pad support via parallel port (up to 5 devices per port) @@@ -1324,23 -1313,6 +1324,23 @@@ Formats: { "ima" | "ima-ng" } Default: "ima-ng"
+ ima.ahash_minsize= [IMA] Minimum file size for asynchronous hash usage + Format: <min_file_size> + Set the minimal file size for using asynchronous hash. + If left unspecified, ahash usage is disabled. + + ahash performance varies for different data sizes on + different crypto accelerators. This option can be used + to achieve the best performance for a particular HW. + + ima.ahash_bufsize= [IMA] Asynchronous hash buffer size + Format: <bufsize> + Set hashing buffer size. Default: 4k. + + ahash performance varies for different chunk sizes on + different crypto accelerators. This option can be used + to achieve best performance for particular HW. + init= [KNL] Format: <full_path> Run specified binary instead of /sbin/init as init @@@ -1444,6 -1416,10 +1444,6 @@@ ip= [IP_PNP] See Documentation/filesystems/nfs/nfsroot.txt.
- ip2= [HW] Set IO/IRQ pairs for up to 4 IntelliPort boards - See comment before ip2_setup() in - drivers/char/ip2/ip2base.c. - irqfixup [HW] When an interrupt is not handled search all handlers for it. Intended to get systems with badly broken @@@ -1716,8 -1692,12 +1716,12 @@@ 7 (KERN_DEBUG) debug-level messages
log_buf_len=n[KMG] Sets the size of the printk ring buffer, - in bytes. n must be a power of two. The default - size is set in the kernel config file. + in bytes. n must be a power of two and greater + than the minimal size. The minimal size is defined + by LOG_BUF_SHIFT kernel config parameter. There is + also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter + that allows to increase the default size depending on + the number of CPUs. See init/Kconfig for more details.
logo.nologo [FB] Disables display of the built-in Linux logo. This may be used to provide more screen space for @@@ -2190,21 -2170,6 +2194,21 @@@ and restore using xsave. The kernel will fallback to enabling legacy floating-point and sse state.
+ noxsaveopt [X86] Disables xsaveopt used in saving x86 extended + register states. The kernel will fall back to use + xsave to save the states. By using this parameter, + performance of saving the states is degraded because + xsave doesn't support modified optimization while + xsaveopt supports it on xsaveopt enabled systems. + + noxsaves [X86] Disables xsaves and xrstors used in saving and + restoring x86 extended register state in compacted + form of xsave area. The kernel will fall back to use + xsaveopt and xrstor to save and restore the states + in standard form of xsave area. By using this + parameter, xsave area per process might occupy more + memory on xsaves enabled systems. + eagerfpu= [X86] on enable eager fpu restore off disable eager fpu restore @@@ -2846,13 -2811,6 +2850,13 @@@ quiescent states. Units are jiffies, minimum value is one, and maximum value is HZ.
+ rcutree.rcu_nocb_leader_stride= [KNL] + Set the number of NOCB kthread groups, which + defaults to the square root of the number of + CPUs. Larger numbers reduces the wakeup overhead + on the per-CPU grace-period kthreads, but increases + that same overhead on each group's leader. + rcutree.qhimark= [KNL] Set threshold of queued RCU callbacks beyond which batch limiting is disabled. @@@ -3069,13 -3027,6 +3073,13 @@@
S [KNL] Run init in single mode
+ s390_iommu= [HW,S390] + Set s390 IOTLB flushing mode + strict + With strict flushing every unmap operation will result in + an IOTLB flush. Default is lazy flushing before reuse, + which is faster. + sa1100ir [NET] See drivers/net/irda/sa1100_ir.c.
@@@ -3750,10 -3701,6 +3754,10 @@@ Disables the ticketlock slowpath using Xen PV optimizations.
+ xen_nopv [X86] + Disables the PV optimizations forcing the HVM guest to + run as generic HVM guest with no PV drivers. + xirc2ps_cs= [NET,PCMCIA] Format: <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]] diff --combined Makefile index 5def5e8,a4b34fe..c837e9a --- a/Makefile +++ b/Makefile @@@ -1,7 -1,7 +1,7 @@@ VERSION = 3 PATCHLEVEL = 16 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Shuffling Zombie Juror
# *DOCUMENTATION* @@@ -360,14 -360,9 +360,14 @@@ include $(srctree)/scripts/Kbuild.inclu # Make variables (CC, etc...) AS = $(CROSS_COMPILE)as LD = $(CROSS_COMPILE)ld +LDFINAL = $(LD) CC = $(CROSS_COMPILE)gcc CPP = $(CC) -E +ifdef CONFIG_LTO +AR = $(CROSS_COMPILE)gcc-ar +else AR = $(CROSS_COMPILE)ar +endif NM = $(CROSS_COMPILE)nm STRIP = $(CROSS_COMPILE)strip OBJCOPY = $(CROSS_COMPILE)objcopy @@@ -377,7 -372,6 +377,7 @@@ GENKSYMS = scripts/genksyms/genksym INSTALLKERNEL := installkernel DEPMOD = /sbin/depmod PERL = perl +PYTHON = python CHECK = sparse
CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \ @@@ -427,8 -421,8 +427,8 @@@ KERNELVERSION = $(VERSION)$(if $(PATCHL
export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC -export CPP AR NM STRIP OBJCOPY OBJDUMP -export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE +export CPP AR NM STRIP OBJCOPY OBJDUMP LDFINAL +export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS @@@ -438,17 -432,6 +438,17 @@@ export KBUILD_AFLAGS_MODULE KBUILD_CFLA export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL export KBUILD_ARFLAGS
+ifdef CONFIG_LTO +# LTO gcc creates a lot of files in TMPDIR, and with /tmp as tmpfs +# it's easy to drive the machine OOM. Use the object directory +# instead. +ifndef TMPDIR +TMPDIR ?= $(objtree) +export TMPDIR +$(info setting TMPDIR=$(objtree) for LTO build) +endif +endif + # When compiling out-of-tree modules, put MODVERDIR in the module # tree rather than in the kernel tree. The kernel tree might # even be read-only. @@@ -638,6 -621,9 +638,9 @@@ els KBUILD_CFLAGS += -O2 endif
+ # Tell gcc to never replace conditional load with a non-conditional one + KBUILD_CFLAGS += $(call cc-option,--param=allow-store-data-races=0) + ifdef CONFIG_READABLE_ASM # Disable optimizations that make assembler listings hard to read. # reorder blocks reorders the control in the function @@@ -653,6 -639,22 +656,22 @@@ KBUILD_CFLAGS += $(call cc-option,-Wfra endif
# Handle stack protector mode. + # + # Since kbuild can potentially perform two passes (first with the old + # .config values and then with updated .config values), we cannot error out + # if a desired compiler option is unsupported. If we were to error, kbuild + # could never get to the second pass and actually notice that we changed + # the option to something that was supported. + # + # Additionally, we don't want to fallback and/or silently change which compiler + # flags will be used, since that leads to producing kernels with different + # security feature characteristics depending on the compiler used. ("But I + # selected CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!") + # + # The middle ground is to warn here so that the failed option is obvious, but + # to let the build fail with bad compiler flags so that we can't produce a + # kernel when there is a CONFIG and compiler mismatch. + # ifdef CONFIG_CC_STACKPROTECTOR_REGULAR stackp-flag := -fstack-protector ifeq ($(call cc-option, $(stackp-flag)),) @@@ -705,8 -707,6 +724,8 @@@ KBUILD_CFLAGS += -fomit-frame-pointe endif endif
+KBUILD_CFLAGS += $(call cc-option, -fno-var-tracking-assignments) + ifdef CONFIG_DEBUG_INFO KBUILD_CFLAGS += -g KBUILD_AFLAGS += -Wa,-gdwarf-2 @@@ -770,7 -770,6 +789,7 @@@ ifeq ($(shell $(CONFIG_SHELL) $(srctree endif
include $(srctree)/scripts/Makefile.extrawarn +include ${srctree}/scripts/Makefile.lto
# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments KBUILD_CPPFLAGS += $(KCPPFLAGS) @@@ -1240,9 -1239,9 +1259,9 @@@ help @echo ' tags/TAGS - Generate tags file for editors' @echo ' cscope - Generate cscope index' @echo ' gtags - Generate GNU GLOBAL index' - @echo ' kernelrelease - Output the release version string' - @echo ' kernelversion - Output the version stored in Makefile' - @echo ' image_name - Output the image name' + @echo ' kernelrelease - Output the release version string (use with make -s)' + @echo ' kernelversion - Output the version stored in Makefile (use with make -s)' + @echo ' image_name - Output the image name (use with make -s)' @echo ' headers_install - Install sanitised kernel headers to INSTALL_HDR_PATH'; \ echo ' (default: $(INSTALL_HDR_PATH))'; \ echo '' diff --combined arch/arm/Kconfig index 05e2e94,551e526..9c45bc5 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@@ -84,6 -84,7 +84,7 @@@ config AR http://www.arm.linux.org.uk/.
config ARM_HAS_SG_CHAIN + select ARCH_HAS_SG_CHAIN bool
config NEED_SG_DMA_LENGTH @@@ -240,6 -241,13 +241,6 @@@ config ARM_PATCH_PHYS_VIR this feature (eg, building a kernel for a single machine) and you need to shrink the kernel to the minimal size.
-config NEED_MACH_GPIO_H - bool - help - Select this when mach/gpio.h is required to provide special - definitions for this platform. The need for mach/gpio.h should - be avoided when possible. - config NEED_MACH_IO_H bool help @@@ -256,22 -264,8 +257,22 @@@ config NEED_MACH_MEMORY_
config PHYS_OFFSET hex "Physical address of main memory" if MMU - depends on !ARM_PATCH_PHYS_VIRT && !NEED_MACH_MEMORY_H + depends on !ARM_PATCH_PHYS_VIRT default DRAM_BASE if !MMU + default 0x00000000 if ARCH_EBSA110 || \ + EP93XX_SDCE3_SYNC_PHYS_OFFSET || \ + ARCH_FOOTBRIDGE || \ + ARCH_INTEGRATOR || \ + ARCH_IOP13XX || \ + ARCH_KS8695 || \ + (ARCH_REALVIEW && !REALVIEW_HIGH_PHYS_OFFSET) + default 0x10000000 if ARCH_OMAP1 || ARCH_RPC + default 0x20000000 if ARCH_S5PV210 + default 0x70000000 if REALVIEW_HIGH_PHYS_OFFSET + default 0xc0000000 if EP93XX_SDCE0_PHYS_OFFSET || ARCH_SA1100 + default 0xd0000000 if EP93XX_SDCE1_PHYS_OFFSET + default 0xe0000000 if EP93XX_SDCE2_PHYS_OFFSET + default 0xf0000000 if EP93XX_SDCE3_ASYNC_PHYS_OFFSET help Please provide the physical address corresponding to the location of main memory in your system. @@@ -320,7 -314,7 +321,7 @@@ config ARCH_MULTIPLATFOR config ARCH_INTEGRATOR bool "ARM Ltd. Integrator family" select ARM_AMBA - select ARM_PATCH_PHYS_VIRT + select ARM_PATCH_PHYS_VIRT if MMU select AUTO_ZRELADDR select COMMON_CLK select COMMON_CLK_VERSATILE @@@ -328,6 -322,7 +329,6 @@@ select HAVE_TCM select ICST select MULTI_IRQ_HANDLER - select NEED_MACH_MEMORY_H select PLAT_VERSATILE select SPARSE_IRQ select USE_OF @@@ -347,6 -342,7 +348,6 @@@ config ARCH_REALVIE select ICST select NEED_MACH_MEMORY_H select PLAT_VERSATILE - select PLAT_VERSATILE_CLCD help This enables support for ARM Ltd RealView boards.
@@@ -361,6 -357,7 +362,6 @@@ config ARCH_VERSATIL select HAVE_MACH_CLKDEV select ICST select PLAT_VERSATILE - select PLAT_VERSATILE_CLCD select PLAT_VERSATILE_CLOCK select VERSATILE_FPGA_IRQ help @@@ -440,6 -437,7 +441,6 @@@ config ARCH_EP93X select ARM_VIC select CLKDEV_LOOKUP select CPU_ARM920T - select NEED_MACH_MEMORY_H help This enables support for the Cirrus EP93xx series of CPUs.
@@@ -532,6 -530,21 +533,6 @@@ config ARCH_DOV help Support for the Marvell Dove SoC 88AP510
-config ARCH_KIRKWOOD - bool "Marvell Kirkwood" - select ARCH_REQUIRE_GPIOLIB - select CPU_FEROCEON - select GENERIC_CLOCKEVENTS - select MVEBU_MBUS - select PCI - select PCI_QUIRKS - select PINCTRL - select PINCTRL_KIRKWOOD - select PLAT_ORION_LEGACY - help - Support for the following Marvell Kirkwood series SoCs: - 88F6180, 88F6192 and 88F6281. - config ARCH_MV78XX0 bool "Marvell MV78xx0" select ARCH_REQUIRE_GPIOLIB @@@ -623,7 -636,6 +624,7 @@@ config ARCH_PX select AUTO_ZRELADDR select CLKDEV_LOOKUP select CLKSRC_MMIO + select CLKSRC_OF select GENERIC_CLOCKEVENTS select GPIO_PXA select HAVE_IDE @@@ -648,7 -660,7 +649,7 @@@ config ARCH_MS config ARCH_SHMOBILE_LEGACY bool "Renesas ARM SoCs (non-multiplatform)" select ARCH_SHMOBILE - select ARM_PATCH_PHYS_VIRT + select ARM_PATCH_PHYS_VIRT if MMU select CLKDEV_LOOKUP select GENERIC_CLOCKEVENTS select HAVE_ARM_SCU if SMP @@@ -748,6 -760,61 +749,6 @@@ config ARCH_S3C64X help Samsung S3C64XX series based systems
-config ARCH_S5P64X0 - bool "Samsung S5P6440 S5P6450" - select ATAGS - select CLKDEV_LOOKUP - select CLKSRC_SAMSUNG_PWM - select CPU_V6 - select GENERIC_CLOCKEVENTS - select GPIO_SAMSUNG - select HAVE_S3C2410_I2C if I2C - select HAVE_S3C2410_WATCHDOG if WATCHDOG - select HAVE_S3C_RTC if RTC_CLASS - select NEED_MACH_GPIO_H - select SAMSUNG_ATAGS - select SAMSUNG_WDT_RESET - help - Samsung S5P64X0 CPU based systems, such as the Samsung SMDK6440, - SMDK6450. - -config ARCH_S5PC100 - bool "Samsung S5PC100" - select ARCH_REQUIRE_GPIOLIB - select ATAGS - select CLKDEV_LOOKUP - select CLKSRC_SAMSUNG_PWM - select CPU_V7 - select GENERIC_CLOCKEVENTS - select GPIO_SAMSUNG - select HAVE_S3C2410_I2C if I2C - select HAVE_S3C2410_WATCHDOG if WATCHDOG - select HAVE_S3C_RTC if RTC_CLASS - select NEED_MACH_GPIO_H - select SAMSUNG_ATAGS - select SAMSUNG_WDT_RESET - help - Samsung S5PC100 series based systems - -config ARCH_S5PV210 - bool "Samsung S5PV210/S5PC110" - select ARCH_HAS_HOLES_MEMORYMODEL - select ARCH_SPARSEMEM_ENABLE - select ATAGS - select CLKDEV_LOOKUP - select CLKSRC_SAMSUNG_PWM - select CPU_V7 - select GENERIC_CLOCKEVENTS - select GPIO_SAMSUNG - select HAVE_S3C2410_I2C if I2C - select HAVE_S3C2410_WATCHDOG if WATCHDOG - select HAVE_S3C_RTC if RTC_CLASS - select NEED_MACH_GPIO_H - select NEED_MACH_MEMORY_H - select SAMSUNG_ATAGS - help - Samsung S5PV210/S5PC110 series based systems - config ARCH_DAVINCI bool "TI DaVinci" select ARCH_HAS_HOLES_MEMORYMODEL @@@ -886,6 -953,8 +887,6 @@@ source "arch/arm/mach-ixp4xx/Kconfig
source "arch/arm/mach-keystone/Kconfig"
-source "arch/arm/mach-kirkwood/Kconfig" - source "arch/arm/mach-ks8695/Kconfig"
source "arch/arm/mach-msm/Kconfig" @@@ -896,8 -965,6 +897,8 @@@ source "arch/arm/mach-mv78xx0/Kconfig
source "arch/arm/mach-imx/Kconfig"
+source "arch/arm/mach-mediatek/Kconfig" + source "arch/arm/mach-mxs/Kconfig"
source "arch/arm/mach-netx/Kconfig" @@@ -939,6 -1006,10 +940,6 @@@ source "arch/arm/mach-s3c24xx/Kconfig
source "arch/arm/mach-s3c64xx/Kconfig"
-source "arch/arm/mach-s5p64x0/Kconfig" - -source "arch/arm/mach-s5pc100/Kconfig" - source "arch/arm/mach-s5pv210/Kconfig"
source "arch/arm/mach-exynos/Kconfig" @@@ -1485,12 -1556,10 +1486,12 @@@ config ARM_PSC config ARCH_NR_GPIO int default 1024 if ARCH_SHMOBILE || ARCH_TEGRA - default 512 if ARCH_EXYNOS || ARCH_KEYSTONE || SOC_OMAP5 || SOC_DRA7XX || ARCH_S3C24XX || ARCH_S3C64XX + default 512 if ARCH_EXYNOS || ARCH_KEYSTONE || SOC_OMAP5 || \ + SOC_DRA7XX || ARCH_S3C24XX || ARCH_S3C64XX || ARCH_S5PV210 default 416 if ARCH_SUNXI default 392 if ARCH_U8500 default 352 if ARCH_VT8500 + default 288 if ARCH_ROCKCHIP default 264 if MACH_H4700 default 0 help @@@ -1502,7 -1571,7 +1503,7 @@@ source kernel/Kconfig.preemp
config HZ_FIXED int - default 200 if ARCH_EBSA110 || ARCH_S3C24XX || ARCH_S5P64X0 || \ + default 200 if ARCH_EBSA110 || ARCH_S3C24XX || \ ARCH_S5PV210 || ARCH_EXYNOS4 default AT91_TIMER_HZ if ARCH_AT91 default SHMOBILE_TIMER_HZ if ARCH_SHMOBILE_LEGACY @@@ -2127,6 -2196,7 +2128,6 @@@ menu "Power management options source "kernel/power/Kconfig"
config ARCH_SUSPEND_POSSIBLE - depends on !ARCH_S5PC100 depends on CPU_ARM920T || CPU_ARM926T || CPU_FEROCEON || CPU_SA1100 || \ CPU_V6 || CPU_V6K || CPU_V7 || CPU_V7M || CPU_XSC3 || CPU_XSCALE || CPU_MOHAWK def_bool y diff --combined arch/arm/mm/dma-mapping.c index 1f88db0,3116880..7a996aa --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@@ -26,6 -26,7 +26,7 @@@ #include <linux/io.h> #include <linux/vmalloc.h> #include <linux/sizes.h> + #include <linux/cma.h>
#include <asm/memory.h> #include <asm/highmem.h> @@@ -461,21 -462,12 +462,21 @@@ void __init dma_contiguous_remap(void map.type = MT_MEMORY_DMA_READY;
/* - * Clear previous low-memory mapping + * Clear previous low-memory mapping to ensure that the + * TLB does not see any conflicting entries, then flush + * the TLB of the old entries before creating new mappings. + * + * This ensures that any speculatively loaded TLB entries + * (even though they may be rare) can not cause any problems, + * and ensures that this code is architecturally compliant. */ for (addr = __phys_to_virt(start); addr < __phys_to_virt(end); addr += PMD_SIZE) pmd_clear(pmd_off_k(addr));
+ flush_tlb_kernel_range(__phys_to_virt(start), + __phys_to_virt(end)); + iotable_init(&map, 1); } } diff --combined arch/arm64/Kconfig index 555ad3c,7bc7b74..4e40949 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@@ -1,6 -1,8 +1,7 @@@ config ARM64 def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - select ARCH_HAS_OPP + select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_USE_CMPXCHG_LOCKREF select ARCH_SUPPORTS_ATOMIC_RMW @@@ -10,9 -12,6 +11,9 @@@ select ARM_AMBA select ARM_ARCH_TIMER select ARM_GIC + select ARM_GIC_V2M if (PCI && PCI_MSI) + select ARM_GIC_V3 + select AUDIT_ARCH_COMPAT_GENERIC select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS select COMMON_CLK @@@ -31,12 -30,10 +32,12 @@@ select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL select HARDIRQS_SW_RESEND + select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK select HAVE_C_RECORDMCOUNT + select HAVE_CC_STACKPROTECTOR select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG @@@ -67,7 -64,6 +68,7 @@@ select RTC_LIB select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE + select HAVE_CONTEXT_TRACKING help ARM 64-bit (AArch64) Linux support.
@@@ -160,63 -156,14 +161,63 @@@ endmen
menu "Kernel Features"
+choice + prompt "Page size" + default ARM64_4K_PAGES + help + Page size (translation granule) configuration. + +config ARM64_4K_PAGES + bool "4KB" + help + This feature enables 4KB pages support. + config ARM64_64K_PAGES - bool "Enable 64KB pages support" + bool "64KB" help This feature enables 64KB pages support (4KB by default) allowing only two levels of page tables and faster TLB look-up. AArch32 emulation is not available when this feature is enabled.
+endchoice + +choice + prompt "Virtual address space size" + default ARM64_VA_BITS_39 if ARM64_4K_PAGES + default ARM64_VA_BITS_42 if ARM64_64K_PAGES + help + Allows choosing one of multiple possible virtual address + space sizes. The level of translation table is determined by + a combination of page size and virtual address space size. + +config ARM64_VA_BITS_39 + bool "39-bit" + depends on ARM64_4K_PAGES + +config ARM64_VA_BITS_42 + bool "42-bit" + depends on ARM64_64K_PAGES + +config ARM64_VA_BITS_48 + bool "48-bit" + depends on BROKEN + +endchoice + +config ARM64_VA_BITS + int + default 39 if ARM64_VA_BITS_39 + default 42 if ARM64_VA_BITS_42 + default 48 if ARM64_VA_BITS_48 + +config ARM64_PGTABLE_LEVELS + int + default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42 + default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48 + default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39 + default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48 + config CPU_BIG_ENDIAN bool "Build big-endian kernel" help @@@ -362,17 -309,6 +363,17 @@@ config EF allow the kernel to be booted as an EFI application. This is only useful on systems that have UEFI firmware.
+config DMI + bool "Enable support for SMBIOS (DMI) tables" + depends on EFI + default y + help + This enables SMBIOS/DMI feature for systems. + + This option is only useful on systems that have UEFI firmware. + However, even with this option, the resultant kernel should + continue to boot on existing non-UEFI platforms. + endmenu
menu "Userspace binary formats" diff --combined arch/ia64/Kconfig index 44a6915,56986a0..c84c88b --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@@ -10,7 -10,6 +10,7 @@@ config IA6 select ARCH_MIGHT_HAVE_PC_SERIO select PCI if (!IA64_HP_SIM) select ACPI if (!IA64_HP_SIM) + select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select PM if (!IA64_HP_SIM) select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE @@@ -28,6 -27,7 +28,7 @@@ select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_VIRT_CPU_ACCOUNTING + select ARCH_HAS_SG_CHAIN select VIRT_TO_BUS select ARCH_DISCARD_MEMBLOCK select GENERIC_IRQ_PROBE diff --combined arch/powerpc/kvm/book3s_64_mmu_hv.c index 09a47ae,a01744f..ad463f8 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@@ -37,8 -37,6 +37,6 @@@ #include <asm/ppc-opcode.h> #include <asm/cputable.h>
- #include "book3s_hv_cma.h" - /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63
@@@ -64,10 -62,10 +62,10 @@@ long kvmppc_alloc_hpt(struct kvm *kvm, }
kvm->arch.hpt_cma_alloc = 0; page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + memset((void *)hpt, 0, (1 << order)); kvm->arch.hpt_cma_alloc = 1; }
@@@ -450,7 -448,7 +448,7 @@@ static int kvmppc_mmu_book3s_64_hv_xlat unsigned long slb_v; unsigned long pp, key; unsigned long v, gr; - unsigned long *hptep; + __be64 *hptep; int index; int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
@@@ -473,13 -471,13 +471,13 @@@ preempt_enable(); return -ENOENT; } - hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); - v = hptep[0] & ~HPTE_V_HVLOCK; + hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; gr = kvm->arch.revmap[index].guest_rpte;
/* Unlock the HPTE */ asm volatile("lwsync" : : : "memory"); - hptep[0] = v; + hptep[0] = cpu_to_be64(v); preempt_enable();
gpte->eaddr = eaddr; @@@ -583,8 -581,7 +581,8 @@@ int kvmppc_book3s_hv_page_fault(struct unsigned long ea, unsigned long dsisr) { struct kvm *kvm = vcpu->kvm; - unsigned long *hptep, hpte[3], r; + unsigned long hpte[3], r; + __be64 *hptep; unsigned long mmu_seq, psize, pte_size; unsigned long gpa_base, gfn_base; unsigned long gpa, gfn, hva, pfn; @@@ -607,16 -604,16 +605,16 @@@ if (ea != vcpu->arch.pgfault_addr) return RESUME_GUEST; index = vcpu->arch.pgfault_index; - hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); rev = &kvm->arch.revmap[index]; preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) cpu_relax(); - hpte[0] = hptep[0] & ~HPTE_V_HVLOCK; - hpte[1] = hptep[1]; + hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; + hpte[1] = be64_to_cpu(hptep[1]); hpte[2] = r = rev->guest_rpte; asm volatile("lwsync" : : : "memory"); - hptep[0] = hpte[0]; + hptep[0] = cpu_to_be64(hpte[0]); preempt_enable();
if (hpte[0] != vcpu->arch.pgfault_hpte[0] || @@@ -732,9 -729,8 +730,9 @@@ preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) cpu_relax(); - if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] || - rev->guest_rpte != hpte[2]) + if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] || + be64_to_cpu(hptep[1]) != hpte[1] || + rev->guest_rpte != hpte[2]) /* HPTE has been changed under us; let the guest retry */ goto out_unlock; hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; @@@ -754,20 -750,20 +752,20 @@@ rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
- if (hptep[0] & HPTE_V_VALID) { + if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) { /* HPTE was previously valid, so we need to invalidate it */ unlock_rmap(rmap); - hptep[0] |= HPTE_V_ABSENT; + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); kvmppc_invalidate_hpte(kvm, hptep, index); /* don't lose previous R and C bits */ - r |= hptep[1] & (HPTE_R_R | HPTE_R_C); + r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); } else { kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); }
- hptep[1] = r; + hptep[1] = cpu_to_be64(r); eieio(); - hptep[0] = hpte[0]; + hptep[0] = cpu_to_be64(hpte[0]); asm volatile("ptesync" : : : "memory"); preempt_enable(); if (page && hpte_is_writable(r)) @@@ -786,7 -782,7 +784,7 @@@ return ret;
out_unlock: - hptep[0] &= ~HPTE_V_HVLOCK; + hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); preempt_enable(); goto out_put; } @@@ -862,7 -858,7 +860,7 @@@ static int kvm_unmap_rmapp(struct kvm * { struct revmap_entry *rev = kvm->arch.revmap; unsigned long h, i, j; - unsigned long *hptep; + __be64 *hptep; unsigned long ptel, psize, rcbits;
for (;;) { @@@ -878,11 -874,11 +876,11 @@@ * rmap chain lock. */ i = *rmapp & KVMPPC_RMAP_INDEX; - hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { /* unlock rmap before spinning on the HPTE lock */ unlock_rmap(rmapp); - while (hptep[0] & HPTE_V_HVLOCK) + while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) cpu_relax(); continue; } @@@ -901,14 -897,14 +899,14 @@@
/* Now check and modify the HPTE */ ptel = rev[i].guest_rpte; - psize = hpte_page_size(hptep[0], ptel); - if ((hptep[0] & HPTE_V_VALID) && + psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); + if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && hpte_rpn(ptel, psize) == gfn) { if (kvm->arch.using_mmu_notifiers) - hptep[0] |= HPTE_V_ABSENT; + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); kvmppc_invalidate_hpte(kvm, hptep, i); /* Harvest R and C */ - rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); + rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; if (rcbits & ~rev[i].guest_rpte) { rev[i].guest_rpte = ptel | rcbits; @@@ -916,7 -912,7 +914,7 @@@ } } unlock_rmap(rmapp); - hptep[0] &= ~HPTE_V_HVLOCK; + hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); } return 0; } @@@ -963,7 -959,7 +961,7 @@@ static int kvm_age_rmapp(struct kvm *kv { struct revmap_entry *rev = kvm->arch.revmap; unsigned long head, i, j; - unsigned long *hptep; + __be64 *hptep; int ret = 0;
retry: @@@ -979,24 -975,23 +977,24 @@@
i = head = *rmapp & KVMPPC_RMAP_INDEX; do { - hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); j = rev[i].forw;
/* If this HPTE isn't referenced, ignore it */ - if (!(hptep[1] & HPTE_R_R)) + if (!(be64_to_cpu(hptep[1]) & HPTE_R_R)) continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { /* unlock rmap before spinning on the HPTE lock */ unlock_rmap(rmapp); - while (hptep[0] & HPTE_V_HVLOCK) + while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) cpu_relax(); goto retry; }
/* Now check and modify the HPTE */ - if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { + if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && + (be64_to_cpu(hptep[1]) & HPTE_R_R)) { kvmppc_clear_ref_hpte(kvm, hptep, i); if (!(rev[i].guest_rpte & HPTE_R_R)) { rev[i].guest_rpte |= HPTE_R_R; @@@ -1004,7 -999,7 +1002,7 @@@ } ret = 1; } - hptep[0] &= ~HPTE_V_HVLOCK; + hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); } while ((i = j) != head);
unlock_rmap(rmapp); @@@ -1038,7 -1033,7 +1036,7 @@@ static int kvm_test_age_rmapp(struct kv do { hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); j = rev[i].forw; - if (hp[1] & HPTE_R_R) + if (be64_to_cpu(hp[1]) & HPTE_R_R) goto out; } while ((i = j) != head); } @@@ -1078,7 -1073,7 +1076,7 @@@ static int kvm_test_clear_dirty_npages( unsigned long head, i, j; unsigned long n; unsigned long v, r; - unsigned long *hptep; + __be64 *hptep; int npages_dirty = 0;
retry: @@@ -1094,8 -1089,7 +1092,8 @@@
i = head = *rmapp & KVMPPC_RMAP_INDEX; do { - hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + unsigned long hptep1; + hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); j = rev[i].forw;
/* @@@ -1112,30 -1106,29 +1110,30 @@@ * Otherwise we need to do the tlbie even if C==0 in * order to pick up any delayed writeback of C. */ - if (!(hptep[1] & HPTE_R_C) && - (!hpte_is_writable(hptep[1]) || vcpus_running(kvm))) + hptep1 = be64_to_cpu(hptep[1]); + if (!(hptep1 & HPTE_R_C) && + (!hpte_is_writable(hptep1) || vcpus_running(kvm))) continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { /* unlock rmap before spinning on the HPTE lock */ unlock_rmap(rmapp); - while (hptep[0] & HPTE_V_HVLOCK) + while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK)) cpu_relax(); goto retry; }
/* Now check and modify the HPTE */ - if (!(hptep[0] & HPTE_V_VALID)) + if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) continue;
/* need to make it temporarily absent so C is stable */ - hptep[0] |= HPTE_V_ABSENT; + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); kvmppc_invalidate_hpte(kvm, hptep, i); - v = hptep[0]; - r = hptep[1]; + v = be64_to_cpu(hptep[0]); + r = be64_to_cpu(hptep[1]); if (r & HPTE_R_C) { - hptep[1] = r & ~HPTE_R_C; + hptep[1] = cpu_to_be64(r & ~HPTE_R_C); if (!(rev[i].guest_rpte & HPTE_R_C)) { rev[i].guest_rpte |= HPTE_R_C; note_hpte_modification(kvm, &rev[i]); @@@ -1148,7 -1141,7 +1146,7 @@@ } v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); v |= HPTE_V_VALID; - hptep[0] = v; + hptep[0] = cpu_to_be64(v); } while ((i = j) != head);
unlock_rmap(rmapp); @@@ -1312,7 -1305,7 +1310,7 @@@ struct kvm_htab_ctx * Returns 1 if this HPT entry has been modified or has pending * R/C bit changes. */ -static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp) +static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp) { unsigned long rcbits_unset;
@@@ -1321,14 -1314,13 +1319,14 @@@
/* Also need to consider changes in reference and changed bits */ rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); - if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset)) + if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) && + (be64_to_cpu(hptp[1]) & rcbits_unset)) return 1;
return 0; }
-static long record_hpte(unsigned long flags, unsigned long *hptp, +static long record_hpte(unsigned long flags, __be64 *hptp, unsigned long *hpte, struct revmap_entry *revp, int want_valid, int first_pass) { @@@ -1343,10 -1335,10 +1341,10 @@@ return 0;
valid = 0; - if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) { + if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) { valid = 1; if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && - !(hptp[0] & HPTE_V_BOLTED)) + !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED)) valid = 0; } if (valid != want_valid) @@@ -1358,7 -1350,7 +1356,7 @@@ preempt_disable(); while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) cpu_relax(); - v = hptp[0]; + v = be64_to_cpu(hptp[0]);
/* re-evaluate valid and dirty from synchronized HPTE value */ valid = !!(v & HPTE_V_VALID); @@@ -1366,9 -1358,9 +1364,9 @@@
/* Harvest R and C into guest view if necessary */ rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); - if (valid && (rcbits_unset & hptp[1])) { - revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) | - HPTE_GR_MODIFIED; + if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) { + revp->guest_rpte |= (be64_to_cpu(hptp[1]) & + (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; dirty = 1; }
@@@ -1387,13 -1379,13 +1385,13 @@@ revp->guest_rpte = r; } asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); - hptp[0] &= ~HPTE_V_HVLOCK; + hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); preempt_enable(); if (!(valid == want_valid && (first_pass || dirty))) ok = 0; } - hpte[0] = v; - hpte[1] = r; + hpte[0] = cpu_to_be64(v); + hpte[1] = cpu_to_be64(r); return ok; }
@@@ -1403,7 -1395,7 +1401,7 @@@ static ssize_t kvm_htab_read(struct fil struct kvm_htab_ctx *ctx = file->private_data; struct kvm *kvm = ctx->kvm; struct kvm_get_htab_header hdr; - unsigned long *hptp; + __be64 *hptp; struct revmap_entry *revp; unsigned long i, nb, nw; unsigned long __user *lbuf; @@@ -1419,7 -1411,7 +1417,7 @@@ flags = ctx->flags;
i = ctx->index; - hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); revp = kvm->arch.revmap + i; lbuf = (unsigned long __user *)buf;
@@@ -1503,7 -1495,7 +1501,7 @@@ static ssize_t kvm_htab_write(struct fi unsigned long i, j; unsigned long v, r; unsigned long __user *lbuf; - unsigned long *hptp; + __be64 *hptp; unsigned long tmp[2]; ssize_t nb; long int err, ret; @@@ -1545,7 -1537,7 +1543,7 @@@ i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) break;
- hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); lbuf = (unsigned long __user *)buf; for (j = 0; j < hdr.n_valid; ++j) { err = -EFAULT; @@@ -1557,7 -1549,7 +1555,7 @@@ lbuf += 2; nb += HPTE_SIZE;
- if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) kvmppc_do_h_remove(kvm, 0, i, 0, tmp); err = -EIO; ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, @@@ -1583,7 -1575,7 +1581,7 @@@ }
for (j = 0; j < hdr.n_invalid; ++j) { - if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) kvmppc_do_h_remove(kvm, 0, i, 0, tmp); ++i; hptp += 2; diff --combined arch/powerpc/kvm/book3s_hv_builtin.c index 3b41447,6cf498a..329d7fd --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@@ -16,12 -16,14 +16,14 @@@ #include <linux/init.h> #include <linux/memblock.h> #include <linux/sizes.h> + #include <linux/cma.h>
#include <asm/cputable.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h>
- #include "book3s_hv_cma.h" + #define KVM_CMA_CHUNK_ORDER 18 + /* * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206) * should be power of 2. @@@ -43,6 -45,8 +45,8 @@@ static unsigned long kvm_cma_resv_rati unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT; /* 128MB */ EXPORT_SYMBOL_GPL(kvm_rma_pages);
+ static struct cma *kvm_cma; + /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ static inline int lpcr_rmls(unsigned long rma_size) @@@ -97,7 -101,7 +101,7 @@@ struct kvm_rma_info *kvm_alloc_rma( ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL); if (!ri) return NULL; - page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages); + page = cma_alloc(kvm_cma, kvm_rma_pages, get_order(kvm_rma_pages)); if (!page) goto err_out; atomic_set(&ri->use_count, 1); @@@ -112,7 -116,7 +116,7 @@@ EXPORT_SYMBOL_GPL(kvm_alloc_rma) void kvm_release_rma(struct kvm_rma_info *ri) { if (atomic_dec_and_test(&ri->use_count)) { - kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages); + cma_release(kvm_cma, pfn_to_page(ri->base_pfn), kvm_rma_pages); kfree(ri); } } @@@ -131,16 -135,18 +135,18 @@@ struct page *kvm_alloc_hpt(unsigned lon { unsigned long align_pages = HPT_ALIGN_PAGES;
+ VM_BUG_ON(get_order(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + /* Old CPUs require HPT aligned on a multiple of its size */ if (!cpu_has_feature(CPU_FTR_ARCH_206)) align_pages = nr_pages; - return kvm_alloc_cma(nr_pages, align_pages); + return cma_alloc(kvm_cma, nr_pages, get_order(align_pages)); } EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
void kvm_release_hpt(struct page *page, unsigned long nr_pages) { - kvm_release_cma(page, nr_pages); + cma_release(kvm_cma, page, nr_pages); } EXPORT_SYMBOL_GPL(kvm_release_hpt);
@@@ -179,7 -185,8 +185,8 @@@ void __init kvm_cma_reserve(void align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size); - kvm_cma_declare_contiguous(selected_size, align_size); + cma_declare_contiguous(0, selected_size, 0, align_size, + KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma); } }
@@@ -212,16 -219,3 +219,16 @@@ bool kvm_hv_mode_active(void { return atomic_read(&hv_vm_count) != 0; } + +extern int hcall_real_table[], hcall_real_table_end[]; + +int kvmppc_hcall_impl_hv_realmode(unsigned long cmd) +{ + cmd /= 4; + if (cmd < hcall_real_table_end - hcall_real_table && + hcall_real_table[cmd]) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode); diff --combined arch/s390/Kconfig index f5af5f6,d12d40e..3c94ef3 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@@ -116,6 -116,7 +116,6 @@@ config S39 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER - select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP @@@ -145,6 -146,7 +145,7 @@@ select TTY select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS + select ARCH_HAS_SG_CHAIN
config SCHED_OMIT_FRAME_POINTER def_bool y diff --combined arch/sparc/Kconfig index 4692c90,bff3192..a537816 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@@ -42,6 -42,7 +42,7 @@@ config SPAR select MODULES_USE_ELF_RELA select ODD_RT_SIGACTION select OLD_SIGSUSPEND + select ARCH_HAS_SG_CHAIN
config SPARC32 def_bool !64BIT @@@ -55,6 -56,7 +56,6 @@@ config SPARC6 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_FP_TEST - select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KRETPROBES select HAVE_KPROBES select HAVE_RCU_TABLE_FREE if SMP diff --combined arch/x86/Kconfig index 503f35c,2ae952c..273d20d --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@@ -21,7 -21,6 +21,7 @@@ config X86_6 ### Arch settings config X86 def_bool y + select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO @@@ -55,6 -54,7 +55,6 @@@ select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_FP_TEST - select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_SYSCALL_TRACEPOINTS select SYSCTL_EXCEPTION_TRACE select HAVE_KVM @@@ -96,6 -96,7 +96,7 @@@ select IRQ_FORCED_THREADING select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select ARCH_HAS_SG_CHAIN select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP @@@ -132,7 -133,6 +133,7 @@@ select GENERIC_CPU_AUTOPROBE select HAVE_ARCH_AUDITSYSCALL select ARCH_SUPPORTS_ATOMIC_RMW + select ACPI_LEGACY_TABLES_LOOKUP if ACPI
config INSTRUCTION_DECODER def_bool y @@@ -431,7 -431,6 +432,7 @@@ config X86_INTEL_C bool "CE4100 TV platform" depends on PCI depends on PCI_GODIRECT + depends on X86_IO_APIC depends on X86_32 depends on X86_EXTENDED_PLATFORM select X86_REBOOTFIXUPS @@@ -539,7 -538,7 +540,7 @@@ config X86_32_IRI
config SCHED_OMIT_FRAME_POINTER def_bool y - prompt "Single-depth WCHAN output" + prompt "Single-depth WCHAN output" if !LTO && !FRAME_POINTER depends on X86 ---help--- Calculate simpler /proc/<PID>/wchan values. If this option @@@ -838,7 -837,6 +839,7 @@@ config X86_IO_API def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ + select IRQ_DOMAIN
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" diff --combined arch/x86/mm/fault.c index 1dbade8,d30b78b..d393ac6 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@@ -350,7 -350,7 +350,7 @@@ out
void vmalloc_sync_all(void) { - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0); }
/* @@@ -577,8 -577,6 +577,8 @@@ static int is_f00f_bug(struct pt_regs *
static const char nx_warning[] = KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; +static const char smep_warning[] = KERN_CRIT +"unable to execute userspace code (SMEP?) (uid: %d)\n";
static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, @@@ -599,10 -597,6 +599,10 @@@
if (pte && pte_present(*pte) && !pte_exec(*pte)) printk(nx_warning, from_kuid(&init_user_ns, current_uid())); + if (pte && pte_present(*pte) && pte_exec(*pte) && + (pgd_flags(*pgd) & _PAGE_USER) && + (read_cr4() & X86_CR4_SMEP)) + printk(smep_warning, from_kuid(&init_user_ns, current_uid())); }
printk(KERN_ALERT "BUG: unable to handle kernel "); @@@ -1218,7 -1212,8 +1218,8 @@@ good_area /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault: + * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if + * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. */ fault = handle_mm_fault(mm, vma, address, flags);
diff --combined block/bio-integrity.c index bc423f7b,56754c4..38c8ac2 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@@ -70,10 -70,8 +70,10 @@@ struct bio_integrity_payload *bio_integ bs->bvec_integrity_pool); if (!bip->bip_vec) goto err; + bip->bip_max_vcnt = bvec_nr_vecs(idx); } else { bip->bip_vec = bip->bip_inline_vecs; + bip->bip_max_vcnt = inline_vecs; }
bip->bip_slab = idx; @@@ -116,6 -114,14 +116,6 @@@ void bio_integrity_free(struct bio *bio } EXPORT_SYMBOL(bio_integrity_free);
-static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip) -{ - if (bip->bip_slab == BIO_POOL_NONE) - return BIP_INLINE_VECS; - - return bvec_nr_vecs(bip->bip_slab); -} - /** * bio_integrity_add_page - Attach integrity metadata * @bio: bio to update @@@ -131,7 -137,7 +131,7 @@@ int bio_integrity_add_page(struct bio * struct bio_integrity_payload *bip = bio->bi_integrity; struct bio_vec *iv;
- if (bip->bip_vcnt >= bip_integrity_vecs(bip)) { + if (bip->bip_vcnt >= bip->bip_max_vcnt) { printk(KERN_ERR "%s: bip_vec full\n", __func__); return 0; } @@@ -646,6 -652,4 +646,4 @@@ void __init bio_integrity_init(void sizeof(struct bio_integrity_payload) + sizeof(struct bio_vec) * BIP_INLINE_VECS, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - if (!bip_slab) - panic("Failed to create slab\n"); } diff --combined drivers/ata/Kconfig index e65d400,b0d5b5a..e1b9278 --- a/drivers/ata/Kconfig +++ b/drivers/ata/Kconfig @@@ -16,6 -16,7 +16,7 @@@ menuconfig AT depends on BLOCK depends on !(M32R || M68K || S390) || BROKEN select SCSI + select GLOB ---help--- If you want to use an ATA hard disk, ATA tape drive, ATA CD-ROM or any other ATA device under Linux, say Y and make sure that you know @@@ -141,15 -142,6 +142,15 @@@ config AHCI_SUNX
If unsure, say N.
+config AHCI_TEGRA + tristate "NVIDIA Tegra124 AHCI SATA support" + depends on ARCH_TEGRA + help + This option enables support for the NVIDIA Tegra124 SoC's + onboard AHCI SATA. + + If unsure, say N. + config AHCI_XGENE tristate "APM X-Gene 6.0Gbps AHCI SATA host controller support" depends on PHY_XGENE diff --combined drivers/ata/libata-core.c index 677c0c1,259d879..dbdc5d3 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@@ -59,6 -59,7 +59,7 @@@ #include <linux/async.h> #include <linux/log2.h> #include <linux/slab.h> + #include <linux/glob.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_host.h> @@@ -4250,73 -4251,6 +4251,6 @@@ static const struct ata_blacklist_entr { } };
- /** - * glob_match - match a text string against a glob-style pattern - * @text: the string to be examined - * @pattern: the glob-style pattern to be matched against - * - * Either/both of text and pattern can be empty strings. - * - * Match text against a glob-style pattern, with wildcards and simple sets: - * - * ? matches any single character. - * * matches any run of characters. - * [xyz] matches a single character from the set: x, y, or z. - * [a-d] matches a single character from the range: a, b, c, or d. - * [a-d0-9] matches a single character from either range. - * - * The special characters ?, [, -, or *, can be matched using a set, eg. [*] - * Behaviour with malformed patterns is undefined, though generally reasonable. - * - * Sample patterns: "SD1?", "SD1[0-5]", "*R0", "SD*1?[012]*xx" - * - * This function uses one level of recursion per '*' in pattern. - * Since it calls _nothing_ else, and has _no_ explicit local variables, - * this will not cause stack problems for any reasonable use here. - * - * RETURNS: - * 0 on match, 1 otherwise. - */ - static int glob_match (const char *text, const char *pattern) - { - do { - /* Match single character or a '?' wildcard */ - if (*text == *pattern || *pattern == '?') { - if (!*pattern++) - return 0; /* End of both strings: match */ - } else { - /* Match single char against a '[' bracketed ']' pattern set */ - if (!*text || *pattern != '[') - break; /* Not a pattern set */ - while (*++pattern && *pattern != ']' && *text != *pattern) { - if (*pattern == '-' && *(pattern - 1) != '[') - if (*text > *(pattern - 1) && *text < *(pattern + 1)) { - ++pattern; - break; - } - } - if (!*pattern || *pattern == ']') - return 1; /* No match */ - while (*pattern && *pattern++ != ']'); - } - } while (*++text && *pattern); - - /* Match any run of chars against a '*' wildcard */ - if (*pattern == '*') { - if (!*++pattern) - return 0; /* Match: avoid recursion at end of pattern */ - /* Loop to handle additional pattern chars after the wildcard */ - while (*text) { - if (glob_match(text, pattern) == 0) - return 0; /* Remainder matched */ - ++text; /* Absorb (match) this char and try again */ - } - } - if (!*text && !*pattern) - return 0; /* End of both strings: match */ - return 1; /* No match */ - } - static unsigned long ata_dev_blacklisted(const struct ata_device *dev) { unsigned char model_num[ATA_ID_PROD_LEN + 1]; @@@ -4327,10 -4261,10 +4261,10 @@@ ata_id_c_string(dev->id, model_rev, ATA_ID_FW_REV, sizeof(model_rev));
while (ad->model_num) { - if (!glob_match(model_num, ad->model_num)) { + if (glob_match(model_num, ad->model_num)) { if (ad->model_rev == NULL) return ad->horkage; - if (!glob_match(model_rev, ad->model_rev)) + if (glob_match(model_rev, ad->model_rev)) return ad->horkage; } ad++; @@@ -4798,8 -4732,9 +4732,8 @@@ void swap_buf_le16(u16 *buf, unsigned i static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap) { struct ata_queued_cmd *qc = NULL; - unsigned int i, tag, max_queue; - - max_queue = ap->scsi_host->can_queue; + unsigned int max_queue = ap->host->n_tags; + unsigned int i, tag;
/* no command while frozen */ if (unlikely(ap->pflags & ATA_PFLAG_FROZEN)) @@@ -6093,7 -6028,6 +6027,7 @@@ void ata_host_init(struct ata_host *hos { spin_lock_init(&host->lock); mutex_init(&host->eh_mutex); + host->n_tags = ATA_MAX_QUEUE - 1; host->dev = dev; host->ops = ops; } @@@ -6175,7 -6109,15 +6109,7 @@@ int ata_host_register(struct ata_host * { int i, rc;
- /* - * The max queue supported by hardware must not be greater than - * ATA_MAX_QUEUE. - */ - if (sht->can_queue > ATA_MAX_QUEUE) { - dev_err(host->dev, "BUG: the hardware max queue is too large\n"); - WARN_ON(1); - return -EINVAL; - } + host->n_tags = clamp(sht->can_queue, 1, ATA_MAX_QUEUE - 1);
/* host must have been started */ if (!(host->flags & ATA_HOST_STARTED)) { diff --combined drivers/base/Kconfig index 88500fe,9d5fed1..4e7f0ff --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@@ -149,21 -149,15 +149,21 @@@ config EXTRA_FIRMWARE_DI some other directory containing the firmware files.
config FW_LOADER_USER_HELPER + bool + +config FW_LOADER_USER_HELPER_FALLBACK bool "Fallback user-helper invocation for firmware loading" depends on FW_LOADER - default y + select FW_LOADER_USER_HELPER help This option enables / disables the invocation of user-helper (e.g. udev) for loading firmware files as a fallback after the direct file loading in kernel fails. The user-mode helper is no longer required unless you have a special firmware file that - resides in a non-standard path. + resides in a non-standard path. Moreover, the udev support has + been deprecated upstream. + + If you are unsure about this, say N here.
config DEBUG_DRIVER bool "Driver Core verbose debug messages" @@@ -214,15 -208,6 +214,15 @@@ config DMA_SHARED_BUFFE APIs extension; the file's descriptor can then be passed on to other driver.
+config FENCE_TRACE + bool "Enable verbose FENCE_TRACE messages" + depends on DMA_SHARED_BUFFER + help + Enable the FENCE_TRACE printks. This will add extra + spam to the console log, but will make it easier to diagnose + lockup related problems for dma-buffers shared across multiple + devices. + config DMA_CMA bool "DMA Contiguous Memory Allocator" depends on HAVE_DMA_CONTIGUOUS && CMA @@@ -289,16 -274,6 +289,6 @@@ config CMA_ALIGNMEN
If unsure, leave the default value "8".
- config CMA_AREAS - int "Maximum count of the CMA device-private areas" - default 7 - help - CMA allows to create CMA areas for particular devices. This parameter - sets the maximum number of such device private CMA areas in the - system. - - If unsure, leave the default value "7". - endif
endmenu diff --combined drivers/input/input.c index 29ca0bb,3b9284b..236bc56 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@@ -257,10 -257,9 +257,10 @@@ static int input_handle_abs_event(struc }
static int input_get_disposition(struct input_dev *dev, - unsigned int type, unsigned int code, int value) + unsigned int type, unsigned int code, int *pval) { int disposition = INPUT_IGNORE_EVENT; + int value = *pval;
switch (type) {
@@@ -358,7 -357,6 +358,7 @@@ break; }
+ *pval = value; return disposition; }
@@@ -367,7 -365,7 +367,7 @@@ static void input_handle_event(struct i { int disposition;
- disposition = input_get_disposition(dev, type, code, value); + disposition = input_get_disposition(dev, type, code, &value);
if ((disposition & INPUT_PASS_TO_DEVICE) && dev->event) dev->event(dev, type, code, value); @@@ -710,6 -708,9 +710,9 @@@ static void input_disconnect_device(str handle->open = 0;
spin_unlock_irq(&dev->event_lock); + + if (is_event_supported(EV_LED, dev->evbit, EV_MAX)) + input_led_disconnect(dev); }
/** @@@ -2136,6 -2137,9 +2139,9 @@@ int input_register_device(struct input_
list_add_tail(&dev->node, &input_dev_list);
+ if (is_event_supported(EV_LED, dev->evbit, EV_MAX)) + input_led_connect(dev); + list_for_each_entry(handler, &input_handler_list, node) input_attach_handler(dev, handler);
diff --combined drivers/leds/Kconfig index 8c96e2d,6784c17..f6e32ba --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@@ -11,9 -11,6 +11,6 @@@ menuconfig NEW_LED Say Y to enable Linux LED support. This allows control of supported LEDs from both userspace and optionally, by kernel events (triggers).
- This is not related to standard keyboard LEDs which are controlled - via the input system. - if NEW_LEDS
config LEDS_CLASS @@@ -32,6 -29,14 +29,6 @@@ config LEDS_88PM860 This option enables support for on-chip LED drivers found on Marvell Semiconductor 88PM8606 PMIC.
-config LEDS_ATMEL_PWM - tristate "LED Support using Atmel PWM outputs" - depends on LEDS_CLASS - depends on ATMEL_PWM - help - This option enables support for LEDs driven using outputs - of the dedicated PWM controller found on newer Atmel SOCs. - config LEDS_LM3530 tristate "LCD Backlight driver for LM3530" depends on LEDS_CLASS @@@ -135,13 -140,6 +132,13 @@@ config LEDS_SUNFIR This option enables support for the Left, Middle, and Right LEDs on the I/O and CPU boards of SunFire UltraSPARC servers.
+config LEDS_IPAQ_MICRO + tristate "LED Support for the Compaq iPAQ h3xxx" + depends on MFD_IPAQ_MICRO + help + Choose this option if you want to use the notification LED on + Compaq/HP iPAQ h3100 and h3600. + config LEDS_HP6XX tristate "LED Support for the HP Jornada 6xx" depends on LEDS_CLASS diff --combined drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 9c93ff2,c57b085..ae3f105 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@@ -215,135 -215,52 +215,135 @@@ static int i40e_get_settings(struct net /* hardware is either in 40G mode or 10G mode * NOTE: this section initializes supported and advertising */ + if (!link_up) { + /* link is down and the driver needs to fall back on + * device ID to determine what kinds of info to display, + * it's mostly a guess that may change when link is up + */ + switch (hw->device_id) { + case I40E_DEV_ID_QSFP_A: + case I40E_DEV_ID_QSFP_B: + case I40E_DEV_ID_QSFP_C: + /* pluggable QSFP */ + ecmd->supported = SUPPORTED_40000baseSR4_Full | + SUPPORTED_40000baseCR4_Full | + SUPPORTED_40000baseLR4_Full; + ecmd->advertising = ADVERTISED_40000baseSR4_Full | + ADVERTISED_40000baseCR4_Full | + ADVERTISED_40000baseLR4_Full; + break; + case I40E_DEV_ID_KX_B: + /* backplane 40G */ + ecmd->supported = SUPPORTED_40000baseKR4_Full; + ecmd->advertising = ADVERTISED_40000baseKR4_Full; + break; + case I40E_DEV_ID_KX_C: + /* backplane 10G */ + ecmd->supported = SUPPORTED_10000baseKR_Full; + ecmd->advertising = ADVERTISED_10000baseKR_Full; + break; + default: + /* all the rest are 10G/1G */ + ecmd->supported = SUPPORTED_10000baseT_Full | + SUPPORTED_1000baseT_Full; + ecmd->advertising = ADVERTISED_10000baseT_Full | + ADVERTISED_1000baseT_Full; + break; + } + + /* skip phy_type use as it is zero when link is down */ + goto no_valid_phy_type; + } + switch (hw_link_info->phy_type) { case I40E_PHY_TYPE_40GBASE_CR4: case I40E_PHY_TYPE_40GBASE_CR4_CU: - ecmd->supported = SUPPORTED_40000baseCR4_Full; - ecmd->advertising = ADVERTISED_40000baseCR4_Full; + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_40000baseCR4_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_40000baseCR4_Full; break; case I40E_PHY_TYPE_40GBASE_KR4: - ecmd->supported = SUPPORTED_40000baseKR4_Full; - ecmd->advertising = ADVERTISED_40000baseKR4_Full; + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_40000baseKR4_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_40000baseKR4_Full; break; case I40E_PHY_TYPE_40GBASE_SR4: + case I40E_PHY_TYPE_XLPPI: + case I40E_PHY_TYPE_XLAUI: ecmd->supported = SUPPORTED_40000baseSR4_Full; break; case I40E_PHY_TYPE_40GBASE_LR4: ecmd->supported = SUPPORTED_40000baseLR4_Full; - ecmd->advertising = ADVERTISED_40000baseLR4_Full; break; case I40E_PHY_TYPE_10GBASE_KX4: - ecmd->supported = SUPPORTED_10000baseKX4_Full; - ecmd->advertising = ADVERTISED_10000baseKX4_Full; + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_10000baseKX4_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_10000baseKX4_Full; break; case I40E_PHY_TYPE_10GBASE_KR: - ecmd->supported = SUPPORTED_10000baseKR_Full; - ecmd->advertising = ADVERTISED_10000baseKR_Full; + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_10000baseKR_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_10000baseKR_Full; break; - default: - if (i40e_is_40G_device(hw->device_id)) { - ecmd->supported = SUPPORTED_40000baseSR4_Full; - ecmd->advertising = ADVERTISED_40000baseSR4_Full; - } else { - ecmd->supported = SUPPORTED_10000baseT_Full; - ecmd->advertising = ADVERTISED_10000baseT_Full; - } + case I40E_PHY_TYPE_10GBASE_SR: + case I40E_PHY_TYPE_10GBASE_LR: + ecmd->supported = SUPPORTED_10000baseT_Full; + break; + case I40E_PHY_TYPE_10GBASE_CR1_CU: + case I40E_PHY_TYPE_10GBASE_CR1: + case I40E_PHY_TYPE_10GBASE_T: + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_10000baseT_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_10000baseT_Full; + break; + case I40E_PHY_TYPE_XAUI: + case I40E_PHY_TYPE_XFI: + case I40E_PHY_TYPE_SFI: + case I40E_PHY_TYPE_10GBASE_SFPP_CU: + ecmd->supported = SUPPORTED_10000baseT_Full; + break; + case I40E_PHY_TYPE_1000BASE_KX: + case I40E_PHY_TYPE_1000BASE_T: + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_1000baseT_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_1000baseT_Full; + break; + case I40E_PHY_TYPE_100BASE_TX: + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_100baseT_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_100baseT_Full; + break; + case I40E_PHY_TYPE_SGMII: + ecmd->supported = SUPPORTED_Autoneg | + SUPPORTED_1000baseT_Full | + SUPPORTED_100baseT_Full; + ecmd->advertising = ADVERTISED_Autoneg | + ADVERTISED_1000baseT_Full | + ADVERTISED_100baseT_Full; break; + default: + /* if we got here and link is up something bad is afoot */ + WARN_ON(link_up); }
- ecmd->supported |= SUPPORTED_Autoneg; - ecmd->advertising |= ADVERTISED_Autoneg; +no_valid_phy_type: + /* this is if autoneg is enabled or disabled */ ecmd->autoneg = ((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ? AUTONEG_ENABLE : AUTONEG_DISABLE);
switch (hw->phy.media_type) { case I40E_MEDIA_TYPE_BACKPLANE: - ecmd->supported |= SUPPORTED_Backplane; - ecmd->advertising |= ADVERTISED_Backplane; + ecmd->supported |= SUPPORTED_Autoneg | + SUPPORTED_Backplane; + ecmd->advertising |= ADVERTISED_Autoneg | + ADVERTISED_Backplane; ecmd->port = PORT_NONE; break; case I40E_MEDIA_TYPE_BASET: @@@ -359,6 -276,7 +359,6 @@@ break; case I40E_MEDIA_TYPE_FIBER: ecmd->supported |= SUPPORTED_FIBRE; - ecmd->advertising |= ADVERTISED_FIBRE; ecmd->port = PORT_FIBRE; break; case I40E_MEDIA_TYPE_UNKNOWN: @@@ -369,25 -287,6 +369,25 @@@
ecmd->transceiver = XCVR_EXTERNAL;
+ ecmd->supported |= SUPPORTED_Pause; + + switch (hw->fc.current_mode) { + case I40E_FC_FULL: + ecmd->advertising |= ADVERTISED_Pause; + break; + case I40E_FC_TX_PAUSE: + ecmd->advertising |= ADVERTISED_Asym_Pause; + break; + case I40E_FC_RX_PAUSE: + ecmd->advertising |= (ADVERTISED_Pause | + ADVERTISED_Asym_Pause); + break; + default: + ecmd->advertising &= ~(ADVERTISED_Pause | + ADVERTISED_Asym_Pause); + break; + } + if (link_up) { switch (link_speed) { case I40E_LINK_SPEED_40GB: @@@ -397,9 -296,6 +397,9 @@@ case I40E_LINK_SPEED_10GB: ethtool_cmd_speed_set(ecmd, SPEED_10000); break; + case I40E_LINK_SPEED_1GB: + ethtool_cmd_speed_set(ecmd, SPEED_1000); + break; default: break; } @@@ -413,182 -309,6 +413,182 @@@ }
/** + * i40e_set_settings - Set Speed and Duplex + * @netdev: network interface device structure + * @ecmd: ethtool command + * + * Set speed/duplex per media_types advertised/forced + **/ +static int i40e_set_settings(struct net_device *netdev, + struct ethtool_cmd *ecmd) +{ + struct i40e_netdev_priv *np = netdev_priv(netdev); + struct i40e_aq_get_phy_abilities_resp abilities; + struct i40e_aq_set_phy_config config; + struct i40e_pf *pf = np->vsi->back; + struct i40e_vsi *vsi = np->vsi; + struct i40e_hw *hw = &pf->hw; + struct ethtool_cmd safe_ecmd; + i40e_status status = 0; + bool change = false; + int err = 0; + u8 autoneg; + u32 advertise; + + if (vsi != pf->vsi[pf->lan_vsi]) + return -EOPNOTSUPP; + + if (hw->phy.media_type != I40E_MEDIA_TYPE_BASET && + hw->phy.media_type != I40E_MEDIA_TYPE_FIBER && + hw->phy.media_type != I40E_MEDIA_TYPE_BACKPLANE) + return -EOPNOTSUPP; + + /* get our own copy of the bits to check against */ + memset(&safe_ecmd, 0, sizeof(struct ethtool_cmd)); + i40e_get_settings(netdev, &safe_ecmd); + + /* save autoneg and speed out of ecmd */ + autoneg = ecmd->autoneg; + advertise = ecmd->advertising; + + /* set autoneg and speed back to what they currently are */ + ecmd->autoneg = safe_ecmd.autoneg; + ecmd->advertising = safe_ecmd.advertising; + + ecmd->cmd = safe_ecmd.cmd; + /* If ecmd and safe_ecmd are not the same now, then they are + * trying to set something that we do not support + */ + if (memcmp(ecmd, &safe_ecmd, sizeof(struct ethtool_cmd))) + return -EOPNOTSUPP; + + while (test_bit(__I40E_CONFIG_BUSY, &vsi->state)) + usleep_range(1000, 2000); + + /* Get the current phy config */ + status = i40e_aq_get_phy_capabilities(hw, false, false, &abilities, + NULL); + if (status) + return -EAGAIN; + + /* Copy link_speed and abilities to config in case they are not + * set below + */ + memset(&config, 0, sizeof(struct i40e_aq_set_phy_config)); + config.link_speed = abilities.link_speed; + config.abilities = abilities.abilities; + + /* Check autoneg */ + if (autoneg == AUTONEG_ENABLE) { + /* If autoneg is not supported, return error */ + if (!(safe_ecmd.supported & SUPPORTED_Autoneg)) { + netdev_info(netdev, "Autoneg not supported on this phy\n"); + return -EINVAL; + } + /* If autoneg was not already enabled */ + if (!(hw->phy.link_info.an_info & I40E_AQ_AN_COMPLETED)) { + config.abilities = abilities.abilities | + I40E_AQ_PHY_ENABLE_AN; + change = true; + } + } else { + /* If autoneg is supported 10GBASE_T is the only phy that + * can disable it, so otherwise return error + */ + if (safe_ecmd.supported & SUPPORTED_Autoneg && + hw->phy.link_info.phy_type != I40E_PHY_TYPE_10GBASE_T) { + netdev_info(netdev, "Autoneg cannot be disabled on this phy\n"); + return -EINVAL; + } + /* If autoneg is currently enabled */ + if (hw->phy.link_info.an_info & I40E_AQ_AN_COMPLETED) { + config.abilities = abilities.abilities | + ~I40E_AQ_PHY_ENABLE_AN; + change = true; + } + } + + if (advertise & ~safe_ecmd.supported) + return -EINVAL; + + if (advertise & ADVERTISED_100baseT_Full) + if (!(abilities.link_speed & I40E_LINK_SPEED_100MB)) { + config.link_speed |= I40E_LINK_SPEED_100MB; + change = true; + } + if (advertise & ADVERTISED_1000baseT_Full || + advertise & ADVERTISED_1000baseKX_Full) + if (!(abilities.link_speed & I40E_LINK_SPEED_1GB)) { + config.link_speed |= I40E_LINK_SPEED_1GB; + change = true; + } + if (advertise & ADVERTISED_10000baseT_Full || + advertise & ADVERTISED_10000baseKX4_Full || + advertise & ADVERTISED_10000baseKR_Full) + if (!(abilities.link_speed & I40E_LINK_SPEED_10GB)) { + config.link_speed |= I40E_LINK_SPEED_10GB; + change = true; + } + if (advertise & ADVERTISED_40000baseKR4_Full || + advertise & ADVERTISED_40000baseCR4_Full || + advertise & ADVERTISED_40000baseSR4_Full || + advertise & ADVERTISED_40000baseLR4_Full) + if (!(abilities.link_speed & I40E_LINK_SPEED_40GB)) { + config.link_speed |= I40E_LINK_SPEED_40GB; + change = true; + } + + if (change) { + /* copy over the rest of the abilities */ + config.phy_type = abilities.phy_type; + config.eee_capability = abilities.eee_capability; + config.eeer = abilities.eeer_val; + config.low_power_ctrl = abilities.d3_lpan; + + /* If link is up set link and an so changes take effect */ + if (hw->phy.link_info.link_info & I40E_AQ_LINK_UP) + config.abilities |= I40E_AQ_PHY_ENABLE_ATOMIC_LINK; + + /* make the aq call */ + status = i40e_aq_set_phy_config(hw, &config, NULL); + if (status) { + netdev_info(netdev, "Set phy config failed with error %d.\n", + status); + return -EAGAIN; + } + + status = i40e_update_link_info(hw, true); + if (status) + netdev_info(netdev, "Updating link info failed with error %d\n", + status); + + } else { + netdev_info(netdev, "Nothing changed, exiting without setting anything.\n"); + } + + return err; +} + +static int i40e_nway_reset(struct net_device *netdev) +{ + /* restart autonegotiation */ + struct i40e_netdev_priv *np = netdev_priv(netdev); + struct i40e_pf *pf = np->vsi->back; + struct i40e_hw *hw = &pf->hw; + bool link_up = hw->phy.link_info.link_info & I40E_AQ_LINK_UP; + i40e_status ret = 0; + + ret = i40e_aq_set_link_restart_an(hw, link_up, NULL); + if (ret) { + netdev_info(netdev, "link restart failed, aq_err=%d\n", + pf->hw.aq.asq_last_status); + return -EIO; + } + + return 0; +} + +/** * i40e_get_pauseparam - Get Flow Control status * Return tx/rx-pause status **/ @@@ -614,85 -334,6 +614,85 @@@ static void i40e_get_pauseparam(struct } }
+/** + * i40e_set_pauseparam - Set Flow Control parameter + * @netdev: network interface device structure + * @pause: return tx/rx flow control status + **/ +static int i40e_set_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *pause) +{ + struct i40e_netdev_priv *np = netdev_priv(netdev); + struct i40e_pf *pf = np->vsi->back; + struct i40e_vsi *vsi = np->vsi; + struct i40e_hw *hw = &pf->hw; + struct i40e_link_status *hw_link_info = &hw->phy.link_info; + bool link_up = hw_link_info->link_info & I40E_AQ_LINK_UP; + i40e_status status; + u8 aq_failures; + int err = 0; + + if (vsi != pf->vsi[pf->lan_vsi]) + return -EOPNOTSUPP; + + if (pause->autoneg != ((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ? + AUTONEG_ENABLE : AUTONEG_DISABLE)) { + netdev_info(netdev, "To change autoneg please use: ethtool -s <dev> autoneg <on|off>\n"); + return -EOPNOTSUPP; + } + + /* If we have link and don't have autoneg */ + if (!test_bit(__I40E_DOWN, &pf->state) && + !(hw_link_info->an_info & I40E_AQ_AN_COMPLETED)) { + /* Send message that it might not necessarily work*/ + netdev_info(netdev, "Autoneg did not complete so changing settings may not result in an actual change.\n"); + } + + if (hw->fc.current_mode == I40E_FC_PFC) { + netdev_info(netdev, "Priority flow control enabled. Cannot set link flow control.\n"); + return -EOPNOTSUPP; + } + + if (pause->rx_pause && pause->tx_pause) + hw->fc.requested_mode = I40E_FC_FULL; + else if (pause->rx_pause && !pause->tx_pause) + hw->fc.requested_mode = I40E_FC_RX_PAUSE; + else if (!pause->rx_pause && pause->tx_pause) + hw->fc.requested_mode = I40E_FC_TX_PAUSE; + else if (!pause->rx_pause && !pause->tx_pause) + hw->fc.requested_mode = I40E_FC_NONE; + else + return -EINVAL; + + /* Set the fc mode and only restart an if link is up*/ + status = i40e_set_fc(hw, &aq_failures, link_up); + + if (aq_failures & I40E_SET_FC_AQ_FAIL_GET) { + netdev_info(netdev, "Set fc failed on the get_phy_capabilities call with error %d and status %d\n", + status, hw->aq.asq_last_status); + err = -EAGAIN; + } + if (aq_failures & I40E_SET_FC_AQ_FAIL_SET) { + netdev_info(netdev, "Set fc failed on the set_phy_config call with error %d and status %d\n", + status, hw->aq.asq_last_status); + err = -EAGAIN; + } + if (aq_failures & I40E_SET_FC_AQ_FAIL_UPDATE) { + netdev_info(netdev, "Set fc failed on the update_link_info call with error %d and status %d\n", + status, hw->aq.asq_last_status); + err = -EAGAIN; + } + + if (!test_bit(__I40E_DOWN, &pf->state)) { + /* Give it a little more time to try to come back */ + msleep(75); + if (!test_bit(__I40E_DOWN, &pf->state)) + return i40e_nway_reset(netdev); + } + + return err; +} + static u32 i40e_get_msglevel(struct net_device *netdev) { struct i40e_netdev_priv *np = netdev_priv(netdev); @@@ -763,33 -404,10 +763,33 @@@ static int i40e_get_eeprom(struct net_d u8 *eeprom_buff; u16 i, sectors; bool last; + u32 magic; + #define I40E_NVM_SECTOR_SIZE 4096 if (eeprom->len == 0) return -EINVAL;
+ /* check for NVMUpdate access method */ + magic = hw->vendor_id | (hw->device_id << 16); + if (eeprom->magic && eeprom->magic != magic) { + int errno; + + /* make sure it is the right magic for NVMUpdate */ + if ((eeprom->magic >> 16) != hw->device_id) + return -EINVAL; + + ret_val = i40e_nvmupd_command(hw, + (struct i40e_nvm_access *)eeprom, + bytes, &errno); + if (ret_val) + dev_info(&pf->pdev->dev, + "NVMUpdate read failed err=%d status=0x%x\n", + ret_val, hw->aq.asq_last_status); + + return errno; + } + + /* normal ethtool get_eeprom support */ eeprom->magic = hw->vendor_id | (hw->device_id << 16);
eeprom_buff = kzalloc(eeprom->len, GFP_KERNEL); @@@ -816,7 -434,7 +816,7 @@@ ret_val = i40e_aq_read_nvm(hw, 0x0, eeprom->offset + (I40E_NVM_SECTOR_SIZE * i), len, - eeprom_buff + (I40E_NVM_SECTOR_SIZE * i), + (u8 *)eeprom_buff + (I40E_NVM_SECTOR_SIZE * i), last, NULL); if (ret_val) { dev_info(&pf->pdev->dev, @@@ -828,7 -446,7 +828,7 @@@
release_nvm: i40e_release_nvm(hw); - memcpy(bytes, eeprom_buff, eeprom->len); + memcpy(bytes, (u8 *)eeprom_buff, eeprom->len); free_buff: kfree(eeprom_buff); return ret_val; @@@ -848,39 -466,6 +848,39 @@@ static int i40e_get_eeprom_len(struct n return val; }
+static int i40e_set_eeprom(struct net_device *netdev, + struct ethtool_eeprom *eeprom, u8 *bytes) +{ + struct i40e_netdev_priv *np = netdev_priv(netdev); + struct i40e_hw *hw = &np->vsi->back->hw; + struct i40e_pf *pf = np->vsi->back; + int ret_val = 0; + int errno; + u32 magic; + + /* normal ethtool set_eeprom is not supported */ + magic = hw->vendor_id | (hw->device_id << 16); + if (eeprom->magic == magic) + return -EOPNOTSUPP; + + /* check for NVMUpdate access method */ + if (!eeprom->magic || (eeprom->magic >> 16) != hw->device_id) + return -EINVAL; + + if (test_bit(__I40E_RESET_RECOVERY_PENDING, &pf->state) || + test_bit(__I40E_RESET_INTR_RECEIVED, &pf->state)) + return -EBUSY; + + ret_val = i40e_nvmupd_command(hw, (struct i40e_nvm_access *)eeprom, + bytes, &errno); + if (ret_val) + dev_info(&pf->pdev->dev, + "NVMUpdate write failed err=%d status=0x%x\n", + ret_val, hw->aq.asq_last_status); + + return errno; +} + static void i40e_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { @@@ -1436,6 -1021,24 +1436,6 @@@ static int i40e_set_wol(struct net_devi return 0; }
-static int i40e_nway_reset(struct net_device *netdev) -{ - /* restart autonegotiation */ - struct i40e_netdev_priv *np = netdev_priv(netdev); - struct i40e_pf *pf = np->vsi->back; - struct i40e_hw *hw = &pf->hw; - i40e_status ret = 0; - - ret = i40e_aq_set_link_restart_an(hw, NULL); - if (ret) { - netdev_info(netdev, "link restart failed, aq_err=%d\n", - pf->hw.aq.asq_last_status); - return -EIO; - } - - return 0; -} - static int i40e_set_phys_id(struct net_device *netdev, enum ethtool_phys_id_state state) { @@@ -1502,36 -1105,17 +1502,36 @@@ static int i40e_set_coalesce(struct net if (ec->tx_max_coalesced_frames_irq || ec->rx_max_coalesced_frames_irq) vsi->work_limit = ec->tx_max_coalesced_frames_irq;
+ vector = vsi->base_vector; if ((ec->rx_coalesce_usecs >= (I40E_MIN_ITR << 1)) && - (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1))) + (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1))) { vsi->rx_itr_setting = ec->rx_coalesce_usecs; - else + } else if (ec->rx_coalesce_usecs == 0) { + vsi->rx_itr_setting = ec->rx_coalesce_usecs; + i40e_irq_dynamic_disable(vsi, vector); + if (ec->use_adaptive_rx_coalesce) + netif_info(pf, drv, netdev, + "Rx-secs=0, need to disable adaptive-Rx for a complete disable\n"); + } else { + netif_info(pf, drv, netdev, + "Invalid value, Rx-usecs range is 0, 8-8160\n"); return -EINVAL; + }
if ((ec->tx_coalesce_usecs >= (I40E_MIN_ITR << 1)) && - (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1))) + (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1))) { vsi->tx_itr_setting = ec->tx_coalesce_usecs; - else + } else if (ec->tx_coalesce_usecs == 0) { + vsi->tx_itr_setting = ec->tx_coalesce_usecs; + i40e_irq_dynamic_disable(vsi, vector); + if (ec->use_adaptive_tx_coalesce) + netif_info(pf, drv, netdev, + "Tx-secs=0, need to disable adaptive-Tx for a complete disable\n"); + } else { + netif_info(pf, drv, netdev, + "Invalid value, Tx-usecs range is 0, 8-8160\n"); return -EINVAL; + }
if (ec->use_adaptive_rx_coalesce) vsi->rx_itr_setting |= I40E_ITR_DYNAMIC; @@@ -1543,6 -1127,7 +1543,6 @@@ else vsi->tx_itr_setting &= ~I40E_ITR_DYNAMIC;
- vector = vsi->base_vector; for (i = 0; i < vsi->num_q_vectors; i++, vector++) { q_vector = vsi->q_vectors[i]; q_vector->rx.itr = ITR_TO_REG(vsi->rx_itr_setting); @@@ -1913,7 -1498,7 +1913,7 @@@ static int i40e_update_ethtool_fdir_ent
/* add filter to the list */ if (parent) - hlist_add_after(&parent->fdir_node, &input->fdir_node); + hlist_add_behind(&input->fdir_node, &parent->fdir_node); else hlist_add_head(&input->fdir_node, &pf->fdir_filter_list); @@@ -2146,7 -1731,6 +2146,7 @@@ static int i40e_set_channels(struct net
static const struct ethtool_ops i40e_ethtool_ops = { .get_settings = i40e_get_settings, + .set_settings = i40e_set_settings, .get_drvinfo = i40e_get_drvinfo, .get_regs_len = i40e_get_regs_len, .get_regs = i40e_get_regs, @@@ -2154,13 -1738,11 +2154,13 @@@ .get_link = ethtool_op_get_link, .get_wol = i40e_get_wol, .set_wol = i40e_set_wol, + .set_eeprom = i40e_set_eeprom, .get_eeprom_len = i40e_get_eeprom_len, .get_eeprom = i40e_get_eeprom, .get_ringparam = i40e_get_ringparam, .set_ringparam = i40e_set_ringparam, .get_pauseparam = i40e_get_pauseparam, + .set_pauseparam = i40e_set_pauseparam, .get_msglevel = i40e_get_msglevel, .set_msglevel = i40e_set_msglevel, .get_rxnfc = i40e_get_rxnfc, diff --combined drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 94a1c07,a6e5bcc..e4100b5 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@@ -1408,6 -1408,7 +1408,6 @@@ static int ixgbe_reg_test(struct ixgbe_ default: *data = 1; return 1; - break; }
/* @@@ -2517,7 -2518,7 +2517,7 @@@ static int ixgbe_update_ethtool_fdir_en
/* add filter to the list */ if (parent) - hlist_add_after(&parent->fdir_node, &input->fdir_node); + hlist_add_behind(&input->fdir_node, &parent->fdir_node); else hlist_add_head(&input->fdir_node, &adapter->fdir_filter_list); @@@ -2865,6 -2866,7 +2865,6 @@@ static int ixgbe_get_ts_info(struct net break; default: return ethtool_op_get_ts_info(dev, info); - break; } return 0; } diff --combined drivers/staging/android/binder.c index 02b0379,0ca9785..4f34dc0 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@@ -454,8 -454,9 +454,8 @@@ static size_t binder_buffer_size(struc { if (list_is_last(&buffer->entry, &proc->buffers)) return proc->buffer + proc->buffer_size - (void *)buffer->data; - else - return (size_t)list_entry(buffer->entry.next, - struct binder_buffer, entry) - (size_t)buffer->data; + return (size_t)list_entry(buffer->entry.next, + struct binder_buffer, entry) - (size_t)buffer->data; }
static void binder_insert_free_buffer(struct binder_proc *proc, @@@ -585,7 -586,6 +585,6 @@@ static int binder_update_page_range(str
for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { int ret; - struct page **page_array_ptr;
page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
@@@ -598,8 -598,7 +597,7 @@@ } tmp_area.addr = page_addr; tmp_area.size = PAGE_SIZE + PAGE_SIZE /* guard page? */; - page_array_ptr = page; - ret = map_vm_area(&tmp_area, PAGE_KERNEL, &page_array_ptr); + ret = map_vm_area(&tmp_area, PAGE_KERNEL, page); if (ret) { pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n", proc->pid, page_addr); @@@ -1185,7 -1184,6 +1183,7 @@@ static void binder_send_failed_reply(st uint32_t error_code) { struct binder_thread *target_thread; + struct binder_transaction *next;
BUG_ON(t->flags & TF_ONE_WAY); while (1) { @@@ -1213,23 -1211,24 +1211,23 @@@ target_thread->return_error); } return; - } else { - struct binder_transaction *next = t->from_parent; + } + next = t->from_parent;
- binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, - "send failed reply for transaction %d, target dead\n", - t->debug_id); + binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, + "send failed reply for transaction %d, target dead\n", + t->debug_id);
- binder_pop_transaction(target_thread, t); - if (next == NULL) { - binder_debug(BINDER_DEBUG_DEAD_BINDER, - "reply failed, no target thread at root\n"); - return; - } - t = next; + binder_pop_transaction(target_thread, t); + if (next == NULL) { binder_debug(BINDER_DEBUG_DEAD_BINDER, - "reply failed, no target thread -- retry %d\n", - t->debug_id); + "reply failed, no target thread at root\n"); + return; } + t = next; + binder_debug(BINDER_DEBUG_DEAD_BINDER, + "reply failed, no target thread -- retry %d\n", + t->debug_id); } }
@@@ -2593,106 -2592,6 +2591,106 @@@ static unsigned int binder_poll(struct return 0; }
+static int binder_ioctl_write_read(struct file *filp, + unsigned int cmd, unsigned long arg, + struct binder_thread *thread) +{ + int ret = 0; + struct binder_proc *proc = filp->private_data; + unsigned int size = _IOC_SIZE(cmd); + void __user *ubuf = (void __user *)arg; + struct binder_write_read bwr; + + if (size != sizeof(struct binder_write_read)) { + ret = -EINVAL; + goto out; + } + if (copy_from_user(&bwr, ubuf, sizeof(bwr))) { + ret = -EFAULT; + goto out; + } + binder_debug(BINDER_DEBUG_READ_WRITE, + "%d:%d write %lld at %016llx, read %lld at %016llx\n", + proc->pid, thread->pid, + (u64)bwr.write_size, (u64)bwr.write_buffer, + (u64)bwr.read_size, (u64)bwr.read_buffer); + + if (bwr.write_size > 0) { + ret = binder_thread_write(proc, thread, + bwr.write_buffer, + bwr.write_size, + &bwr.write_consumed); + trace_binder_write_done(ret); + if (ret < 0) { + bwr.read_consumed = 0; + if (copy_to_user(ubuf, &bwr, sizeof(bwr))) + ret = -EFAULT; + goto out; + } + } + if (bwr.read_size > 0) { + ret = binder_thread_read(proc, thread, bwr.read_buffer, + bwr.read_size, + &bwr.read_consumed, + filp->f_flags & O_NONBLOCK); + trace_binder_read_done(ret); + if (!list_empty(&proc->todo)) + wake_up_interruptible(&proc->wait); + if (ret < 0) { + if (copy_to_user(ubuf, &bwr, sizeof(bwr))) + ret = -EFAULT; + goto out; + } + } + binder_debug(BINDER_DEBUG_READ_WRITE, + "%d:%d wrote %lld of %lld, read return %lld of %lld\n", + proc->pid, thread->pid, + (u64)bwr.write_consumed, (u64)bwr.write_size, + (u64)bwr.read_consumed, (u64)bwr.read_size); + if (copy_to_user(ubuf, &bwr, sizeof(bwr))) { + ret = -EFAULT; + goto out; + } +out: + return ret; +} + +static int binder_ioctl_set_ctx_mgr(struct file *filp) +{ + int ret = 0; + struct binder_proc *proc = filp->private_data; + kuid_t curr_euid = current_euid(); + + if (binder_context_mgr_node != NULL) { + pr_err("BINDER_SET_CONTEXT_MGR already set\n"); + ret = -EBUSY; + goto out; + } + if (uid_valid(binder_context_mgr_uid)) { + if (!uid_eq(binder_context_mgr_uid, curr_euid)) { + pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n", + from_kuid(&init_user_ns, curr_euid), + from_kuid(&init_user_ns, + binder_context_mgr_uid)); + ret = -EPERM; + goto out; + } + } else { + binder_context_mgr_uid = curr_euid; + } + binder_context_mgr_node = binder_new_node(proc, 0, 0); + if (binder_context_mgr_node == NULL) { + ret = -ENOMEM; + goto out; + } + binder_context_mgr_node->local_weak_refs++; + binder_context_mgr_node->local_strong_refs++; + binder_context_mgr_node->has_strong_ref = 1; + binder_context_mgr_node->has_weak_ref = 1; +out: + return ret; +} + static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int ret; @@@ -2700,9 -2599,9 +2698,9 @@@ struct binder_thread *thread; unsigned int size = _IOC_SIZE(cmd); void __user *ubuf = (void __user *)arg; - kuid_t curr_euid = current_euid();
- /*pr_info("binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd, arg);*/ + /*pr_info("binder_ioctl: %d:%d %x %lx\n", + proc->pid, current->pid, cmd, arg);*/
trace_binder_ioctl(cmd, arg);
@@@ -2718,11 -2617,61 +2716,11 @@@ }
switch (cmd) { - case BINDER_WRITE_READ: { - struct binder_write_read bwr; - - if (size != sizeof(struct binder_write_read)) { - ret = -EINVAL; + case BINDER_WRITE_READ: + ret = binder_ioctl_write_read(filp, cmd, arg, thread); + if (ret) goto err; - } - if (copy_from_user(&bwr, ubuf, sizeof(bwr))) { - ret = -EFAULT; - goto err; - } - binder_debug(BINDER_DEBUG_READ_WRITE, - "%d:%d write %lld at %016llx, read %lld at %016llx\n", - proc->pid, thread->pid, - (u64)bwr.write_size, (u64)bwr.write_buffer, - (u64)bwr.read_size, (u64)bwr.read_buffer); - - if (bwr.write_size > 0) { - ret = binder_thread_write(proc, thread, - bwr.write_buffer, - bwr.write_size, - &bwr.write_consumed); - trace_binder_write_done(ret); - if (ret < 0) { - bwr.read_consumed = 0; - if (copy_to_user(ubuf, &bwr, sizeof(bwr))) - ret = -EFAULT; - goto err; - } - } - if (bwr.read_size > 0) { - ret = binder_thread_read(proc, thread, bwr.read_buffer, - bwr.read_size, - &bwr.read_consumed, - filp->f_flags & O_NONBLOCK); - trace_binder_read_done(ret); - if (!list_empty(&proc->todo)) - wake_up_interruptible(&proc->wait); - if (ret < 0) { - if (copy_to_user(ubuf, &bwr, sizeof(bwr))) - ret = -EFAULT; - goto err; - } - } - binder_debug(BINDER_DEBUG_READ_WRITE, - "%d:%d wrote %lld of %lld, read return %lld of %lld\n", - proc->pid, thread->pid, - (u64)bwr.write_consumed, (u64)bwr.write_size, - (u64)bwr.read_consumed, (u64)bwr.read_size); - if (copy_to_user(ubuf, &bwr, sizeof(bwr))) { - ret = -EFAULT; - goto err; - } break; - } case BINDER_SET_MAX_THREADS: if (copy_from_user(&proc->max_threads, ubuf, sizeof(proc->max_threads))) { ret = -EINVAL; @@@ -2730,9 -2679,31 +2728,9 @@@ } break; case BINDER_SET_CONTEXT_MGR: - if (binder_context_mgr_node != NULL) { - pr_err("BINDER_SET_CONTEXT_MGR already set\n"); - ret = -EBUSY; - goto err; - } - if (uid_valid(binder_context_mgr_uid)) { - if (!uid_eq(binder_context_mgr_uid, curr_euid)) { - pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n", - from_kuid(&init_user_ns, curr_euid), - from_kuid(&init_user_ns, binder_context_mgr_uid)); - ret = -EPERM; - goto err; - } - } else { - binder_context_mgr_uid = curr_euid; - } - binder_context_mgr_node = binder_new_node(proc, 0, 0); - if (binder_context_mgr_node == NULL) { - ret = -ENOMEM; + ret = binder_ioctl_set_ctx_mgr(filp); + if (ret) goto err; - } - binder_context_mgr_node->local_weak_refs++; - binder_context_mgr_node->local_strong_refs++; - binder_context_mgr_node->has_strong_ref = 1; - binder_context_mgr_node->has_weak_ref = 1; break; case BINDER_THREAD_EXIT: binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n", @@@ -2796,15 -2767,9 +2794,15 @@@ static void binder_vma_close(struct vm_ binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES); }
+static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static struct vm_operations_struct binder_vm_ops = { .open = binder_vma_open, .close = binder_vma_close, + .fault = binder_vm_fault, };
static int binder_mmap(struct file *filp, struct vm_area_struct *vma) diff --combined drivers/staging/lustre/lustre/libcfs/hash.c index 5dde794,6db7391..8ef1deb --- a/drivers/staging/lustre/lustre/libcfs/hash.c +++ b/drivers/staging/lustre/lustre/libcfs/hash.c @@@ -107,7 -107,7 +107,7 @@@ * table. Also, user can break the iteration by return 1 in callback. */
-#include <linux/libcfs/libcfs.h> +#include "../../include/linux/libcfs/libcfs.h" #include <linux/seq_file.h>
#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 @@@ -351,7 -351,7 +351,7 @@@ cfs_hash_dh_hnode_add(struct cfs_hash * cfs_hash_dhead_t, dh_head);
if (dh->dh_tail != NULL) /* not empty */ - hlist_add_after(dh->dh_tail, hnode); + hlist_add_behind(hnode, dh->dh_tail); else /* empty list */ hlist_add_head(hnode, &dh->dh_head); dh->dh_tail = hnode; @@@ -406,7 -406,7 +406,7 @@@ cfs_hash_dd_hnode_add(struct cfs_hash * cfs_hash_dhead_dep_t, dd_head);
if (dh->dd_tail != NULL) /* not empty */ - hlist_add_after(dh->dd_tail, hnode); + hlist_add_behind(hnode, dh->dd_tail); else /* empty list */ hlist_add_head(hnode, &dh->dd_head); dh->dd_tail = hnode; diff --combined drivers/video/backlight/backlight.c index bddc8b1,19b170d..0ce8823 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@@ -190,8 -190,6 +190,6 @@@ static ssize_t brightness_store(struct } mutex_unlock(&bd->ops_lock);
- backlight_generate_event(bd, BACKLIGHT_UPDATE_SYSFS); - return rc; } static DEVICE_ATTR_RW(brightness); @@@ -223,8 -221,6 +221,8 @@@ static ssize_t actual_brightness_show(s mutex_lock(&bd->ops_lock); if (bd->ops && bd->ops->get_brightness) rc = sprintf(buf, "%d\n", bd->ops->get_brightness(bd)); + else + rc = sprintf(buf, "%d\n", bd->props.brightness); mutex_unlock(&bd->ops_lock);
return rc; diff --combined fs/cifs/cifssmb.c index 7d4361f,c3dc52e..692d79f --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@@ -196,6 -196,10 +196,6 @@@ cifs_reconnect_tcon(struct cifs_tcon *t if (rc) goto out;
- /* - * FIXME: check if wsize needs updated due to negotiated smb buffer - * size shrinking - */ atomic_inc(&tconInfoReconnectCount);
/* tell server Unix caps we support */ @@@ -1513,6 -1517,7 +1513,6 @@@ cifs_readv_receive(struct TCP_Server_In return length;
server->total_read += length; - rdata->bytes = length;
cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", server->total_read, buflen, data_len); @@@ -1555,18 -1560,12 +1555,18 @@@ cifs_readv_callback(struct mid_q_entry rc); } /* FIXME: should this be counted toward the initiating task? */ - task_io_account_read(rdata->bytes); - cifs_stats_bytes_read(tcon, rdata->bytes); + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: rdata->result = -EAGAIN; + if (server->sign && rdata->got_bytes) + /* reset bytes number since we can not check a sign */ + rdata->got_bytes = 0; + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; default: rdata->result = -EIO; @@@ -1735,7 -1734,10 +1735,7 @@@ CIFSSMBRead(const unsigned int xid, str
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ if (*buf) { - if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base); } else if (resp_buf_type != CIFS_NO_BUFFER) { /* return buffer to caller to free */ *buf = iov[0].iov_base; @@@ -1900,79 -1902,27 +1900,79 @@@ cifs_writev_requeue(struct cifs_writeda int i, rc; struct inode *inode = wdata->cfile->dentry->d_inode; struct TCP_Server_Info *server; + unsigned int rest_len;
- for (i = 0; i < wdata->nr_pages; i++) { - lock_page(wdata->pages[i]); - clear_page_dirty_for_io(wdata->pages[i]); - } - + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + i = 0; + rest_len = wdata->bytes; do { - server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata, cifs_writedata_release); - } while (rc == -EAGAIN); + struct cifs_writedata *wdata2; + unsigned int j, nr_pages, wsize, tailsz, cur_len; + + wsize = server->ops->wp_retry_size(inode); + if (wsize < rest_len) { + nr_pages = wsize / PAGE_CACHE_SIZE; + if (!nr_pages) { + rc = -ENOTSUPP; + break; + } + cur_len = nr_pages * PAGE_CACHE_SIZE; + tailsz = PAGE_CACHE_SIZE; + } else { + nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE); + cur_len = rest_len; + tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE; + }
- for (i = 0; i < wdata->nr_pages; i++) { - unlock_page(wdata->pages[i]); - if (rc != 0) { - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - page_cache_release(wdata->pages[i]); + wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete); + if (!wdata2) { + rc = -ENOMEM; + break; } - }
- mapping_set_error(inode->i_mapping, rc); + for (j = 0; j < nr_pages; j++) { + wdata2->pages[j] = wdata->pages[i + j]; + lock_page(wdata2->pages[j]); + clear_page_dirty_for_io(wdata2->pages[j]); + } + + wdata2->sync_mode = wdata->sync_mode; + wdata2->nr_pages = nr_pages; + wdata2->offset = page_offset(wdata2->pages[0]); + wdata2->pagesz = PAGE_CACHE_SIZE; + wdata2->tailsz = tailsz; + wdata2->bytes = cur_len; + + wdata2->cfile = find_writable_file(CIFS_I(inode), false); + if (!wdata2->cfile) { + cifs_dbg(VFS, "No writable handles for inode\n"); + rc = -EBADF; + break; + } + wdata2->pid = wdata2->cfile->pid; + rc = server->ops->async_writev(wdata2, cifs_writedata_release); + + for (j = 0; j < nr_pages; j++) { + unlock_page(wdata2->pages[j]); + if (rc != 0 && rc != -EAGAIN) { + SetPageError(wdata2->pages[j]); + end_page_writeback(wdata2->pages[j]); + page_cache_release(wdata2->pages[j]); + } + } + + if (rc) { + kref_put(&wdata2->refcount, cifs_writedata_release); + if (rc == -EAGAIN) + continue; + mapping_set_error(inode->i_mapping, rc); + break; + } + + rest_len -= cur_len; + i += nr_pages; + } while (i < wdata->nr_pages); + kref_put(&wdata->refcount, cifs_writedata_release); }
@@@ -2253,7 -2203,10 +2253,7 @@@ CIFSSMBWrite2(const unsigned int xid, s }
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ - if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@@ -2477,14 -2430,14 +2477,14 @@@ CIFSSMBPosixLock(const unsigned int xid } parm_data = (struct cifs_posix_lock *) ((char *)&pSMBr->hdr.Protocol + data_offset); - if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK)) + if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) pLockData->fl_type = F_UNLCK; else { if (parm_data->lock_type == - __constant_cpu_to_le16(CIFS_RDLCK)) + cpu_to_le16(CIFS_RDLCK)) pLockData->fl_type = F_RDLCK; else if (parm_data->lock_type == - __constant_cpu_to_le16(CIFS_WRLCK)) + cpu_to_le16(CIFS_WRLCK)) pLockData->fl_type = F_WRLCK;
pLockData->fl_start = le64_to_cpu(parm_data->start); @@@ -2498,7 -2451,10 +2498,7 @@@ plk_err_exit if (pSMB) cifs_small_buf_release(pSMB);
- if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@@ -3276,25 -3232,25 +3276,25 @@@ CIFSSMB_set_compression(const unsigned pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
pSMB->TotalParameterCount = 0; - pSMB->TotalDataCount = __constant_cpu_to_le32(2); + pSMB->TotalDataCount = cpu_to_le32(2); pSMB->MaxParameterCount = 0; pSMB->MaxDataCount = 0; pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; - pSMB->DataCount = __constant_cpu_to_le32(2); + pSMB->DataCount = cpu_to_le32(2); pSMB->DataOffset = cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req, compression_state) - 4); /* 84 */ pSMB->SetupCount = 4; - pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL); + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); pSMB->ParameterCount = 0; - pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION); + pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION); pSMB->IsFsctl = 1; /* FSCTL */ pSMB->IsRootFlag = 0; pSMB->Fid = fid; /* file handle always le */ /* 3 byte pad, followed by 2 byte compress state */ - pSMB->ByteCount = __constant_cpu_to_le16(5); + pSMB->ByteCount = cpu_to_le16(5); inc_rfc1001_len(pSMB, 5);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@@ -3430,10 -3386,10 +3430,10 @@@ static __u16 ACL_to_cifs_posix(char *pa cifs_acl->version = cpu_to_le16(1); if (acl_type == ACL_TYPE_ACCESS) { cifs_acl->access_entry_count = cpu_to_le16(count); - cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF); + cifs_acl->default_entry_count = cpu_to_le16(0xFFFF); } else if (acl_type == ACL_TYPE_DEFAULT) { cifs_acl->default_entry_count = cpu_to_le16(count); - cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF); + cifs_acl->access_entry_count = cpu_to_le16(0xFFFF); } else { cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); return 0; @@@ -3882,7 -3838,10 +3882,7 @@@ CIFSSMBGetCIFSACL(const unsigned int xi } } qsec_out: - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(buf_type, iov[0].iov_base); /* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ return rc; } diff --combined fs/cifs/file.c index 01a6339,3c1967c..03558d4 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@@ -1058,7 -1058,7 +1058,7 @@@ cifs_push_mandatory_locks(struct cifsFi
max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); - buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) { free_xid(xid); return -ENOMEM; @@@ -1393,7 -1393,7 +1393,7 @@@ cifs_unlock_range(struct cifsFileInfo *
max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); - buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) return -ENOMEM;
@@@ -1670,8 -1670,8 +1670,8 @@@ cifs_write(struct cifsFileInfo *open_fi break; }
- len = min((size_t)cifs_sb->wsize, - write_size - total_written); + len = min(server->ops->wp_retry_size(dentry->d_inode), + (unsigned int)write_size - total_written); /* iov[0] is reserved for smb header */ iov[1].iov_base = (char *)write_data + total_written; iov[1].iov_len = len; @@@ -1878,178 -1878,15 +1878,178 @@@ static int cifs_partialpagewrite(struc return rc; }
+static struct cifs_writedata * +wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, + pgoff_t end, pgoff_t *index, + unsigned int *found_pages) +{ + unsigned int nr_pages; + struct page **pages; + struct cifs_writedata *wdata; + + wdata = cifs_writedata_alloc((unsigned int)tofind, + cifs_writev_complete); + if (!wdata) + return NULL; + + /* + * find_get_pages_tag seems to return a max of 256 on each + * iteration, so we must call it several times in order to + * fill the array or the wsize is effectively limited to + * 256 * PAGE_CACHE_SIZE. + */ + *found_pages = 0; + pages = wdata->pages; + do { + nr_pages = find_get_pages_tag(mapping, index, + PAGECACHE_TAG_DIRTY, tofind, + pages); + *found_pages += nr_pages; + tofind -= nr_pages; + pages += nr_pages; + } while (nr_pages && tofind && *index <= end); + + return wdata; +} + +static unsigned int +wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, + struct address_space *mapping, + struct writeback_control *wbc, + pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done) +{ + unsigned int nr_pages = 0, i; + struct page *page; + + for (i = 0; i < found_pages; i++) { + page = wdata->pages[i]; + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + + if (nr_pages == 0) + lock_page(page); + else if (!trylock_page(page)) + break; + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + break; + } + + if (!wbc->range_cyclic && page->index > end) { + *done = true; + unlock_page(page); + break; + } + + if (*next && (page->index != *next)) { + /* Not next consecutive page */ + unlock_page(page); + break; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + break; + } + + /* + * This actually clears the dirty bit in the radix tree. + * See cifs_writepage() for more commentary. + */ + set_page_writeback(page); + if (page_offset(page) >= i_size_read(mapping->host)) { + *done = true; + unlock_page(page); + end_page_writeback(page); + break; + } + + wdata->pages[i] = page; + *next = page->index + 1; + ++nr_pages; + } + + /* reset index to refind any pages skipped */ + if (nr_pages == 0) + *index = wdata->pages[0]->index + 1; + + /* put any pages we aren't going to use */ + for (i = nr_pages; i < found_pages; i++) { + page_cache_release(wdata->pages[i]); + wdata->pages[i] = NULL; + } + + return nr_pages; +} + +static int +wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, + struct address_space *mapping, struct writeback_control *wbc) +{ + int rc = 0; + struct TCP_Server_Info *server; + unsigned int i; + + wdata->sync_mode = wbc->sync_mode; + wdata->nr_pages = nr_pages; + wdata->offset = page_offset(wdata->pages[0]); + wdata->pagesz = PAGE_CACHE_SIZE; + wdata->tailsz = min(i_size_read(mapping->host) - + page_offset(wdata->pages[nr_pages - 1]), + (loff_t)PAGE_CACHE_SIZE); + wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz; + + if (wdata->cfile != NULL) + cifsFileInfo_put(wdata->cfile); + wdata->cfile = find_writable_file(CIFS_I(mapping->host), false); + if (!wdata->cfile) { + cifs_dbg(VFS, "No writable handles for inode\n"); + rc = -EBADF; + } else { + wdata->pid = wdata->cfile->pid; + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + rc = server->ops->async_writev(wdata, cifs_writedata_release); + } + + for (i = 0; i < nr_pages; ++i) + unlock_page(wdata->pages[i]); + + if (!rc) + return rc; + + /* send failure -- clean up the mess */ + for (i = 0; i < nr_pages; ++i) { + if (rc == -EAGAIN) + redirty_page_for_writepage(wbc, wdata->pages[i]); + else + SetPageError(wdata->pages[i]); + end_page_writeback(wdata->pages[i]); + page_cache_release(wdata->pages[i]); + } + if (rc != -EAGAIN) + mapping_set_error(mapping, rc); + + return rc; +} + static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb); + struct TCP_Server_Info *server; bool done = false, scanned = false, range_whole = false; pgoff_t end, index; struct cifs_writedata *wdata; - struct TCP_Server_Info *server; - struct page *page; int rc = 0;
/* @@@ -2069,55 -1906,165 +2069,55 @@@ range_whole = true; scanned = true; } + server = cifs_sb_master_tcon(cifs_sb)->ses->server; retry: while (!done && index <= end) { - unsigned int i, nr_pages, found_pages; - pgoff_t next = 0, tofind; - struct page **pages; + unsigned int nr_pages, found_pages, wsize, credits; + pgoff_t next = 0, tofind, saved_index = index;
- tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1, - end - index) + 1; + rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, + &wsize, &credits); + if (rc) + break;
- wdata = cifs_writedata_alloc((unsigned int)tofind, - cifs_writev_complete); + tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1; + + wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index, + &found_pages); if (!wdata) { rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); break; }
- /* - * find_get_pages_tag seems to return a max of 256 on each - * iteration, so we must call it several times in order to - * fill the array or the wsize is effectively limited to - * 256 * PAGE_CACHE_SIZE. - */ - found_pages = 0; - pages = wdata->pages; - do { - nr_pages = find_get_pages_tag(mapping, &index, - PAGECACHE_TAG_DIRTY, - tofind, pages); - found_pages += nr_pages; - tofind -= nr_pages; - pages += nr_pages; - } while (nr_pages && tofind && index <= end); - if (found_pages == 0) { kref_put(&wdata->refcount, cifs_writedata_release); + add_credits_and_wake_if(server, credits, 0); break; }
- nr_pages = 0; - for (i = 0; i < found_pages; i++) { - page = wdata->pages[i]; - /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping - */ - - if (nr_pages == 0) - lock_page(page); - else if (!trylock_page(page)) - break; - - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - break; - } - - if (!wbc->range_cyclic && page->index > end) { - done = true; - unlock_page(page); - break; - } - - if (next && (page->index != next)) { - /* Not next consecutive page */ - unlock_page(page); - break; - } - - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - break; - } - - /* - * This actually clears the dirty bit in the radix tree. - * See cifs_writepage() for more commentary. - */ - set_page_writeback(page); - - if (page_offset(page) >= i_size_read(mapping->host)) { - done = true; - unlock_page(page); - end_page_writeback(page); - break; - } - - wdata->pages[i] = page; - next = page->index + 1; - ++nr_pages; - } - - /* reset index to refind any pages skipped */ - if (nr_pages == 0) - index = wdata->pages[0]->index + 1; - - /* put any pages we aren't going to use */ - for (i = nr_pages; i < found_pages; i++) { - page_cache_release(wdata->pages[i]); - wdata->pages[i] = NULL; - } + nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc, + end, &index, &next, &done);
/* nothing to write? */ if (nr_pages == 0) { kref_put(&wdata->refcount, cifs_writedata_release); + add_credits_and_wake_if(server, credits, 0); continue; }
- wdata->sync_mode = wbc->sync_mode; - wdata->nr_pages = nr_pages; - wdata->offset = page_offset(wdata->pages[0]); - wdata->pagesz = PAGE_CACHE_SIZE; - wdata->tailsz = - min(i_size_read(mapping->host) - - page_offset(wdata->pages[nr_pages - 1]), - (loff_t)PAGE_CACHE_SIZE); - wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + - wdata->tailsz; + wdata->credits = credits;
- do { - if (wdata->cfile != NULL) - cifsFileInfo_put(wdata->cfile); - wdata->cfile = find_writable_file(CIFS_I(mapping->host), - false); - if (!wdata->cfile) { - cifs_dbg(VFS, "No writable handles for inode\n"); - rc = -EBADF; - break; - } - wdata->pid = wdata->cfile->pid; - server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata, - cifs_writedata_release); - } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN); + rc = wdata_send_pages(wdata, nr_pages, mapping, wbc); + if (rc) + add_credits_and_wake_if(server, wdata->credits, 0);
- for (i = 0; i < nr_pages; ++i) - unlock_page(wdata->pages[i]); + kref_put(&wdata->refcount, cifs_writedata_release);
- /* send failure -- clean up the mess */ - if (rc != 0) { - for (i = 0; i < nr_pages; ++i) { - if (rc == -EAGAIN) - redirty_page_for_writepage(wbc, - wdata->pages[i]); - else - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - page_cache_release(wdata->pages[i]); - } - if (rc != -EAGAIN) - mapping_set_error(mapping, rc); + if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) { + index = saved_index; + continue; } - kref_put(&wdata->refcount, cifs_writedata_release);
wbc->nr_to_write -= nr_pages; if (wbc->nr_to_write <= 0) @@@ -2415,106 -2362,125 +2415,106 @@@ cifs_uncached_writev_complete(struct wo kref_put(&wdata->refcount, cifs_uncached_writedata_release); }
-/* attempt to send write to server, retry on any -EAGAIN errors */ static int -cifs_uncached_retry_writev(struct cifs_writedata *wdata) +wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, + size_t *len, unsigned long nr_pages) { - int rc; - struct TCP_Server_Info *server; + int rc = 0; + size_t save_len, copied, bytes, cur_len = *len; + unsigned long i;
- server = tlink_tcon(wdata->cfile->tlink)->ses->server; + save_len = cur_len; + for (i = 0; i < nr_pages; i++) { + bytes = min_t(const size_t, cur_len, PAGE_SIZE); + copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from); + cur_len -= copied; + /* + * If we didn't copy as much as we expected, then that + * may mean we trod into an unmapped area. Stop copying + * at that point. On the next pass through the big + * loop, we'll likely end up getting a zero-length + * write and bailing out of it. + */ + if (copied < bytes) + break; + } + cur_len = save_len - cur_len; + *len = cur_len;
- do { - if (wdata->cfile->invalidHandle) { - rc = cifs_reopen_file(wdata->cfile, false); - if (rc != 0) - continue; - } - rc = server->ops->async_writev(wdata, - cifs_uncached_writedata_release); - } while (rc == -EAGAIN); + /* + * If we have no data to send, then that probably means that + * the copy above failed altogether. That's most likely because + * the address in the iovec was bogus. Return -EFAULT and let + * the caller free anything we allocated and bail out. + */ + if (!cur_len) + return -EFAULT;
+ /* + * i + 1 now represents the number of pages we actually used in + * the copy phase above. Bring nr_pages down to that, and free + * any pages that we didn't use. + */ + for ( ; nr_pages > i + 1; nr_pages--) + put_page(wdata->pages[nr_pages - 1]); return rc; }
-static ssize_t -cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) +static int +cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, + struct cifsFileInfo *open_file, + struct cifs_sb_info *cifs_sb, struct list_head *wdata_list) { + int rc = 0; + size_t cur_len; unsigned long nr_pages, i; - size_t bytes, copied, len, cur_len; - ssize_t total_written = 0; - loff_t offset; - struct cifsFileInfo *open_file; - struct cifs_tcon *tcon; - struct cifs_sb_info *cifs_sb; - struct cifs_writedata *wdata, *tmp; - struct list_head wdata_list; - int rc; + struct cifs_writedata *wdata; + struct iov_iter saved_from; + loff_t saved_offset = offset; pid_t pid; - - len = iov_iter_count(from); - rc = generic_write_checks(file, poffset, &len, 0); - if (rc) - return rc; - - if (!len) - return 0; - - iov_iter_truncate(from, len); - - INIT_LIST_HEAD(&wdata_list); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_writev) - return -ENOSYS; - - offset = *poffset; + struct TCP_Server_Info *server;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid;
+ server = tlink_tcon(open_file->tlink)->ses->server; + memcpy(&saved_from, from, sizeof(struct iov_iter)); + do { - size_t save_len; + unsigned int wsize, credits; + + rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, + &wsize, &credits); + if (rc) + break;
- nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len); + nr_pages = get_numpages(wsize, len, &cur_len); wdata = cifs_writedata_alloc(nr_pages, cifs_uncached_writev_complete); if (!wdata) { rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); break; }
rc = cifs_write_allocate_pages(wdata->pages, nr_pages); if (rc) { kfree(wdata); + add_credits_and_wake_if(server, credits, 0); break; }
- save_len = cur_len; - for (i = 0; i < nr_pages; i++) { - bytes = min_t(size_t, cur_len, PAGE_SIZE); - copied = copy_page_from_iter(wdata->pages[i], 0, bytes, - from); - cur_len -= copied; - /* - * If we didn't copy as much as we expected, then that - * may mean we trod into an unmapped area. Stop copying - * at that point. On the next pass through the big - * loop, we'll likely end up getting a zero-length - * write and bailing out of it. - */ - if (copied < bytes) - break; - } - cur_len = save_len - cur_len; - - /* - * If we have no data to send, then that probably means that - * the copy above failed altogether. That's most likely because - * the address in the iovec was bogus. Set the rc to -EFAULT, - * free anything we allocated and bail out. - */ - if (!cur_len) { + rc = wdata_fill_from_iovec(wdata, from, &cur_len, nr_pages); + if (rc) { for (i = 0; i < nr_pages; i++) put_page(wdata->pages[i]); kfree(wdata); - rc = -EFAULT; + add_credits_and_wake_if(server, credits, 0); break; }
- /* - * i + 1 now represents the number of pages we actually used in - * the copy phase above. Bring nr_pages down to that, and free - * any pages that we didn't use. - */ - for ( ; nr_pages > i + 1; nr_pages--) - put_page(wdata->pages[nr_pages - 1]); - wdata->sync_mode = WB_SYNC_ALL; wdata->nr_pages = nr_pages; wdata->offset = (__u64)offset; @@@ -2523,71 -2489,18 +2523,71 @@@ wdata->bytes = cur_len; wdata->pagesz = PAGE_SIZE; wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); - rc = cifs_uncached_retry_writev(wdata); + wdata->credits = credits; + + if (!wdata->cfile->invalidHandle || + !cifs_reopen_file(wdata->cfile, false)) + rc = server->ops->async_writev(wdata, + cifs_uncached_writedata_release); if (rc) { + add_credits_and_wake_if(server, wdata->credits, 0); kref_put(&wdata->refcount, cifs_uncached_writedata_release); + if (rc == -EAGAIN) { + memcpy(from, &saved_from, + sizeof(struct iov_iter)); + iov_iter_advance(from, offset - saved_offset); + continue; + } break; }
- list_add_tail(&wdata->list, &wdata_list); + list_add_tail(&wdata->list, wdata_list); offset += cur_len; len -= cur_len; } while (len > 0);
+ return rc; +} + +static ssize_t +cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) +{ + size_t len; + ssize_t total_written = 0; + loff_t offset; + struct cifsFileInfo *open_file; + struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb; + struct cifs_writedata *wdata, *tmp; + struct list_head wdata_list; + struct iov_iter saved_from; + int rc; + + len = iov_iter_count(from); + rc = generic_write_checks(file, poffset, &len, 0); + if (rc) + return rc; + + if (!len) + return 0; + + iov_iter_truncate(from, len); + + INIT_LIST_HEAD(&wdata_list); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + open_file = file->private_data; + tcon = tlink_tcon(open_file->tlink); + + if (!tcon->ses->server->ops->async_writev) + return -ENOSYS; + + offset = *poffset; + memcpy(&saved_from, from, sizeof(struct iov_iter)); + + rc = cifs_write_from_iter(*poffset, len, from, open_file, cifs_sb, + &wdata_list); + /* * If at least one write was successfully sent, then discard any rc * value from the later writes. If the other write succeeds, then @@@ -2616,25 -2529,7 +2616,25 @@@ restart_loop
/* resend call if it's a retryable error */ if (rc == -EAGAIN) { - rc = cifs_uncached_retry_writev(wdata); + struct list_head tmp_list; + struct iov_iter tmp_from; + + INIT_LIST_HEAD(&tmp_list); + list_del_init(&wdata->list); + + memcpy(&tmp_from, &saved_from, + sizeof(struct iov_iter)); + iov_iter_advance(&tmp_from, + wdata->offset - *poffset); + + rc = cifs_write_from_iter(wdata->offset, + wdata->bytes, &tmp_from, + open_file, cifs_sb, &tmp_list); + + list_splice(&tmp_list, &wdata_list); + + kref_put(&wdata->refcount, + cifs_uncached_writedata_release); goto restart_loop; } } @@@ -2827,6 -2722,26 +2827,6 @@@ cifs_uncached_readdata_release(struct k cifs_readdata_release(refcount); }
-static int -cifs_retry_async_readv(struct cifs_readdata *rdata) -{ - int rc; - struct TCP_Server_Info *server; - - server = tlink_tcon(rdata->cfile->tlink)->ses->server; - - do { - if (rdata->cfile->invalidHandle) { - rc = cifs_reopen_file(rdata->cfile, true); - if (rc != 0) - continue; - } - rc = server->ops->async_readv(rdata); - } while (rc == -EAGAIN); - - return rc; -} - /** * cifs_readdata_to_iov - copy data from pages in response to an iovec * @rdata: the readdata response with list of pages holding data @@@ -2839,7 -2754,7 +2839,7 @@@ static int cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) { - size_t remaining = rdata->bytes; + size_t remaining = rdata->got_bytes; unsigned int i;
for (i = 0; i < rdata->nr_pages; i++) { @@@ -2867,12 -2782,11 +2867,12 @@@ static in cifs_uncached_read_into_pages(struct TCP_Server_Info *server, struct cifs_readdata *rdata, unsigned int len) { - int total_read = 0, result = 0; + int result = 0; unsigned int i; unsigned int nr_pages = rdata->nr_pages; struct kvec iov;
+ rdata->got_bytes = 0; rdata->tailsz = PAGE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; @@@ -2906,45 -2820,55 +2906,45 @@@ if (result < 0) break;
- total_read += result; + rdata->got_bytes += result; }
- return total_read > 0 ? total_read : result; + return rdata->got_bytes > 0 && result != -ECONNABORTED ? + rdata->got_bytes : result; }
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +static int +cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, + struct cifs_sb_info *cifs_sb, struct list_head *rdata_list) { - struct file *file = iocb->ki_filp; - ssize_t rc; - size_t len, cur_len; - ssize_t total_read = 0; - loff_t offset = iocb->ki_pos; - unsigned int npages; - struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; - struct cifsFileInfo *open_file; - struct cifs_readdata *rdata, *tmp; - struct list_head rdata_list; + struct cifs_readdata *rdata; + unsigned int npages, rsize, credits; + size_t cur_len; + int rc; pid_t pid; + struct TCP_Server_Info *server;
- len = iov_iter_count(to); - if (!len) - return 0; - - INIT_LIST_HEAD(&rdata_list); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_readv) - return -ENOSYS; + server = tlink_tcon(open_file->tlink)->ses->server;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid;
- if ((file->f_flags & O_ACCMODE) == O_WRONLY) - cifs_dbg(FYI, "attempting read on write only file instance\n"); - do { - cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, + &rsize, &credits); + if (rc) + break; + + cur_len = min_t(const size_t, len, rsize); npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
/* allocate a readdata struct */ rdata = cifs_readdata_alloc(npages, cifs_uncached_readv_complete); if (!rdata) { + add_credits_and_wake_if(server, credits, 0); rc = -ENOMEM; break; } @@@ -2960,113 -2884,44 +2960,113 @@@ rdata->pid = pid; rdata->pagesz = PAGE_SIZE; rdata->read_into_pages = cifs_uncached_read_into_pages; + rdata->credits = credits;
- rc = cifs_retry_async_readv(rdata); + if (!rdata->cfile->invalidHandle || + !cifs_reopen_file(rdata->cfile, true)) + rc = server->ops->async_readv(rdata); error: if (rc) { + add_credits_and_wake_if(server, rdata->credits, 0); kref_put(&rdata->refcount, cifs_uncached_readdata_release); + if (rc == -EAGAIN) + continue; break; }
- list_add_tail(&rdata->list, &rdata_list); + list_add_tail(&rdata->list, rdata_list); offset += cur_len; len -= cur_len; } while (len > 0);
+ return rc; +} + +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + ssize_t rc; + size_t len; + ssize_t total_read = 0; + loff_t offset = iocb->ki_pos; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsFileInfo *open_file; + struct cifs_readdata *rdata, *tmp; + struct list_head rdata_list; + + len = iov_iter_count(to); + if (!len) + return 0; + + INIT_LIST_HEAD(&rdata_list); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + open_file = file->private_data; + tcon = tlink_tcon(open_file->tlink); + + if (!tcon->ses->server->ops->async_readv) + return -ENOSYS; + + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + cifs_dbg(FYI, "attempting read on write only file instance\n"); + + rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list); + /* if at least one read request send succeeded, then reset rc */ if (!list_empty(&rdata_list)) rc = 0;
len = iov_iter_count(to); /* the loop below should proceed in the order of increasing offsets */ +again: list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { - again: if (!rc) { /* FIXME: freezable sleep too? */ rc = wait_for_completion_killable(&rdata->done); if (rc) rc = -EINTR; - else if (rdata->result) { - rc = rdata->result; + else if (rdata->result == -EAGAIN) { /* resend call if it's a retryable error */ - if (rc == -EAGAIN) { - rc = cifs_retry_async_readv(rdata); - goto again; + struct list_head tmp_list; + unsigned int got_bytes = rdata->got_bytes; + + list_del_init(&rdata->list); + INIT_LIST_HEAD(&tmp_list); + + /* + * Got a part of data and then reconnect has + * happened -- fill the buffer and continue + * reading. + */ + if (got_bytes && got_bytes < rdata->bytes) { + rc = cifs_readdata_to_iov(rdata, to); + if (rc) { + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + continue; + } } - } else { + + rc = cifs_send_async_read( + rdata->offset + got_bytes, + rdata->bytes - got_bytes, + rdata->cfile, cifs_sb, + &tmp_list); + + list_splice(&tmp_list, &rdata_list); + + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + goto again; + } else if (rdata->result) + rc = rdata->result; + else rc = cifs_readdata_to_iov(rdata, to); - }
+ /* if there was a short read -- discard anything left */ + if (rdata->got_bytes && rdata->got_bytes < rdata->bytes) + rc = -ENODATA; } list_del_init(&rdata->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); @@@ -3175,19 -3030,18 +3175,19 @@@ cifs_read(struct file *file, char *read
for (total_read = 0, cur_offset = read_data; read_size > total_read; total_read += bytes_read, cur_offset += bytes_read) { - current_read_size = min_t(uint, read_size - total_read, rsize); - /* - * For windows me and 9x we do not want to request more than it - * negotiated since it will refuse the read then. - */ - if ((tcon->ses) && !(tcon->ses->capabilities & + do { + current_read_size = min_t(uint, read_size - total_read, + rsize); + /* + * For windows me and 9x we do not want to request more + * than it negotiated since it will refuse the read + * then. + */ + if ((tcon->ses) && !(tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)) { - current_read_size = min_t(uint, current_read_size, - CIFSMaxBufSize); - } - rc = -EAGAIN; - while (rc == -EAGAIN) { + current_read_size = min_t(uint, + current_read_size, CIFSMaxBufSize); + } if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); if (rc != 0) @@@ -3200,8 -3054,7 +3200,8 @@@ rc = server->ops->sync_read(xid, open_file, &io_parms, &bytes_read, &cur_offset, &buf_type); - } + } while (rc == -EAGAIN); + if (rc || (bytes_read == 0)) { if (total_read) { break; @@@ -3280,30 -3133,25 +3280,30 @@@ int cifs_file_mmap(struct file *file, s static void cifs_readv_complete(struct work_struct *work) { - unsigned int i; + unsigned int i, got_bytes; struct cifs_readdata *rdata = container_of(work, struct cifs_readdata, work);
+ got_bytes = rdata->got_bytes; for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i];
lru_cache_add_file(page);
- if (rdata->result == 0) { + if (rdata->result == 0 || + (rdata->result == -EAGAIN && got_bytes)) { flush_dcache_page(page); SetPageUptodate(page); }
unlock_page(page);
- if (rdata->result == 0) + if (rdata->result == 0 || + (rdata->result == -EAGAIN && got_bytes)) cifs_readpage_to_fscache(rdata->mapping->host, page);
+ got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes); + page_cache_release(page); rdata->pages[i] = NULL; } @@@ -3314,7 -3162,7 +3314,7 @@@ static in cifs_readpages_read_into_pages(struct TCP_Server_Info *server, struct cifs_readdata *rdata, unsigned int len) { - int total_read = 0, result = 0; + int result = 0; unsigned int i; u64 eof; pgoff_t eof_index; @@@ -3326,7 -3174,6 +3326,7 @@@ eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
+ rdata->got_bytes = 0; rdata->tailsz = PAGE_CACHE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; @@@ -3381,70 -3228,10 +3381,70 @@@ if (result < 0) break;
- total_read += result; + rdata->got_bytes += result; + } + + return rdata->got_bytes > 0 && result != -ECONNABORTED ? + rdata->got_bytes : result; +} + +static int +readpages_get_pages(struct address_space *mapping, struct list_head *page_list, + unsigned int rsize, struct list_head *tmplist, + unsigned int *nr_pages, loff_t *offset, unsigned int *bytes) +{ + struct page *page, *tpage; + unsigned int expected_index; + int rc; + + INIT_LIST_HEAD(tmplist); + + page = list_entry(page_list->prev, struct page, lru); + + /* + * Lock the page and put it in the cache. Since no one else + * should have access to this page, we're safe to simply set + * PG_locked without checking it first. + */ + __set_page_locked(page); + rc = add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL); + + /* give up if we can't stick it in the cache */ + if (rc) { + __clear_page_locked(page); + return rc; }
- return total_read > 0 ? total_read : result; + /* move first page to the tmplist */ + *offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + *bytes = PAGE_CACHE_SIZE; + *nr_pages = 1; + list_move_tail(&page->lru, tmplist); + + /* now try and add more pages onto the request */ + expected_index = page->index + 1; + list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { + /* discontinuity ? */ + if (page->index != expected_index) + break; + + /* would this page push the read over the rsize? */ + if (*bytes + PAGE_CACHE_SIZE > rsize) + break; + + __set_page_locked(page); + if (add_to_page_cache_locked(page, mapping, page->index, + GFP_KERNEL)) { + __clear_page_locked(page); + break; + } + list_move_tail(&page->lru, tmplist); + (*bytes) += PAGE_CACHE_SIZE; + expected_index++; + (*nr_pages)++; + } + return rc; }
static int cifs_readpages(struct file *file, struct address_space *mapping, @@@ -3454,10 -3241,19 +3454,10 @@@ struct list_head tmplist; struct cifsFileInfo *open_file = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - unsigned int rsize = cifs_sb->rsize; + struct TCP_Server_Info *server; pid_t pid;
/* - * Give up immediately if rsize is too small to read an entire page. - * The VFS will fall back to readpage. We should never reach this - * point however since we set ra_pages to 0 when the rsize is smaller - * than a cache page. - */ - if (unlikely(rsize < PAGE_CACHE_SIZE)) - return 0; - - /* * Reads as many pages as possible from fscache. Returns -ENOBUFS * immediately if the cookie is negative * @@@ -3475,7 -3271,7 +3475,7 @@@ pid = current->tgid;
rc = 0; - INIT_LIST_HEAD(&tmplist); + server = tlink_tcon(open_file->tlink)->ses->server;
cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", __func__, file, mapping, num_pages); @@@ -3492,35 -3288,58 +3492,35 @@@ * the rdata->pages, then we want them in increasing order. */ while (!list_empty(page_list)) { - unsigned int i; - unsigned int bytes = PAGE_CACHE_SIZE; - unsigned int expected_index; - unsigned int nr_pages = 1; + unsigned int i, nr_pages, bytes, rsize; loff_t offset; struct page *page, *tpage; struct cifs_readdata *rdata; + unsigned credits;
- page = list_entry(page_list->prev, struct page, lru); + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, + &rsize, &credits); + if (rc) + break;
/* - * Lock the page and put it in the cache. Since no one else - * should have access to this page, we're safe to simply set - * PG_locked without checking it first. + * Give up immediately if rsize is too small to read an entire + * page. The VFS will fall back to readpage. We should never + * reach this point however since we set ra_pages to 0 when the + * rsize is smaller than a cache page. */ - __set_page_locked(page); - rc = add_to_page_cache_locked(page, mapping, - page->index, GFP_KERNEL); + if (unlikely(rsize < PAGE_CACHE_SIZE)) { + add_credits_and_wake_if(server, credits, 0); + return 0; + }
- /* give up if we can't stick it in the cache */ + rc = readpages_get_pages(mapping, page_list, rsize, &tmplist, + &nr_pages, &offset, &bytes); if (rc) { - __clear_page_locked(page); + add_credits_and_wake_if(server, credits, 0); break; }
- /* move first page to the tmplist */ - offset = (loff_t)page->index << PAGE_CACHE_SHIFT; - list_move_tail(&page->lru, &tmplist); - - /* now try and add more pages onto the request */ - expected_index = page->index + 1; - list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { - /* discontinuity ? */ - if (page->index != expected_index) - break; - - /* would this page push the read over the rsize? */ - if (bytes + PAGE_CACHE_SIZE > rsize) - break; - - __set_page_locked(page); - if (add_to_page_cache_locked(page, mapping, - page->index, GFP_KERNEL)) { - __clear_page_locked(page); - break; - } - list_move_tail(&page->lru, &tmplist); - bytes += PAGE_CACHE_SIZE; - expected_index++; - nr_pages++; - } - rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete); if (!rdata) { /* best to give up if we're out of mem */ @@@ -3531,7 -3350,6 +3531,7 @@@ page_cache_release(page); } rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); break; }
@@@ -3542,32 -3360,21 +3542,32 @@@ rdata->pid = pid; rdata->pagesz = PAGE_CACHE_SIZE; rdata->read_into_pages = cifs_readpages_read_into_pages; + rdata->credits = credits;
list_for_each_entry_safe(page, tpage, &tmplist, lru) { list_del(&page->lru); rdata->pages[rdata->nr_pages++] = page; }
- rc = cifs_retry_async_readv(rdata); - if (rc != 0) { + if (!rdata->cfile->invalidHandle || + !cifs_reopen_file(rdata->cfile, true)) + rc = server->ops->async_readv(rdata); + if (rc) { + add_credits_and_wake_if(server, rdata->credits, 0); for (i = 0; i < rdata->nr_pages; i++) { page = rdata->pages[i]; lru_cache_add_file(page); unlock_page(page); page_cache_release(page); + if (rc == -EAGAIN) + list_add_tail(&page->lru, &tmplist); } kref_put(&rdata->refcount, cifs_readdata_release); + if (rc == -EAGAIN) { + /* Re-add pages to the page_list and retry */ + list_splice(&tmplist, page_list); + continue; + } break; }
@@@ -3811,6 -3618,13 +3811,6 @@@ static int cifs_launder_page(struct pag return rc; }
-static int -cifs_pending_writers_wait(void *unused) -{ - schedule(); - return 0; -} - void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@@ -3822,7 -3636,7 +3822,7 @@@ int rc = 0;
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, - cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE);
server->ops->downgrade_oplock(server, cinode, test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); diff --combined fs/cifs/sess.c index 39ee326,27e6175..39b8507 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@@ -46,7 -46,7 +46,7 @@@ static __u32 cifs_ssetup_hdr(struct cif CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); - pSMB->req.VcNumber = __constant_cpu_to_le16(1); + pSMB->req.VcNumber = cpu_to_le16(1);
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@@ -520,559 -520,382 +520,559 @@@ select_sectype(struct TCP_Server_Info * } }
-int -CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_cp) +struct sess_data { + unsigned int xid; + struct cifs_ses *ses; + struct nls_table *nls_cp; + void (*func)(struct sess_data *); + int result; + + /* we will send the SMB in three pieces: + * a fixed length beginning part, an optional + * SPNEGO blob (which can be zero length), and a + * last part which will include the strings + * and rest of bcc area. This allows us to avoid + * a large buffer 17K allocation + */ + int buf0_type; + struct kvec iov[3]; +}; + +static int +sess_alloc_buffer(struct sess_data *sess_data, int wct) { - int rc = 0; - int wct; + int rc; + struct cifs_ses *ses = sess_data->ses; struct smb_hdr *smb_buf; - char *bcc_ptr; - char *str_area; - SESSION_SETUP_ANDX *pSMB; - __u32 capabilities; - __u16 count; - int resp_buf_type; - struct kvec iov[3]; - enum securityEnum type; - __u16 action, bytes_remaining; - struct key *spnego_key = NULL; - __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ - u16 blob_len; - char *ntlmsspblob = NULL;
- if (ses == NULL) { - WARN(1, "%s: ses == NULL!", __func__); - return -EINVAL; - } + rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, + (void **)&smb_buf);
- type = select_sectype(ses->server, ses->sectype); - cifs_dbg(FYI, "sess setup type %d\n", type); - if (type == Unspecified) { - cifs_dbg(VFS, - "Unable to select appropriate authentication method!"); - return -EINVAL; + if (rc) + return rc; + + sess_data->iov[0].iov_base = (char *)smb_buf; + sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4; + /* + * This variable will be used to clear the buffer + * allocated above in case of any error in the calling function. + */ + sess_data->buf0_type = CIFS_SMALL_BUFFER; + + /* 2000 big enough to fit max user, domain, NOS name etc. */ + sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL); + if (!sess_data->iov[2].iov_base) { + rc = -ENOMEM; + goto out_free_smb_buf; }
- if (type == RawNTLMSSP) { - /* if memory allocation is successful, caller of this function - * frees it. - */ - ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); - if (!ses->ntlmssp) - return -ENOMEM; - ses->ntlmssp->sesskey_per_smbsess = false; + return 0; + +out_free_smb_buf: + kfree(smb_buf); + sess_data->iov[0].iov_base = NULL; + sess_data->iov[0].iov_len = 0; + sess_data->buf0_type = CIFS_NO_BUFFER; + return rc; +} + +static void +sess_free_buffer(struct sess_data *sess_data) +{
+ free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + sess_data->buf0_type = CIFS_NO_BUFFER; + kfree(sess_data->iov[2].iov_base); +} + +static int +sess_establish_session(struct sess_data *sess_data) +{ + struct cifs_ses *ses = sess_data->ses; + + mutex_lock(&ses->server->srv_mutex); + if (!ses->server->session_estab) { + if (ses->server->sign) { + ses->server->session_key.response = + kmemdup(ses->auth_key.response, + ses->auth_key.len, GFP_KERNEL); + if (!ses->server->session_key.response) { + mutex_unlock(&ses->server->srv_mutex); + return -ENOMEM; + } + ses->server->session_key.len = + ses->auth_key.len; + } + ses->server->sequence_number = 0x2; + ses->server->session_estab = true; } + mutex_unlock(&ses->server->srv_mutex);
-ssetup_ntlmssp_authenticate: - if (phase == NtLmChallenge) - phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ + cifs_dbg(FYI, "CIFS session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock);
- if (type == LANMAN) { -#ifndef CONFIG_CIFS_WEAK_PW_HASH - /* LANMAN and plaintext are less secure and off by default. - So we make this explicitly be turned on in kconfig (in the - build) and turned on at runtime (changed from the default) - in proc/fs/cifs or via mount parm. Unfortunately this is - needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ - return -EOPNOTSUPP; -#endif - wct = 10; /* lanman 2 style sessionsetup */ - } else if ((type == NTLM) || (type == NTLMv2)) { - /* For NTLMv2 failures eventually may need to retry NTLM */ - wct = 13; /* old style NTLM sessionsetup */ - } else /* same size: negotiate or auth, NTLMSSP or extended security */ - wct = 12; + return 0; +}
- rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, - (void **)&smb_buf); - if (rc) - return rc; +static int +sess_sendreceive(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base; + __u16 count;
- pSMB = (SESSION_SETUP_ANDX *)smb_buf; + count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len; + smb_buf->smb_buf_length = + cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count); + put_bcc(count, smb_buf); + + rc = SendReceive2(sess_data->xid, sess_data->ses, + sess_data->iov, 3 /* num_iovecs */, + &sess_data->buf0_type, + CIFS_LOG_ERROR); + + return rc; +}
+/* + * LANMAN and plaintext are less secure and off by default. + * So we make this explicitly be turned on in kconfig (in the + * build) and turned on at runtime (changed from the default) + * in proc/fs/cifs or via mount parm. Unfortunately this is + * needed for old Win (e.g. Win95), some obscure NAS and OS/2 + */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH +static void +sess_auth_lanman(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + char lnm_session_key[CIFS_AUTH_RESP_SIZE]; + __u32 capabilities; + __u16 bytes_remaining; + + /* lanman 2 style sessionsetup */ + /* wct = 10 */ + rc = sess_alloc_buffer(sess_data, 10); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; capabilities = cifs_ssetup_hdr(ses, pSMB);
- /* we will send the SMB in three pieces: - a fixed length beginning part, an optional - SPNEGO blob (which can be zero length), and a - last part which will include the strings - and rest of bcc area. This allows us to avoid - a large buffer 17K allocation */ - iov[0].iov_base = (char *)pSMB; - iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4; - - /* setting this here allows the code at the end of the function - to free the request buffer if there's an error */ - resp_buf_type = CIFS_SMALL_BUFFER; + pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
- /* 2000 big enough to fit max user, domain, NOS name etc. */ - str_area = kmalloc(2000, GFP_KERNEL); - if (str_area == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - bcc_ptr = str_area; + /* no capabilities flags in old lanman negotiation */ + pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- iov[1].iov_base = NULL; - iov[1].iov_len = 0; + /* Calculate hash with password and copy into bcc_ptr. + * Encryption Key (stored as in cryptkey) gets used if the + * security mode bit in Negottiate Protocol response states + * to use challenge/response method (i.e. Password bit is 1). + */ + rc = calc_lanman_hash(ses->password, ses->server->cryptkey, + ses->server->sec_mode & SECMODE_PW_ENCRYPT ? + true : false, lnm_session_key);
- if (type == LANMAN) { -#ifdef CONFIG_CIFS_WEAK_PW_HASH - char lnm_session_key[CIFS_AUTH_RESP_SIZE]; + memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + /* + * can not sign if LANMAN negotiated so no need + * to calculate signing key? but what if server + * changed to do higher than lanman dialect and + * we reconnected would we ever calc signing_key? + */
- pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; + cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); + /* Unicode not allowed for LANMAN dialects */ + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
- /* no capabilities flags in old lanman negotiation */ + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base;
- pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); + rc = sess_sendreceive(sess_data); + if (rc) + goto out;
- /* Calculate hash with password and copy into bcc_ptr. - * Encryption Key (stored as in cryptkey) gets used if the - * security mode bit in Negottiate Protocol response states - * to use challenge/response method (i.e. Password bit is 1). - */ + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
- rc = calc_lanman_hash(ses->password, ses->server->cryptkey, - ses->server->sec_mode & SECMODE_PW_ENCRYPT ? - true : false, lnm_session_key); + /* lanman response has a word count of 3 */ + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + }
- memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
- /* can not sign if LANMAN negotiated so no need - to calculate signing key? but what if server - changed to do higher than lanman dialect and - we reconnected would we ever calc signing_key? */ + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf);
- cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); - /* Unicode not allowed for LANMAN dialects */ - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); +} + +#else + +static void +sess_auth_lanman(struct sess_data *sess_data) +{ + sess_data->result = -EOPNOTSUPP; + sess_data->func = NULL; +} #endif - } else if (type == NTLM) { - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); - pSMB->req_no_secext.CaseInsensitivePasswordLength = + +static void +sess_auth_ntlm(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + + /* old style NTLM sessionsetup */ + /* wct = 13 */ + rc = sess_alloc_buffer(sess_data, 13); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + pSMB->req_no_secext.CaseInsensitivePasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - pSMB->req_no_secext.CaseSensitivePasswordLength = + pSMB->req_no_secext.CaseSensitivePasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- /* calculate ntlm response and session key */ - rc = setup_ntlm_response(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLM authentication\n", + /* calculate ntlm response and session key */ + rc = setup_ntlm_response(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLM authentication\n", rc); - goto ssetup_exit; - } + goto out; + }
- /* copy ntlm response */ - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - - if (ses->capabilities & CAP_UNICODE) { - /* unicode strings must be word aligned */ - if (iov[0].iov_len % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else if (type == NTLMv2) { - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); - - /* LM2 password would be here if we supported it */ - pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; - - /* calculate nlmv2 response and session key */ - rc = setup_ntlmv2_rsp(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", - rc); - goto ssetup_exit; + /* copy ntlm response */ + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + if (ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if (sess_data->iov[0].iov_len % 2) { + *bcc_ptr = 0; + bcc_ptr++; } - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - ses->auth_key.len - CIFS_SESS_KEY_SIZE); - bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; - - /* set case sensitive password length after tilen may get - * assigned, tilen is 0 otherwise. - */ - pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } else { + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + }
- if (ses->capabilities & CAP_UNICODE) { - if (iov[0].iov_len % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else if (type == Kerberos) { -#ifdef CONFIG_CIFS_UPCALL - struct cifs_spnego_msg *msg;
- spnego_key = cifs_get_spnego_key(ses); - if (IS_ERR(spnego_key)) { - rc = PTR_ERR(spnego_key); - spnego_key = NULL; - goto ssetup_exit; - } + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base;
- msg = spnego_key->payload.data; - /* check version field to make sure that cifs.upcall is - sending us a response in an expected form */ - if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { - cifs_dbg(VFS, "incorrect version of cifs.upcall " - "expected %d but got %d)", - CIFS_SPNEGO_UPCALL_VERSION, msg->version); - rc = -EKEYREJECTED; - goto ssetup_exit; - } + rc = sess_sendreceive(sess_data); + if (rc) + goto out;
- ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, - GFP_KERNEL); - if (!ses->auth_key.response) { - cifs_dbg(VFS, - "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); - rc = -ENOMEM; - goto ssetup_exit; - } - ses->auth_key.len = msg->sesskey_len; - - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; - capabilities |= CAP_EXTENDED_SECURITY; - pSMB->req.Capabilities = cpu_to_le32(capabilities); - iov[1].iov_base = msg->data + msg->sesskey_len; - iov[1].iov_len = msg->secblob_len; - pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len); - - if (ses->capabilities & CAP_UNICODE) { - /* unicode strings must be word aligned */ - if ((iov[0].iov_len + iov[1].iov_len) % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_oslm_strings(&bcc_ptr, nls_cp); - unicode_domain_string(&bcc_ptr, ses, nls_cp); - } else - /* BB: is this right? */ - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); -#else /* ! CONFIG_CIFS_UPCALL */ - cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); - rc = -ENOSYS; - goto ssetup_exit; -#endif /* CONFIG_CIFS_UPCALL */ - } else if (type == RawNTLMSSP) { - if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { - cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); - rc = -ENOSYS; - goto ssetup_exit; - } + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
- cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase); - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; - capabilities |= CAP_EXTENDED_SECURITY; - pSMB->req.Capabilities |= cpu_to_le32(capabilities); - switch(phase) { - case NtLmNegotiate: - build_ntlmssp_negotiate_blob( - pSMB->req.SecurityBlob, ses); - iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); - iov[1].iov_base = pSMB->req.SecurityBlob; - pSMB->req.SecurityBlobLength = - cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); - break; - case NtLmAuthenticate: - /* - * 5 is an empirical value, large enough to hold - * authenticate message plus max 10 of av paris, - * domain, user, workstation names, flags, etc. - */ - ntlmsspblob = kzalloc( - 5*sizeof(struct _AUTHENTICATE_MESSAGE), - GFP_KERNEL); - if (!ntlmsspblob) { - rc = -ENOMEM; - goto ssetup_exit; - } + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + }
- rc = build_ntlmssp_auth_blob(ntlmsspblob, - &blob_len, ses, nls_cp); - if (rc) - goto ssetup_exit; - iov[1].iov_len = blob_len; - iov[1].iov_base = ntlmsspblob; - pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len); - /* - * Make sure that we tell the server that we are using - * the uid that it just gave us back on the response - * (challenge) - */ - smb_buf->Uid = ses->Suid; - break; - default: - cifs_dbg(VFS, "invalid phase %d\n", phase); - rc = -ENOSYS; - goto ssetup_exit; + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; } - /* unicode strings must be word aligned */ - if ((iov[0].iov_len + iov[1].iov_len) % 2) { + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +static void +sess_auth_ntlmv2(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + + /* old style NTLM sessionsetup */ + /* wct = 13 */ + rc = sess_alloc_buffer(sess_data, 13); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + + /* LM2 password would be here if we supported it */ + pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; + + /* calculate nlmv2 response and session key */ + rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc); + goto out; + } + + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + + /* set case sensitive password length after tilen may get + * assigned, tilen is 0 otherwise. + */ + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + + if (ses->capabilities & CAP_UNICODE) { + if (sess_data->iov[0].iov_len % 2) { *bcc_ptr = 0; bcc_ptr++; } - unicode_oslm_strings(&bcc_ptr, nls_cp); + unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); } else { - cifs_dbg(VFS, "secType %d not supported!\n", type); - rc = -ENOSYS; - goto ssetup_exit; + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); }
- iov[2].iov_base = str_area; - iov[2].iov_len = (long) bcc_ptr - (long) str_area;
- count = iov[1].iov_len + iov[2].iov_len; - smb_buf->smb_buf_length = - cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count); + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base;
- put_bcc(count, smb_buf); + rc = sess_sendreceive(sess_data); + if (rc) + goto out;
- rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type, - CIFS_LOG_ERROR); - /* SMB request buf freed in SendReceive2 */ + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
- pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; - smb_buf = (struct smb_hdr *)iov[0].iov_base; + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf);
- if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) && - (smb_buf->Status.CifsError == - cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { - if (phase != NtLmNegotiate) { - cifs_dbg(VFS, "Unexpected more processing error\n"); - goto ssetup_exit; + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; } - /* NTLMSSP Negotiate sent now processing challenge (response) */ - phase = NtLmChallenge; /* process ntlmssp challenge */ - rc = 0; /* MORE_PROC rc is not an error here, but expected */ + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +#ifdef CONFIG_CIFS_UPCALL +static void +sess_auth_kerberos(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + struct key *spnego_key = NULL; + struct cifs_spnego_msg *msg; + u16 blob_len; + + /* extended security */ + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); if (rc) - goto ssetup_exit; + goto out;
- if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + spnego_key = cifs_get_spnego_key(ses); + if (IS_ERR(spnego_key)) { + rc = PTR_ERR(spnego_key); + spnego_key = NULL; + goto out; + } + + msg = spnego_key->payload.data; + /* + * check version field to make sure that cifs.upcall is + * sending us a response in an expected form + */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cifs_dbg(VFS, + "incorrect version of cifs.upcall (expected %d but got %d)", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); + rc = -EKEYREJECTED; + goto out_put_spnego_key; + } + + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto out_put_spnego_key; + } + ses->auth_key.len = msg->sesskey_len; + + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities = cpu_to_le32(capabilities); + sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; + sess_data->iov[1].iov_len = msg->secblob_len; + pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len); + + if (ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if ((sess_data->iov[0].iov_len + + sess_data->iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp); + } else { + /* BB: is this right? */ + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out_put_spnego_key; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + if (smb_buf->WordCount != 4) { rc = -EIO; cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); - goto ssetup_exit; + goto out_put_spnego_key; } - action = le16_to_cpu(pSMB->resp.Action); - if (action & GUEST_LOGIN) + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ cifs_dbg(FYI, "UID = %llu\n", ses->Suid); - /* response can have either 3 or 4 word count - Samba sends 3 */ - /* and lanman response is 3 */ + bytes_remaining = get_bcc(smb_buf); bcc_ptr = pByteArea(smb_buf);
- if (smb_buf->WordCount == 4) { - blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); - if (blob_len > bytes_remaining) { - cifs_dbg(VFS, "bad security blob length %d\n", - blob_len); - rc = -EINVAL; - goto ssetup_exit; - } - if (phase == NtLmChallenge) { - rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); - /* now goto beginning for ntlmssp authenticate phase */ - if (rc) - goto ssetup_exit; - } - bcc_ptr += blob_len; - bytes_remaining -= blob_len; + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out_put_spnego_key; } + bcc_ptr += blob_len; + bytes_remaining -= blob_len;
/* BB check if Unicode and decode strings */ if (bytes_remaining == 0) { @@@ -1083,371 -906,60 +1083,371 @@@ ++bcc_ptr; --bytes_remaining; } - decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); } else { - decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); }
-ssetup_exit: - if (spnego_key) { - key_invalidate(spnego_key); - key_put(spnego_key); + rc = sess_establish_session(sess_data); +out_put_spnego_key: + key_invalidate(spnego_key); + key_put(spnego_key); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +#else + +static void +sess_auth_kerberos(struct sess_data *sess_data) +{ + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); + sess_data->result = -ENOSYS; + sess_data->func = NULL; +} +#endif /* ! CONFIG_CIFS_UPCALL */ + +/* + * The required kvec buffers have to be allocated before calling this + * function. + */ +static int +_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) +{ + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + char *bcc_ptr; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)pSMB; + + capabilities = cifs_ssetup_hdr(ses, pSMB); + if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { + cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); + return -ENOSYS; } - kfree(str_area); - kfree(ntlmsspblob); - ntlmsspblob = NULL; - if (resp_buf_type == CIFS_SMALL_BUFFER) { - cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base); - cifs_small_buf_release(iov[0].iov_base); - } else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base);
- /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ - if ((phase == NtLmChallenge) && (rc == 0)) - goto ssetup_ntlmssp_authenticate; + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities |= cpu_to_le32(capabilities); + + bcc_ptr = sess_data->iov[2].iov_base; + /* unicode strings must be word aligned */ + if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + return 0; +} + +static void +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data); + +static void +sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u16 bytes_remaining; + char *bcc_ptr; + u16 blob_len; + + cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n"); + + /* + * if memory allocation is successful, caller of this function + * frees it. + */ + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) { + rc = -ENOMEM; + goto out; + } + ses->ntlmssp->sesskey_per_smbsess = false; + + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + + /* Build security blob before we assemble the request */ + build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses); + sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); + sess_data->iov[1].iov_base = pSMB->req.SecurityBlob; + pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); + + rc = _sess_auth_rawntlmssp_assemble_req(sess_data); + if (rc) + goto out; + + rc = sess_sendreceive(sess_data); + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + /* If true, rc here is expected and not an error */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && + smb_buf->Status.CifsError == + cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED)) + rc = 0; + + if (rc) + goto out; + + cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + + if (smb_buf->WordCount != 4) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out; + } + + rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); +out: + sess_free_buffer(sess_data);
if (!rc) { - mutex_lock(&ses->server->srv_mutex); - if (!ses->server->session_estab) { - if (ses->server->sign) { - ses->server->session_key.response = - kmemdup(ses->auth_key.response, - ses->auth_key.len, GFP_KERNEL); - if (!ses->server->session_key.response) { - rc = -ENOMEM; - mutex_unlock(&ses->server->srv_mutex); - goto keycp_exit; - } - ses->server->session_key.len = - ses->auth_key.len; - } - ses->server->sequence_number = 0x2; - ses->server->session_estab = true; - } - mutex_unlock(&ses->server->srv_mutex); + sess_data->func = sess_auth_rawntlmssp_authenticate; + return; + } + + /* Else error. Cleanup */ + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + + sess_data->func = NULL; + sess_data->result = rc; +}
- cifs_dbg(FYI, "CIFS session established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); +static void +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u16 bytes_remaining; + char *bcc_ptr; + char *ntlmsspblob = NULL; + u16 blob_len; + + cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n"); + + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); + if (rc) + goto out; + + /* Build security blob before we assemble the request */ + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)pSMB; + /* + * 5 is an empirical value, large enough to hold + * authenticate message plus max 10 of av paris, + * domain, user, workstation names, flags, etc. + */ + ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE), + GFP_KERNEL); + if (!ntlmsspblob) { + rc = -ENOMEM; + goto out; }
-keycp_exit: + rc = build_ntlmssp_auth_blob(ntlmsspblob, + &blob_len, ses, sess_data->nls_cp); + if (rc) + goto out_free_ntlmsspblob; + sess_data->iov[1].iov_len = blob_len; + sess_data->iov[1].iov_base = ntlmsspblob; + pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len); + /* + * Make sure that we tell the server that we are using + * the uid that it just gave us back on the response + * (challenge) + */ + smb_buf->Uid = ses->Suid; + + rc = _sess_auth_rawntlmssp_assemble_req(sess_data); + if (rc) + goto out_free_ntlmsspblob; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out_free_ntlmsspblob; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + if (smb_buf->WordCount != 4) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out_free_ntlmsspblob; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out_free_ntlmsspblob; + } + bcc_ptr += blob_len; + bytes_remaining -= blob_len; + + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + +out_free_ntlmsspblob: + kfree(ntlmsspblob); +out: + sess_free_buffer(sess_data); + + if (!rc) + rc = sess_establish_session(sess_data); + + /* Cleanup */ kfree(ses->auth_key.response); ses->auth_key.response = NULL; kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + + sess_data->func = NULL; + sess_data->result = rc; +} + +static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data) +{ + int type; + + type = select_sectype(ses->server, ses->sectype); + cifs_dbg(FYI, "sess setup type %d\n", type); + if (type == Unspecified) { + cifs_dbg(VFS, + "Unable to select appropriate authentication method!"); + return -EINVAL; + } + + switch (type) { + case LANMAN: + /* LANMAN and plaintext are less secure and off by default. + * So we make this explicitly be turned on in kconfig (in the + * build) and turned on at runtime (changed from the default) + * in proc/fs/cifs or via mount parm. Unfortunately this is + * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH + sess_data->func = sess_auth_lanman; + break; +#else + return -EOPNOTSUPP; +#endif + case NTLM: + sess_data->func = sess_auth_ntlm; + break; + case NTLMv2: + sess_data->func = sess_auth_ntlmv2; + break; + case Kerberos: +#ifdef CONFIG_CIFS_UPCALL + sess_data->func = sess_auth_kerberos; + break; +#else + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); + return -ENOSYS; + break; +#endif /* CONFIG_CIFS_UPCALL */ + case RawNTLMSSP: + sess_data->func = sess_auth_rawntlmssp_negotiate; + break; + default: + cifs_dbg(VFS, "secType %d not supported!\n", type); + return -ENOSYS; + } + + return 0; +} + +int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc = 0; + struct sess_data *sess_data; + + if (ses == NULL) { + WARN(1, "%s: ses == NULL!", __func__); + return -EINVAL; + } + + sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL); + if (!sess_data) + return -ENOMEM; + + rc = select_sec(ses, sess_data); + if (rc) + goto out; + + sess_data->xid = xid; + sess_data->ses = ses; + sess_data->buf0_type = CIFS_NO_BUFFER; + sess_data->nls_cp = (struct nls_table *) nls_cp; + + while (sess_data->func) + sess_data->func(sess_data); + + /* Store result before we free sess_data */ + rc = sess_data->result;
+out: + kfree(sess_data); return rc; } diff --combined fs/cifs/smb2ops.c index 081529f,7f99a0f..59437c5 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@@ -112,53 -112,6 +112,53 @@@ smb2_get_credits(struct mid_q_entry *mi return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest); }
+static int +smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, + unsigned int *num, unsigned int *credits) +{ + int rc = 0; + unsigned int scredits; + + spin_lock(&server->req_lock); + while (1) { + if (server->credits <= 0) { + spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); + rc = wait_event_killable(server->request_q, + has_credits(server, &server->credits)); + cifs_num_waiters_dec(server); + if (rc) + return rc; + spin_lock(&server->req_lock); + } else { + if (server->tcpStatus == CifsExiting) { + spin_unlock(&server->req_lock); + return -ENOENT; + } + + scredits = server->credits; + /* can deadlock with reopen */ + if (scredits == 1) { + *num = SMB2_MAX_BUFFER_SIZE; + *credits = 0; + break; + } + + /* leave one credit for a possible reopen */ + scredits--; + *num = min_t(unsigned int, size, + scredits * SMB2_MAX_BUFFER_SIZE); + + *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE); + server->credits -= *credits; + server->in_flight++; + break; + } + } + spin_unlock(&server->req_lock); + return rc; +} + static __u64 smb2_get_next_mid(struct TCP_Server_Info *server) { @@@ -229,6 -182,8 +229,6 @@@ smb2_negotiate_wsize(struct cifs_tcon * /* start with specified wsize, or default */ wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); - /* set it to the maximum buffer size value we can send with 1 credit */ - wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
return wsize; } @@@ -242,6 -197,8 +242,6 @@@ smb2_negotiate_rsize(struct cifs_tcon * /* start with specified rsize, or default */ rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); - /* set it to the maximum buffer size value we can send with 1 credit */ - rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
return rsize; } @@@ -590,7 -547,7 +590,7 @@@ smb2_clone_range(const unsigned int xid goto cchunk_out;
/* For now array only one chunk long, will make more flexible later */ - pcchunk->ChunkCount = __constant_cpu_to_le32(1); + pcchunk->ChunkCount = cpu_to_le32(1); pcchunk->Reserved = 0; pcchunk->Reserved2 = 0;
@@@ -1147,13 -1104,6 +1147,13 @@@ smb3_parse_lease_buf(void *buf, unsigne return le32_to_cpu(lc->lcontext.LeaseState); }
+static unsigned int +smb2_wp_retry_size(struct inode *inode) +{ + return min_t(unsigned int, CIFS_SB(inode->i_sb)->wsize, + SMB2_MAX_BUFFER_SIZE); +} + struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@@ -1163,7 -1113,6 +1163,7 @@@ .set_credits = smb2_set_credits, .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, + .wait_mtu_credits = cifs_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, @@@ -1228,7 -1177,6 +1228,7 @@@ .create_lease_buf = smb2_create_lease_buf, .parse_lease_buf = smb2_parse_lease_buf, .clone_range = smb2_clone_range, + .wp_retry_size = smb2_wp_retry_size, };
struct smb_version_operations smb21_operations = { @@@ -1240,7 -1188,6 +1240,7 @@@ .set_credits = smb2_set_credits, .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, @@@ -1305,7 -1252,6 +1305,7 @@@ .create_lease_buf = smb2_create_lease_buf, .parse_lease_buf = smb2_parse_lease_buf, .clone_range = smb2_clone_range, + .wp_retry_size = smb2_wp_retry_size, };
struct smb_version_operations smb30_operations = { @@@ -1317,7 -1263,6 +1317,7 @@@ .set_credits = smb2_set_credits, .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, @@@ -1385,7 -1330,6 +1385,7 @@@ .parse_lease_buf = smb3_parse_lease_buf, .clone_range = smb2_clone_range, .validate_negotiate = smb3_validate_negotiate, + .wp_retry_size = smb2_wp_retry_size, };
struct smb_version_values smb20_values = { diff --combined fs/cifs/smb2pdu.c index 768cddb,a9b03c2..2057250 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@@ -245,6 -245,10 +245,6 @@@ smb2_reconnect(__le16 smb2_command, str if (rc) goto out; atomic_inc(&tconInfoReconnectCount); - /* - * BB FIXME add code to check if wsize needs update due to negotiated - * smb buffer size shrinking. - */ out: /* * Check if handle based operation so we know whether we can continue @@@ -305,6 -309,16 +305,6 @@@ small_smb2_init(__le16 smb2_command, st return rc; }
-static void -free_rsp_buf(int resp_buftype, void *rsp) -{ - if (resp_buftype == CIFS_SMALL_BUFFER) - cifs_small_buf_release(rsp); - else if (resp_buftype == CIFS_LARGE_BUFFER) - cifs_buf_release(rsp); -} - - /* * * SMB2 Worker functions follow: @@@ -1355,7 -1369,7 +1355,7 @@@ SMB2_set_compression(const unsigned in char *ret_data = NULL;
fsctl_input.CompressionState = - __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SET_COMPRESSION, true /* is_fsctl */, @@@ -1724,18 -1738,12 +1724,18 @@@ smb2_readv_callback(struct mid_q_entry rc); } /* FIXME: should this be counted toward the initiating task? */ - task_io_account_read(rdata->bytes); - cifs_stats_bytes_read(tcon, rdata->bytes); + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: rdata->result = -EAGAIN; + if (server->sign && rdata->got_bytes) + /* reset bytes number since we can not check a sign */ + rdata->got_bytes = 0; + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; default: if (rdata->result != -ENODATA) @@@ -1754,12 -1762,11 +1754,12 @@@ int smb2_async_readv(struct cifs_readdata *rdata) { - int rc; + int rc, flags = 0; struct smb2_hdr *buf; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = &rdata->iov, .rq_nvec = 1 }; + struct TCP_Server_Info *server;
cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", __func__, rdata->offset, rdata->bytes); @@@ -1770,41 -1777,18 +1770,41 @@@ io_parms.persistent_fid = rdata->cfile->fid.persistent_fid; io_parms.volatile_fid = rdata->cfile->fid.volatile_fid; io_parms.pid = rdata->pid; + + server = io_parms.tcon->ses->server; + rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0); - if (rc) + if (rc) { + if (rc == -EAGAIN && rdata->credits) { + /* credits was reseted by reconnect */ + rdata->credits = 0; + /* reduce in_flight value since we won't send the req */ + spin_lock(&server->req_lock); + server->in_flight--; + spin_unlock(&server->req_lock); + } return rc; + }
buf = (struct smb2_hdr *)rdata->iov.iov_base; /* 4 for rfc1002 length field */ rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4;
+ if (rdata->credits) { + buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, + SMB2_MAX_BUFFER_SIZE)); + spin_lock(&server->req_lock); + server->credits += rdata->credits - + le16_to_cpu(buf->CreditCharge); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + flags = CIFS_HAS_CREDITS; + } + kref_get(&rdata->refcount); rc = cifs_call_async(io_parms.tcon->ses->server, &rqst, cifs_readv_receive, smb2_readv_callback, - rdata, 0); + rdata, flags); if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); @@@ -1922,25 -1906,15 +1922,25 @@@ in smb2_async_writev(struct cifs_writedata *wdata, void (*release)(struct kref *kref)) { - int rc = -EACCES; + int rc = -EACCES, flags = 0; struct smb2_write_req *req = NULL; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; struct kvec iov; struct smb_rqst rqst;
rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req); - if (rc) + if (rc) { + if (rc == -EAGAIN && wdata->credits) { + /* credits was reseted by reconnect */ + wdata->credits = 0; + /* reduce in_flight value since we won't send the req */ + spin_lock(&server->req_lock); + server->in_flight--; + spin_unlock(&server->req_lock); + } goto async_writev_out; + }
req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid);
@@@ -1973,20 -1947,9 +1973,20 @@@
inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
+ if (wdata->credits) { + req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, + SMB2_MAX_BUFFER_SIZE)); + spin_lock(&server->req_lock); + server->credits += wdata->credits - + le16_to_cpu(req->hdr.CreditCharge); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + flags = CIFS_HAS_CREDITS; + } + kref_get(&wdata->refcount); - rc = cifs_call_async(tcon->ses->server, &rqst, NULL, - smb2_writev_callback, wdata, 0); + rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata, + flags);
if (rc) { kref_put(&wdata->refcount, release); diff --combined fs/exec.c index ab1f120,2ef2751..a2b42a9 --- a/fs/exec.c +++ b/fs/exec.c @@@ -368,10 -368,6 +368,6 @@@ static int bprm_mm_init(struct linux_bi if (!mm) goto err;
- err = init_new_context(current, mm); - if (err) - goto err; - err = __bprm_mm_init(bprm); if (err) goto err; @@@ -1216,7 -1212,7 +1212,7 @@@ EXPORT_SYMBOL(install_exec_creds) /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against - * PTRACE_ATTACH + * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { @@@ -1234,7 -1230,7 +1230,7 @@@ * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ - if (current->no_new_privs) + if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
t = p; @@@ -1272,7 -1268,7 +1268,7 @@@ int prepare_binprm(struct linux_binprm bprm->cred->egid = current_egid();
if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && - !current->no_new_privs && + !task_no_new_privs(current) && kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) && kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) { /* Set-uid? */ diff --combined fs/fscache/main.c index a31b83c,3248c15..b39d487 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@@ -67,7 -67,7 +67,7 @@@ static int fscache_max_active_sysctl(st return ret; }
- struct ctl_table fscache_sysctls[] = { + static struct ctl_table fscache_sysctls[] = { { .procname = "object_max_active", .data = &fscache_object_max_active, @@@ -87,7 -87,7 +87,7 @@@ {} };
- struct ctl_table fscache_sysctls_root[] = { + static struct ctl_table fscache_sysctls_root[] = { { .procname = "fscache", .mode = 0555, @@@ -197,6 -197,24 +197,6 @@@ static void __exit fscache_exit(void module_exit(fscache_exit);
/* - * wait_on_bit() sleep function for uninterruptible waiting - */ -int fscache_wait_bit(void *flags) -{ - schedule(); - return 0; -} - -/* - * wait_on_bit() sleep function for interruptible waiting - */ -int fscache_wait_bit_interruptible(void *flags) -{ - schedule(); - return signal_pending(current); -} - -/* * wait_on_atomic_t() sleep function for uninterruptible waiting */ int fscache_wait_atomic_t(atomic_t *p) diff --combined fs/namespace.c index b10db3d,2a1447c..019ff81 --- a/fs/namespace.c +++ b/fs/namespace.c @@@ -225,7 -225,6 +225,7 @@@ static struct mount *alloc_vfsmnt(cons INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_LIST_HEAD(&mnt->mnt_mp_list); #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif @@@ -668,45 -667,11 +668,45 @@@ struct vfsmount *lookup_mnt(struct pat return m; }
-static struct mountpoint *new_mountpoint(struct dentry *dentry) +/* + * __is_local_mountpoint - Test to see if dentry is a mountpoint in the + * current mount namespace. + * + * The common case is dentries are not mountpoints at all and that + * test is handled inline. For the slow case when we are actually + * dealing with a mountpoint of some kind, walk through all of the + * mounts in the current mount namespace and test to see if the dentry + * is a mountpoint. + * + * The mount_hashtable is not usable in the context because we + * need to identify all mounts that may be in the current mount + * namespace not just a mount that happens to have some specified + * parent mount. + */ +bool __is_local_mountpoint(struct dentry *dentry) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + bool is_covered = false; + + if (!d_mountpoint(dentry)) + goto out; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + is_covered = (mnt->mnt_mountpoint == dentry); + if (is_covered) + break; + } + up_read(&namespace_sem); +out: + return is_covered; +} + +static struct mountpoint *lookup_mountpoint(struct dentry *dentry) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; - int ret;
hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { @@@ -717,14 -682,6 +717,14 @@@ return mp; } } + return NULL; +} + +static struct mountpoint *new_mountpoint(struct dentry *dentry) +{ + struct hlist_head *chain = mp_hash(dentry); + struct mountpoint *mp; + int ret;
mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); if (!mp) @@@ -739,7 -696,6 +739,7 @@@ mp->m_dentry = dentry; mp->m_count = 1; hlist_add_head(&mp->m_hash, chain); + INIT_LIST_HEAD(&mp->m_list); return mp; }
@@@ -747,7 -703,6 +747,7 @@@ static void put_mountpoint(struct mount { if (!--mp->m_count) { struct dentry *dentry = mp->m_dentry; + BUG_ON(!list_empty(&mp->m_list)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); @@@ -794,7 -749,6 +794,7 @@@ static void detach_mnt(struct mount *mn mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); hlist_del_init_rcu(&mnt->mnt_hash); + list_del_init(&mnt->mnt_mp_list); put_mountpoint(mnt->mnt_mp); mnt->mnt_mp = NULL; } @@@ -811,7 -765,6 +811,7 @@@ void mnt_set_mountpoint(struct mount *m child_mnt->mnt_mountpoint = dget(mp->m_dentry); child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; + list_add_tail(&child_mnt->mnt_mp_list, &mp->m_list); }
/* @@@ -845,7 -798,7 +845,7 @@@ static void commit_tree(struct mount *m list_splice(&head, n->list.prev);
if (shadows) - hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); else hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); @@@ -983,25 -936,9 +983,25 @@@ static struct mount *clone_mnt(struct m return ERR_PTR(err); }
+static void cleanup_mnt(struct mount *mnt) +{ + fsnotify_vfsmount_delete(&mnt->mnt); + dput(mnt->mnt.mnt_root); + deactivate_super(mnt->mnt.mnt_sb); + mnt_free_id(mnt); + complete(mnt->mnt_undone); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); +} + +static void cleanup_mnt_work(struct work_struct *work) +{ + cleanup_mnt(container_of(work, struct mount, mnt_cleanup_work)); +} + static void mntput_no_expire(struct mount *mnt) { -put_again: + struct completion undone; + rcu_read_lock(); mnt_add_count(mnt, -1); if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ @@@ -1015,15 -952,12 +1015,15 @@@ return; } if (unlikely(mnt->mnt_pinned)) { - mnt_add_count(mnt, mnt->mnt_pinned + 1); + init_completion(&undone); + mnt->mnt_undone = &undone; + mnt_add_count(mnt, mnt->mnt_pinned); mnt->mnt_pinned = 0; rcu_read_unlock(); unlock_mount_hash(); acct_auto_close_mnt(&mnt->mnt); - goto put_again; + wait_for_completion(&undone); + return; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); @@@ -1047,19 -981,11 +1047,19 @@@ * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); - fsnotify_vfsmount_delete(&mnt->mnt); - dput(mnt->mnt.mnt_root); - deactivate_super(mnt->mnt.mnt_sb); - mnt_free_id(mnt); - call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); + /* The stack may be deep here, cleanup the mount on a work + * queue where the stack is guaranteed to be shallow. + */ + init_completion(&undone); + if (!mnt->mnt_undone) + mnt->mnt_undone = &undone; + else + complete(&undone); + + INIT_WORK(&mnt->mnt_cleanup_work, cleanup_mnt_work); + schedule_work(&mnt->mnt_cleanup_work); + + wait_for_completion(&undone); }
void mntput(struct vfsmount *mnt) @@@ -1335,7 -1261,6 +1335,7 @@@ void umount_tree(struct mount *mnt, in p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { + list_del_init(&p->mnt_mp_list); put_mountpoint(p->mnt_mp); /* move the reference to mountpoint into ->mnt_ex_mountpoint */ p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; @@@ -1448,37 -1373,6 +1448,37 @@@ static int do_umount(struct mount *mnt return retval; }
+/* + * __detach_mounts - lazily unmount all mounts on the specified dentry + * + * During unlink, rmdir, and d_drop it is possible to loose the path + * to an existing mountpoint, and wind up leaking the mount. + * detach_mounts allows lazily unmounting those mounts instead of + * leaking them. + * + * The caller may hold dentry->d_inode->i_mutex. + */ +void __detach_mounts(struct dentry *dentry) +{ + struct mountpoint *mp; + struct mount *mnt; + + namespace_lock(); + mp = lookup_mountpoint(dentry); + if (!mp) + goto out_unlock; + + lock_mount_hash(); + while (!list_empty(&mp->m_list)) { + mnt = list_first_entry(&mp->m_list, struct mount, mnt_mp_list); + umount_tree(mnt, 2); + } + unlock_mount_hash(); + put_mountpoint(mp); +out_unlock: + namespace_unlock(); +} + /* * Is the caller allowed to modify his namespace? */ @@@ -1828,9 -1722,7 +1828,9 @@@ retry namespace_lock(); mnt = lookup_mnt(path); if (likely(!mnt)) { - struct mountpoint *mp = new_mountpoint(dentry); + struct mountpoint *mp = lookup_mountpoint(dentry); + if (!mp) + mp = new_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); mutex_unlock(&dentry->d_inode->i_mutex); diff --combined fs/proc/base.c index e442784,043c83c..2105331 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@@ -105,7 -105,7 +105,7 @@@ */
struct pid_entry { - char *name; + const char *name; int len; umode_t mode; const struct inode_operations *iop; @@@ -130,10 -130,6 +130,6 @@@ { .proc_get_link = get_link } ) #define REG(NAME, MODE, fops) \ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) - #define INF(NAME, MODE, read) \ - NOD(NAME, (S_IFREG|(MODE)), \ - NULL, &proc_info_file_operations, \ - { .proc_read = read } ) #define ONE(NAME, MODE, show) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ @@@ -200,27 -196,32 +196,32 @@@ static int proc_root_link(struct dentr return result; }
- static int proc_pid_cmdline(struct task_struct *task, char *buffer) + static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { - return get_cmdline(task, buffer, PAGE_SIZE); + /* + * Rely on struct seq_operations::show() being called once + * per internal buffer allocation. See single_open(), traverse(). + */ + BUG_ON(m->size < PAGE_SIZE); + m->count += get_cmdline(task, m->buf, PAGE_SIZE); + return 0; }
- static int proc_pid_auxv(struct task_struct *task, char *buffer) + static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); - int res = PTR_ERR(mm); if (mm && !IS_ERR(mm)) { unsigned int nwords = 0; do { nwords += 2; } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ - res = nwords * sizeof(mm->saved_auxv[0]); - if (res > PAGE_SIZE) - res = PAGE_SIZE; - memcpy(buffer, mm->saved_auxv, res); + seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0])); mmput(mm); - } - return res; + return 0; + } else + return PTR_ERR(mm); }
@@@ -229,7 -230,8 +230,8 @@@ * Provides a wchan file via kallsyms in a proper one-value-per-file format. * Returns the resolved symbol. If that fails, simply return the address. */ - static int proc_pid_wchan(struct task_struct *task, char *buffer) + static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { unsigned long wchan; char symname[KSYM_NAME_LEN]; @@@ -240,9 -242,9 +242,9 @@@ if (!ptrace_may_access(task, PTRACE_MODE_READ)) return 0; else - return sprintf(buffer, "%lu", wchan); + return seq_printf(m, "%lu", wchan); else - return sprintf(buffer, "%s", symname); + return seq_printf(m, "%s", symname); } #endif /* CONFIG_KALLSYMS */
@@@ -304,9 -306,10 +306,10 @@@ static int proc_pid_stack(struct seq_fi /* * Provides /proc/PID/schedstat */ - static int proc_pid_schedstat(struct task_struct *task, char *buffer) + static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { - return sprintf(buffer, "%llu %llu %lu\n", + return seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); @@@ -404,7 -407,8 +407,8 @@@ static const struct file_operations pro }; #endif
- static int proc_oom_score(struct task_struct *task, char *buffer) + static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { unsigned long totalpages = totalram_pages + total_swap_pages; unsigned long points = 0; @@@ -414,12 -418,12 +418,12 @@@ points = oom_badness(task, NULL, NULL, totalpages) * 1000 / totalpages; read_unlock(&tasklist_lock); - return sprintf(buffer, "%lu\n", points); + return seq_printf(m, "%lu\n", points); }
struct limit_names { - char *name; - char *unit; + const char *name; + const char *unit; };
static const struct limit_names lnames[RLIM_NLIMITS] = { @@@ -442,12 -446,11 +446,11 @@@ };
/* Display limits for a process */ - static int proc_pid_limits(struct task_struct *task, char *buffer) + static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { unsigned int i; - int count = 0; unsigned long flags; - char *bufptr = buffer;
struct rlimit rlim[RLIM_NLIMITS];
@@@ -459,35 -462,34 +462,34 @@@ /* * print the file header */ - count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n", + seq_printf(m, "%-25s %-20s %-20s %-10s\n", "Limit", "Soft Limit", "Hard Limit", "Units");
for (i = 0; i < RLIM_NLIMITS; i++) { if (rlim[i].rlim_cur == RLIM_INFINITY) - count += sprintf(&bufptr[count], "%-25s %-20s ", + seq_printf(m, "%-25s %-20s ", lnames[i].name, "unlimited"); else - count += sprintf(&bufptr[count], "%-25s %-20lu ", + seq_printf(m, "%-25s %-20lu ", lnames[i].name, rlim[i].rlim_cur);
if (rlim[i].rlim_max == RLIM_INFINITY) - count += sprintf(&bufptr[count], "%-20s ", "unlimited"); + seq_printf(m, "%-20s ", "unlimited"); else - count += sprintf(&bufptr[count], "%-20lu ", - rlim[i].rlim_max); + seq_printf(m, "%-20lu ", rlim[i].rlim_max);
if (lnames[i].unit) - count += sprintf(&bufptr[count], "%-10s\n", - lnames[i].unit); + seq_printf(m, "%-10s\n", lnames[i].unit); else - count += sprintf(&bufptr[count], "\n"); + seq_putc(m, '\n'); }
- return count; + return 0; }
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK - static int proc_pid_syscall(struct task_struct *task, char *buffer) + static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { long nr; unsigned long args[6], sp, pc; @@@ -496,11 -498,11 +498,11 @@@ return res;
if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) - res = sprintf(buffer, "running\n"); + seq_puts(m, "running\n"); else if (nr < 0) - res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc); else - res = sprintf(buffer, + seq_printf(m, "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", nr, args[0], args[1], args[2], args[3], args[4], args[5], @@@ -598,43 -600,6 +600,6 @@@ static const struct inode_operations pr .setattr = proc_setattr, };
- #define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ - - static ssize_t proc_info_read(struct file * file, char __user * buf, - size_t count, loff_t *ppos) - { - struct inode * inode = file_inode(file); - unsigned long page; - ssize_t length; - struct task_struct *task = get_proc_task(inode); - - length = -ESRCH; - if (!task) - goto out_no_task; - - if (count > PROC_BLOCK_SIZE) - count = PROC_BLOCK_SIZE; - - length = -ENOMEM; - if (!(page = __get_free_page(GFP_TEMPORARY))) - goto out; - - length = PROC_I(inode)->op.proc_read(task, (char*)page); - - if (length >= 0) - length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); - free_page(page); - out: - put_task_struct(task); - out_no_task: - return length; - } - - static const struct file_operations proc_info_file_operations = { - .read = proc_info_read, - .llseek = generic_file_llseek, - }; - static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; @@@ -1625,6 -1590,7 +1590,6 @@@ int pid_revalidate(struct dentry *dentr put_task_struct(task); return 1; } - d_drop(dentry); return 0; }
@@@ -1761,6 -1727,9 +1726,6 @@@ out put_task_struct(task);
out_notask: - if (status <= 0) - d_drop(dentry); - return status; }
@@@ -2052,7 -2021,7 +2017,7 @@@ static int show_timer(struct seq_file * struct k_itimer *timer; struct timers_private *tp = m->private; int notify; - static char *nstr[] = { + static const char * const nstr[] = { [SIGEV_SIGNAL] = "signal", [SIGEV_NONE] = "none", [SIGEV_THREAD] = "thread", @@@ -2388,7 -2357,7 +2353,7 @@@ static const struct file_operations pro #endif
#ifdef CONFIG_TASK_IO_ACCOUNTING - static int do_io_accounting(struct task_struct *task, char *buffer, int whole) + static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) { struct task_io_accounting acct = task->ioac; unsigned long flags; @@@ -2412,7 -2381,7 +2377,7 @@@
unlock_task_sighand(task, &flags); } - result = sprintf(buffer, + result = seq_printf(m, "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" @@@ -2432,20 -2401,22 +2397,22 @@@ out_unlock return result; }
- static int proc_tid_io_accounting(struct task_struct *task, char *buffer) + static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { - return do_io_accounting(task, buffer, 0); + return do_io_accounting(task, m, 0); }
- static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) + static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { - return do_io_accounting(task, buffer, 1); + return do_io_accounting(task, m, 1); } #endif /* CONFIG_TASK_IO_ACCOUNTING */
#ifdef CONFIG_USER_NS static int proc_id_map_open(struct inode *inode, struct file *file, - struct seq_operations *seq_ops) + const struct seq_operations *seq_ops) { struct user_namespace *ns = NULL; struct task_struct *task; @@@ -2553,10 -2524,10 +2520,10 @@@ static const struct pid_entry tgid_base DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), - INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUGO, proc_pid_limits), + ONE("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif @@@ -2565,9 -2536,9 +2532,9 @@@ #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUSR, proc_pid_syscall), + ONE("syscall", S_IRUSR, proc_pid_syscall), #endif - INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tgid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), @@@ -2590,13 -2561,13 +2557,13 @@@ DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS - INF("wchan", S_IRUGO, proc_pid_wchan), + ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS - INF("schedstat", S_IRUGO, proc_pid_schedstat), + ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), @@@ -2607,7 -2578,7 +2574,7 @@@ #ifdef CONFIG_CGROUPS REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif - INF("oom_score", S_IRUGO, proc_oom_score), + ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL @@@ -2621,10 -2592,10 +2588,10 @@@ REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUSR, proc_tgid_io_accounting), + ONE("io", S_IRUSR, proc_tgid_io_accounting), #endif #ifdef CONFIG_HARDWALL - INF("hardwall", S_IRUGO, proc_pid_hardwall), + ONE("hardwall", S_IRUGO, proc_pid_hardwall), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), @@@ -2672,7 -2643,8 +2639,7 @@@ static void proc_flush_task_mnt(struct /* no ->d_hash() rejects on procfs */ dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { - shrink_dcache_parent(dentry); - d_drop(dentry); + d_invalidate(dentry); dput(dentry); }
@@@ -2692,7 -2664,8 +2659,7 @@@ name.len = snprintf(buf, sizeof(buf), "%d", pid); dentry = d_hash_and_lookup(dir, &name); if (dentry) { - shrink_dcache_parent(dentry); - d_drop(dentry); + d_invalidate(dentry); dput(dentry); }
@@@ -2774,12 -2747,12 +2741,12 @@@ out
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - int result = 0; + int result = -ENOENT; struct task_struct *task; unsigned tgid; struct pid_namespace *ns;
- tgid = name_to_int(dentry); + tgid = name_to_int(&dentry->d_name); if (tgid == ~0U) goto out;
@@@ -2890,18 -2863,18 +2857,18 @@@ static const struct pid_entry tid_base_ DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), REG("environ", S_IRUSR, proc_environ_operations), - INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUGO, proc_pid_limits), + ONE("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUSR, proc_pid_syscall), + ONE("syscall", S_IRUSR, proc_pid_syscall), #endif - INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_tid_maps_operations), @@@ -2926,13 -2899,13 +2893,13 @@@ DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS - INF("wchan", S_IRUGO, proc_pid_wchan), + ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS - INF("schedstat", S_IRUGO, proc_pid_schedstat), + ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), @@@ -2943,7 -2916,7 +2910,7 @@@ #ifdef CONFIG_CGROUPS REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif - INF("oom_score", S_IRUGO, proc_oom_score), + ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL @@@ -2954,10 -2927,10 +2921,10 @@@ REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUSR, proc_tid_io_accounting), + ONE("io", S_IRUSR, proc_tid_io_accounting), #endif #ifdef CONFIG_HARDWALL - INF("hardwall", S_IRUGO, proc_pid_hardwall), + ONE("hardwall", S_IRUGO, proc_pid_hardwall), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), @@@ -3027,7 -3000,7 +2994,7 @@@ static struct dentry *proc_task_lookup( if (!leader) goto out_no_task;
- tid = name_to_int(dentry); + tid = name_to_int(&dentry->d_name); if (tid == ~0U) goto out;
diff --combined fs/proc/fd.c index eb82e9f,955bb55..e11d7c5 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@@ -129,6 -129,8 +129,6 @@@ static int tid_fd_revalidate(struct den } put_task_struct(task); } - - d_drop(dentry); return 0; }
@@@ -204,7 -206,7 +204,7 @@@ static struct dentry *proc_lookupfd_com { struct task_struct *task = get_proc_task(dir); int result = -ENOENT; - unsigned fd = name_to_int(dentry); + unsigned fd = name_to_int(&dentry->d_name);
if (!task) goto out_no_task; diff --combined include/linux/fs.h index 2daccaf,8b4a021..1ab6c69 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@@ -833,7 -833,7 +833,7 @@@ static inline struct file *get_file(str * * Lockd stuffs a "host" pointer into this. */ -typedef struct files_struct *fl_owner_t; +typedef void *fl_owner_t;
struct file_lock_operations { void (*fl_copy_lock)(struct file_lock *, struct file_lock *); @@@ -2688,7 -2688,7 +2688,7 @@@ static const struct file_operations __f .read = simple_attr_read, \ .write = simple_attr_write, \ .llseek = generic_file_llseek, \ - }; + }
static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) diff --combined include/linux/kernel.h index a9e2268,44a498d..e989204 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@@ -470,6 -470,7 +470,7 @@@ extern enum system_states #define TAINT_FIRMWARE_WORKAROUND 11 #define TAINT_OOT_MODULE 12 #define TAINT_UNSIGNED_MODULE 13 + #define TAINT_SOFTLOCKUP 14
extern const char hex_asc[]; #define hex_asc_lo(x) hex_asc[((x) & 0x0f)] @@@ -493,15 -494,10 +494,10 @@@ static inline char *hex_byte_pack_upper return buf; }
extern int hex_to_bin(char ch); extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
-int mac_pton(const char *s, u8 *mac); +bool mac_pton(const char *s, u8 *mac);
/* * General tracing related utility functions - trace_printk(), @@@ -719,23 -715,8 +715,8 @@@ static inline void ftrace_dump(enum ftr (void) (&_max1 == &_max2); \ _max1 > _max2 ? _max1 : _max2; })
- #define min3(x, y, z) ({ \ - typeof(x) _min1 = (x); \ - typeof(y) _min2 = (y); \ - typeof(z) _min3 = (z); \ - (void) (&_min1 == &_min2); \ - (void) (&_min1 == &_min3); \ - _min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \ - (_min2 < _min3 ? _min2 : _min3); }) - - #define max3(x, y, z) ({ \ - typeof(x) _max1 = (x); \ - typeof(y) _max2 = (y); \ - typeof(z) _max3 = (z); \ - (void) (&_max1 == &_max2); \ - (void) (&_max1 == &_max3); \ - _max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \ - (_max2 > _max3 ? _max2 : _max3); }) + #define min3(x, y, z) min((typeof(x))min(x, y), z) + #define max3(x, y, z) max((typeof(x))max(x, y), z)
/** * min_not_zero - return the minimum that is _not_ zero, unless both are zero @@@ -750,20 -731,13 +731,13 @@@ /** * clamp - return a value clamped to a given range with strict typechecking * @val: current value - * @min: minimum allowable value - * @max: maximum allowable value + * @lo: lowest allowable value + * @hi: highest allowable value * * This macro does strict typechecking of min/max to make sure they are of the * same type as val. See the unnecessary pointer comparisons. */ - #define clamp(val, min, max) ({ \ - typeof(val) __val = (val); \ - typeof(min) __min = (min); \ - typeof(max) __max = (max); \ - (void) (&__val == &__min); \ - (void) (&__val == &__max); \ - __val = __val < __min ? __min: __val; \ - __val > __max ? __max: __val; }) + #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
/* * ..and if you can't take the strict diff --combined include/linux/scatterlist.h index f4ec8bb,4b152c8..ed8f9e7 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@@ -136,7 -136,7 +136,7 @@@ static inline void sg_set_buf(struct sc static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, struct scatterlist *sgl) { - #ifndef ARCH_HAS_SG_CHAIN + #ifndef CONFIG_ARCH_HAS_SG_CHAIN BUG(); #endif
@@@ -229,10 -229,10 +229,10 @@@ void sg_init_one(struct scatterlist *, typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t); typedef void (sg_free_fn)(struct scatterlist *, unsigned int);
-void __sg_free_table(struct sg_table *, unsigned int, sg_free_fn *); +void __sg_free_table(struct sg_table *, unsigned int, bool, sg_free_fn *); void sg_free_table(struct sg_table *); -int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, gfp_t, - sg_alloc_fn *); +int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, + struct scatterlist *, gfp_t, sg_alloc_fn *); int sg_alloc_table(struct sg_table *, unsigned int, gfp_t); int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, unsigned int n_pages, diff --combined include/linux/sched.h index fa964cf,b9d5364..89f531e --- a/include/linux/sched.h +++ b/include/linux/sched.h @@@ -33,6 -33,7 +33,7 @@@ struct sched_param
#include <linux/smp.h> #include <linux/sem.h> + #include <linux/shm.h> #include <linux/signal.h> #include <linux/compiler.h> #include <linux/completion.h> @@@ -1270,6 -1271,9 +1271,6 @@@ struct task_struct #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rcu_boost_mutex; -#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@@ -1304,12 -1308,13 +1305,12 @@@ * execve */ unsigned in_iowait:1;
- /* task may not gain privileges */ - unsigned no_new_privs:1; - /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1;
+ unsigned long atomic_flags; /* Flags needing atomic access. */ + pid_t pid; pid_t tgid;
@@@ -1385,6 -1390,7 +1386,7 @@@ #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; + struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ @@@ -1436,6 -1442,8 +1438,6 @@@ struct rb_node *pi_waiters_leftmost; /* Deadlock detection and priority inheritance handling */ struct rt_mutex_waiter *pi_blocked_on; - /* Top pi_waiters task */ - struct task_struct *pi_top_task; #endif
#ifdef CONFIG_DEBUG_MUTEXES @@@ -1628,12 -1636,6 +1630,6 @@@ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ - struct memcg_batch_info { - int do_batch; /* incremented when batch uncharge started */ - struct mem_cgroup *memcg; /* target memcg of uncharge */ - unsigned long nr_pages; /* uncharged usage */ - unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ - } memcg_batch; unsigned int memcg_kmem_skip_account; struct memcg_oom_info { struct mem_cgroup *memcg; @@@ -1961,19 -1963,6 +1957,19 @@@ static inline void memalloc_noio_restor current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; }
+/* Per-process atomic flags. */ +#define PFA_NO_NEW_PRIVS 0x00000001 /* May not gain new privileges. */ + +static inline bool task_no_new_privs(struct task_struct *p) +{ + return test_bit(PFA_NO_NEW_PRIVS, &p->atomic_flags); +} + +static inline void task_set_no_new_privs(struct task_struct *p) +{ + set_bit(PFA_NO_NEW_PRIVS, &p->atomic_flags); +} + /* * task->jobctl flags */ @@@ -2016,6 -2005,9 +2012,6 @@@ static inline void rcu_copy_process(str #ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - p->rcu_boost_mutex = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ INIT_LIST_HEAD(&p->rcu_node_entry); }
@@@ -2364,10 -2356,8 +2360,10 @@@ static inline int on_sig_stack(unsigne
static inline int sas_ss_flags(unsigned long sp) { - return (current->sas_ss_size == 0 ? SS_DISABLE - : on_sig_stack(sp) ? SS_ONSTACK : 0); + if (!current->sas_ss_size) + return SS_DISABLE; + + return on_sig_stack(sp) ? SS_ONSTACK : 0; }
static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) @@@ -2794,7 -2784,7 +2790,7 @@@ static inline bool __must_check current
/* * Polling state must be visible before we test NEED_RESCHED, - * paired by resched_task() + * paired by resched_curr() */ smp_mb__after_atomic();
@@@ -2812,7 -2802,7 +2808,7 @@@ static inline bool __must_check current
/* * Polling state must be visible before we test NEED_RESCHED, - * paired by resched_task() + * paired by resched_curr() */ smp_mb__after_atomic();
@@@ -2844,7 -2834,7 +2840,7 @@@ static inline void current_clr_polling( * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also * fold. */ - smp_mb(); /* paired with resched_task() */ + smp_mb(); /* paired with resched_curr() */
preempt_fold_need_resched(); } @@@ -2969,15 -2959,10 +2965,10 @@@ static inline void inc_syscw(struct tas
#ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); - extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); #else static inline void mm_update_next_owner(struct mm_struct *mm) { } - - static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) - { - } #endif /* CONFIG_MEMCG */
static inline unsigned long task_rlimit(const struct task_struct *tsk, diff --combined include/scsi/scsi.h index e6df23c,d34cf2d..261e708 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@@ -31,7 -31,7 +31,7 @@@ enum scsi_timeouts * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. */ - #ifdef ARCH_HAS_SG_CHAIN + #ifdef CONFIG_ARCH_HAS_SG_CHAIN #define SCSI_MAX_SG_CHAIN_SEGMENTS 2048 #else #define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS @@@ -332,7 -332,6 +332,7 @@@ static inline int scsi_status_is_good(i #define TYPE_ENCLOSURE 0x0d /* Enclosure Services Device */ #define TYPE_RBC 0x0e #define TYPE_OSD 0x11 +#define TYPE_ZBC 0x14 #define TYPE_NO_LUN 0x7f
/* SCSI protocols; these are taken from SPC-3 section 7.5 */ @@@ -386,7 -385,7 +386,7 @@@ struct scsi_lun #define SCSI_W_LUN_ACCESS_CONTROL (SCSI_W_LUN_BASE + 2) #define SCSI_W_LUN_TARGET_LOG_PAGE (SCSI_W_LUN_BASE + 3)
-static inline int scsi_is_wlun(unsigned int lun) +static inline int scsi_is_wlun(u64 lun) { return (lun & 0xff00) == SCSI_W_LUN_BASE; } diff --combined init/Kconfig index 85fb985,77dc4cb..d3ef635 --- a/init/Kconfig +++ b/init/Kconfig @@@ -505,7 -505,7 +505,7 @@@ config PREEMPT_RC def_bool TREE_PREEMPT_RCU help This option enables preemptible-RCU code that is common between - the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. + TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU.
config RCU_STALL_COMMON def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE ) @@@ -737,7 -737,7 +737,7 @@@ choic
config RCU_NOCB_CPU_NONE bool "No build_forced no-CBs CPUs" - depends on RCU_NOCB_CPU && !NO_HZ_FULL + depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL help This option does not force any of the CPUs to be no-CBs CPUs. Only CPUs designated by the rcu_nocbs= boot parameter will be @@@ -751,7 -751,7 +751,7 @@@
config RCU_NOCB_CPU_ZERO bool "CPU 0 is a build_forced no-CBs CPU" - depends on RCU_NOCB_CPU && !NO_HZ_FULL + depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL help This option forces CPU 0 to be a no-CBs CPU, so that its RCU callbacks are invoked by a per-CPU kthread whose name begins @@@ -807,15 -807,53 +807,53 @@@ config LOG_BUF_SHIF range 12 21 default 17 help - Select kernel log buffer size as a power of 2. + Select the minimal kernel log buffer size as a power of 2. + The final size is affected by LOG_CPU_MAX_BUF_SHIFT config + parameter, see below. Any higher size also might be forced + by "log_buf_len" boot parameter. + Examples: - 17 => 128 KB + 17 => 128 KB 16 => 64 KB - 15 => 32 KB - 14 => 16 KB + 15 => 32 KB + 14 => 16 KB 13 => 8 KB 12 => 4 KB
+ config LOG_CPU_MAX_BUF_SHIFT + int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)" + range 0 21 + default 12 if !BASE_SMALL + default 0 if BASE_SMALL + help + This option allows to increase the default ring buffer size + according to the number of CPUs. The value defines the contribution + of each CPU as a power of 2. The used space is typically only few + lines however it might be much more when problems are reported, + e.g. backtraces. + + The increased size means that a new buffer has to be allocated and + the original static one is unused. It makes sense only on systems + with more CPUs. Therefore this value is used only when the sum of + contributions is greater than the half of the default kernel ring + buffer as defined by LOG_BUF_SHIFT. The default values are set + so that more than 64 CPUs are needed to trigger the allocation. + + Also this option is ignored when "log_buf_len" kernel parameter is + used as it forces an exact (power of two) size of the ring buffer. + + The number of possible CPUs is used for this computation ignoring + hotplugging making the compuation optimal for the the worst case + scenerio while allowing a simple algorithm to be used from bootup. + + Examples shift values and their meaning: + 17 => 128 KB for each CPU + 16 => 64 KB for each CPU + 15 => 32 KB for each CPU + 14 => 16 KB for each CPU + 13 => 8 KB for each CPU + 12 => 4 KB for each CPU + # # Architectures with an unreliable sched_clock() should select this: # @@@ -1264,77 -1302,6 +1302,77 @@@ config CC_OPTIMIZE_FOR_SIZ
If unsure, say N.
+config LTO_MENU + bool "Enable gcc link time optimization (LTO)" + # Only tested on X86 for now. For other architectures you likely + # have to fix some things first, like adding asmlinkages etc. + depends on X86 + # lto does not support excluding flags for specific files + # right now. Can be removed if that is fixed. + depends on !FUNCTION_TRACER + help + With this option gcc will do whole program optimizations for + the whole kernel and module. This increases compile time, but can + lead to better code. It allows gcc to inline functions between + different files and do other optimization. It might also trigger + bugs due to more aggressive optimization. It allows gcc to drop unused + code. On smaller monolithic kernel configurations + it usually leads to smaller kernels, especially when modules + are disabled. + + With this option gcc will also do some global checking over + different source files. It also disables a number of kernel + features. + + This option is recommended for release builds. With LTO + the kernel always has to be re-optimized (but not re-parsed) + on each build. + + This requires a gcc 4.8 or later compiler and + Linux binutils 2.21.51.0.3 or later. gcc 4.9 builds significantly + faster than 4.8 It does not currently work with a FSF release of + binutils or with the gold linker. + + On larger configurations this may need more than 4GB of RAM. + It will likely not work on those with a 32bit compiler. + + When the toolchain support is not available this will (hopefully) + be automatically disabled. + + For more information see Documentation/lto-build + +config LTO_DISABLE + bool "Disable LTO again" + depends on LTO_MENU + default n + help + This option is merely here so that allyesconfig or allmodconfig do + not enable LTO. If you want to actually use LTO do not enable. + +config LTO + bool + default y + depends on LTO_MENU && !LTO_DISABLE + +config LTO_DEBUG + bool "Enable LTO compile time debugging" + depends on LTO + help + Enable LTO debugging in the compiler. The compiler dumps + some log files that make it easier to figure out LTO + behavior. The log files also allow to reconstruct + the global inlining and a global callgraph. + They however add some (single threaded) cost to the + compilation. When in doubt do not enable. + +config LTO_CP_CLONE + bool "Allow aggressive cloning for function specialization" + depends on LTO + help + Allow the compiler to clone and specialize functions for specific + arguments when it determines these arguments are very commonly + called. Experimential. Will increase text size. + config SYSCTL bool
@@@ -1834,8 -1801,6 +1872,8 @@@ config MODULE_FORCE_UNLOA
config MODVERSIONS bool "Module versioning support" + # LTO should work with gcc 4.9 + depends on !LTO help Usually, you have to use modules compiled with your kernel. Saying Y here makes it sometimes possible to use modules diff --combined kernel/acct.c index 3cec8c4,1bfdda0..98c4a20 --- a/kernel/acct.c +++ b/kernel/acct.c @@@ -93,7 -93,6 +93,7 @@@ struct bsd_acct_struct
static DEFINE_SPINLOCK(acct_lock); static LIST_HEAD(acct_list); +static LIST_HEAD(acct_close_list);
/* * Check the amount of free space and suspend/resume accordingly. @@@ -142,12 -141,12 +142,12 @@@ static int check_free_space(struct bsd_ if (acct->active) { if (act < 0) { acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); + pr_info("Process accounting paused\n"); } } else { if (act > 0) { acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); + pr_info("Process accounting resumed\n"); } }
@@@ -262,6 -261,7 +262,7 @@@ SYSCALL_DEFINE1(acct, const char __use
if (name) { struct filename *tmp = getname(name); + if (IS_ERR(tmp)) return PTR_ERR(tmp); error = acct_on(tmp); @@@ -281,20 -281,6 +282,20 @@@ return error; }
+static void acct_close_mnts(struct work_struct *unused) +{ + struct bsd_acct_struct *acct; + + spin_lock(&acct_lock); +restart: + list_for_each_entry(acct, &acct_close_list, list) { + acct_file_reopen(acct, NULL, NULL); + goto restart; + } + spin_unlock(&acct_lock); +} +static DECLARE_WORK(acct_close_work, acct_close_mnts); + /** * acct_auto_close - turn off a filesystem's accounting if it is on * @m: vfsmount being shut down @@@ -304,15 -290,15 +305,15 @@@ */ void acct_auto_close_mnt(struct vfsmount *m) { - struct bsd_acct_struct *acct; + struct bsd_acct_struct *acct, *tmp;
spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) + list_for_each_entry_safe(acct, tmp, &acct_list, list) { if (acct->file && acct->file->f_path.mnt == m) { - acct_file_reopen(acct, NULL, NULL); - goto restart; + list_move_tail(&acct->list, &acct_close_list); + schedule_work(&acct_close_work); } + } spin_unlock(&acct_lock); }
@@@ -391,7 -377,7 +392,7 @@@ static comp_t encode_comp_t(unsigned lo return exp; }
- #if ACCT_VERSION==1 || ACCT_VERSION==2 + #if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * @@@ -404,7 -390,7 +405,7 @@@ #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ - #define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ + #define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */
static comp2_t encode_comp2_t(u64 value) { @@@ -435,7 -421,7 +436,7 @@@ } #endif
- #if ACCT_VERSION==3 + #if ACCT_VERSION == 3 /* * encode an u64 into a 32 bit IEEE float */ @@@ -444,8 -430,9 +445,9 @@@ static u32 encode_float(u64 value unsigned exp = 190; unsigned u;
- if (value==0) return 0; - while ((s64)value > 0){ + if (value == 0) + return 0; + while ((s64)value > 0) { value <<= 1; exp--; } @@@ -499,22 -486,23 +501,23 @@@ static void do_acct_process(struct bsd_ strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
/* calculate run_time in nsec*/ - do_posix_clock_monotonic_gettime(&uptime); + ktime_get_ts(&uptime); run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC + current->group_leader->start_time.tv_nsec; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); - #if ACCT_VERSION==3 + #if ACCT_VERSION == 3 ac.ac_etime = encode_float(elapsed); #else ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); + (unsigned long) elapsed : (unsigned long) -1l); #endif - #if ACCT_VERSION==1 || ACCT_VERSION==2 + #if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); + ac.ac_etime_hi = etime >> 16; ac.ac_etime_lo = (u16) etime; } @@@ -524,15 -512,15 +527,15 @@@ /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); - #if ACCT_VERSION==2 + #if ACCT_VERSION == 2 ac.ac_ahz = AHZ; #endif - #if ACCT_VERSION==1 || ACCT_VERSION==2 + #if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; #endif - #if ACCT_VERSION==3 + #if ACCT_VERSION == 3 ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); @@@ -593,6 -581,7 +596,7 @@@ void acct_collect(long exitcode, int gr
if (group_dead && current->mm) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; while (vma) { diff --combined kernel/fork.c index 7657301,735ea98..38dcf83 --- a/kernel/fork.c +++ b/kernel/fork.c @@@ -315,15 -315,6 +315,15 @@@ static struct task_struct *dup_task_str goto free_ti;
tsk->stack = ti; +#ifdef CONFIG_SECCOMP + /* + * We must handle setting up seccomp filters once we're under + * the sighand lock in case orig has changed between now and + * then. Until then, filter must be NULL to avoid messing up + * the usage counts on the error path calling free_task. + */ + tsk->seccomp.filter = NULL; +#endif
setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); @@@ -374,12 -365,11 +374,11 @@@ static int dup_mmap(struct mm_struct *m */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
- mm->locked_vm = 0; - mm->mmap = NULL; - mm->vmacache_seqnum = 0; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@@ -536,19 -526,37 +535,37 @@@ static void mm_init_aio(struct mm_struc #endif }
+ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) + { + #ifdef CONFIG_MEMCG + mm->owner = p; + #endif + } + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm->map_count = 0; + mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); clear_tlb_flush_pending(mm); + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; + #endif
if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@@ -558,11 -566,17 +575,17 @@@ mm->def_flags = 0; }
- if (likely(!mm_alloc_pgd(mm))) { - mmu_notifier_mm_init(mm); - return mm; - } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext; + + return mm;
+ fail_nocontext: + mm_free_pgd(mm); + fail_nopgd: free_mm(mm); return NULL; } @@@ -596,7 -610,6 +619,6 @@@ struct mm_struct *mm_alloc(void return NULL;
memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); }
@@@ -828,17 -841,10 +850,10 @@@ static struct mm_struct *dup_mm(struct goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm);
if (!mm_init(mm, tsk)) goto fail_nomem;
- if (init_new_context(tsk, mm)) - goto fail_nocontext; - dup_mm_exe_file(oldmm, mm);
err = dup_mmap(mm, oldmm); @@@ -860,15 -866,6 +875,6 @@@ free_pt
fail_nomem: return NULL; - - fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; }
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) @@@ -1090,39 -1087,6 +1096,39 @@@ static int copy_signal(unsigned long cl return 0; }
+static void copy_seccomp(struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP + /* + * Must be called with sighand->lock held, which is common to + * all threads in the group. Holding cred_guard_mutex is not + * needed because this new task is not yet running and cannot + * be racing exec. + */ + BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + + /* Ref-count the new filter user, and assign it. */ + get_seccomp_filter(current); + p->seccomp = current->seccomp; + + /* + * Explicitly enable no_new_privs here in case it got set + * between the task_struct being duplicated and holding the + * sighand lock. The seccomp state and nnp must be in sync. + */ + if (task_no_new_privs(current)) + task_set_no_new_privs(p); + + /* + * If the parent gained a seccomp mode after copying thread + * flags and between before we held the sighand lock, we have + * to manually enable the seccomp thread flag here. + */ + if (p->seccomp.mode != SECCOMP_MODE_DISABLED) + set_tsk_thread_flag(p, TIF_SECCOMP); +#endif +} + SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@@ -1137,16 -1101,10 +1143,9 @@@ static void rt_mutex_init_task(struct t p->pi_waiters = RB_ROOT; p->pi_waiters_leftmost = NULL; p->pi_blocked_on = NULL; #endif }
- #ifdef CONFIG_MEMCG - void mm_init_owner(struct mm_struct *mm, struct task_struct *p) - { - mm->owner = p; - } - #endif /* CONFIG_MEMCG */ - /* * Initialize POSIX timer handling for a single task. */ @@@ -1237,6 -1195,7 +1236,6 @@@ static struct task_struct *copy_process goto fork_out;
ftrace_graph_init_task(p); - get_seccomp_filter(p);
rt_mutex_init_task(p);
@@@ -1302,7 -1261,7 +1301,7 @@@
posix_cpu_timers_init(p);
- do_posix_clock_monotonic_gettime(&p->start_time); + ktime_get_ts(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; @@@ -1347,10 -1306,6 +1346,6 @@@ #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif - #ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; - #endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; @@@ -1368,6 -1323,7 +1363,7 @@@ if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@@ -1477,12 -1433,6 +1473,12 @@@ spin_lock(¤t->sighand->siglock);
/* + * Copy seccomp details explicitly here, in case they were changed + * before holding sighand lock. + */ + copy_seccomp(p); + + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to @@@ -1919,6 -1869,11 +1915,11 @@@ SYSCALL_DEFINE1(unshare, unsigned long */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + }
if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); diff --combined lib/Kconfig index a8a775730,fdf90f3..2accc79 --- a/lib/Kconfig +++ b/lib/Kconfig @@@ -177,6 -177,13 +177,13 @@@ config CRC when they need to do cyclic redundancy check according CRC8 algorithm. Module will be called crc8.
+ config CRC64_ECMA + tristate "CRC64 ECMA function" + help + This option provides CRC64 ECMA function. Drivers may select this + when they need to do cyclic redundancy check according to the CRC64 + ECMA algorithm. + config AUDIT_GENERIC bool depends on AUDIT && !AUDIT_ARCH @@@ -396,6 -403,39 +403,39 @@@ config CPU_RMA config DQL bool
+ config GLOB + bool + # This actually supports modular compilation, but the module overhead + # is ridiculous for the amount of code involved. Until an out-of-tree + # driver asks for it, we'll just link it directly it into the kernel + # when required. Since we're ignoring out-of-tree users, there's also + # no need bother prompting for a manual decision: + # prompt "glob_match() function" + help + This option provides a glob_match function for performing + simple text pattern matching. It originated in the ATA code + to blacklist particular drive models, but other device drivers + may need similar functionality. + + All drivers in the Linux kernel tree that require this function + should automatically select this option. Say N unless you + are compiling an out-of tree driver which tells you that it + depends on this. + + config GLOB_SELFTEST + bool "glob self-test on init" + default n + depends on GLOB + help + This option enables a simple self-test of the glob_match + function on startup. It is primarily useful for people + working on the code to ensure they haven't introduced any + regressions. + + It only adds a little bit of code and slows kernel boot (or + module load) by a small amount, so you're welcome to play with + it, but you probably don't need it. + # # Netlink attribute parsing support is select'ed if needed # @@@ -451,8 -491,7 +491,8 @@@ config MPILI
config SIGNATURE tristate - depends on KEYS && CRYPTO + depends on KEYS + select CRYPTO select CRYPTO_SHA1 select MPILIB help @@@ -475,4 -514,11 +515,11 @@@ config UCS2_STRIN
source "lib/fonts/Kconfig"
+ # + # sg chaining option + # + + config ARCH_HAS_SG_CHAIN + def_bool n + endmenu diff --combined lib/Kconfig.debug index 066936a,fd939e1..ff15fb6 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@@ -15,7 -15,7 +15,7 @@@ config PRINTK_TIM The behavior is also controlled by the kernel command line parameter printk.time=1. See Documentation/kernel-parameters.txt
- config DEFAULT_MESSAGE_LOGLEVEL + config MESSAGE_LOGLEVEL_DEFAULT int "Default message log level (1-7)" range 1 7 default "4" @@@ -180,7 -180,7 +180,7 @@@ config STRIP_ASM_SYM
config READABLE_ASM bool "Generate readable assembler code" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !LTO help Disable some compiler optimizations that tend to generate human unreadable assembler output. This may make the kernel slightly slower, but it helps @@@ -835,7 -835,7 +835,7 @@@ config DEBUG_RT_MUTEXE
config RT_MUTEX_TESTER bool "Built-in scriptable tester for rt-mutexes" - depends on DEBUG_KERNEL && RT_MUTEXES + depends on DEBUG_KERNEL && RT_MUTEXES && BROKEN help This option enables a rt-mutex tester.
@@@ -1131,6 -1131,20 +1131,6 @@@ config PROVE_RCU_REPEATEDL
Say N if you are unsure.
-config PROVE_RCU_DELAY - bool "RCU debugging: preemptible RCU race provocation" - depends on DEBUG_KERNEL && PREEMPT_RCU - default n - help - There is a class of races that involve an unlikely preemption - of __rcu_read_unlock() just after ->rcu_read_lock_nesting has - been set to INT_MIN. This feature inserts a delay at that - point to increase the probability of these races. - - Say Y to increase probability of preemption of __rcu_read_unlock(). - - Say N if you are unsure. - config SPARSE_RCU_POINTER bool "RCU debugging: sparse-based checks for pointer usage" default n @@@ -1635,19 -1649,6 +1635,19 @@@ config TEST_BP
If unsure, say N.
+config TEST_FIRMWARE + tristate "Test firmware loading via userspace interface" + default n + depends on FW_LOADER + help + This builds the "test_firmware" module that creates a userspace + interface for testing firmware loading. This can be used to + control the triggering of firmware loading without needing an + actual firmware-using device. The contents can be rechecked by + userspace. + + If unsure, say N. + source "samples/Kconfig"
source "lib/Kconfig.kgdb" diff --combined lib/Makefile index 230b4b1,e48067c..44dbcee --- a/lib/Makefile +++ b/lib/Makefile @@@ -34,7 -34,6 +34,7 @@@ obj-$(CONFIG_TEST_KSTRTOX) += test-kstr obj-$(CONFIG_TEST_MODULE) += test_module.o obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o obj-$(CONFIG_TEST_BPF) += test_bpf.o +obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG @@@ -72,6 -71,7 +72,7 @@@ obj-$(CONFIG_CRC32) += crc32. obj-$(CONFIG_CRC7) += crc7.o obj-$(CONFIG_LIBCRC32C) += libcrc32c.o obj-$(CONFIG_CRC8) += crc8.o + obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/ @@@ -137,6 -137,8 +138,8 @@@ obj-$(CONFIG_CORDIC) += cordic.
obj-$(CONFIG_DQL) += dynamic_queue_limits.o
+ obj-$(CONFIG_GLOB) += glob.o + obj-$(CONFIG_MPILIB) += mpi/ obj-$(CONFIG_SIGNATURE) += digsig.o
diff --combined lib/scatterlist.c index b4415fc,4251cbd..9cdf62f --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@@ -73,7 -73,7 +73,7 @@@ EXPORT_SYMBOL(sg_nents) **/ struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents) { - #ifndef ARCH_HAS_SG_CHAIN + #ifndef CONFIG_ARCH_HAS_SG_CHAIN struct scatterlist *ret = &sgl[nents - 1]; #else struct scatterlist *sg, *ret = NULL; @@@ -165,7 -165,6 +165,7 @@@ static void sg_kfree(struct scatterlis * __sg_free_table - Free a previously mapped sg table * @table: The sg table header to use * @max_ents: The maximum number of entries per single scatterlist + * @skip_first_chunk: don't free the (preallocated) first scatterlist chunk * @free_fn: Free function * * Description: @@@ -175,7 -174,7 +175,7 @@@ * **/ void __sg_free_table(struct sg_table *table, unsigned int max_ents, - sg_free_fn *free_fn) + bool skip_first_chunk, sg_free_fn *free_fn) { struct scatterlist *sgl, *next;
@@@ -203,10 -202,7 +203,10 @@@ }
table->orig_nents -= sg_size; - free_fn(sgl, alloc_size); + if (!skip_first_chunk) { + free_fn(sgl, alloc_size); + skip_first_chunk = false; + } sgl = next; }
@@@ -221,7 -217,7 +221,7 @@@ EXPORT_SYMBOL(__sg_free_table) **/ void sg_free_table(struct sg_table *table) { - __sg_free_table(table, SG_MAX_SINGLE_ALLOC, sg_kfree); + __sg_free_table(table, SG_MAX_SINGLE_ALLOC, false, sg_kfree); } EXPORT_SYMBOL(sg_free_table);
@@@ -245,8 -241,8 +245,8 @@@ * **/ int __sg_alloc_table(struct sg_table *table, unsigned int nents, - unsigned int max_ents, gfp_t gfp_mask, - sg_alloc_fn *alloc_fn) + unsigned int max_ents, struct scatterlist *first_chunk, + gfp_t gfp_mask, sg_alloc_fn *alloc_fn) { struct scatterlist *sg, *prv; unsigned int left; @@@ -255,7 -251,7 +255,7 @@@
if (nents == 0) return -EINVAL; - #ifndef ARCH_HAS_SG_CHAIN + #ifndef CONFIG_ARCH_HAS_SG_CHAIN if (WARN_ON_ONCE(nents > max_ents)) return -EINVAL; #endif @@@ -273,12 -269,7 +273,12 @@@
left -= sg_size;
- sg = alloc_fn(alloc_size, gfp_mask); + if (first_chunk) { + sg = first_chunk; + first_chunk = NULL; + } else { + sg = alloc_fn(alloc_size, gfp_mask); + } if (unlikely(!sg)) { /* * Adjust entry count to reflect that the last @@@ -333,9 -324,9 +333,9 @@@ int sg_alloc_table(struct sg_table *tab int ret;
ret = __sg_alloc_table(table, nents, SG_MAX_SINGLE_ALLOC, - gfp_mask, sg_kmalloc); + NULL, gfp_mask, sg_kmalloc); if (unlikely(ret)) - __sg_free_table(table, SG_MAX_SINGLE_ALLOC, sg_kfree); + __sg_free_table(table, SG_MAX_SINGLE_ALLOC, false, sg_kfree);
return ret; } diff --combined mm/filemap.c index d175917,fb74fb8..367ea2c --- a/mm/filemap.c +++ b/mm/filemap.c @@@ -31,6 -31,7 +31,7 @@@ #include <linux/security.h> #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ + #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/cleancache.h> #include <linux/rmap.h> @@@ -233,7 -234,6 +234,6 @@@ void delete_from_page_cache(struct pag spin_lock_irq(&mapping->tree_lock); __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page);
if (freepage) freepage(page); @@@ -241,6 -241,18 +241,6 @@@ } EXPORT_SYMBOL(delete_from_page_cache);
-static int sleep_on_page(void *word) -{ - io_schedule(); - return 0; -} - -static int sleep_on_page_killable(void *word) -{ - sleep_on_page(word); - return fatal_signal_pending(current) ? -EINTR : 0; -} - static int filemap_check_errors(struct address_space *mapping) { int ret = 0; @@@ -489,8 -501,7 +489,7 @@@ int replace_page_cache_page(struct pag if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); - /* mem_cgroup codes must not be called under tree_lock */ - mem_cgroup_replace_page_cache(old, new); + mem_cgroup_migrate(old, new, true); radix_tree_preload_end(); if (freepage) freepage(old); @@@ -548,19 -559,24 +547,24 @@@ static int __add_to_page_cache_locked(s pgoff_t offset, gfp_t gfp_mask, void **shadowp) { + int huge = PageHuge(page); + struct mem_cgroup *memcg; int error;
VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page);
- error = mem_cgroup_charge_file(page, current->mm, - gfp_mask & GFP_RECLAIM_MASK); - if (error) - return error; + if (!huge) { + error = mem_cgroup_try_charge(page, current->mm, + gfp_mask, &memcg); + if (error) + return error; + }
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error) { - mem_cgroup_uncharge_cache_page(page); + if (!huge) + mem_cgroup_cancel_charge(page, memcg); return error; }
@@@ -575,13 -591,16 +579,16 @@@ goto err_insert; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); + if (!huge) + mem_cgroup_commit_charge(page, memcg, false); trace_mm_filemap_add_to_page_cache(page); return 0; err_insert: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); + if (!huge) + mem_cgroup_cancel_charge(page, memcg); page_cache_release(page); return error; } @@@ -680,7 -699,7 +687,7 @@@ void wait_on_page_bit(struct page *page DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, + __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_on_page_bit); @@@ -693,7 -712,7 +700,7 @@@ int wait_on_page_bit_killable(struct pa return 0;
return __wait_on_bit(page_waitqueue(page), &wait, - sleep_on_page_killable, TASK_KILLABLE); + bit_wait_io, TASK_KILLABLE); }
/** @@@ -794,7 -813,7 +801,7 @@@ void __lock_page(struct page *page { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, + __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); @@@ -804,10 -823,21 +811,21 @@@ int __lock_page_killable(struct page *p DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
return __wait_on_bit_lock(page_waitqueue(page), &wait, - sleep_on_page_killable, TASK_KILLABLE); + bit_wait_io, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable);
+ /* + * Return values: + * 1 - page is locked; mmap_sem is still held. + * 0 - page is not locked. + * mmap_sem has been released (up_read()), unless flags had both + * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in + * which case mmap_sem is still held. + * + * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 + * with the page locked and the mmap_sem unperturbed. + */ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { @@@ -1088,9 -1118,9 +1106,9 @@@ no_page if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) fgp_flags |= FGP_LOCK;
- /* Init accessed so avoit atomic mark_page_accessed later */ + /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) - init_page_accessed(page); + __SetPageReferenced(page);
err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); if (unlikely(err)) { @@@ -1824,6 -1854,18 +1842,18 @@@ static void do_async_mmap_readahead(str * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. + * + * vma->vm_mm->mmap_sem must be held on entry. + * + * If our return value has VM_FAULT_RETRY set, it's because + * lock_page_or_retry() returned 0. + * The mmap_sem has usually been released in this case. + * See __lock_page_or_retry() for the exception. + * + * If our return value does not have VM_FAULT_RETRY set, the mmap_sem + * has not been released. + * + * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { diff --combined mm/memcontrol.c index 45c10c6,d44bf3e..6f81411 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@@ -648,10 -648,8 +648,8 @@@ EXPORT_SYMBOL(memcg_kmem_enabled_key)
static void disarm_kmem_keys(struct mem_cgroup *memcg) { - if (memcg_kmem_is_active(memcg)) { + if (memcg_kmem_is_active(memcg)) static_key_slow_dec(&memcg_kmem_enabled_key); - ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); - } /* * This check can't live in kmem destruction function, * since the charges will outlive the cgroup @@@ -754,9 -752,11 +752,11 @@@ static void __mem_cgroup_remove_exceede static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz) { - spin_lock(&mctz->lock); + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); __mem_cgroup_remove_exceeded(mz, mctz); - spin_unlock(&mctz->lock); + spin_unlock_irqrestore(&mctz->lock, flags); }
@@@ -779,7 -779,9 +779,9 @@@ static void mem_cgroup_update_tree(stru * mem is over its softlimit. */ if (excess || mz->on_tree) { - spin_lock(&mctz->lock); + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); /* if on-tree, remove it */ if (mz->on_tree) __mem_cgroup_remove_exceeded(mz, mctz); @@@ -788,7 -790,7 +790,7 @@@ * If excess is 0, no tree ops. */ __mem_cgroup_insert_exceeded(mz, mctz, excess); - spin_unlock(&mctz->lock); + spin_unlock_irqrestore(&mctz->lock, flags); } } } @@@ -839,9 -841,9 +841,9 @@@ mem_cgroup_largest_soft_limit_node(stru { struct mem_cgroup_per_zone *mz;
- spin_lock(&mctz->lock); + spin_lock_irq(&mctz->lock); mz = __mem_cgroup_largest_soft_limit_node(mctz); - spin_unlock(&mctz->lock); + spin_unlock_irq(&mctz->lock); return mz; }
@@@ -882,13 -884,6 +884,6 @@@ static long mem_cgroup_read_stat(struc return val; }
- static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - bool charge) - { - int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); - } - static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { @@@ -909,13 -904,13 +904,13 @@@
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - bool anon, int nr_pages) + int nr_pages) { /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is * counted as CACHE even if it's on ANON LRU. */ - if (anon) + if (PageAnon(page)) __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); else @@@ -1013,7 -1008,6 +1008,6 @@@ static bool mem_cgroup_event_ratelimit( */ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { - preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { @@@ -1026,8 -1020,6 +1020,6 @@@ do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); #endif - preempt_enable(); - mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); @@@ -1035,8 -1027,7 +1027,7 @@@ if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); #endif - } else - preempt_enable(); + } }
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) @@@ -1347,20 -1338,6 +1338,6 @@@ out return lruvec; }
- /* - * Following LRU functions are allowed to be used without PCG_LOCK. - * Operations are called by routine of global LRU independently from memcg. - * What we have to take care of here is validness of pc->mem_cgroup. - * - * Changes to pc->mem_cgroup happens when - * 1. charge - * 2. moving account - * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. - * It is added to LRU before charge. - * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. - * When moving account, the page is not on LRU. It's isolated. - */ - /** * mem_cgroup_page_lruvec - return lruvec for adding an lru page * @page: the page @@@ -2261,22 -2238,14 +2238,14 @@@ cleanup * * Notes: Race condition * - * We usually use lock_page_cgroup() for accessing page_cgroup member but - * it tends to be costly. But considering some conditions, we doesn't need - * to do so _always_. + * Charging occurs during page instantiation, while the page is + * unmapped and locked in page migration, or while the page table is + * locked in THP migration. No race is possible. * - * Considering "charge", lock_page_cgroup() is not required because all - * file-stat operations happen after a page is attached to radix-tree. There - * are no race with "charge". + * Uncharge happens to pages with zero references, no race possible. * - * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup - * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even - * if there are race with "uncharge". Statistics itself is properly handled - * by flags. - * - * Considering "move", this is an only case we see a race. To make the race - * small, we check memcg->moving_account and detect there are possibility - * of race or not. If there is, we take a lock. + * Charge moving between groups is protected by checking mm->moving + * account and taking the move_lock in the slowpath. */
void __mem_cgroup_begin_update_page_stat(struct page *page, @@@ -2551,55 -2520,63 +2520,63 @@@ static int memcg_cpu_hotplug_callback(s return NOTIFY_OK; }
- - /* See mem_cgroup_try_charge() for details */ - enum { - CHARGE_OK, /* success */ - CHARGE_RETRY, /* need to retry but retry is not bad */ - CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ - CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ - }; - - static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, unsigned int min_pages, - bool invoke_oom) + static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) { - unsigned long csize = nr_pages * PAGE_SIZE; + unsigned int batch = max(CHARGE_BATCH, nr_pages); + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; + unsigned long nr_reclaimed; unsigned long flags = 0; - int ret; + unsigned long long size; + int ret = 0;
- ret = res_counter_charge(&memcg->res, csize, &fail_res); + retry: + if (consume_stock(memcg, nr_pages)) + goto done;
- if (likely(!ret)) { + size = batch * PAGE_SIZE; + if (!res_counter_charge(&memcg->res, size, &fail_res)) { if (!do_swap_account) - return CHARGE_OK; - ret = res_counter_charge(&memcg->memsw, csize, &fail_res); - if (likely(!ret)) - return CHARGE_OK; - - res_counter_uncharge(&memcg->res, csize); + goto done_restock; + if (!res_counter_charge(&memcg->memsw, size, &fail_res)) + goto done_restock; + res_counter_uncharge(&memcg->res, size); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + + if (batch > nr_pages) { + batch = nr_pages; + goto retry; + } + /* - * Never reclaim on behalf of optional batching, retry with a - * single page instead. + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. */ - if (nr_pages > min_pages) - return CHARGE_RETRY; + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current) || + current->flags & PF_EXITING)) + goto bypass; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem;
if (!(gfp_mask & __GFP_WAIT)) - return CHARGE_WOULDBLOCK; + goto nomem;
- if (gfp_mask & __GFP_NORETRY) - return CHARGE_NOMEM; + nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); + + if (mem_cgroup_margin(mem_over_limit) >= batch) + goto retry;
- ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); - if (mem_cgroup_margin(mem_over_limit) >= nr_pages) - return CHARGE_RETRY; + if (gfp_mask & __GFP_NORETRY) + goto nomem; /* * Even though the limit is exceeded at this point, reclaim * may have been able to free some pages. Retry the charge @@@ -2609,142 -2586,47 +2586,47 @@@ * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) - return CHARGE_RETRY; - + if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + goto retry; /* * At task move, charge accounts can be doubly counted. So, it's * better to wait until the end of task_move if something is going on. */ if (mem_cgroup_wait_acct_move(mem_over_limit)) - return CHARGE_RETRY; - - if (invoke_oom) - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - - return CHARGE_NOMEM; - } - - /** - * mem_cgroup_try_charge - try charging a memcg - * @memcg: memcg to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns 0 if @memcg was charged successfully, -EINTR if the charge - * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. - */ - static int mem_cgroup_try_charge(struct mem_cgroup *memcg, - gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) - { - unsigned int batch = max(CHARGE_BATCH, nr_pages); - int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - int ret; - - if (mem_cgroup_is_root(memcg)) - goto done; - /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) - goto bypass; + goto retry;
- if (unlikely(task_in_memcg_oom(current))) - goto nomem; + if (nr_retries--) + goto retry;
if (gfp_mask & __GFP_NOFAIL) - oom = false; - again: - if (consume_stock(memcg, nr_pages)) - goto done; - - do { - bool invoke_oom = oom && !nr_oom_retries; - - /* If killed, bypass charge */ - if (fatal_signal_pending(current)) - goto bypass; + goto bypass;
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, - nr_pages, invoke_oom); - switch (ret) { - case CHARGE_OK: - break; - case CHARGE_RETRY: /* not in OOM situation but retry */ - batch = nr_pages; - goto again; - case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - goto nomem; - case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) - goto nomem; - nr_oom_retries--; - break; - } - } while (ret != CHARGE_OK); + if (fatal_signal_pending(current)) + goto bypass;
- if (batch > nr_pages) - refill_stock(memcg, batch - nr_pages); - done: - return 0; + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; bypass: - return -EINTR; - } - - /** - * mem_cgroup_try_charge_mm - try charging a mm - * @mm: mm_struct to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns the charged mem_cgroup associated with the given mm_struct or - * NULL the charge failed. - */ - static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, - gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) - - { - struct mem_cgroup *memcg; - int ret; - - memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); - css_put(&memcg->css); - if (ret == -EINTR) - memcg = root_mem_cgroup; - else if (ret) - memcg = NULL; + memcg = root_mem_cgroup; + ret = -EINTR; + goto retry;
- return memcg; + done_restock: + if (batch > nr_pages) + refill_stock(memcg, batch - nr_pages); + done: + return ret; }
- /* - * Somemtimes we have to undo a charge we got by try_charge(). - * This function is for that and do uncharge, put css's refcnt. - * gotten by try_charge(). - */ - static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) + static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(memcg)) { - unsigned long bytes = nr_pages * PAGE_SIZE; + unsigned long bytes = nr_pages * PAGE_SIZE;
- res_counter_uncharge(&memcg->res, bytes); - if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); - } + res_counter_uncharge(&memcg->res, bytes); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, bytes); }
/* @@@ -2756,9 -2638,6 +2638,6 @@@ static void __mem_cgroup_cancel_local_c { unsigned long bytes = nr_pages * PAGE_SIZE;
- if (mem_cgroup_is_root(memcg)) - return; - res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); if (do_swap_account) res_counter_uncharge_until(&memcg->memsw, @@@ -2779,6 -2658,16 +2658,16 @@@ static struct mem_cgroup *mem_cgroup_lo return mem_cgroup_from_id(id); }
+ /* + * try_get_mem_cgroup_from_page - look up page's memcg association + * @page: the page + * + * Look up, get a css reference, and return the memcg that owns @page. + * + * The page must be locked to prevent racing with swap-in and page + * cache charges. If coming from an unlocked page table, the caller + * must ensure the page is on the LRU or this can race with charging. + */ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { struct mem_cgroup *memcg = NULL; @@@ -2789,7 -2678,6 +2678,6 @@@ VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { memcg = pc->mem_cgroup; if (memcg && !css_tryget_online(&memcg->css)) @@@ -2803,23 -2691,46 +2691,46 @@@ memcg = NULL; rcu_read_unlock(); } return memcg; }
- static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, - struct page *page, - unsigned int nr_pages, - enum charge_type ctype, - bool lrucare) + static void lock_page_lru(struct page *page, int *isolated) + { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, zone); + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, page_lru(page)); + *isolated = 1; + } else + *isolated = 0; + } + + static void unlock_page_lru(struct page *page, int isolated) + { + struct zone *zone = page_zone(page); + + if (isolated) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, zone); + VM_BUG_ON_PAGE(PageLRU(page), page); + SetPageLRU(page); + add_page_to_lru_list(page, lruvec, page_lru(page)); + } + spin_unlock_irq(&zone->lru_lock); + } + + static void commit_charge(struct page *page, struct mem_cgroup *memcg, + unsigned int nr_pages, bool lrucare) { struct page_cgroup *pc = lookup_page_cgroup(page); - struct zone *uninitialized_var(zone); - struct lruvec *lruvec; - bool was_on_lru = false; - bool anon; + int isolated;
- lock_page_cgroup(pc); VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); /* * we don't need page_cgroup_lock about tail pages, becase they are not @@@ -2830,52 -2741,38 +2741,38 @@@ * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page * may already be on some other mem_cgroup's LRU. Take care of it. */ - if (lrucare) { - zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page)) { - lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); - was_on_lru = true; - } - } + if (lrucare) + lock_page_lru(page, &isolated);
- pc->mem_cgroup = memcg; /* - * We access a page_cgroup asynchronously without lock_page_cgroup(). - * Especially when a page_cgroup is taken from a page, pc->mem_cgroup - * is accessed after testing USED bit. To make pc->mem_cgroup visible - * before USED bit, we need memory barrier here. - * See mem_cgroup_add_lru_list(), etc. + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point: + * + * - the page is uncharged + * + * - the page is off-LRU + * + * - an anonymous fault has exclusive page access, except for + * a locked page table + * + * - a page cache insertion, a swapin fault, or a migration + * have the page locked */ - smp_wmb(); - SetPageCgroupUsed(pc); - - if (lrucare) { - if (was_on_lru) { - lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); - VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); - add_page_to_lru_list(page, lruvec, page_lru(page)); - } - spin_unlock_irq(&zone->lru_lock); - } - - if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) - anon = true; - else - anon = false; + pc->mem_cgroup = memcg; + pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
- mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); - unlock_page_cgroup(pc); + if (lrucare) + unlock_page_lru(page, isolated);
+ local_irq_disable(); + mem_cgroup_charge_statistics(memcg, page, nr_pages); /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. * if they exceeds softlimit. */ memcg_check_events(memcg, page); + local_irq_enable(); }
static DEFINE_MUTEX(set_limit_mutex); @@@ -2896,16 -2793,13 +2793,13 @@@ static inline bool memcg_can_account_km }
/* - * This is a bit cumbersome, but it is rarely used and avoids a backpointer - * in the memcg_cache_params struct. + * helper for acessing a memcg's index. It will be used as an index in the + * child cache array in kmem_cache, and also to derive its name. This function + * will return -1 when this is not a kmem-limited memcg. */ - static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) + static inline int memcg_cache_id(struct mem_cgroup *memcg) { - struct kmem_cache *cachep; - - VM_BUG_ON(p->is_root_cache); - cachep = p->root_cache; - return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); + return memcg ? memcg->kmemcg_id : -1; }
#ifdef CONFIG_SLABINFO @@@ -2921,7 -2815,7 +2815,7 @@@ static int mem_cgroup_slabinfo_read(str
mutex_lock(&memcg_slab_mutex); list_for_each_entry(params, &memcg->memcg_slab_caches, list) - cache_show(memcg_params_to_cache(params), m); + cache_show(params->cachep, m); mutex_unlock(&memcg_slab_mutex);
return 0; @@@ -2937,22 -2831,21 +2831,21 @@@ static int memcg_charge_kmem(struct mem if (ret) return ret;
- ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, - oom_gfp_allowed(gfp)); + ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); if (ret == -EINTR) { /* - * mem_cgroup_try_charge() chosed to bypass to root due to - * OOM kill or fatal signal. Since our only options are to - * either fail the allocation or charge it to this cgroup, do - * it as a temporary condition. But we can't fail. From a - * kmem/slab perspective, the cache has already been selected, - * by mem_cgroup_kmem_get_cache(), so it is too late to change + * try_charge() chose to bypass to root due to OOM kill or + * fatal signal. Since our only options are to either fail + * the allocation or charge it to this cgroup, do it as a + * temporary condition. But we can't fail. From a kmem/slab + * perspective, the cache has already been selected, by + * mem_cgroup_kmem_get_cache(), so it is too late to change * our minds. * * This condition will only trigger if the task entered - * memcg_charge_kmem in a sane state, but was OOM-killed during - * mem_cgroup_try_charge() above. Tasks that were already - * dying when the allocation triggers should have been already + * memcg_charge_kmem in a sane state, but was OOM-killed + * during try_charge() above. Tasks that were already dying + * when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ res_counter_charge_nofail(&memcg->res, size, &fail_res); @@@ -2988,16 -2881,6 +2881,6 @@@ static void memcg_uncharge_kmem(struct css_put(&memcg->css); }
- /* - * helper for acessing a memcg's index. It will be used as an index in the - * child cache array in kmem_cache, and also to derive its name. This function - * will return -1 when this is not a kmem-limited memcg. - */ - int memcg_cache_id(struct mem_cgroup *memcg) - { - return memcg ? memcg->kmemcg_id : -1; - } - static size_t memcg_caches_array_size(int num_groups) { ssize_t size; @@@ -3043,6 -2926,10 +2926,10 @@@ int memcg_update_cache_size(struct kmem return -ENOMEM;
new_params->is_root_cache = true; + INIT_LIST_HEAD(&new_params->children); + if (cur_params) + list_replace(&cur_params->children, + &new_params->children);
/* * There is the chance it will be bigger than @@@ -3095,11 -2982,14 +2982,14 @@@ int memcg_alloc_cache_params(struct mem return -ENOMEM;
if (memcg) { + s->memcg_params->cachep = s; s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; css_get(&memcg->css); - } else + } else { s->memcg_params->is_root_cache = true; + INIT_LIST_HEAD(&s->memcg_params->children); + }
return 0; } @@@ -3119,11 -3009,18 +3009,18 @@@ static void memcg_register_cache(struc static char memcg_name_buf[NAME_MAX + 1]; /* protected by memcg_slab_mutex */ struct kmem_cache *cachep; + char *cache_name; int id;
lockdep_assert_held(&memcg_slab_mutex);
id = memcg_cache_id(memcg); + /* + * The cgroup was taken offline while the create work was pending, + * nothing to do then. + */ + if (id < 0) + return;
/* * Since per-memcg caches are created asynchronously on first @@@ -3134,14 -3031,22 +3031,22 @@@ return;
cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); - cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); + + cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, + mem_cgroup_id(memcg), memcg_name_buf); + if (!cache_name) + return; + + cachep = memcg_create_kmem_cache(memcg, root_cache, cache_name); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root * cache. */ - if (!cachep) + if (!cachep) { + kfree(cache_name); return; + }
list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
@@@ -3170,8 -3075,17 +3075,17 @@@ static void memcg_unregister_cache(stru memcg = cachep->memcg_params->memcg; id = memcg_cache_id(memcg);
- BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); - root_cache->memcg_params->memcg_caches[id] = NULL; + /* + * This function can be called both after and before css offline. If + * it's called before css offline, which happens on the root cache + * destruction, we should clear the slot corresponding to the cache in + * memcg_caches array. Otherwise the slot must have already been + * cleared in memcg_unregister_all_caches. + */ + if (id >= 0) { + BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); + root_cache->memcg_params->memcg_caches[id] = NULL; + }
list_del(&cachep->memcg_params->list);
@@@ -3209,42 -3123,41 +3123,41 @@@ static inline void memcg_resume_kmem_ac current->memcg_kmem_skip_account--; }
- int __memcg_cleanup_cache_params(struct kmem_cache *s) + void __memcg_cleanup_cache_params(struct kmem_cache *s) { - struct kmem_cache *c; - int i, failed = 0; + struct memcg_cache_params *params, *tmp;
mutex_lock(&memcg_slab_mutex); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; - - memcg_unregister_cache(c); - - if (cache_from_memcg_idx(s, i)) - failed++; - } + list_for_each_entry_safe(params, tmp, + &s->memcg_params->children, siblings) + memcg_unregister_cache(params->cachep); mutex_unlock(&memcg_slab_mutex); - return failed; }
static void memcg_unregister_all_caches(struct mem_cgroup *memcg) { - struct kmem_cache *cachep; struct memcg_cache_params *params, *tmp; + int id = memcg_cache_id(memcg);
if (!memcg_kmem_is_active(memcg)) return;
mutex_lock(&memcg_slab_mutex); + memcg->kmemcg_id = -1; list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { - cachep = memcg_params_to_cache(params); + struct kmem_cache *cachep = params->cachep; + struct kmem_cache *root_cache = params->root_cache; + + BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); + root_cache->memcg_params->memcg_caches[id] = NULL; + kmem_cache_shrink(cachep); if (atomic_read(&cachep->memcg_params->nr_pages) == 0) memcg_unregister_cache(cachep); } mutex_unlock(&memcg_slab_mutex); + + ida_simple_remove(&kmem_limited_groups, id); }
struct memcg_register_cache_work { @@@ -3343,6 -3256,7 +3256,7 @@@ struct kmem_cache *__memcg_kmem_get_cac { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; + int id;
VM_BUG_ON(!cachep->memcg_params); VM_BUG_ON(!cachep->memcg_params->is_root_cache); @@@ -3356,7 -3270,15 +3270,15 @@@ if (!memcg_can_account_kmem(memcg)) goto out;
- memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); + id = memcg_cache_id(memcg); + /* + * This can happen if current was migrated to another cgroup and this + * cgroup was taken offline after we issued mem_cgroup_from_task above. + */ + if (unlikely(id < 0)) + goto out; + + memcg_cachep = cache_from_memcg_idx(cachep, id); if (likely(memcg_cachep)) { cachep = memcg_cachep; goto out; @@@ -3463,12 -3385,13 +3385,13 @@@ void __memcg_kmem_commit_charge(struct memcg_uncharge_kmem(memcg, PAGE_SIZE << order); return; } - + /* + * The page is freshly allocated and not visible to any + * outside callers yet. Set up pc non-atomically. + */ pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); pc->mem_cgroup = memcg; - SetPageCgroupUsed(pc); - unlock_page_cgroup(pc); + pc->flags = PCG_USED; }
void __memcg_kmem_uncharge_pages(struct page *page, int order) @@@ -3478,19 -3401,11 +3401,11 @@@
pc = lookup_page_cgroup(page); if (!PageCgroupUsed(pc)) return;
- lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); + memcg = pc->mem_cgroup; + pc->flags = 0;
/* * We trust that only if there is a memcg associated with the page, it @@@ -3510,7 -3425,6 +3425,6 @@@ static inline void memcg_unregister_all
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@@ -3531,8 -3445,7 +3445,7 @@@ void mem_cgroup_split_huge_fixup(struc for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; pc->mem_cgroup = memcg; - smp_wmb();/* see __commit_charge() */ - pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + pc->flags = head_pc->flags; } __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], HPAGE_PMD_NR); @@@ -3562,7 -3475,6 +3475,6 @@@ static int mem_cgroup_move_account(stru { unsigned long flags; int ret; - bool anon = PageAnon(page);
VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@@ -3576,15 -3488,21 +3488,21 @@@ if (nr_pages > 1 && !PageTransHuge(page)) goto out;
- lock_page_cgroup(pc); + /* + * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup + * of its source page while we change it: page migration takes + * both pages off the LRU, but page cache replacement doesn't. + */ + if (!trylock_page(page)) + goto out;
ret = -EINVAL; if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) - goto unlock; + goto out_unlock;
move_lock_mem_cgroup(from, &flags);
- if (!anon && page_mapped(page)) { + if (!PageAnon(page) && page_mapped(page)) { __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], nr_pages); __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], @@@ -3598,20 -3516,25 +3516,25 @@@ nr_pages); }
- mem_cgroup_charge_statistics(from, page, anon, -nr_pages); + /* + * It is safe to change pc->mem_cgroup here because the page + * is referenced, charged, and isolated - we can't race with + * uncharging, charging, migration, or LRU putback. + */
/* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, page, anon, nr_pages); move_unlock_mem_cgroup(from, &flags); ret = 0; - unlock: - unlock_page_cgroup(pc); - /* - * check events - */ + + local_irq_disable(); + mem_cgroup_charge_statistics(to, page, nr_pages); memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); memcg_check_events(from, page); + local_irq_enable(); + out_unlock: + unlock_page(page); out: return ret; } @@@ -3682,483 -3605,39 +3605,39 @@@ out return ret; }
- int mem_cgroup_charge_anon(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) + #ifdef CONFIG_MEMCG_SWAP + static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, + bool charge) { - unsigned int nr_pages = 1; - struct mem_cgroup *memcg; - bool oom = true; + int val = (charge) ? 1 : -1; + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); + }
- if (mem_cgroup_disabled()) - return 0; + /** + * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. + * @entry: swap entry to be moved + * @from: mem_cgroup which the entry is moved from + * @to: mem_cgroup which the entry is moved to + * + * It succeeds only when the swap_cgroup's record for this entry is the same + * as the mem_cgroup's id of @from. + * + * Returns 0 on success, -EINVAL on failure. + * + * The caller must have charged to @to, IOW, called res_counter_charge() about + * both res and memsw, and called css_get(). + */ + static int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to) + { + unsigned short old_id, new_id;
- VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - VM_BUG_ON(!mm); + old_id = mem_cgroup_id(from); + new_id = mem_cgroup_id(to);
- if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - /* - * Never OOM-kill a process for a huge page. The - * fault handler will fall back to regular pages. - */ - oom = false; - } - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); - if (!memcg) - return -ENOMEM; - __mem_cgroup_commit_charge(memcg, page, nr_pages, - MEM_CGROUP_CHARGE_TYPE_ANON, false); - return 0; - } - - /* - * While swap-in, try_charge -> commit or cancel, the page is locked. - * And when try_charge() successfully returns, one refcnt to memcg without - * struct page_cgroup is acquired. This refcnt will be consumed by - * "commit()" or removed by "cancel()" - */ - static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, - gfp_t mask, - struct mem_cgroup **memcgp) - { - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - int ret; - - pc = lookup_page_cgroup(page); - /* - * Every swap fault against a single page tries to charge the - * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. The USED bit is protected by - * the page lock, which serializes swap cache removal, which - * in turn serializes uncharging. - */ - if (PageCgroupUsed(pc)) - goto out; - if (do_swap_account) - memcg = try_get_mem_cgroup_from_page(page); - if (!memcg) - memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, mask, 1, true); - css_put(&memcg->css); - if (ret == -EINTR) - memcg = root_mem_cgroup; - else if (ret) - return ret; - out: - *memcgp = memcg; - return 0; - } - - int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, - gfp_t gfp_mask, struct mem_cgroup **memcgp) - { - if (mem_cgroup_disabled()) { - *memcgp = NULL; - return 0; - } - /* - * A racing thread's fault, or swapoff, may have already - * updated the pte, and even removed page from swap cache: in - * those cases unuse_pte()'s pte_same() test will fail; but - * there's also a KSM case which does need to charge the page. - */ - if (!PageSwapCache(page)) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); - if (!memcg) - return -ENOMEM; - *memcgp = memcg; - return 0; - } - return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); - } - - void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) - { - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - __mem_cgroup_cancel_charge(memcg, 1); - } - - static void - __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, - enum charge_type ctype) - { - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - - __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); - /* - * Now swap is on-memory. This means this page may be - * counted both as mem and swap....double count. - * Fix it by uncharging from memsw. Basically, this SwapCache is stable - * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() - * may call delete_from_swap_cache() before reach here. - */ - if (do_swap_account && PageSwapCache(page)) { - swp_entry_t ent = {.val = page_private(page)}; - mem_cgroup_uncharge_swap(ent); - } - } - - void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *memcg) - { - __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_ANON); - } - - int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) - { - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - struct mem_cgroup *memcg; - int ret; - - if (mem_cgroup_disabled()) - return 0; - if (PageCompound(page)) - return 0; - - if (PageSwapCache(page)) { /* shmem */ - ret = __mem_cgroup_try_charge_swapin(mm, page, - gfp_mask, &memcg); - if (ret) - return ret; - __mem_cgroup_commit_charge_swapin(page, memcg, type); - return 0; - } - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); - if (!memcg) - return -ENOMEM; - __mem_cgroup_commit_charge(memcg, page, 1, type, false); - return 0; - } - - static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, - unsigned int nr_pages, - const enum charge_type ctype) - { - struct memcg_batch_info *batch = NULL; - bool uncharge_memsw = true; - - /* If swapout, usage of swap doesn't decrease */ - if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - uncharge_memsw = false; - - batch = ¤t->memcg_batch; - /* - * In usual, we do css_get() when we remember memcg pointer. - * But in this case, we keep res->usage until end of a series of - * uncharges. Then, it's ok to ignore memcg's refcnt. - */ - if (!batch->memcg) - batch->memcg = memcg; - /* - * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continuously can be expected to be in - * the same cgroup and we have chance to coalesce uncharges. - * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) - * because we want to do uncharge as soon as possible. - */ - - if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) - goto direct_uncharge; - - if (nr_pages > 1) - goto direct_uncharge; - - /* - * In typical case, batch->memcg == mem. This means we can - * merge a series of uncharges to an uncharge of res_counter. - * If not, we uncharge res_counter ony by one. - */ - if (batch->memcg != memcg) - goto direct_uncharge; - /* remember freed charge and uncharge it later */ - batch->nr_pages++; - if (uncharge_memsw) - batch->memsw_nr_pages++; - return; - direct_uncharge: - res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); - if (uncharge_memsw) - res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); - if (unlikely(batch->memcg != memcg)) - memcg_oom_recover(memcg); - } - - /* - * uncharge if !page_mapped(page) - */ - static struct mem_cgroup * - __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, - bool end_migration) - { - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - bool anon; - - if (mem_cgroup_disabled()) - return NULL; - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - /* - * Check if our page_cgroup is valid - */ - pc = lookup_page_cgroup(page); - if (unlikely(!PageCgroupUsed(pc))) - return NULL; - - lock_page_cgroup(pc); - - memcg = pc->mem_cgroup; - - if (!PageCgroupUsed(pc)) - goto unlock_out; - - anon = PageAnon(page); - - switch (ctype) { - case MEM_CGROUP_CHARGE_TYPE_ANON: - /* - * Generally PageAnon tells if it's the anon statistics to be - * updated; but sometimes e.g. mem_cgroup_uncharge_page() is - * used before page reached the stage of being marked PageAnon. - */ - anon = true; - /* fallthrough */ - case MEM_CGROUP_CHARGE_TYPE_DROP: - /* See mem_cgroup_prepare_migration() */ - if (page_mapped(page)) - goto unlock_out; - /* - * Pages under migration may not be uncharged. But - * end_migration() /must/ be the one uncharging the - * unused post-migration page and so it has to call - * here with the migration bit still set. See the - * res_counter handling below. - */ - if (!end_migration && PageCgroupMigration(pc)) - goto unlock_out; - break; - case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: - if (!PageAnon(page)) { /* Shared memory */ - if (page->mapping && !page_is_file_cache(page)) - goto unlock_out; - } else if (page_mapped(page)) /* Anon */ - goto unlock_out; - break; - default: - break; - } - - mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); - - ClearPageCgroupUsed(pc); - /* - * pc->mem_cgroup is not cleared here. It will be accessed when it's - * freed from LRU. This is safe because uncharged page is expected not - * to be reused (freed soon). Exception is SwapCache, it's handled by - * special functions. - */ - - unlock_page_cgroup(pc); - /* - * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed, so it's safe to call css_get(). - */ - memcg_check_events(memcg, page); - if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { - mem_cgroup_swap_statistics(memcg, true); - css_get(&memcg->css); - } - /* - * Migration does not charge the res_counter for the - * replacement page, so leave it alone when phasing out the - * page that is unused after the migration. - */ - if (!end_migration && !mem_cgroup_is_root(memcg)) - mem_cgroup_do_uncharge(memcg, nr_pages, ctype); - - return memcg; - - unlock_out: - unlock_page_cgroup(pc); - return NULL; - } - - void mem_cgroup_uncharge_page(struct page *page) - { - /* early check. */ - if (page_mapped(page)) - return; - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - /* - * If the page is in swap cache, uncharge should be deferred - * to the swap path, which also properly accounts swap usage - * and handles memcg lifetime. - * - * Note that this check is not stable and reclaim may add the - * page to swap cache at any time after this. However, if the - * page is not in swap cache by the time page->mapcount hits - * 0, there won't be any page table references to the swap - * slot, and reclaim will free it and not actually write the - * page to disk. - */ - if (PageSwapCache(page)) - return; - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); - } - - void mem_cgroup_uncharge_cache_page(struct page *page) - { - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping, page); - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); - } - - /* - * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. - * In that cases, pages are freed continuously and we can expect pages - * are in the same memcg. All these calls itself limits the number of - * pages freed at once, then uncharge_start/end() is called properly. - * This may be called prural(2) times in a context, - */ - - void mem_cgroup_uncharge_start(void) - { - current->memcg_batch.do_batch++; - /* We can do nest. */ - if (current->memcg_batch.do_batch == 1) { - current->memcg_batch.memcg = NULL; - current->memcg_batch.nr_pages = 0; - current->memcg_batch.memsw_nr_pages = 0; - } - } - - void mem_cgroup_uncharge_end(void) - { - struct memcg_batch_info *batch = ¤t->memcg_batch; - - if (!batch->do_batch) - return; - - batch->do_batch--; - if (batch->do_batch) /* If stacked, do nothing. */ - return; - - if (!batch->memcg) - return; - /* - * This "batch->memcg" is valid without any css_get/put etc... - * bacause we hide charges behind us. - */ - if (batch->nr_pages) - res_counter_uncharge(&batch->memcg->res, - batch->nr_pages * PAGE_SIZE); - if (batch->memsw_nr_pages) - res_counter_uncharge(&batch->memcg->memsw, - batch->memsw_nr_pages * PAGE_SIZE); - memcg_oom_recover(batch->memcg); - /* forget this pointer (for sanity check) */ - batch->memcg = NULL; - } - - #ifdef CONFIG_SWAP - /* - * called after __delete_from_swap_cache() and drop "page" account. - * memcg information is recorded to swap_cgroup of "ent" - */ - void - mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) - { - struct mem_cgroup *memcg; - int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; - - if (!swapout) /* this was a swap cache but the swap is unused ! */ - ctype = MEM_CGROUP_CHARGE_TYPE_DROP; - - memcg = __mem_cgroup_uncharge_common(page, ctype, false); - - /* - * record memcg information, if swapout && memcg != NULL, - * css_get() was called in uncharge(). - */ - if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, mem_cgroup_id(memcg)); - } - #endif - - #ifdef CONFIG_MEMCG_SWAP - /* - * called from swap_entry_free(). remove record in swap_cgroup and - * uncharge "memsw" account. - */ - void mem_cgroup_uncharge_swap(swp_entry_t ent) - { - struct mem_cgroup *memcg; - unsigned short id; - - if (!do_swap_account) - return; - - id = swap_cgroup_record(ent, 0); - rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { - /* - * We uncharge this because swap is freed. This memcg can - * be obsolete one. We avoid calling css_tryget_online(). - */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); - mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); - } - rcu_read_unlock(); - } - - /** - * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. - * @entry: swap entry to be moved - * @from: mem_cgroup which the entry is moved from - * @to: mem_cgroup which the entry is moved to - * - * It succeeds only when the swap_cgroup's record for this entry is the same - * as the mem_cgroup's id of @from. - * - * Returns 0 on success, -EINVAL on failure. - * - * The caller must have charged to @to, IOW, called res_counter_charge() about - * both res and memsw, and called css_get(). - */ - static int mem_cgroup_move_swap_account(swp_entry_t entry, - struct mem_cgroup *from, struct mem_cgroup *to) - { - unsigned short old_id, new_id; - - old_id = mem_cgroup_id(from); - new_id = mem_cgroup_id(to); - - if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { - mem_cgroup_swap_statistics(from, false); - mem_cgroup_swap_statistics(to, true); + if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { + mem_cgroup_swap_statistics(from, false); + mem_cgroup_swap_statistics(to, true); /* * This function is only called from task migration context now. * It postpones res_counter and refcount handling till the end @@@ -4183,175 -3662,6 +3662,6 @@@ static inline int mem_cgroup_move_swap_ } #endif
- /* - * Before starting migration, account PAGE_SIZE to mem_cgroup that the old - * page belongs to. - */ - void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **memcgp) - { - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - enum charge_type ctype; - - *memcgp = NULL; - - if (mem_cgroup_disabled()) - return; - - if (PageTransHuge(page)) - nr_pages <<= compound_order(page); - - pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - css_get(&memcg->css); - /* - * At migrating an anonymous page, its mapcount goes down - * to 0 and uncharge() will be called. But, even if it's fully - * unmapped, migration may fail and this page has to be - * charged again. We set MIGRATION flag here and delay uncharge - * until end_migration() is called - * - * Corner Case Thinking - * A) - * When the old page was mapped as Anon and it's unmap-and-freed - * while migration was ongoing. - * If unmap finds the old page, uncharge() of it will be delayed - * until end_migration(). If unmap finds a new page, it's - * uncharged when it make mapcount to be 1->0. If unmap code - * finds swap_migration_entry, the new page will not be mapped - * and end_migration() will find it(mapcount==0). - * - * B) - * When the old page was mapped but migraion fails, the kernel - * remaps it. A charge for it is kept by MIGRATION flag even - * if mapcount goes down to 0. We can do remap successfully - * without charging it again. - * - * C) - * The "old" page is under lock_page() until the end of - * migration, so, the old page itself will not be swapped-out. - * If the new page is swapped out before end_migraton, our - * hook to usual swap-out path will catch the event. - */ - if (PageAnon(page)) - SetPageCgroupMigration(pc); - } - unlock_page_cgroup(pc); - /* - * If the page is not charged at this point, - * we return here. - */ - if (!memcg) - return; - - *memcgp = memcg; - /* - * We charge new page before it's used/mapped. So, even if unlock_page() - * is called before end_migration, we can catch all events on this new - * page. In the case new page is migrated but not remapped, new page's - * mapcount will be finally 0 and we call uncharge in end_migration(). - */ - if (PageAnon(page)) - ctype = MEM_CGROUP_CHARGE_TYPE_ANON; - else - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; - /* - * The page is committed to the memcg, but it's not actually - * charged to the res_counter since we plan on replacing the - * old one and only one page is going to be left afterwards. - */ - __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); - } - - /* remove redundant charge if migration failed*/ - void mem_cgroup_end_migration(struct mem_cgroup *memcg, - struct page *oldpage, struct page *newpage, bool migration_ok) - { - struct page *used, *unused; - struct page_cgroup *pc; - bool anon; - - if (!memcg) - return; - - if (!migration_ok) { - used = oldpage; - unused = newpage; - } else { - used = newpage; - unused = oldpage; - } - anon = PageAnon(used); - __mem_cgroup_uncharge_common(unused, - anon ? MEM_CGROUP_CHARGE_TYPE_ANON - : MEM_CGROUP_CHARGE_TYPE_CACHE, - true); - css_put(&memcg->css); - /* - * We disallowed uncharge of pages under migration because mapcount - * of the page goes down to zero, temporarly. - * Clear the flag and check the page should be charged. - */ - pc = lookup_page_cgroup(oldpage); - lock_page_cgroup(pc); - ClearPageCgroupMigration(pc); - unlock_page_cgroup(pc); - - /* - * If a page is a file cache, radix-tree replacement is very atomic - * and we can skip this check. When it was an Anon page, its mapcount - * goes down to 0. But because we added MIGRATION flage, it's not - * uncharged yet. There are several case but page->mapcount check - * and USED bit check in mem_cgroup_uncharge_page() will do enough - * check. (see prepare_charge() also) - */ - if (anon) - mem_cgroup_uncharge_page(used); - } - - /* - * At replace page cache, newpage is not under any memcg but it's on - * LRU. So, this function doesn't touch res_counter but handles LRU - * in correct way. Both pages are locked so we cannot race with uncharge. - */ - void mem_cgroup_replace_page_cache(struct page *oldpage, - struct page *newpage) - { - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(oldpage); - /* fix accounting on old pages */ - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, oldpage, false, -1); - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); - - /* - * When called from shmem_replace_page(), in some cases the - * oldpage has already been charged, and in some cases not. - */ - if (!memcg) - return; - /* - * Even if newpage->mapping was NULL before starting replacement, - * the newpage may be on LRU(or pagevec for LRU) already. We lock - * LRU while we overwrite pc->mem_cgroup. - */ - __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); - } - #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { @@@ -4550,7 -3860,7 +3860,7 @@@ unsigned long mem_cgroup_soft_limit_rec gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; - spin_lock(&mctz->lock); + spin_lock_irq(&mctz->lock);
/* * If we failed to reclaim anything from this memory cgroup @@@ -4590,7 -3900,7 +3900,7 @@@ */ /* If excess == 0, no tree ops */ __mem_cgroup_insert_exceeded(mz, mctz, excess); - spin_unlock(&mctz->lock); + spin_unlock_irq(&mctz->lock); css_put(&mz->memcg->css); loop++; /* @@@ -4809,86 -4119,32 +4119,32 @@@ static int mem_cgroup_hierarchy_write(s else retval = -EBUSY; } else - retval = -EINVAL; - - out: - mutex_unlock(&memcg_create_mutex); - - return retval; - } - - - static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) - { - struct mem_cgroup *iter; - long val = 0; - - /* Per-cpu values can be negative, use a signed accumulator */ - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); - - if (val < 0) /* race ? */ - val = 0; - return val; - } - - static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) - { - u64 val; - - if (!mem_cgroup_is_root(memcg)) { - if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); - else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); - } - - /* - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS - * as well as in MEM_CGROUP_STAT_RSS_HUGE. - */ - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); + retval = -EINVAL;
- if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); + out: + mutex_unlock(&memcg_create_mutex);
- return val << PAGE_SHIFT; + return retval; }
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) + struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - u64 val; - int name; - enum res_type type; - - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); + int name = MEMFILE_ATTR(cft->private);
switch (type) { case _MEM: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, false); - else - val = res_counter_read_u64(&memcg->res, name); - break; + return res_counter_read_u64(&memcg->res, name); case _MEMSWAP: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, true); - else - val = res_counter_read_u64(&memcg->memsw, name); - break; + return res_counter_read_u64(&memcg->memsw, name); case _KMEM: - val = res_counter_read_u64(&memcg->kmem, name); + return res_counter_read_u64(&memcg->kmem, name); break; default: BUG(); } - - return val; }
#ifdef CONFIG_MEMCG_KMEM @@@ -5350,7 -4606,10 +4606,10 @@@ static void __mem_cgroup_threshold(stru if (!t) goto unlock;
- usage = mem_cgroup_usage(memcg, swap); + if (!swap) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* * current_threshold points to threshold just below or equal to usage. @@@ -5442,15 -4701,15 +4701,15 @@@ static int __mem_cgroup_usage_register_
mutex_lock(&memcg->thresholds_lock);
- if (type == _MEM) + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG();
- usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before adding a new one */ if (thresholds->primary) __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@@ -5530,18 -4789,19 +4789,19 @@@ static void __mem_cgroup_usage_unregist int i, j, size;
mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG();
if (!thresholds->primary) goto unlock;
- usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before removing */ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@@ -6003,6 -5263,7 +5263,6 @@@ static struct cftype mem_cgroup_files[ }, { .name = "use_hierarchy", - .flags = CFTYPE_INSANE, .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, @@@ -6295,9 -5556,9 +5555,9 @@@ mem_cgroup_css_online(struct cgroup_sub * core guarantees its existence. */ } else { - res_counter_init(&memcg->res, NULL); - res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); + res_counter_init(&memcg->res, &root_mem_cgroup->res); + res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); + res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@@ -6406,80 -5667,40 +5666,63 @@@ static void mem_cgroup_css_free(struct __mem_cgroup_free(memcg); }
+/** + * mem_cgroup_css_reset - reset the states of a mem_cgroup + * @css: the target css + * + * Reset the states of the mem_cgroup associated with @css. This is + * invoked when the userland requests disabling on the default hierarchy + * but the memcg is pinned through dependency. The memcg should stop + * applying policies and should revert to the vanilla state as it may be + * made visible again. + * + * The current implementation only resets the essential configurations. + * This needs to be expanded to cover all the visible parts. + */ +static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + mem_cgroup_resize_limit(memcg, ULLONG_MAX); + mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); + memcg_update_kmem_limit(memcg, ULLONG_MAX); + res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); +} + #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ - #define PRECHARGE_COUNT_AT_ONCE 256 static int mem_cgroup_do_precharge(unsigned long count) { - int ret = 0; - int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *memcg = mc.to; + int ret;
- if (mem_cgroup_is_root(memcg)) { + /* Try a single bulk charge without reclaim first */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + if (!ret) { mc.precharge += count; - /* we don't need css_get for root */ return ret; } - /* try to charge at once */ - if (count > 1) { - struct res_counter *dummy; - /* - * "memcg" cannot be under rmdir() because we've already checked - * by cgroup_lock_live_cgroup() that it is not removed and we - * are still under the same cgroup_mutex. So we can postpone - * css_get(). - */ - if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) - goto one_by_one; - if (do_swap_account && res_counter_charge(&memcg->memsw, - PAGE_SIZE * count, &dummy)) { - res_counter_uncharge(&memcg->res, PAGE_SIZE * count); - goto one_by_one; - } - mc.precharge += count; + if (ret == -EINTR) { + cancel_charge(root_mem_cgroup, count); return ret; } - one_by_one: - /* fall back to one by one charge */ + + /* Try charges one by one with reclaim */ while (count--) { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - if (!batch_count--) { - batch_count = PRECHARGE_COUNT_AT_ONCE; - cond_resched(); - } - ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + /* + * In case of failure, any residual charges against + * mc.to will be dropped by mem_cgroup_clear_mc() + * later on. However, cancel any charges that are + * bypassed to root right away or they'll be lost. + */ + if (ret == -EINTR) + cancel_charge(root_mem_cgroup, 1); if (ret) - /* mem_cgroup_clear_mc() will do uncharge later */ return ret; mc.precharge++; + cond_resched(); } - return ret; + return 0; }
/** @@@ -6615,9 -5836,9 +5858,9 @@@ static enum mc_target_type get_mctgt_ty if (page) { pc = lookup_page_cgroup(page); /* - * Do only loose check w/o page_cgroup lock. - * mem_cgroup_move_account() checks the pc is valid or not under - * the lock. + * Do only loose check w/o serialization. + * mem_cgroup_move_account() checks the pc is valid or + * not under LRU exclusion. */ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; @@@ -6742,7 -5963,7 +5985,7 @@@ static void __mem_cgroup_clear_mc(void
/* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { - __mem_cgroup_cancel_charge(mc.to, mc.precharge); + cancel_charge(mc.to, mc.precharge); mc.precharge = 0; } /* @@@ -6750,27 -5971,24 +5993,24 @@@ * we must uncharge here. */ if (mc.moved_charge) { - __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); + cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; } /* we must fixup refcnts and charges */ if (mc.moved_swap) { /* uncharge swap account from the old cgroup */ - if (!mem_cgroup_is_root(mc.from)) - res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); + res_counter_uncharge(&mc.from->memsw, + PAGE_SIZE * mc.moved_swap);
for (i = 0; i < mc.moved_swap; i++) css_put(&mc.from->css);
- if (!mem_cgroup_is_root(mc.to)) { - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ - res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); - } + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + res_counter_uncharge(&mc.to->res, + PAGE_SIZE * mc.moved_swap); /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } @@@ -7023,17 -6241,16 +6263,17 @@@ static void mem_cgroup_move_task(struc
/* * Cgroup retains root cgroups across [un]mount cycles making it necessary - * to verify sane_behavior flag on each mount attempt. + * to verify whether we're attached to the default hierarchy on each mount + * attempt. */ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) { /* - * use_hierarchy is forced with sane_behavior. cgroup core + * use_hierarchy is forced on the default hierarchy. cgroup core * guarantees that @root doesn't have any children, so turning it * on for the root memcg is enough. */ - if (cgroup_sane_behavior(root_css->cgroup)) + if (cgroup_on_dfl(root_css->cgroup)) mem_cgroup_from_css(root_css)->use_hierarchy = true; }
@@@ -7042,12 -6259,11 +6282,12 @@@ struct cgroup_subsys memory_cgrp_subsy .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, .css_free = mem_cgroup_css_free, + .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, - .base_cftypes = mem_cgroup_files, + .legacy_cftypes = mem_cgroup_files, .early_init = 0, };
@@@ -7064,8 -6280,7 +6304,8 @@@ __setup("swapaccount=", enable_swap_acc
static void __init memsw_file_init(void) { - WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, + memsw_cgroup_files)); }
static void __init enable_swap_cgroup(void) @@@ -7082,6 -6297,397 +6322,397 @@@ static void __init enable_swap_cgroup(v } #endif
+ #ifdef CONFIG_MEMCG_SWAP + /** + * mem_cgroup_swapout - transfer a memsw charge to swap + * @page: page whose memsw charge to transfer + * @entry: swap entry to move the charge to + * + * Transfer the memsw charge of @page to @entry. + */ + void mem_cgroup_swapout(struct page *page, swp_entry_t entry) + { + struct page_cgroup *pc; + unsigned short oldid; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + if (!do_swap_account) + return; + + pc = lookup_page_cgroup(page); + + /* Readahead page, never charged */ + if (!PageCgroupUsed(pc)) + return; + + VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); + + oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); + VM_BUG_ON_PAGE(oldid, page); + + pc->flags &= ~PCG_MEMSW; + css_get(&pc->mem_cgroup->css); + mem_cgroup_swap_statistics(pc->mem_cgroup, true); + } + + /** + * mem_cgroup_uncharge_swap - uncharge a swap entry + * @entry: swap entry to uncharge + * + * Drop the memsw charge associated with @entry. + */ + void mem_cgroup_uncharge_swap(swp_entry_t entry) + { + struct mem_cgroup *memcg; + unsigned short id; + + if (!do_swap_account) + return; + + id = swap_cgroup_record(entry, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); + if (memcg) { + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); + css_put(&memcg->css); + } + rcu_read_unlock(); + } + #endif + + /** + * mem_cgroup_try_charge - try charging a page + * @page: page to charge + * @mm: mm context of the victim + * @gfp_mask: reclaim mode + * @memcgp: charged memcg return + * + * Try to charge @page to the memcg that @mm belongs to, reclaiming + * pages according to @gfp_mask if necessary. + * + * Returns 0 on success, with *@memcgp pointing to the charged memcg. + * Otherwise, an error code is returned. + * + * After page->mapping has been set up, the caller must finalize the + * charge with mem_cgroup_commit_charge(). Or abort the transaction + * with mem_cgroup_cancel_charge() in case page instantiation fails. + */ + int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp) + { + struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; + int ret = 0; + + if (mem_cgroup_disabled()) + goto out; + + if (PageSwapCache(page)) { + struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * Every swap fault against a single page tries to charge the + * page, bail as early as possible. shmem_unuse() encounters + * already charged pages, too. The USED bit is protected by + * the page lock, which serializes swap cache removal, which + * in turn serializes uncharging. + */ + if (PageCgroupUsed(pc)) + goto out; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + if (do_swap_account && PageSwapCache(page)) + memcg = try_get_mem_cgroup_from_page(page); + if (!memcg) + memcg = get_mem_cgroup_from_mm(mm); + + ret = try_charge(memcg, gfp_mask, nr_pages); + + css_put(&memcg->css); + + if (ret == -EINTR) { + memcg = root_mem_cgroup; + ret = 0; + } + out: + *memcgp = memcg; + return ret; + } + + /** + * mem_cgroup_commit_charge - commit a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * @lrucare: page might be on LRU already + * + * Finalize a charge transaction started by mem_cgroup_try_charge(), + * after page->mapping has been set up. This must happen atomically + * as part of the page instantiation, i.e. under the page table lock + * for anonymous pages, under the page lock for page and swap cache. + * + * In addition, the page must not be on the LRU during the commit, to + * prevent racing with task migration. If it might be, use @lrucare. + * + * Use mem_cgroup_cancel_charge() to cancel the transaction instead. + */ + void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare) + { + unsigned int nr_pages = 1; + + VM_BUG_ON_PAGE(!page->mapping, page); + VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + commit_charge(page, memcg, nr_pages, lrucare); + + if (do_swap_account && PageSwapCache(page)) { + swp_entry_t entry = { .val = page_private(page) }; + /* + * The swap entry might not get freed for a long time, + * let's not wait for it. The page already received a + * memory+swap charge, drop the swap entry duplicate. + */ + mem_cgroup_uncharge_swap(entry); + } + } + + /** + * mem_cgroup_cancel_charge - cancel a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * + * Cancel a charge transaction started by mem_cgroup_try_charge(). + */ + void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) + { + unsigned int nr_pages = 1; + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + cancel_charge(memcg, nr_pages); + } + + static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + unsigned long nr_mem, unsigned long nr_memsw, + unsigned long nr_anon, unsigned long nr_file, + unsigned long nr_huge, struct page *dummy_page) + { + unsigned long flags; + + if (nr_mem) + res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); + if (nr_memsw) + res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); + + memcg_oom_recover(memcg); + + local_irq_save(flags); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); + __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); + __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); + memcg_check_events(memcg, dummy_page); + local_irq_restore(flags); + } + + static void uncharge_list(struct list_head *page_list) + { + struct mem_cgroup *memcg = NULL; + unsigned long nr_memsw = 0; + unsigned long nr_anon = 0; + unsigned long nr_file = 0; + unsigned long nr_huge = 0; + unsigned long pgpgout = 0; + unsigned long nr_mem = 0; + struct list_head *next; + struct page *page; + + next = page_list->next; + do { + unsigned int nr_pages = 1; + struct page_cgroup *pc; + + page = list_entry(next, struct page, lru); + next = page->lru.next; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + pc = lookup_page_cgroup(page); + if (!PageCgroupUsed(pc)) + continue; + + /* + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point, we have + * fully exclusive access to the page. + */ + + if (memcg != pc->mem_cgroup) { + if (memcg) { + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); + pgpgout = nr_mem = nr_memsw = 0; + nr_anon = nr_file = nr_huge = 0; + } + memcg = pc->mem_cgroup; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + nr_huge += nr_pages; + } + + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + + if (pc->flags & PCG_MEM) + nr_mem += nr_pages; + if (pc->flags & PCG_MEMSW) + nr_memsw += nr_pages; + pc->flags = 0; + + pgpgout++; + } while (next != page_list); + + if (memcg) + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); + } + + /** + * mem_cgroup_uncharge - uncharge a page + * @page: page to uncharge + * + * Uncharge a page previously charged with mem_cgroup_try_charge() and + * mem_cgroup_commit_charge(). + */ + void mem_cgroup_uncharge(struct page *page) + { + struct page_cgroup *pc; + + if (mem_cgroup_disabled()) + return; + + /* Don't touch page->lru of any random page, pre-check: */ + pc = lookup_page_cgroup(page); + if (!PageCgroupUsed(pc)) + return; + + INIT_LIST_HEAD(&page->lru); + uncharge_list(&page->lru); + } + + /** + * mem_cgroup_uncharge_list - uncharge a list of page + * @page_list: list of pages to uncharge + * + * Uncharge a list of pages previously charged with + * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + */ + void mem_cgroup_uncharge_list(struct list_head *page_list) + { + if (mem_cgroup_disabled()) + return; + + if (!list_empty(page_list)) + uncharge_list(page_list); + } + + /** + * mem_cgroup_migrate - migrate a charge to another page + * @oldpage: currently charged page + * @newpage: page to transfer the charge to + * @lrucare: both pages might be on the LRU already + * + * Migrate the charge from @oldpage to @newpage. + * + * Both pages must be locked, @newpage->mapping must be set up. + */ + void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, + bool lrucare) + { + unsigned int nr_pages = 1; + struct page_cgroup *pc; + int isolated; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); + VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); + VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(oldpage); + if (!PageCgroupUsed(pc)) + return; + + VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); + VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); + + if (PageTransHuge(oldpage)) { + nr_pages <<= compound_order(oldpage); + VM_BUG_ON_PAGE(!PageTransHuge(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageTransHuge(newpage), newpage); + } + + if (lrucare) + lock_page_lru(oldpage, &isolated); + + pc->flags = 0; + + if (lrucare) + unlock_page_lru(oldpage, isolated); + + local_irq_disable(); + mem_cgroup_charge_statistics(pc->mem_cgroup, oldpage, -nr_pages); + memcg_check_events(pc->mem_cgroup, oldpage); + local_irq_enable(); + + commit_charge(newpage, pc->mem_cgroup, nr_pages, lrucare); + } + /* * subsys_initcall() for memory controller. * diff --combined mm/migrate.c index be6dbf9,7f5a424..327b5c6 --- a/mm/migrate.c +++ b/mm/migrate.c @@@ -778,11 -778,14 +778,14 @@@ static int move_to_new_page(struct pag rc = fallback_migrate_page(mapping, newpage, page, mode);
if (rc != MIGRATEPAGE_SUCCESS) { - newpage->mapping = NULL; + if (!PageAnon(newpage)) + newpage->mapping = NULL; } else { + mem_cgroup_migrate(page, newpage, false); if (remap_swapcache) remove_migration_ptes(page, newpage); - page->mapping = NULL; + if (!PageAnon(page)) + page->mapping = NULL; }
unlock_page(newpage); @@@ -795,7 -798,6 +798,6 @@@ static int __unmap_and_move(struct pag { int rc = -EAGAIN; int remap_swapcache = 1; - struct mem_cgroup *mem; struct anon_vma *anon_vma = NULL;
if (!trylock_page(page)) { @@@ -821,9 -823,6 +823,6 @@@ lock_page(page); }
- /* charge against new page */ - mem_cgroup_prepare_migration(page, newpage, &mem); - if (PageWriteback(page)) { /* * Only in the case of a full synchronous migration is it @@@ -833,10 -832,10 +832,10 @@@ */ if (mode != MIGRATE_SYNC) { rc = -EBUSY; - goto uncharge; + goto out_unlock; } if (!force) - goto uncharge; + goto out_unlock; wait_on_page_writeback(page); } /* @@@ -872,7 -871,7 +871,7 @@@ */ remap_swapcache = 0; } else { - goto uncharge; + goto out_unlock; } }
@@@ -885,7 -884,7 +884,7 @@@ * the page migration right away (proteced by page lock). */ rc = balloon_page_migrate(newpage, page, mode); - goto uncharge; + goto out_unlock; }
/* @@@ -904,7 -903,7 +903,7 @@@ VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { try_to_free_buffers(page); - goto uncharge; + goto out_unlock; } goto skip_unmap; } @@@ -923,10 -922,7 +922,7 @@@ skip_unmap if (anon_vma) put_anon_vma(anon_vma);
- uncharge: - mem_cgroup_end_migration(mem, page, newpage, - (rc == MIGRATEPAGE_SUCCESS || - rc == MIGRATEPAGE_BALLOON_SUCCESS)); + out_unlock: unlock_page(page); out: return rc; @@@ -988,10 -984,9 +984,10 @@@ out * it. Otherwise, putback_lru_page() will drop the reference grabbed * during isolation. */ - if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { + ClearPageSwapBacked(newpage); put_new_page(newpage, private); - else + } else putback_lru_page(newpage);
if (result) { @@@ -1786,7 -1781,6 +1782,6 @@@ int migrate_misplaced_transhuge_page(st pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; struct page *new_page = NULL; - struct mem_cgroup *memcg = NULL; int page_lru = page_is_file_cache(page); unsigned long mmun_start = address & HPAGE_PMD_MASK; unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; @@@ -1852,15 -1846,6 +1847,6 @@@ fail_putback goto out_unlock; }
- /* - * Traditional migration needs to prepare the memcg charge - * transaction early to prevent the old page from being - * uncharged when installing migration entries. Here we can - * save the potential rollback and start the charge transfer - * only when migration is already known to end successfully. - */ - mem_cgroup_prepare_migration(page, new_page, &memcg); - orig_entry = *pmd; entry = mk_pmd(new_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); @@@ -1888,14 -1873,10 +1874,10 @@@ goto fail_putback; }
+ mem_cgroup_migrate(page, new_page, false); + page_remove_rmap(page);
- /* - * Finish the charge transaction under the page table lock to - * prevent split_huge_page() from dividing up the charge - * before it's fully transferred to the new page. - */ - mem_cgroup_end_migration(memcg, page, new_page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --combined mm/shmem.c index 0f01800,b16d3e7..5909f29 --- a/mm/shmem.c +++ b/mm/shmem.c @@@ -149,6 -149,19 +149,19 @@@ static inline void shmem_unacct_size(un vm_unacct_memory(VM_ACCT(size)); }
+ static inline int shmem_reacct_size(unsigned long flags, + loff_t oldsize, loff_t newsize) + { + if (!(flags & VM_NORESERVE)) { + if (VM_ACCT(newsize) > VM_ACCT(oldsize)) + return security_vm_enough_memory_mm(current->mm, + VM_ACCT(newsize) - VM_ACCT(oldsize)); + else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) + vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); + } + return 0; + } + /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow huge sparse files. @@@ -280,7 -293,7 +293,7 @@@ static bool shmem_confirm_swap(struct a */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp, void *expected) + pgoff_t index, void *expected) { int error;
@@@ -406,7 -419,6 +419,6 @@@ static void shmem_undo_range(struct ino pvec.pages, indices); if (!pvec.nr) break; - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i];
@@@ -434,7 -446,6 +446,6 @@@ } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } @@@ -482,7 -493,6 +493,6 @@@ index = start; continue; } - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i];
@@@ -518,7 -528,6 +528,6 @@@ } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); index++; }
@@@ -549,6 -558,10 +558,10 @@@ static int shmem_setattr(struct dentry loff_t newsize = attr->ia_size;
if (newsize != oldsize) { + error = shmem_reacct_size(SHMEM_I(inode)->flags, + oldsize, newsize); + if (error) + return error; i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } @@@ -604,7 -617,7 +617,7 @@@ static int shmem_unuse_inode(struct shm radswap = swp_to_radix_entry(swap); index = radix_tree_locate_item(&mapping->page_tree, radswap); if (index == -1) - return 0; + return -EAGAIN; /* tell shmem_unuse we found nothing */
/* * Move _head_ to start search for next from here. @@@ -649,7 -662,7 +662,7 @@@ */ if (!error) error = shmem_add_to_page_cache(*pagep, mapping, index, - GFP_NOWAIT, radswap); + radswap); if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which @@@ -663,7 -676,6 +676,6 @@@ spin_unlock(&info->lock); swap_free(swap); } - error = 1; /* not an error, but entry was found */ } return error; } @@@ -675,7 -687,7 +687,7 @@@ int shmem_unuse(swp_entry_t swap, struc { struct list_head *this, *next; struct shmem_inode_info *info; - int found = 0; + struct mem_cgroup *memcg; int error = 0;
/* @@@ -690,26 -702,32 +702,32 @@@ * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL); + error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ + error = -EAGAIN;
mutex_lock(&shmem_swaplist_mutex); list_for_each_safe(this, next, &shmem_swaplist) { info = list_entry(this, struct shmem_inode_info, swaplist); if (info->swapped) - found = shmem_unuse_inode(info, swap, &page); + error = shmem_unuse_inode(info, swap, &page); else list_del_init(&info->swaplist); cond_resched(); - if (found) + if (error != -EAGAIN) break; + /* found nothing in this: move on to search the next */ } mutex_unlock(&shmem_swaplist_mutex);
- if (found < 0) - error = found; + if (error) { + if (error != -ENOMEM) + error = 0; + mem_cgroup_cancel_charge(page, memcg); + } else + mem_cgroup_commit_charge(page, memcg, true); out: unlock_page(page); page_cache_release(page); @@@ -813,7 -831,7 +831,7 @@@ static int shmem_writepage(struct page }
mutex_unlock(&shmem_swaplist_mutex); - swapcache_free(swap, NULL); + swapcache_free(swap); redirty: set_page_dirty(page); if (wbc->for_reclaim) @@@ -986,7 -1004,7 +1004,7 @@@ static int shmem_replace_page(struct pa */ oldpage = newpage; } else { - mem_cgroup_replace_page_cache(oldpage, newpage); + mem_cgroup_migrate(oldpage, newpage, false); lru_cache_add_anon(newpage); *pagep = newpage; } @@@ -1013,6 -1031,7 +1031,7 @@@ static int shmem_getpage_gfp(struct ino struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo; + struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; int error; @@@ -1091,11 -1110,10 +1110,10 @@@ repeat goto failed; }
- error = mem_cgroup_charge_file(page, current->mm, - gfp & GFP_RECLAIM_MASK); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, swp_to_radix_entry(swap)); + swp_to_radix_entry(swap)); /* * We already confirmed swap under page lock, and make * no memory allocation here, so usually no possibility @@@ -1108,12 -1126,16 +1126,16 @@@ * Reset swap.val? No, leave it so "failed" goes back to * "repeat": reading a hole and writing should succeed. */ - if (error) + if (error) { + mem_cgroup_cancel_charge(page, memcg); delete_from_swap_cache(page); + } } if (error) goto failed;
+ mem_cgroup_commit_charge(page, memcg, true); + spin_lock(&info->lock); info->swapped--; shmem_recalc_inode(inode); @@@ -1149,22 -1171,22 +1171,22 @@@ __SetPageSwapBacked(page); __set_page_locked(page); if (sgp == SGP_WRITE) - init_page_accessed(page); + __SetPageReferenced(page);
- error = mem_cgroup_charge_file(page, current->mm, - gfp & GFP_RECLAIM_MASK); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); if (error) goto decused; error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); + NULL); radix_tree_preload_end(); } if (error) { - mem_cgroup_uncharge_cache_page(page); + mem_cgroup_cancel_charge(page, memcg); goto decused; } + mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_anon(page);
spin_lock(&info->lock); @@@ -1289,7 -1311,7 +1311,7 @@@ static int shmem_fault(struct vm_area_s
shmem_falloc_waitq = shmem_falloc->waitq; prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, - TASK_KILLABLE); + TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); schedule();
@@@ -2048,45 -2070,17 +2070,45 @@@ static int shmem_rmdir(struct inode *di return shmem_unlink(dir, dentry); }
+static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +{ + bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode); + bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode); + + if (old_dir != new_dir && old_is_dir != new_is_dir) { + if (old_is_dir) { + drop_nlink(old_dir); + inc_nlink(new_dir); + } else { + drop_nlink(new_dir); + inc_nlink(old_dir); + } + } + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + old_dentry->d_inode->i_ctime = + new_dentry->d_inode->i_ctime = CURRENT_TIME; + + return 0; +} + /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ -static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *inode = old_dentry->d_inode; int they_are_dirs = S_ISDIR(inode->i_mode);
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) + return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry); + if (!simple_empty(new_dentry)) return -ENOTEMPTY;
@@@ -2769,7 -2763,7 +2791,7 @@@ static const struct inode_operations sh .mkdir = shmem_mkdir, .rmdir = shmem_rmdir, .mknod = shmem_mknod, - .rename = shmem_rename, + .rename2 = shmem_rename2, .tmpfile = shmem_tmpfile, #endif #ifdef CONFIG_TMPFS_XATTR @@@ -2960,16 -2954,16 +2982,16 @@@ static struct file *__shmem_file_setup( this.len = strlen(name); this.hash = 0; /* will go */ sb = shm_mnt->mnt_sb; + path.mnt = mntget(shm_mnt); path.dentry = d_alloc_pseudo(sb, &this); if (!path.dentry) goto put_memory; d_set_d_op(path.dentry, &anon_ops); - path.mnt = mntget(shm_mnt);
res = ERR_PTR(-ENOSPC); inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) - goto put_dentry; + goto put_memory;
inode->i_flags |= i_flags; d_instantiate(path.dentry, inode); @@@ -2977,19 -2971,19 +2999,19 @@@ clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (IS_ERR(res)) - goto put_dentry; + goto put_path;
res = alloc_file(&path, FMODE_WRITE | FMODE_READ, &shmem_file_operations); if (IS_ERR(res)) - goto put_dentry; + goto put_path;
return res;
- put_dentry: - path_put(&path); put_memory: shmem_unacct_size(flags, size); + put_path: + path_put(&path); return res; }
diff --combined mm/slab_common.c index d31c4ba,8b711f5..d80ec43 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@@ -19,6 -19,8 +19,8 @@@ #include <asm/tlbflush.h> #include <asm/page.h> #include <linux/memcontrol.h> + + #define CREATE_TRACE_POINTS #include <trace/events/kmem.h>
#include "slab.h" @@@ -55,7 -57,7 +57,7 @@@ static int kmem_cache_sanity_check(cons continue; }
-#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) +#if !defined(CONFIG_SLUB) if (!strcmp(s->name, name)) { pr_err("%s (%s): Cache name already exists.\n", __func__, name); @@@ -264,7 -266,7 +266,7 @@@ EXPORT_SYMBOL(kmem_cache_create) * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. * @root_cache: The parent of the new cache. - * @memcg_name: The name of the memory cgroup (used for naming the new cache). + * @cache_name: The string to be used as the new cache name. * * This function attempts to create a kmem cache that will serve allocation * requests going from @memcg to @root_cache. The new cache inherits properties @@@ -272,31 -274,25 +274,25 @@@ */ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache, - const char *memcg_name) + char *cache_name) { struct kmem_cache *s = NULL; - char *cache_name;
get_online_cpus(); get_online_mems();
mutex_lock(&slab_mutex);
- cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - memcg_cache_id(memcg), memcg_name); - if (!cache_name) - goto out_unlock; - s = do_kmem_cache_create(cache_name, root_cache->object_size, root_cache->size, root_cache->align, root_cache->flags, root_cache->ctor, memcg, root_cache); - if (IS_ERR(s)) { - kfree(cache_name); + if (!IS_ERR(s)) + list_add(&s->memcg_params->siblings, + &root_cache->memcg_params->children); + else s = NULL; - }
- out_unlock: mutex_unlock(&slab_mutex);
put_online_mems(); @@@ -307,17 -303,15 +303,15 @@@
static int memcg_cleanup_cache_params(struct kmem_cache *s) { - int rc; - if (!s->memcg_params || !s->memcg_params->is_root_cache) return 0;
mutex_unlock(&slab_mutex); - rc = __memcg_cleanup_cache_params(s); + __memcg_cleanup_cache_params(s); mutex_lock(&slab_mutex);
- return rc; + return !list_empty(&s->memcg_params->children); } #else static int memcg_cleanup_cache_params(struct kmem_cache *s) @@@ -354,6 -348,10 +348,10 @@@ void kmem_cache_destroy(struct kmem_cac }
list_del(&s->list); + #ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + list_del(&s->memcg_params->siblings); + #endif
mutex_unlock(&slab_mutex); if (s->flags & SLAB_DESTROY_BY_RCU) @@@ -692,20 -690,17 +690,17 @@@ void slab_stop(struct seq_file *m, voi static void memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) { - struct kmem_cache *c; + #ifdef CONFIG_MEMCG_KMEM + struct memcg_cache_params *params; struct slabinfo sinfo; - int i;
- if (!is_root_cache(s)) + if (!s->memcg_params || + !s->memcg_params->is_root_cache) return;
- for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; - + list_for_each_entry(params, &s->memcg_params->children, siblings) { memset(&sinfo, 0, sizeof(sinfo)); - get_slabinfo(c, &sinfo); + get_slabinfo(params->cachep, &sinfo);
info->active_slabs += sinfo.active_slabs; info->num_slabs += sinfo.num_slabs; @@@ -713,6 -708,7 +708,7 @@@ info->active_objs += sinfo.active_objs; info->num_objs += sinfo.num_objs; } + #endif }
int cache_show(struct kmem_cache *s, struct seq_file *m) @@@ -787,3 -783,102 +783,102 @@@ static int __init slab_proc_init(void } module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ + + static __always_inline void *__do_krealloc(const void *p, size_t new_size, + gfp_t flags) + { + void *ret; + size_t ks = 0; + + if (p) + ks = ksize(p); + + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) + memcpy(ret, p, ks); + + return ret; + } + + /** + * __krealloc - like krealloc() but don't free @p. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * This function is like krealloc() except it never frees the originally + * allocated buffer. Use this if you don't want to free the buffer immediately + * like, for example, with RCU. + */ + void *__krealloc(const void *p, size_t new_size, gfp_t flags) + { + if (unlikely(!new_size)) + return ZERO_SIZE_PTR; + + return __do_krealloc(p, new_size, flags); + + } + EXPORT_SYMBOL(__krealloc); + + /** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ + void *krealloc(const void *p, size_t new_size, gfp_t flags) + { + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, flags); + if (ret && p != ret) + kfree(p); + + return ret; + } + EXPORT_SYMBOL(krealloc); + + /** + * kzfree - like kfree but zero memory + * @p: object to free memory of + * + * The memory of the object @p points to is zeroed before freed. + * If @p is %NULL, kzfree() does nothing. + * + * Note: this function zeroes the whole allocated buffer which can be a good + * deal bigger than the requested buffer size passed to kmalloc(). So be + * careful when using this function in performance sensitive code. + */ + void kzfree(const void *p) + { + size_t ks; + void *mem = (void *)p; + + if (unlikely(ZERO_OR_NULL_PTR(mem))) + return; + ks = ksize(mem); + memset(mem, 0, ks); + kfree(mem); + } + EXPORT_SYMBOL(kzfree); + + /* Tracepoints definitions. */ + EXPORT_TRACEPOINT_SYMBOL(kmalloc); + EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); + EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); + EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); + EXPORT_TRACEPOINT_SYMBOL(kfree); + EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --combined net/bridge/br_multicast.c index b4845f4,d9c4f57..7751c92 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@@ -1174,7 -1174,7 +1174,7 @@@ static void br_multicast_add_router(str }
if (slot) - hlist_add_after_rcu(slot, &port->rlist); + hlist_add_behind_rcu(&port->rlist, slot); else hlist_add_head_rcu(&port->rlist, &br->router_list); } @@@ -2216,43 -2216,6 +2216,43 @@@ unlock EXPORT_SYMBOL_GPL(br_multicast_list_adjacent);
/** + * br_multicast_has_querier_anywhere - Checks for a querier on a bridge + * @dev: The bridge port providing the bridge on which to check for a querier + * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 + * + * Checks whether the given interface has a bridge on top and if so returns + * true if a valid querier exists anywhere on the bridged link layer. + * Otherwise returns false. + */ +bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) +{ + struct net_bridge *br; + struct net_bridge_port *port; + struct ethhdr eth; + bool ret = false; + + rcu_read_lock(); + if (!br_port_exists(dev)) + goto unlock; + + port = br_port_get_rcu(dev); + if (!port || !port->br) + goto unlock; + + br = port->br; + + memset(ð, 0, sizeof(eth)); + eth.h_proto = htons(proto); + + ret = br_multicast_querier_exists(br, ð); + +unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere); + +/** * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port * @dev: The bridge port adjacent to which to check for a querier * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 diff --combined net/xfrm/xfrm_policy.c index 0525d78,92cb08d..beeed60 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@@ -389,7 -389,7 +389,7 @@@ redo if (h != h0) continue; hlist_del(&pol->bydst); - hlist_add_after(entry0, &pol->bydst); + hlist_add_behind(&pol->bydst, entry0); } entry0 = &pol->bydst; } @@@ -654,7 -654,7 +654,7 @@@ int xfrm_policy_insert(int dir, struct break; } if (newpos) - hlist_add_after(newpos, &policy->bydst); + hlist_add_behind(&policy->bydst, newpos); else hlist_add_head(&policy->bydst, chain); xfrm_pol_hold(policy); @@@ -2097,8 -2097,6 +2097,8 @@@ struct dst_entry *xfrm_lookup(struct ne goto no_transform; }
+ dst_hold(&xdst->u.dst); + xdst->u.dst.flags |= DST_NOCACHE; route = xdst->route; } }
linux-merge@lists.open-mesh.org