The following commit has been merged in the master branch: commit a7fd20d1c476af4563e66865213474a2f9f473a4 Merge: b80fed9595513384424cd141923c9161c4b5021b 917fa5353da05e8a0045b8acacba8d50400d5b12 Author: Linus Torvalds torvalds@linux-foundation.org Date: Tue May 17 16:26:30 2016 -0700
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights:
1) Support SPI based w5100 devices, from Akinobu Mita.
2) Partial Segmentation Offload, from Alexander Duyck.
3) Add GMAC4 support to stmmac driver, from Alexandre TORGUE.
4) Allow cls_flower stats offload, from Amir Vadai.
5) Implement bpf blinding, from Daniel Borkmann.
6) Optimize _ASYNC_ bit twiddling on sockets, unless the socket is actually using FASYNC these atomics are superfluous. From Eric Dumazet.
7) Run TCP more preemptibly, also from Eric Dumazet.
8) Support LED blinking, EEPROM dumps, and rxvlan offloading in mlx5e driver, from Gal Pressman.
9) Allow creating ppp devices via rtnetlink, from Guillaume Nault.
10) Improve BPF usage documentation, from Jesper Dangaard Brouer.
11) Support tunneling offloads in qed, from Manish Chopra.
12) aRFS offloading in mlx5e, from Maor Gottlieb.
13) Add RFS and RPS support to SCTP protocol, from Marcelo Ricardo Leitner.
14) Add MSG_EOR support to TCP, this allows controlling packet coalescing on application record boundaries for more accurate socket timestamp sampling. From Martin KaFai Lau.
15) Fix alignment of 64-bit netlink attributes across the board, from Nicolas Dichtel.
16) Per-vlan stats in bridging, from Nikolay Aleksandrov.
17) Several conversions of drivers to ethtool ksettings, from Philippe Reynes.
18) Checksum neutral ILA in ipv6, from Tom Herbert.
19) Factorize all of the various marvell dsa drivers into one, from Vivien Didelot
20) Add VF support to qed driver, from Yuval Mintz"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1649 commits) Revert "phy dp83867: Fix compilation with CONFIG_OF_MDIO=m" Revert "phy dp83867: Make rgmii parameters optional" r8169: default to 64-bit DMA on recent PCIe chips phy dp83867: Make rgmii parameters optional phy dp83867: Fix compilation with CONFIG_OF_MDIO=m bpf: arm64: remove callee-save registers use for tmp registers asix: Fix offset calculation in asix_rx_fixup() causing slow transmissions switchdev: pass pointer to fib_info instead of copy net_sched: close another race condition in tcf_mirred_release() tipc: fix nametable publication field in nl compat drivers: net: Don't print unpopulated net_device name qed: add support for dcbx. ravb: Add missing free_irq() calls to ravb_close() qed: Remove a stray tab net: ethernet: fec-mpc52xx: use phy_ethtool_{get|set}_link_ksettings net: ethernet: fec-mpc52xx: use phydev from struct net_device bpf, doc: fix typo on bpf_asm descriptions stmmac: hardware TX COE doesn't work when force_thresh_dma_mode is set net: ethernet: fs-enet: use phy_ethtool_{get|set}_link_ksettings net: ethernet: fs-enet: use phydev from struct net_device ...
diff --combined MAINTAINERS index 20e6346,452aa2b..6210ae2 --- a/MAINTAINERS +++ b/MAINTAINERS @@@ -627,7 -627,6 +627,7 @@@ F: include/linux/altera_jtaguart.
AMD CRYPTOGRAPHIC COPROCESSOR (CCP) DRIVER M: Tom Lendacky thomas.lendacky@amd.com +M: Gary Hook gary.hook@amd.com L: linux-crypto@vger.kernel.org S: Supported F: drivers/crypto/ccp/ @@@ -1323,7 -1322,6 +1323,7 @@@ F: drivers/rtc/rtc-armada38x. F: arch/arm/boot/dts/armada* F: arch/arm/boot/dts/kirkwood* F: arch/arm64/boot/dts/marvell/armada* +F: drivers/cpufreq/mvebu-cpufreq.c
ARM/Marvell Berlin SoC support @@@ -1472,7 -1470,10 +1472,10 @@@ F: arch/arm/boot/dts/qcom-*.dt F: arch/arm/boot/dts/qcom-*.dtsi F: arch/arm/mach-qcom/ F: arch/arm64/boot/dts/qcom/* + F: drivers/i2c/busses/i2c-qup.c + F: drivers/clk/qcom/ F: drivers/soc/qcom/ + F: drivers/spi/spi-qup.c F: drivers/tty/serial/msm_serial.h F: drivers/tty/serial/msm_serial.c F: drivers/*/pm8???-* @@@ -2205,10 -2206,13 +2208,13 @@@ BATMAN ADVANCE M: Marek Lindner mareklindner@neomailbox.ch M: Simon Wunderlich sw@simonwunderlich.de M: Antonio Quartulli a@unstable.cc - L: b.a.t.m.a.n@lists.open-mesh.org + L: b.a.t.m.a.n@lists.open-mesh.org (moderated for non-subscribers) W: https://www.open-mesh.org/ Q: https://patchwork.open-mesh.org/project/batman/list/ S: Maintained + F: Documentation/ABI/testing/sysfs-class-net-batman-adv + F: Documentation/ABI/testing/sysfs-class-net-mesh + F: Documentation/networking/batman-adv.txt F: net/batman-adv/
BAYCOM/HDLCDRV DRIVERS FOR AX.25 @@@ -3350,6 -3354,7 +3356,7 @@@ F: Documentation/powerpc/cxlflash.tx
STMMAC ETHERNET DRIVER M: Giuseppe Cavallaro peppe.cavallaro@st.com + M: Alexandre Torgue alexandre.torgue@st.com L: netdev@vger.kernel.org W: http://www.stlinux.com S: Supported @@@ -3541,15 -3546,6 +3548,15 @@@ F: drivers/devfreq/devfreq-event. F: include/linux/devfreq-event.h F: Documentation/devicetree/bindings/devfreq/event/
+BUS FREQUENCY DRIVER FOR SAMSUNG EXYNOS +M: Chanwoo Choi cw00.choi@samsung.com +L: linux-pm@vger.kernel.org +L: linux-samsung-soc@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/mzx/devfreq.git +S: Maintained +F: drivers/devfreq/exynos-bus.c +F: Documentation/devicetree/bindings/devfreq/exynos-bus.txt + DEVICE NUMBER REGISTRY M: Torben Mathiasen device@lanana.org W: http://lanana.org/docs/device-list/index.html @@@ -5755,13 -5751,6 +5762,6 @@@ F: drivers/char/hw_random/ixp4xx-rng.
INTEL ETHERNET DRIVERS M: Jeff Kirsher jeffrey.t.kirsher@intel.com - R: Jesse Brandeburg jesse.brandeburg@intel.com - R: Shannon Nelson shannon.nelson@intel.com - R: Carolyn Wyborny carolyn.wyborny@intel.com - R: Don Skidmore donald.c.skidmore@intel.com - R: Bruce Allan bruce.w.allan@intel.com - R: John Ronciak john.ronciak@intel.com - R: Mitch Williams mitch.a.williams@intel.com L: intel-wired-lan@lists.osuosl.org (moderated for non-subscribers) W: http://www.intel.com/support/feedback.htm W: http://e1000.sourceforge.net/ @@@ -7031,9 -7020,9 +7031,9 @@@ M: Chanwoo Choi <cw00.choi@samsung.com M: Krzysztof Kozlowski k.kozlowski@samsung.com L: linux-kernel@vger.kernel.org S: Supported -F: drivers/*/max14577.c +F: drivers/*/max14577*.c F: drivers/*/max77686*.c -F: drivers/*/max77693.c +F: drivers/*/max77693*.c F: drivers/extcon/extcon-max14577.c F: drivers/extcon/extcon-max77693.c F: drivers/rtc/rtc-max77686.c @@@ -9500,7 -9489,7 +9500,7 @@@ F: drivers/net/wireless/realtek/rtlwifi RTL8XXXU WIRELESS DRIVER (rtl8xxxu) M: Jes Sorensen Jes.Sorensen@redhat.com L: linux-wireless@vger.kernel.org - T: git git://git.kernel.org/pub/scm/linux/kernel/git/jes/linux.git rtl8723au-mac80211 + T: git git://git.kernel.org/pub/scm/linux/kernel/git/jes/linux.git rtl8xxxu-devel S: Maintained F: drivers/net/wireless/realtek/rtl8xxxu/
@@@ -11257,13 -11246,14 +11257,13 @@@ S: Maintaine F: drivers/media/i2c/tc358743* F: include/media/i2c/tc358743.h
-TMIO MMC DRIVER -M: Ian Molton ian@mnementh.co.uk +TMIO/SDHI MMC DRIVER +M: Wolfram Sang wsa+renesas@sang-engineering.com L: linux-mmc@vger.kernel.org -S: Maintained +S: Supported F: drivers/mmc/host/tmio_mmc* F: drivers/mmc/host/sh_mobile_sdhi.c -F: include/linux/mmc/tmio.h -F: include/linux/mmc/sh_mobile_sdhi.h +F: include/linux/mfd/tmio.h
TMP401 HARDWARE MONITOR DRIVER M: Guenter Roeck linux@roeck-us.net @@@ -12021,9 -12011,7 +12021,9 @@@ L: linux-kernel@vger.kernel.or W: http://www.slimlogic.co.uk/?p=48 T: git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git S: Supported +F: Documentation/devicetree/bindings/regulator/ F: drivers/regulator/ +F: include/dt-bindings/regulator/ F: include/linux/regulator/
VRF diff --combined arch/arm/Kconfig index 6684af9,2315b0d..970f1cf --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@@ -41,7 -41,7 +41,7 @@@ config AR select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT) select HAVE_ARCH_TRACEHOOK select HAVE_ARM_SMCCC if CPU_V7 - select HAVE_BPF_JIT + select HAVE_CBPF_JIT select HAVE_CC_STACKPROTECTOR select HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT @@@ -531,8 -531,6 +531,8 @@@ config ARCH_LPC32X select COMMON_CLK select CPU_ARM926T select GENERIC_CLOCKEVENTS + select MULTI_IRQ_HANDLER + select SPARSE_IRQ select USE_OF help Support for the NXP LPC32XX family of processors diff --combined arch/arm64/Kconfig index 8845c0d,e6761ea..76747d9 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@@ -11,7 -11,6 +11,7 @@@ config ARM6 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_USE_CMPXCHG_LOCKREF select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_COMPAT_IPC_PARSE_VERSION select ARCH_WANT_FRAME_POINTERS @@@ -59,14 -58,11 +59,14 @@@ select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARM_SMCCC - select HAVE_BPF_JIT + select HAVE_EBPF_JIT select HAVE_C_RECORDMCOUNT select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL + select HAVE_CONTEXT_TRACKING select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG @@@ -80,7 -76,6 +80,7 @@@ select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP if NUMA select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_REGS @@@ -94,13 -89,15 +94,13 @@@ select NO_BOOTMEM select OF select OF_EARLY_FLATTREE + select OF_NUMA if NUMA && OF select OF_RESERVED_MEM select PERF_USE_VMALLOC select POWER_RESET select POWER_SUPPLY - select RTC_LIB select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE - select HAVE_CONTEXT_TRACKING - select HAVE_ARM_SMCCC help ARM 64-bit (AArch64) Linux support.
@@@ -549,35 -546,10 +549,35 @@@ config HOTPLUG_CP Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu.
+# Common NUMA Features +config NUMA + bool "Numa Memory Allocation and Scheduler Support" + depends on SMP + help + Enable NUMA (Non Uniform Memory Access) support. + + The kernel will try to allocate memory used by a CPU on the + local memory of the CPU and add some more + NUMA awareness to the kernel. + +config NODES_SHIFT + int "Maximum NUMA Nodes (as a power of 2)" + range 1 10 + default "2" + depends on NEED_MULTIPLE_NODES + help + Specify the maximum number of NUMA Nodes available on the target + system. Increases memory reserved to accommodate various tables. + +config USE_PERCPU_NUMA_NODE_ID + def_bool y + depends on NUMA + source kernel/Kconfig.preempt source kernel/Kconfig.hz
config ARCH_SUPPORTS_DEBUG_PAGEALLOC + depends on !HIBERNATION def_bool y
config ARCH_HAS_HOLES_MEMORYMODEL @@@ -606,6 -578,9 +606,6 @@@ config SYS_SUPPORTS_HUGETLBF config ARCH_WANT_HUGE_PMD_SHARE def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
-config HAVE_ARCH_TRANSPARENT_HUGEPAGE - def_bool y - config ARCH_HAS_CACHE_LINE_SIZE def_bool y
@@@ -978,14 -953,6 +978,14 @@@ menu "Power management options
source "kernel/power/Kconfig"
+config ARCH_HIBERNATION_POSSIBLE + def_bool y + depends on CPU_PM + +config ARCH_HIBERNATION_HEADER + def_bool y + depends on HIBERNATION + config ARCH_SUSPEND_POSSIBLE def_bool y
diff --combined arch/x86/Kconfig index 7bb1574,ae83046..ace79d2 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@@ -91,7 -91,7 +91,7 @@@ config X8 select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_BPF_JIT if X86_64 + select HAVE_EBPF_JIT if X86_64 select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL @@@ -164,6 -164,10 +164,6 @@@ config INSTRUCTION_DECODE def_bool y depends on KPROBES || PERF_EVENTS || UPROBES
-config PERF_EVENTS_INTEL_UNCORE - def_bool y - depends on PERF_EVENTS && CPU_SUP_INTEL && PCI - config OUTPUT_FORMAT string default "elf32-i386" if X86_32 @@@ -1042,8 -1046,6 +1042,8 @@@ config X86_THERMAL_VECTO def_bool y depends on X86_MCE_INTEL
+source "arch/x86/events/Kconfig" + config X86_LEGACY_VM86 bool "Legacy VM86 support" default n @@@ -1208,6 -1210,15 +1208,6 @@@ config MICROCODE_OLD_INTERFAC def_bool y depends on MICROCODE
-config PERF_EVENTS_AMD_POWER - depends on PERF_EVENTS && CPU_SUP_AMD - tristate "AMD Processor Power Reporting Mechanism" - ---help--- - Provide power reporting mechanism support for AMD processors. - Currently, it leverages X86_FEATURE_ACC_POWER - (CPUID Fn8000_0007_EDX[12]) interface to calculate the - average power consumption on Family 15h processors. - config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" ---help--- @@@ -1921,38 -1932,54 +1921,38 @@@ config RELOCATABL (CONFIG_PHYSICAL_START) is used as the minimum location.
config RANDOMIZE_BASE - bool "Randomize the address of the kernel image" + bool "Randomize the address of the kernel image (KASLR)" depends on RELOCATABLE default n ---help--- - Randomizes the physical and virtual address at which the - kernel image is decompressed, as a security feature that - deters exploit attempts relying on knowledge of the location - of kernel internals. + In support of Kernel Address Space Layout Randomization (KASLR), + this randomizes the physical address at which the kernel image + is decompressed and the virtual address where the kernel + image is mapped, as a security feature that deters exploit + attempts relying on knowledge of the location of kernel + code internals. + + The kernel physical and virtual address can be randomized + from 16MB up to 1GB on 64-bit and 512MB on 32-bit. (Note that + using RANDOMIZE_BASE reduces the memory space available to + kernel modules from 1.5GB to 1GB.) + + Entropy is generated using the RDRAND instruction if it is + supported. If RDTSC is supported, its value is mixed into + the entropy pool as well. If neither RDRAND nor RDTSC are + supported, then entropy is read from the i8254 timer. + + Since the kernel is built using 2GB addressing, and + PHYSICAL_ALIGN must be at a minimum of 2MB, only 10 bits of + entropy is theoretically possible. Currently, with the + default value for PHYSICAL_ALIGN and due to page table + layouts, 64-bit uses 9 bits of entropy and 32-bit uses 8 bits. + + If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot + time. To enable it, boot with "kaslr" on the kernel command + line (which will also disable hibernation).
- Entropy is generated using the RDRAND instruction if it is - supported. If RDTSC is supported, it is used as well. If - neither RDRAND nor RDTSC are supported, then randomness is - read from the i8254 timer. - - The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET, - and aligned according to PHYSICAL_ALIGN. Since the kernel is - built using 2GiB addressing, and PHYSICAL_ALGIN must be at a - minimum of 2MiB, only 10 bits of entropy is theoretically - possible. At best, due to page table layouts, 64-bit can use - 9 bits of entropy and 32-bit uses 8 bits. - - If unsure, say N. - -config RANDOMIZE_BASE_MAX_OFFSET - hex "Maximum kASLR offset allowed" if EXPERT - depends on RANDOMIZE_BASE - range 0x0 0x20000000 if X86_32 - default "0x20000000" if X86_32 - range 0x0 0x40000000 if X86_64 - default "0x40000000" if X86_64 - ---help--- - The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical - memory is used to determine the maximal offset in bytes that will - be applied to the kernel when kernel Address Space Layout - Randomization (kASLR) is active. This must be a multiple of - PHYSICAL_ALIGN. - - On 32-bit this is limited to 512MiB by page table layouts. The - default is 512MiB. - - On 64-bit this is limited by how the kernel fixmap page table is - positioned, so this cannot be larger than 1GiB currently. Without - RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel - and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the - modules area will shrink to compensate, up to the current maximum - 1GiB to 1GiB split. The default is 1GiB. - - If unsure, leave at the default value. + If unsure, say N.
# Relocation on x86 needs some additional build support config X86_NEED_RELOCS diff --combined drivers/net/hamradio/baycom_epp.c index 7c78307,eb66638..78dbc44 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@@ -635,10 -635,10 +635,10 @@@ static int receive(struct net_device *d
#ifdef __i386__ #include <asm/msr.h> -#define GETTICK(x) \ -({ \ - if (cpu_has_tsc) \ - x = (unsigned int)rdtsc(); \ +#define GETTICK(x) \ +({ \ + if (boot_cpu_has(X86_FEATURE_TSC)) \ + x = (unsigned int)rdtsc(); \ }) #else /* __i386__ */ #define GETTICK(x) @@@ -780,8 -780,10 +780,10 @@@ static int baycom_send_packet(struct sk dev_kfree_skb(skb); return NETDEV_TX_OK; } - if (bc->skb) - return NETDEV_TX_LOCKED; + if (bc->skb) { + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } /* strip KISS byte */ if (skb->len >= HDLCDRV_MAXFLEN+1 || skb->len < 3) { dev_kfree_skb(skb); diff --combined drivers/soc/qcom/spm.c index 1fcbb22,f324451..f9d7a85 --- a/drivers/soc/qcom/spm.c +++ b/drivers/soc/qcom/spm.c @@@ -2,6 -2,8 +2,8 @@@ * Copyright (c) 2011-2014, The Linux Foundation. All rights reserved. * Copyright (c) 2014,2015, Linaro Ltd. * + * SAW power controller driver + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and * only version 2 as published by the Free Software Foundation. @@@ -12,7 -14,6 +14,6 @@@ * GNU General Public License for more details. */
- #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/io.h> @@@ -274,7 -275,7 +275,7 @@@ check_spm return per_cpu(cpu_spm_drv, cpu) ? 0 : -ENXIO; }
-static struct cpuidle_ops qcom_cpuidle_ops __initdata = { +static const struct cpuidle_ops qcom_cpuidle_ops __initconst = { .suspend = qcom_idle_enter, .init = qcom_cpuidle_init, }; @@@ -378,8 -379,5 +379,5 @@@ static struct platform_driver spm_drive .of_match_table = spm_match_table, }, }; - module_platform_driver(spm_driver);
- MODULE_LICENSE("GPL v2"); - MODULE_DESCRIPTION("SAW power controller driver"); - MODULE_ALIAS("platform:saw"); + builtin_platform_driver(spm_driver); diff --combined include/linux/perf_event.h index 9e1c3ad,eb41b53..44f3383 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@@ -58,7 -58,7 +58,7 @@@ struct perf_guest_info_callbacks
struct perf_callchain_entry { __u64 nr; - __u64 ip[PERF_MAX_STACK_DEPTH]; + __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */ };
struct perf_raw_record { @@@ -151,15 -151,6 +151,15 @@@ struct hw_perf_event */ struct task_struct *target;
+ /* + * PMU would store hardware filter configuration + * here. + */ + void *addr_filters; + + /* Last sync'ed generation of filters */ + unsigned long addr_filters_gen; + /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ @@@ -225,7 -216,6 +225,7 @@@ struct perf_event #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF 0x08 #define PERF_PMU_CAP_EXCLUSIVE 0x10 #define PERF_PMU_CAP_ITRACE 0x20 +#define PERF_PMU_CAP_HETEROGENEOUS_CPUS 0x40
/** * struct pmu - generic performance monitoring unit @@@ -250,9 -240,6 +250,9 @@@ struct pmu int task_ctx_nr; int hrtimer_interval_ms;
+ /* number of address filters this PMU can do */ + unsigned int nr_addr_filters; + /* * Fully disable/enable this PMU, can be used to protect from the PMI * as well as for lazy/batch writing of the MSRs. @@@ -406,71 -393,12 +406,71 @@@ void (*free_aux) (void *aux); /* optional */
/* + * Validate address range filters: make sure the HW supports the + * requested configuration and number of filters; return 0 if the + * supplied filters are valid, -errno otherwise. + * + * Runs in the context of the ioctl()ing process and is not serialized + * with the rest of the PMU callbacks. + */ + int (*addr_filters_validate) (struct list_head *filters); + /* optional */ + + /* + * Synchronize address range filter configuration: + * translate hw-agnostic filters into hardware configuration in + * event::hw::addr_filters. + * + * Runs as a part of filter sync sequence that is done in ->start() + * callback by calling perf_event_addr_filters_sync(). + * + * May (and should) traverse event::addr_filters::list, for which its + * caller provides necessary serialization. + */ + void (*addr_filters_sync) (struct perf_event *event); + /* optional */ + + /* * Filter events for PMU-specific reasons. */ int (*filter_match) (struct perf_event *event); /* optional */ };
/** + * struct perf_addr_filter - address range filter definition + * @entry: event's filter list linkage + * @inode: object file's inode for file-based filters + * @offset: filter range offset + * @size: filter range size + * @range: 1: range, 0: address + * @filter: 1: filter/start, 0: stop + * + * This is a hardware-agnostic filter configuration as specified by the user. + */ +struct perf_addr_filter { + struct list_head entry; + struct inode *inode; + unsigned long offset; + unsigned long size; + unsigned int range : 1, + filter : 1; +}; + +/** + * struct perf_addr_filters_head - container for address range filters + * @list: list of filters for this event + * @lock: spinlock that serializes accesses to the @list and event's + * (and its children's) filter generations. + * + * A child event will use parent's @list (and therefore @lock), so they are + * bundled together; see perf_event_addr_filters(). + */ +struct perf_addr_filters_head { + struct list_head list; + raw_spinlock_t lock; +}; + +/** * enum perf_event_active_state - the states of a event */ enum perf_event_active_state { @@@ -638,12 -566,6 +638,12 @@@ struct perf_event
atomic_t event_limit;
+ /* address range filters */ + struct perf_addr_filters_head addr_filters; + /* vma address array for file-based filders */ + unsigned long *addr_filters_offs; + unsigned long addr_filters_gen; + void (*destroy)(struct perf_event *); struct rcu_head rcu_head;
@@@ -912,25 -834,9 +912,25 @@@ extern int perf_event_overflow(struct p struct perf_sample_data *data, struct pt_regs *regs);
+extern void perf_event_output_forward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs); +extern void perf_event_output_backward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs); extern void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs); + struct perf_sample_data *data, + struct pt_regs *regs); + +static inline bool +is_default_overflow_handler(struct perf_event *event) +{ + if (likely(event->overflow_handler == perf_event_output_forward)) + return true; + if (unlikely(event->overflow_handler == perf_event_output_backward)) + return true; + return false; +}
extern void perf_event_header__init_id(struct perf_event_header *header, @@@ -976,8 -882,6 +976,6 @@@ static inline void perf_arch_fetch_call */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { - memset(regs, 0, sizeof(*regs)); - perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); }
@@@ -1071,11 -975,9 +1069,11 @@@ get_perf_callchain(struct pt_regs *regs extern int get_callchain_buffers(void); extern void put_callchain_buffers(void);
+extern int sysctl_perf_event_max_stack; + static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) { - if (entry->nr < PERF_MAX_STACK_DEPTH) { + if (entry->nr < sysctl_perf_event_max_stack) { entry->ip[entry->nr++] = ip; return 0; } else { @@@ -1097,8 -999,6 +1095,8 @@@ extern int perf_cpu_time_max_percent_ha void __user *buffer, size_t *lenp, loff_t *ppos);
+int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos);
static inline bool perf_paranoid_tracepoint_raw(void) { @@@ -1116,7 -1016,7 +1114,7 @@@ static inline bool perf_paranoid_kernel }
extern void perf_event_init(void); - extern void perf_tp_event(u64 addr, u64 count, void *record, + extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task); @@@ -1143,41 -1043,8 +1141,41 @@@ static inline bool has_aux(struct perf_ return event->pmu->setup_aux; }
+static inline bool is_write_backward(struct perf_event *event) +{ + return !!event->attr.write_backward; +} + +static inline bool has_addr_filter(struct perf_event *event) +{ + return event->pmu->nr_addr_filters; +} + +/* + * An inherited event uses parent's filters + */ +static inline struct perf_addr_filters_head * +perf_event_addr_filters(struct perf_event *event) +{ + struct perf_addr_filters_head *ifh = &event->addr_filters; + + if (event->parent) + ifh = &event->parent->addr_filters; + + return ifh; +} + +extern void perf_event_addr_filters_sync(struct perf_event *event); + extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); +extern int perf_output_begin_forward(struct perf_output_handle *handle, + struct perf_event *event, + unsigned int size); +extern int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_event *event, + unsigned int size); + extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); diff --combined kernel/bpf/stackmap.c index f5a1954,3511472..c8ee352 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@@ -66,7 -66,7 +66,7 @@@ static struct bpf_map *stack_map_alloc( /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || value_size < 8 || value_size % 8 || - value_size / 8 > PERF_MAX_STACK_DEPTH) + value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL);
/* hash table size must be power of 2 */ @@@ -116,7 -116,7 +116,7 @@@ free_smap return ERR_PTR(err); }
- static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) + u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) { struct pt_regs *regs = (struct pt_regs *) (long) r1; struct bpf_map *map = (struct bpf_map *) (long) r2; @@@ -124,8 -124,8 +124,8 @@@ struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 max_depth = map->value_size / 8; - /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */ - u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth; + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; @@@ -143,7 -143,7 +143,7 @@@ return -EFAULT;
/* get_perf_callchain() guarantees that trace->nr >= init_nr - * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth + * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth */ trace_nr = trace->nr - init_nr;
diff --combined kernel/events/core.c index 050a290,5b16725..274450e --- a/kernel/events/core.c +++ b/kernel/events/core.c @@@ -44,8 -44,6 +44,8 @@@ #include <linux/compat.h> #include <linux/bpf.h> #include <linux/filter.h> +#include <linux/namei.h> +#include <linux/parser.h>
#include "internal.h"
@@@ -1929,13 -1927,8 +1929,13 @@@ event_sched_in(struct perf_event *event if (event->state <= PERF_EVENT_STATE_OFF) return 0;
- event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = smp_processor_id(); + WRITE_ONCE(event->oncpu, smp_processor_id()); + /* + * Order event::oncpu write to happen before the ACTIVE state + * is visible. + */ + smp_wmb(); + WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
/* * Unthrottle events, since we scheduled we might have missed several @@@ -2367,112 -2360,6 +2367,112 @@@ void perf_event_enable(struct perf_even } EXPORT_SYMBOL_GPL(perf_event_enable);
+struct stop_event_data { + struct perf_event *event; + unsigned int restart; +}; + +static int __perf_event_stop(void *info) +{ + struct stop_event_data *sd = info; + struct perf_event *event = sd->event; + + /* if it's already INACTIVE, do nothing */ + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * There is a window with interrupts enabled before we get here, + * so we need to check again lest we try to stop another CPU's event. + */ + if (READ_ONCE(event->oncpu) != smp_processor_id()) + return -EAGAIN; + + event->pmu->stop(event, PERF_EF_UPDATE); + + /* + * May race with the actual stop (through perf_pmu_output_stop()), + * but it is only used for events with AUX ring buffer, and such + * events will refuse to restart because of rb::aux_mmap_count==0, + * see comments in perf_aux_output_begin(). + * + * Since this is happening on a event-local CPU, no trace is lost + * while restarting. + */ + if (sd->restart) + event->pmu->start(event, PERF_EF_START); + + return 0; +} + +static int perf_event_restart(struct perf_event *event) +{ + struct stop_event_data sd = { + .event = event, + .restart = 1, + }; + int ret = 0; + + do { + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * We only want to restart ACTIVE events, so if the event goes + * inactive here (event->oncpu==-1), there's nothing more to do; + * fall through with ret==-ENXIO. + */ + ret = cpu_function_call(READ_ONCE(event->oncpu), + __perf_event_stop, &sd); + } while (ret == -EAGAIN); + + return ret; +} + +/* + * In order to contain the amount of racy and tricky in the address filter + * configuration management, it is a two part process: + * + * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, + * we update the addresses of corresponding vmas in + * event::addr_filters_offs array and bump the event::addr_filters_gen; + * (p2) when an event is scheduled in (pmu::add), it calls + * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() + * if the generation has changed since the previous call. + * + * If (p1) happens while the event is active, we restart it to force (p2). + * + * (1) perf_addr_filters_apply(): adjusting filters' offsets based on + * pre-existing mappings, called once when new filters arrive via SET_FILTER + * ioctl; + * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly + * registered mapping, called for every new mmap(), with mm::mmap_sem down + * for reading; + * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process + * of exec. + */ +void perf_event_addr_filters_sync(struct perf_event *event) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + + if (!has_addr_filter(event)) + return; + + raw_spin_lock(&ifh->lock); + if (event->addr_filters_gen != event->hw.addr_filters_gen) { + event->pmu->addr_filters_sync(event); + event->hw.addr_filters_gen = event->addr_filters_gen; + } + raw_spin_unlock(&ifh->lock); +} +EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); + static int _perf_event_refresh(struct perf_event *event, int refresh) { /* @@@ -3322,6 -3209,16 +3322,6 @@@ out put_ctx(clone_ctx); }
-void perf_event_exec(void) -{ - int ctxn; - - rcu_read_lock(); - for_each_task_context_nr(ctxn) - perf_event_enable_on_exec(ctxn); - rcu_read_unlock(); -} - struct perf_read_data { struct perf_event *event; bool group; @@@ -3823,9 -3720,6 +3823,9 @@@ static bool exclusive_event_installable return true; }
+static void perf_addr_filters_splice(struct perf_event *event, + struct list_head *head); + static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@@ -3853,8 -3747,6 +3853,8 @@@ }
perf_event_free_bpf_prog(event); + perf_addr_filters_splice(event, NULL); + kfree(event->addr_filters_offs);
if (event->destroy) event->destroy(event); @@@ -4451,19 -4343,6 +4451,19 @@@ static long _perf_ioctl(struct perf_eve case PERF_EVENT_IOC_SET_BPF: return perf_event_set_bpf_prog(event, arg);
+ case PERF_EVENT_IOC_PAUSE_OUTPUT: { + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb || !rb->nr_pages) { + rcu_read_unlock(); + return -EINVAL; + } + rb_toggle_paused(rb, !!arg); + rcu_read_unlock(); + return 0; + } default: return -ENOTTY; } @@@ -4780,8 -4659,6 +4780,8 @@@ static void perf_mmap_open(struct vm_ar event->pmu->event_mapped(event); }
+static void perf_pmu_output_stop(struct perf_event *event); + /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). @@@ -4809,22 -4686,10 +4809,22 @@@ static void perf_mmap_close(struct vm_a */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + /* + * Stop all AUX events that are writing to this buffer, + * so that we can free its AUX pages and corresponding PMU + * data. Note that after rb::aux_mmap_count dropped to zero, + * they won't start any more (see perf_aux_output_begin()). + */ + perf_pmu_output_stop(event); + + /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+ /* this has to be the last one */ rb_free_aux(rb); + WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + mutex_unlock(&event->mmap_mutex); }
@@@ -5765,13 -5630,9 +5765,13 @@@ void perf_prepare_sample(struct perf_ev } }
-void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) +static void __always_inline +__perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs, + int (*output_begin)(struct perf_output_handle *, + struct perf_event *, + unsigned int)) { struct perf_output_handle handle; struct perf_event_header header; @@@ -5781,7 -5642,7 +5781,7 @@@
perf_prepare_sample(&header, data, event, regs);
- if (perf_output_begin(&handle, event, header.size)) + if (output_begin(&handle, event, header.size)) goto exit;
perf_output_sample(&handle, &header, data, event); @@@ -5792,30 -5653,6 +5792,30 @@@ exit rcu_read_unlock(); }
+void +perf_event_output_forward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin_forward); +} + +void +perf_event_output_backward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin_backward); +} + +void +perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin); +} + /* * read event_id */ @@@ -5861,18 -5698,15 +5861,18 @@@ typedef void (perf_event_aux_output_cb) static void perf_event_aux_ctx(struct perf_event_context *ctx, perf_event_aux_output_cb output, - void *data) + void *data, bool all) { struct perf_event *event;
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->state < PERF_EVENT_STATE_INACTIVE) - continue; - if (!event_filter_match(event)) - continue; + if (!all) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + } + output(event, data); } } @@@ -5883,7 -5717,7 +5883,7 @@@ perf_event_aux_task_ctx(perf_event_aux_ { rcu_read_lock(); preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data); + perf_event_aux_ctx(task_ctx, output, data, false); preempt_enable(); rcu_read_unlock(); } @@@ -5913,13 -5747,13 +5913,13 @@@ perf_event_aux(perf_event_aux_output_c cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); if (cpuctx->unique_pmu != pmu) goto next; - perf_event_aux_ctx(&cpuctx->ctx, output, data); + perf_event_aux_ctx(&cpuctx->ctx, output, data, false); ctxn = pmu->task_ctx_nr; if (ctxn < 0) goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) - perf_event_aux_ctx(ctx, output, data); + perf_event_aux_ctx(ctx, output, data, false); next: put_cpu_ptr(pmu->pmu_cpu_context); } @@@ -5927,134 -5761,6 +5927,134 @@@ }
/* + * Clear all file-based filters at exec, they'll have to be + * re-instated when/if these objects are mmapped again. + */ +static void perf_event_addr_filters_exec(struct perf_event *event, void *data) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct perf_addr_filter *filter; + unsigned int restart = 0, count = 0; + unsigned long flags; + + if (!has_addr_filter(event)) + return; + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + if (filter->inode) { + event->addr_filters_offs[count] = 0; + restart++; + } + + count++; + } + + if (restart) + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + if (restart) + perf_event_restart(event); +} + +void perf_event_exec(void) +{ + struct perf_event_context *ctx; + int ctxn; + + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = current->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctxn); + + perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL, + true); + } + rcu_read_unlock(); +} + +struct remote_output { + struct ring_buffer *rb; + int err; +}; + +static void __perf_event_output_stop(struct perf_event *event, void *data) +{ + struct perf_event *parent = event->parent; + struct remote_output *ro = data; + struct ring_buffer *rb = ro->rb; + struct stop_event_data sd = { + .event = event, + }; + + if (!has_aux(event)) + return; + + if (!parent) + parent = event; + + /* + * In case of inheritance, it will be the parent that links to the + * ring-buffer, but it will be the child that's actually using it: + */ + if (rcu_dereference(parent->rb) == rb) + ro->err = __perf_event_stop(&sd); +} + +static int __perf_pmu_output_stop(void *info) +{ + struct perf_event *event = info; + struct pmu *pmu = event->pmu; + struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + struct remote_output ro = { + .rb = event->rb, + }; + + rcu_read_lock(); + perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); + if (cpuctx->task_ctx) + perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, + &ro, false); + rcu_read_unlock(); + + return ro.err; +} + +static void perf_pmu_output_stop(struct perf_event *event) +{ + struct perf_event *iter; + int err, cpu; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { + /* + * For per-CPU events, we need to make sure that neither they + * nor their children are running; for cpu==-1 events it's + * sufficient to stop the event itself if it's active, since + * it can't have children. + */ + cpu = iter->cpu; + if (cpu == -1) + cpu = READ_ONCE(iter->oncpu); + + if (cpu == -1) + continue; + + err = cpu_function_call(cpu, __perf_pmu_output_stop, event); + if (err == -EAGAIN) { + rcu_read_unlock(); + goto restart; + } + } + rcu_read_unlock(); +} + +/* * task tracking -- fork/exit * * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task @@@ -6463,87 -6169,6 +6463,87 @@@ got_name kfree(buf); }
+/* + * Whether this @filter depends on a dynamic object which is not loaded + * yet or its load addresses are not known. + */ +static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter) +{ + return filter->filter && filter->inode; +} + +/* + * Check whether inode and address range match filter criteria. + */ +static bool perf_addr_filter_match(struct perf_addr_filter *filter, + struct file *file, unsigned long offset, + unsigned long size) +{ + if (filter->inode != file->f_inode) + return false; + + if (filter->offset > offset + size) + return false; + + if (filter->offset + filter->size < offset) + return false; + + return true; +} + +static void __perf_addr_filters_adjust(struct perf_event *event, void *data) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct vm_area_struct *vma = data; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; + struct file *file = vma->vm_file; + struct perf_addr_filter *filter; + unsigned int restart = 0, count = 0; + + if (!has_addr_filter(event)) + return; + + if (!file) + return; + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + if (perf_addr_filter_match(filter, file, off, + vma->vm_end - vma->vm_start)) { + event->addr_filters_offs[count] = vma->vm_start; + restart++; + } + + count++; + } + + if (restart) + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + if (restart) + perf_event_restart(event); +} + +/* + * Adjust all task's events' filters to the new vma + */ +static void perf_addr_filters_adjust(struct vm_area_struct *vma) +{ + struct perf_event_context *ctx; + int ctxn; + + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (!ctx) + continue; + + perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true); + } + rcu_read_unlock(); +} + void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; @@@ -6575,7 -6200,6 +6575,7 @@@ /* .flags (attr_mmap2 only) */ };
+ perf_addr_filters_adjust(vma); perf_event_mmap_event(&mmap_event); }
@@@ -6867,7 -6491,10 +6867,7 @@@ static int __perf_event_overflow(struc irq_work_queue(&event->pending); }
- if (event->overflow_handler) - event->overflow_handler(event, data, regs); - else - perf_event_output(event, data, regs); + event->overflow_handler(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@@ -7100,7 -6727,7 +7100,7 @@@ int perf_swevent_get_recursion_context( } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
- inline void perf_swevent_put_recursion_context(int rctx) + void perf_swevent_put_recursion_context(int rctx) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@@ -7362,7 -6989,26 +7362,26 @@@ static int perf_tp_event_match(struct p return 1; }
- void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, + void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, + struct trace_event_call *call, u64 count, + struct pt_regs *regs, struct hlist_head *head, + struct task_struct *task) + { + struct bpf_prog *prog = call->prog; + + if (prog) { + *(struct pt_regs **)raw_data = regs; + if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) { + perf_swevent_put_recursion_context(rctx); + return; + } + } + perf_tp_event(call->event.type, count, raw_data, size, regs, head, + rctx, task); + } + EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); + + void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) { @@@ -7374,9 -7020,11 +7393,11 @@@ .data = record, };
- perf_sample_data_init(&data, addr, 0); + perf_sample_data_init(&data, 0, 0); data.raw = &raw;
+ perf_trace_buf_update(record, event_type); + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); @@@ -7454,6 -7102,24 +7475,6 @@@ static inline void perf_tp_register(voi perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); }
-static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - char *filter_str; - int ret; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; - - filter_str = strndup_user(arg, PAGE_SIZE); - if (IS_ERR(filter_str)) - return PTR_ERR(filter_str); - - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - - kfree(filter_str); - return ret; -} - static void perf_event_free_filter(struct perf_event *event) { ftrace_profile_free_filter(event); @@@ -7461,6 -7127,7 +7482,7 @@@
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { + bool is_kprobe, is_tracepoint; struct bpf_prog *prog;
if (event->attr.type != PERF_TYPE_TRACEPOINT) @@@ -7469,20 -7136,31 +7491,31 @@@ if (event->tp_event->prog) return -EEXIST;
- if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) - /* bpf programs can only be attached to u/kprobes */ + is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; + is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; + if (!is_kprobe && !is_tracepoint) + /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL;
prog = bpf_prog_get(prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog);
- if (prog->type != BPF_PROG_TYPE_KPROBE) { + if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || + (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { /* valid fd, but invalid bpf program type */ bpf_prog_put(prog); return -EINVAL; }
+ if (is_tracepoint) { + int off = trace_event_get_offsets(event->tp_event); + + if (prog->aux->max_ctx_offset > off) { + bpf_prog_put(prog); + return -EACCES; + } + } event->tp_event->prog = prog;
return 0; @@@ -7508,6 -7186,11 +7541,6 @@@ static inline void perf_tp_register(voi { }
-static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - return -ENOENT; -} - static void perf_event_free_filter(struct perf_event *event) { } @@@ -7536,387 -7219,6 +7569,387 @@@ void perf_bp_event(struct perf_event *b #endif
/* + * Allocate a new address filter + */ +static struct perf_addr_filter * +perf_addr_filter_new(struct perf_event *event, struct list_head *filters) +{ + int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); + struct perf_addr_filter *filter; + + filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); + if (!filter) + return NULL; + + INIT_LIST_HEAD(&filter->entry); + list_add_tail(&filter->entry, filters); + + return filter; +} + +static void free_filters_list(struct list_head *filters) +{ + struct perf_addr_filter *filter, *iter; + + list_for_each_entry_safe(filter, iter, filters, entry) { + if (filter->inode) + iput(filter->inode); + list_del(&filter->entry); + kfree(filter); + } +} + +/* + * Free existing address filters and optionally install new ones + */ +static void perf_addr_filters_splice(struct perf_event *event, + struct list_head *head) +{ + unsigned long flags; + LIST_HEAD(list); + + if (!has_addr_filter(event)) + return; + + /* don't bother with children, they don't have their own filters */ + if (event->parent) + return; + + raw_spin_lock_irqsave(&event->addr_filters.lock, flags); + + list_splice_init(&event->addr_filters.list, &list); + if (head) + list_splice(head, &event->addr_filters.list); + + raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); + + free_filters_list(&list); +} + +/* + * Scan through mm's vmas and see if one of them matches the + * @filter; if so, adjust filter's address range. + * Called with mm::mmap_sem down for reading. + */ +static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, + struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + struct file *file = vma->vm_file; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + unsigned long vma_size = vma->vm_end - vma->vm_start; + + if (!file) + continue; + + if (!perf_addr_filter_match(filter, file, off, vma_size)) + continue; + + return vma->vm_start; + } + + return 0; +} + +/* + * Update event's address range filters based on the + * task's existing mappings, if any. + */ +static void perf_event_addr_filters_apply(struct perf_event *event) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct task_struct *task = READ_ONCE(event->ctx->task); + struct perf_addr_filter *filter; + struct mm_struct *mm = NULL; + unsigned int count = 0; + unsigned long flags; + + /* + * We may observe TASK_TOMBSTONE, which means that the event tear-down + * will stop on the parent's child_mutex that our caller is also holding + */ + if (task == TASK_TOMBSTONE) + return; + + mm = get_task_mm(event->ctx->task); + if (!mm) + goto restart; + + down_read(&mm->mmap_sem); + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + event->addr_filters_offs[count] = 0; + + if (perf_addr_filter_needs_mmap(filter)) + event->addr_filters_offs[count] = + perf_addr_filter_apply(filter, mm); + + count++; + } + + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + up_read(&mm->mmap_sem); + + mmput(mm); + +restart: + perf_event_restart(event); +} + +/* + * Address range filtering: limiting the data to certain + * instruction address ranges. Filters are ioctl()ed to us from + * userspace as ascii strings. + * + * Filter string format: + * + * ACTION RANGE_SPEC + * where ACTION is one of the + * * "filter": limit the trace to this region + * * "start": start tracing from this address + * * "stop": stop tracing at this address/region; + * RANGE_SPEC is + * * for kernel addresses: <start address>[/<size>] + * * for object files: <start address>[/<size>]@</path/to/object/file> + * + * if <size> is not specified, the range is treated as a single address. + */ +enum { + IF_ACT_FILTER, + IF_ACT_START, + IF_ACT_STOP, + IF_SRC_FILE, + IF_SRC_KERNEL, + IF_SRC_FILEADDR, + IF_SRC_KERNELADDR, +}; + +enum { + IF_STATE_ACTION = 0, + IF_STATE_SOURCE, + IF_STATE_END, +}; + +static const match_table_t if_tokens = { + { IF_ACT_FILTER, "filter" }, + { IF_ACT_START, "start" }, + { IF_ACT_STOP, "stop" }, + { IF_SRC_FILE, "%u/%u@%s" }, + { IF_SRC_KERNEL, "%u/%u" }, + { IF_SRC_FILEADDR, "%u@%s" }, + { IF_SRC_KERNELADDR, "%u" }, +}; + +/* + * Address filter string parser + */ +static int +perf_event_parse_addr_filter(struct perf_event *event, char *fstr, + struct list_head *filters) +{ + struct perf_addr_filter *filter = NULL; + char *start, *orig, *filename = NULL; + struct path path; + substring_t args[MAX_OPT_ARGS]; + int state = IF_STATE_ACTION, token; + unsigned int kernel = 0; + int ret = -EINVAL; + + orig = fstr = kstrdup(fstr, GFP_KERNEL); + if (!fstr) + return -ENOMEM; + + while ((start = strsep(&fstr, " ,\n")) != NULL) { + ret = -EINVAL; + + if (!*start) + continue; + + /* filter definition begins */ + if (state == IF_STATE_ACTION) { + filter = perf_addr_filter_new(event, filters); + if (!filter) + goto fail; + } + + token = match_token(start, if_tokens, args); + switch (token) { + case IF_ACT_FILTER: + case IF_ACT_START: + filter->filter = 1; + + case IF_ACT_STOP: + if (state != IF_STATE_ACTION) + goto fail; + + state = IF_STATE_SOURCE; + break; + + case IF_SRC_KERNELADDR: + case IF_SRC_KERNEL: + kernel = 1; + + case IF_SRC_FILEADDR: + case IF_SRC_FILE: + if (state != IF_STATE_SOURCE) + goto fail; + + if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) + filter->range = 1; + + *args[0].to = 0; + ret = kstrtoul(args[0].from, 0, &filter->offset); + if (ret) + goto fail; + + if (filter->range) { + *args[1].to = 0; + ret = kstrtoul(args[1].from, 0, &filter->size); + if (ret) + goto fail; + } + + if (token == IF_SRC_FILE) { + filename = match_strdup(&args[2]); + if (!filename) { + ret = -ENOMEM; + goto fail; + } + } + + state = IF_STATE_END; + break; + + default: + goto fail; + } + + /* + * Filter definition is fully parsed, validate and install it. + * Make sure that it doesn't contradict itself or the event's + * attribute. + */ + if (state == IF_STATE_END) { + if (kernel && event->attr.exclude_kernel) + goto fail; + + if (!kernel) { + if (!filename) + goto fail; + + /* look up the path and grab its inode */ + ret = kern_path(filename, LOOKUP_FOLLOW, &path); + if (ret) + goto fail_free_name; + + filter->inode = igrab(d_inode(path.dentry)); + path_put(&path); + kfree(filename); + filename = NULL; + + ret = -EINVAL; + if (!filter->inode || + !S_ISREG(filter->inode->i_mode)) + /* free_filters_list() will iput() */ + goto fail; + } + + /* ready to consume more filters */ + state = IF_STATE_ACTION; + filter = NULL; + } + } + + if (state != IF_STATE_ACTION) + goto fail; + + kfree(orig); + + return 0; + +fail_free_name: + kfree(filename); +fail: + free_filters_list(filters); + kfree(orig); + + return ret; +} + +static int +perf_event_set_addr_filter(struct perf_event *event, char *filter_str) +{ + LIST_HEAD(filters); + int ret; + + /* + * Since this is called in perf_ioctl() path, we're already holding + * ctx::mutex. + */ + lockdep_assert_held(&event->ctx->mutex); + + if (WARN_ON_ONCE(event->parent)) + return -EINVAL; + + /* + * For now, we only support filtering in per-task events; doing so + * for CPU-wide events requires additional context switching trickery, + * since same object code will be mapped at different virtual + * addresses in different processes. + */ + if (!event->ctx->task) + return -EOPNOTSUPP; + + ret = perf_event_parse_addr_filter(event, filter_str, &filters); + if (ret) + return ret; + + ret = event->pmu->addr_filters_validate(&filters); + if (ret) { + free_filters_list(&filters); + return ret; + } + + /* remove existing filters, if any */ + perf_addr_filters_splice(event, &filters); + + /* install new filters */ + perf_event_for_each_child(event, perf_event_addr_filters_apply); + + return ret; +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret = -EINVAL; + + if ((event->attr.type != PERF_TYPE_TRACEPOINT || + !IS_ENABLED(CONFIG_EVENT_TRACING)) && + !has_addr_filter(event)) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + if (IS_ENABLED(CONFIG_EVENT_TRACING) && + event->attr.type == PERF_TYPE_TRACEPOINT) + ret = ftrace_profile_set_filter(event, event->attr.config, + filter_str); + else if (has_addr_filter(event)) + ret = perf_event_set_addr_filter(event, filter_str); + + kfree(filter_str); + return ret; +} + +/* * hrtimer based swevent callback */
@@@ -8273,20 -7575,6 +8306,20 @@@ static void free_pmu_context(struct pm out: mutex_unlock(&pmus_lock); } + +/* + * Let userspace know that this PMU supports address range filtering: + */ +static ssize_t nr_addr_filters_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); +} +DEVICE_ATTR_RO(nr_addr_filters); + static struct idr pmu_idr;
static ssize_t @@@ -8388,19 -7676,9 +8421,19 @@@ static int pmu_dev_alloc(struct pmu *pm if (ret) goto free_dev;
+ /* For PMUs with address filters, throw in an extra attribute: */ + if (pmu->nr_addr_filters) + ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); + + if (ret) + goto del_dev; + out: return ret;
+del_dev: + device_del(pmu->dev); + free_dev: put_device(pmu->dev); goto out; @@@ -8440,21 -7718,6 +8473,21 @@@ int perf_pmu_register(struct pmu *pmu, }
skip_type: + if (pmu->task_ctx_nr == perf_hw_context) { + static int hw_context_taken = 0; + + /* + * Other than systems with heterogeneous CPUs, it never makes + * sense for two PMUs to share perf_hw_context. PMUs which are + * uncore must use perf_invalid_context. + */ + if (WARN_ON_ONCE(hw_context_taken && + !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) + pmu->task_ctx_nr = perf_invalid_context; + + hw_context_taken = 1; + } + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); if (pmu->pmu_cpu_context) goto got_cpu_context; @@@ -8542,8 -7805,6 +8575,8 @@@ void perf_pmu_unregister(struct pmu *pm free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); free_pmu_context(pmu); @@@ -8737,7 -7998,6 +8770,7 @@@ perf_event_alloc(struct perf_event_att INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); + INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry);
@@@ -8745,7 -8005,6 +8778,7 @@@ init_irq_work(&event->pending, perf_pending_event);
mutex_init(&event->mmap_mutex); + raw_spin_lock_init(&event->addr_filters.lock);
atomic_long_set(&event->refcount, 1); event->cpu = cpu; @@@ -8780,16 -8039,8 +8813,16 @@@ context = parent_event->overflow_handler_context; }
- event->overflow_handler = overflow_handler; - event->overflow_handler_context = context; + if (overflow_handler) { + event->overflow_handler = overflow_handler; + event->overflow_handler_context = context; + } else if (is_write_backward(event)){ + event->overflow_handler = perf_event_output_backward; + event->overflow_handler_context = NULL; + } else { + event->overflow_handler = perf_event_output_forward; + event->overflow_handler_context = NULL; + }
perf_event__state_init(event);
@@@ -8830,22 -8081,11 +8863,22 @@@ if (err) goto err_pmu;
+ if (has_addr_filter(event)) { + event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, + sizeof(unsigned long), + GFP_KERNEL); + if (!event->addr_filters_offs) + goto err_per_task; + + /* force hw sync on the address filters */ + event->addr_filters_gen = 1; + } + if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(); if (err) - goto err_per_task; + goto err_addr_filters; } }
@@@ -8854,9 -8094,6 +8887,9 @@@
return event;
+err_addr_filters: + kfree(event->addr_filters_offs); + err_per_task: exclusive_event_destroy(event);
@@@ -9036,13 -8273,6 +9069,13 @@@ perf_event_set_output(struct perf_even goto out;
/* + * Either writing ring buffer from beginning or from end. + * Mixing is not allowed. + */ + if (is_write_backward(output_event) != is_write_backward(event)) + goto out; + + /* * If both events generate aux data, they must be on the same PMU */ if (has_aux(event) && has_aux(output_event) && diff --combined kernel/trace/trace_event_perf.c index e11108f,5a92707..562fa69 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@@ -47,9 -47,6 +47,9 @@@ static int perf_trace_event_perm(struc if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) return -EPERM;
+ if (!is_sampling_event(p_event)) + return 0; + /* * We don't allow user space callchains for function trace * event, due to issues with page faults while tracing page @@@ -263,42 -260,43 +263,43 @@@ void perf_trace_del(struct perf_event * tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); }
- void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs **regs, int *rctxp) + void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) { - struct trace_entry *entry; - unsigned long flags; char *raw_data; - int pc; + int rctx;
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) + "perf buffer not large enough")) return NULL;
- pc = preempt_count(); - - *rctxp = perf_swevent_get_recursion_context(); - if (*rctxp < 0) + *rctxp = rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) return NULL;
if (regs) - *regs = this_cpu_ptr(&__perf_regs[*rctxp]); - raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); + *regs = this_cpu_ptr(&__perf_regs[rctx]); + raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
/* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); + return raw_data; + } + EXPORT_SYMBOL_GPL(perf_trace_buf_alloc); + NOKPROBE_SYMBOL(perf_trace_buf_alloc); + + void perf_trace_buf_update(void *record, u16 type) + { + struct trace_entry *entry = record; + int pc = preempt_count(); + unsigned long flags;
- entry = (struct trace_entry *)raw_data; local_save_flags(flags); tracing_generic_entry_update(entry, flags, pc); entry->type = type; - - return raw_data; } - EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); - NOKPROBE_SYMBOL(perf_trace_buf_prepare); + NOKPROBE_SYMBOL(perf_trace_buf_update);
#ifdef CONFIG_FUNCTION_TRACER static void @@@ -319,15 -317,16 +320,16 @@@ perf_ftrace_function_call(unsigned lon
BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
+ memset(®s, 0, sizeof(regs)); perf_fetch_caller_regs(®s);
- entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); + entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); if (!entry) return;
entry->ip = ip; entry->parent_ip = parent_ip; - perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, 1, ®s, head, NULL);
#undef ENTRY_SIZE diff --combined net/socket.c index 35e4523,7789d79..e7793f5 --- a/net/socket.c +++ b/net/socket.c @@@ -466,7 -466,7 +466,7 @@@ static struct socket *sockfd_lookup_lig #define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname" #define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX) #define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1) -static ssize_t sockfs_getxattr(struct dentry *dentry, +static ssize_t sockfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { const char *proto_name; @@@ -587,22 -587,19 +587,19 @@@ void sock_release(struct socket *sock } EXPORT_SYMBOL(sock_release);
- void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) + void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags;
- if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_HARDWARE) + if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) flags |= SKBTX_HW_TSTAMP;
- if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) + if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) flags |= SKBTX_SW_TSTAMP;
- if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED) + if (tsflags & SOF_TIMESTAMPING_TX_SCHED) flags |= SKBTX_SCHED_TSTAMP;
- if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK) - flags |= SKBTX_ACK_TSTAMP; - *tx_flags = flags; } EXPORT_SYMBOL(__sock_tx_timestamp); @@@ -709,17 -706,16 +706,16 @@@ void __sock_recv_ts_and_drops(struct ms EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, - size_t size, int flags) + int flags) { - return sock->ops->recvmsg(sock, msg, size, flags); + return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags); }
- int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, - int flags) + int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags) { - int err = security_socket_recvmsg(sock, msg, size, flags); + int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);
- return err ?: sock_recvmsg_nosec(sock, msg, size, flags); + return err ?: sock_recvmsg_nosec(sock, msg, flags); } EXPORT_SYMBOL(sock_recvmsg);
@@@ -746,7 -742,7 +742,7 @@@ int kernel_recvmsg(struct socket *sock
iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size); set_fs(KERNEL_DS); - result = sock_recvmsg(sock, msg, size, flags); + result = sock_recvmsg(sock, msg, flags); set_fs(oldfs); return result; } @@@ -796,7 -792,7 +792,7 @@@ static ssize_t sock_read_iter(struct ki if (!iov_iter_count(to)) /* Match SYS5 behaviour */ return 0;
- res = sock_recvmsg(sock, &msg, iov_iter_count(to), msg.msg_flags); + res = sock_recvmsg(sock, &msg, msg.msg_flags); *to = msg.msg_iter; return res; } @@@ -1046,7 -1042,7 +1042,7 @@@ static int sock_fasync(int fd, struct f return -EINVAL;
lock_sock(sk); - wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk)); + wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk)); fasync_helper(fd, filp, on, &wq->fasync_list);
if (!wq->fasync_list) @@@ -1696,7 -1692,7 +1692,7 @@@ SYSCALL_DEFINE6(recvfrom, int, fd, voi msg.msg_iocb = NULL; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; - err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags); + err = sock_recvmsg(sock, &msg, flags);
if (err >= 0 && addr != NULL) { err2 = move_addr_to_user(&address, @@@ -2073,7 -2069,7 +2069,7 @@@ static int ___sys_recvmsg(struct socke struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; unsigned long cmsg_ptr; - int total_len, len; + int len; ssize_t err;
/* kernel mode address */ @@@ -2091,7 -2087,6 +2087,6 @@@ err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov); if (err < 0) return err; - total_len = iov_iter_count(&msg_sys->msg_iter);
cmsg_ptr = (unsigned long)msg_sys->msg_control; msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); @@@ -2101,8 -2096,7 +2096,7 @@@
if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; - err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, - total_len, flags); + err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, flags); if (err < 0) goto out_freeiov; len = err;