The following commit has been merged in the master branch: commit 8d65b08debc7e62b2c6032d7fe7389d895b92cbc Merge: 5a0387a8a8efb90ae7fea1e2e5c62de3efa74691 5d15af6778b8e4ed1fd41b040283af278e7a9a72 Author: Linus Torvalds torvalds@linux-foundation.org Date: Tue May 2 16:40:27 2017 -0700
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Millar: "Here are some highlights from the 2065 networking commits that happened this development cycle:
1) XDP support for IXGBE (John Fastabend) and thunderx (Sunil Kowuri)
2) Add a generic XDP driver, so that anyone can test XDP even if they lack a networking device whose driver has explicit XDP support (me).
3) Sparc64 now has an eBPF JIT too (me)
4) Add a BPF program testing framework via BPF_PROG_TEST_RUN (Alexei Starovoitov)
5) Make netfitler network namespace teardown less expensive (Florian Westphal)
6) Add symmetric hashing support to nft_hash (Laura Garcia Liebana)
7) Implement NAPI and GRO in netvsc driver (Stephen Hemminger)
8) Support TC flower offload statistics in mlxsw (Arkadi Sharshevsky)
9) Multiqueue support in stmmac driver (Joao Pinto)
10) Remove TCP timewait recycling, it never really could possibly work well in the real world and timestamp randomization really zaps any hint of usability this feature had (Soheil Hassas Yeganeh)
11) Support level3 vs level4 ECMP route hashing in ipv4 (Nikolay Aleksandrov)
12) Add socket busy poll support to epoll (Sridhar Samudrala)
13) Netlink extended ACK support (Johannes Berg, Pablo Neira Ayuso, and several others)
14) IPSEC hw offload infrastructure (Steffen Klassert)"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2065 commits) tipc: refactor function tipc_sk_recv_stream() tipc: refactor function tipc_sk_recvmsg() net: thunderx: Optimize page recycling for XDP net: thunderx: Support for XDP header adjustment net: thunderx: Add support for XDP_TX net: thunderx: Add support for XDP_DROP net: thunderx: Add basic XDP support net: thunderx: Cleanup receive buffer allocation net: thunderx: Optimize CQE_TX handling net: thunderx: Optimize RBDR descriptor handling net: thunderx: Support for page recycling ipx: call ipxitf_put() in ioctl error path net: sched: add helpers to handle extended actions qed*: Fix issues in the ptp filter config implementation. qede: Fix concurrency issue in PTP Tx path processing. stmmac: Add support for SIMATIC IOT2000 platform net: hns: fix ethtool_get_strings overflow in hns driver tcp: fix wraparound issue in tcp_lp bpf, arm64: fix jit branch offset related to ldimm64 bpf, arm64: implement jiting of BPF_XADD ...
diff --combined Documentation/devicetree/bindings/vendor-prefixes.txt index 830c998,36b4d8a..b685d13 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt @@@ -51,6 -51,7 +51,7 @@@ brcm Broadcom Corporatio buffalo Buffalo, Inc. calxeda Calxeda capella Capella Microsystems, Inc + cascoda Cascoda, Ltd. cavium Cavium, Inc. cdns Cadence Design Systems Inc. ceva Ceva, Inc. @@@ -265,7 -266,6 +266,7 @@@ sbs Smart Battery Syste schindler Schindler seagate Seagate Technology PLC semtech Semtech Corporation +sensirion Sensirion AG sgx SGX Sensortech sharp Sharp Corporation si-en Si-En Technology Ltd. diff --combined MAINTAINERS index 756da3f,28ea78b..b1036a7 --- a/MAINTAINERS +++ b/MAINTAINERS @@@ -896,12 -896,19 +896,19 @@@ F: arch/arm64/boot/dts/apm APPLIED MICRO (APM) X-GENE SOC ETHERNET DRIVER M: Iyappan Subramanian isubramanian@apm.com M: Keyur Chudgar kchudgar@apm.com + M: Quan Nguyen qnguyen@apm.com S: Supported F: drivers/net/ethernet/apm/xgene/ F: drivers/net/phy/mdio-xgene.c F: Documentation/devicetree/bindings/net/apm-xgene-enet.txt F: Documentation/devicetree/bindings/net/apm-xgene-mdio.txt
+ APPLIED MICRO (APM) X-GENE SOC ETHERNET (V2) DRIVER + M: Iyappan Subramanian isubramanian@apm.com + M: Keyur Chudgar kchudgar@apm.com + S: Supported + F: drivers/net/ethernet/apm/xgene-v2/ + APPLIED MICRO (APM) X-GENE SOC PMU M: Tai Nguyen ttnguyen@apm.com S: Supported @@@ -2327,6 -2334,21 +2334,6 @@@ S: Maintaine F: drivers/auxdisplay/ F: include/linux/cfag12864b.h
-AVR32 ARCHITECTURE -M: Haavard Skinnemoen hskinnemoen@gmail.com -M: Hans-Christian Egtvedt egtvedt@samfundet.no -W: http://www.atmel.com/products/AVR32/ -W: http://mirror.egtvedt.no/avr32linux.org/ -W: http://avrfreaks.net/ -S: Maintained -F: arch/avr32/ - -AVR32/AT32AP MACHINE SUPPORT -M: Haavard Skinnemoen hskinnemoen@gmail.com -M: Hans-Christian Egtvedt egtvedt@samfundet.no -S: Maintained -F: arch/avr32/mach-at32ap/ - AX.25 NETWORK LAYER M: Ralf Baechle ralf@linux-mips.org L: linux-hams@vger.kernel.org @@@ -2529,14 -2551,6 +2536,14 @@@ F: block F: kernel/trace/blktrace.c F: lib/sbitmap.c
+BFQ I/O SCHEDULER +M: Paolo Valente paolo.valente@linaro.org +M: Jens Axboe axboe@kernel.dk +L: linux-block@vger.kernel.org +S: Maintained +F: block/bfq-* +F: Documentation/block/bfq-iosched.txt + BLOCK2MTD DRIVER M: Joern Engel joern@lazybastard.org L: linux-mtd@lists.infradead.org @@@ -2944,6 -2958,15 +2951,15 @@@ W: http://www.linux-c6x.org/wiki/index. S: Maintained F: arch/c6x/
+ CA8210 IEEE-802.15.4 RADIO DRIVER + M: Harry Morris h.morris@cascoda.com + M: linuxdev@cascoda.com + L: linux-wpan@vger.kernel.org + W: https://github.com/Cascoda/ca8210-linux.git + S: Maintained + F: drivers/net/ieee802154/ca8210.c + F: Documentation/devicetree/bindings/net/ieee802154/ca8210.txt + CACHEFILES: FS-CACHE BACKEND FOR CACHING ON MOUNTED FILESYSTEMS M: David Howells dhowells@redhat.com L: linux-cachefs@redhat.com (moderated for non-subscribers) @@@ -3456,7 -3479,6 +3472,7 @@@ T: git git://git.kernel.org/pub/scm/lin T: git git://git.linaro.org/people/vireshk/linux.git (For ARM Updates) B: https://bugzilla.kernel.org F: Documentation/cpu-freq/ +F: Documentation/devicetree/bindings/cpufreq/ F: drivers/cpufreq/ F: include/linux/cpufreq.h F: tools/testing/selftests/cpufreq/ @@@ -4701,7 -4723,6 +4717,7 @@@ L: linux-edac@vger.kernel.or L: linux-mips@linux-mips.org S: Supported F: drivers/edac/octeon_edac* +F: drivers/edac/thunderx_edac*
EDAC-E752X M: Mark Gross mark.gross@intel.com @@@ -5415,23 -5436,6 +5431,23 @@@ F: fs/fuse F: include/uapi/linux/fuse.h F: Documentation/filesystems/fuse.txt
+FUTEX SUBSYSTEM +M: Thomas Gleixner tglx@linutronix.de +M: Ingo Molnar mingo@redhat.com +R: Peter Zijlstra peterz@infradead.org +R: Darren Hart dvhart@infradead.org +L: linux-kernel@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core +S: Maintained +F: kernel/futex.c +F: kernel/futex_compat.c +F: include/asm-generic/futex.h +F: include/linux/futex.h +F: include/uapi/linux/futex.h +F: tools/testing/selftests/futex/ +F: tools/perf/bench/futex* +F: Documentation/*futex* + FUTURE DOMAIN TMC-16x0 SCSI DRIVER (16-bit) M: Rik Faith faith@cs.unc.edu L: linux-scsi@vger.kernel.org @@@ -6036,7 -6040,7 +6052,7 @@@ M: Sebastian Reichel <sre@kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-hsi.git S: Maintained F: Documentation/ABI/testing/sysfs-bus-hsi -F: Documentation/device-drivers/serial-interfaces.rst +F: Documentation/driver-api/hsi.rst F: drivers/hsi/ F: include/linux/hsi/ F: include/uapi/linux/hsi/ @@@ -6242,7 -6246,7 +6258,7 @@@ F: drivers/crypto/nx/nx_csbcpb. F: drivers/crypto/nx/nx_debugfs.h
IBM Power 842 compression accelerator -M: Dan Streetman ddstreet@ieee.org +M: Haren Myneni haren@us.ibm.com S: Supported F: drivers/crypto/nx/Makefile F: drivers/crypto/nx/Kconfig @@@ -7201,7 -7205,6 +7217,7 @@@ S: Supporte F: Documentation/s390/kvm.txt F: arch/s390/include/asm/kvm* F: arch/s390/kvm/ +F: arch/s390/mm/gmap.c
KERNEL VIRTUAL MACHINE (KVM) FOR ARM M: Christoffer Dall christoffer.dall@linaro.org @@@ -7884,7 -7887,7 +7900,7 @@@ S: Maintaine F: drivers/net/ethernet/marvell/mvneta.*
MARVELL MWIFIEX WIRELESS DRIVER - M: Amitkumar Karwar akarwar@marvell.com + M: Amitkumar Karwar amitkarwar@gmail.com M: Nishant Sarmukadam nishants@marvell.com M: Ganapathi Bhat gbhat@marvell.com M: Xinming Hu huxm@marvell.com @@@ -8829,12 -8832,12 +8845,12 @@@ F: net/core/flow. F: net/xfrm/ F: net/key/ F: net/ipv4/xfrm* - F: net/ipv4/esp4.c + F: net/ipv4/esp4* F: net/ipv4/ah4.c F: net/ipv4/ipcomp.c F: net/ipv4/ip_vti.c F: net/ipv6/xfrm* - F: net/ipv6/esp6.c + F: net/ipv6/esp6* F: net/ipv6/ah6.c F: net/ipv6/ipcomp6.c F: net/ipv6/ip6_vti.c @@@ -8888,8 -8891,6 +8904,6 @@@ S: Supporte F: drivers/net/ethernet/qlogic/netxen/
NFC SUBSYSTEM - M: Lauro Ramos Venancio lauro.venancio@openbossa.org - M: Aloisio Almeida Jr aloisio.almeida@openbossa.org M: Samuel Ortiz sameo@linux.intel.com L: linux-wireless@vger.kernel.org L: linux-nfc@lists.01.org (subscribers-only) @@@ -10897,16 -10898,6 +10911,16 @@@ W: http://www.ibm.com/developerworks/li S: Supported F: drivers/iommu/s390-iommu.c
+S390 VFIO-CCW DRIVER +M: Cornelia Huck cornelia.huck@de.ibm.com +M: Dong Jia Shi bjsdjshi@linux.vnet.ibm.com +L: linux-s390@vger.kernel.org +L: kvm@vger.kernel.org +S: Supported +F: drivers/s390/cio/vfio_ccw* +F: Documentation/s390/vfio-ccw.txt +F: include/uapi/linux/vfio_ccw.h + S3C24XX SD/MMC Driver M: Ben Dooks ben-linux@fluff.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@@ -10954,14 -10945,6 +10968,14 @@@ L: alsa-devel@alsa-project.org (moderat S: Supported F: sound/soc/samsung/
+SAMSUNG EXYNOS PSEUDO RANDOM NUMBER GENERATOR (RNG) DRIVER +M: Krzysztof Kozlowski krzk@kernel.org +L: linux-crypto@vger.kernel.org +L: linux-samsung-soc@vger.kernel.org +S: Maintained +F: drivers/crypto/exynos-rng.c +F: Documentation/devicetree/bindings/rng/samsung,exynos-rng4.txt + SAMSUNG FRAMEBUFFER DRIVER M: Jingoo Han jingoohan1@gmail.com L: linux-fbdev@vger.kernel.org @@@ -10986,14 -10969,6 +11000,14 @@@ F: Documentation/devicetree/bindings/re F: Documentation/devicetree/bindings/regulator/samsung,s5m*.txt F: Documentation/devicetree/bindings/clock/samsung,s2mps11.txt
+SAMSUNG S5P Security SubSystem (SSS) DRIVER +M: Krzysztof Kozlowski krzk@kernel.org +M: Vladimir Zapolskiy vz@mleia.com +L: linux-crypto@vger.kernel.org +L: linux-samsung-soc@vger.kernel.org +S: Maintained +F: drivers/crypto/s5p-sss.c + SAMSUNG S5P/EXYNOS4 SOC SERIES CAMERA SUBSYSTEM DRIVERS M: Kyungmin Park kyungmin.park@samsung.com M: Sylwester Nawrocki s.nawrocki@samsung.com @@@ -11125,6 -11100,12 +11139,12 @@@ F: include/linux/dma/dw. F: include/linux/platform_data/dma-dw.h F: drivers/dma/dw/
+ SYNOPSYS DESIGNWARE ENTERPRISE ETHERNET DRIVER + M: Jie Deng jiedeng@synopsys.com + L: netdev@vger.kernel.org + S: Supported + F: drivers/net/ethernet/synopsys/ + SYNOPSYS DESIGNWARE I2C DRIVER M: Jarkko Nikula jarkko.nikula@linux.intel.com R: Andy Shevchenko andriy.shevchenko@linux.intel.com @@@ -11163,7 -11144,6 +11183,7 @@@ F: drivers/power/supply/bq27xxx_battery TIMEKEEPING, CLOCKSOURCE CORE, NTP, ALARMTIMER M: John Stultz john.stultz@linaro.org M: Thomas Gleixner tglx@linutronix.de +R: Stephen Boyd sboyd@codeaurora.org L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core S: Supported @@@ -12229,12 -12209,19 +12249,19 @@@ F: Documentation/accounting/taskstats F: include/linux/taskstats* F: kernel/taskstats.c
- TC CLASSIFIER + TC subsystem M: Jamal Hadi Salim jhs@mojatatu.com + M: Cong Wang xiyou.wangcong@gmail.com + M: Jiri Pirko jiri@resnulli.us L: netdev@vger.kernel.org S: Maintained F: include/net/pkt_cls.h + F: include/net/pkt_sched.h + F: include/net/tc_act/ F: include/uapi/linux/pkt_cls.h + F: include/uapi/linux/pkt_sched.h + F: include/uapi/linux/tc_act/ + F: include/uapi/linux/tc_ematch/ F: net/sched/
TCP LOW PRIORITY MODULE @@@ -13330,8 -13317,11 +13357,11 @@@ L: netdev@vger.kernel.or S: Maintained F: include/linux/virtio_vsock.h F: include/uapi/linux/virtio_vsock.h + F: include/uapi/linux/vsockmon.h + F: net/vmw_vsock/af_vsock_tap.c F: net/vmw_vsock/virtio_transport_common.c F: net/vmw_vsock/virtio_transport.c + F: drivers/net/vsockmon.c F: drivers/vhost/vsock.c F: drivers/vhost/vsock.h
diff --combined arch/sparc/Kconfig index ed96869,d8c7736..58243b0 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@@ -31,7 -31,8 +31,8 @@@ config SPAR select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_PCI_IOMAP select HAVE_NMI_WATCHDOG if SPARC64 - select HAVE_CBPF_JIT + select HAVE_CBPF_JIT if SPARC32 + select HAVE_EBPF_JIT if SPARC64 select HAVE_DEBUG_BUGVERBOSE select GENERIC_SMP_IDLE_THREAD select GENERIC_CLOCKEVENTS @@@ -42,6 -43,7 +43,6 @@@ select OLD_SIGSUSPEND select ARCH_HAS_SG_CHAIN select CPU_NO_EFFICIENT_FFS - select HAVE_ARCH_HARDENED_USERCOPY select LOCKDEP_SMALL if LOCKDEP select ARCH_WANT_RELAX_ORDER
diff --combined crypto/crypto_user.c index 89acaab,b575876..0dbe2be7 --- a/crypto/crypto_user.c +++ b/crypto/crypto_user.c @@@ -83,7 -83,7 +83,7 @@@ static int crypto_report_cipher(struct { struct crypto_report_cipher rcipher;
- strncpy(rcipher.type, "cipher", sizeof(rcipher.type)); + strlcpy(rcipher.type, "cipher", sizeof(rcipher.type));
rcipher.blocksize = alg->cra_blocksize; rcipher.min_keysize = alg->cra_cipher.cia_min_keysize; @@@ -102,7 -102,7 +102,7 @@@ static int crypto_report_comp(struct sk { struct crypto_report_comp rcomp;
- strncpy(rcomp.type, "compression", sizeof(rcomp.type)); + strlcpy(rcomp.type, "compression", sizeof(rcomp.type)); if (nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS, sizeof(struct crypto_report_comp), &rcomp)) goto nla_put_failure; @@@ -116,7 -116,7 +116,7 @@@ static int crypto_report_acomp(struct s { struct crypto_report_acomp racomp;
- strncpy(racomp.type, "acomp", sizeof(racomp.type)); + strlcpy(racomp.type, "acomp", sizeof(racomp.type));
if (nla_put(skb, CRYPTOCFGA_REPORT_ACOMP, sizeof(struct crypto_report_acomp), &racomp)) @@@ -131,7 -131,7 +131,7 @@@ static int crypto_report_akcipher(struc { struct crypto_report_akcipher rakcipher;
- strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type)); + strlcpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
if (nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER, sizeof(struct crypto_report_akcipher), &rakcipher)) @@@ -146,7 -146,7 +146,7 @@@ static int crypto_report_kpp(struct sk_ { struct crypto_report_kpp rkpp;
- strncpy(rkpp.type, "kpp", sizeof(rkpp.type)); + strlcpy(rkpp.type, "kpp", sizeof(rkpp.type));
if (nla_put(skb, CRYPTOCFGA_REPORT_KPP, sizeof(struct crypto_report_kpp), &rkpp)) @@@ -160,10 -160,10 +160,10 @@@ nla_put_failure static int crypto_report_one(struct crypto_alg *alg, struct crypto_user_alg *ualg, struct sk_buff *skb) { - strncpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name)); - strncpy(ualg->cru_driver_name, alg->cra_driver_name, + strlcpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name)); + strlcpy(ualg->cru_driver_name, alg->cra_driver_name, sizeof(ualg->cru_driver_name)); - strncpy(ualg->cru_module_name, module_name(alg->cra_module), + strlcpy(ualg->cru_module_name, module_name(alg->cra_module), sizeof(ualg->cru_module_name));
ualg->cru_type = 0; @@@ -176,7 -176,7 +176,7 @@@ if (alg->cra_flags & CRYPTO_ALG_LARVAL) { struct crypto_report_larval rl;
- strncpy(rl.type, "larval", sizeof(rl.type)); + strlcpy(rl.type, "larval", sizeof(rl.type)); if (nla_put(skb, CRYPTOCFGA_REPORT_LARVAL, sizeof(struct crypto_report_larval), &rl)) goto nla_put_failure; @@@ -483,7 -483,8 +483,8 @@@ static const struct crypto_link [CRYPTO_MSG_DELRNG - CRYPTO_MSG_BASE] = { .doit = crypto_del_rng }, };
- static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) + static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct nlattr *attrs[CRYPTOCFGA_MAX+1]; const struct crypto_link *link; @@@ -522,7 -523,7 +523,7 @@@ }
err = nlmsg_parse(nlh, crypto_msg_min[type], attrs, CRYPTOCFGA_MAX, - crypto_policy); + crypto_policy, extack); if (err < 0) return err;
diff --combined drivers/block/nbd.c index 56efb04,d8a2356..9b482ba --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@@ -40,82 -40,49 +40,82 @@@ #include <asm/types.h>
#include <linux/nbd.h> +#include <linux/nbd-netlink.h> +#include <net/genetlink.h>
static DEFINE_IDR(nbd_index_idr); static DEFINE_MUTEX(nbd_index_mutex); +static int nbd_total_devices = 0;
struct nbd_sock { struct socket *sock; struct mutex tx_lock; struct request *pending; int sent; + bool dead; + int fallback_index; + int cookie; +}; + +struct recv_thread_args { + struct work_struct work; + struct nbd_device *nbd; + int index; +}; + +struct link_dead_args { + struct work_struct work; + int index; };
#define NBD_TIMEDOUT 0 #define NBD_DISCONNECT_REQUESTED 1 #define NBD_DISCONNECTED 2 -#define NBD_RUNNING 3 +#define NBD_HAS_PID_FILE 3 +#define NBD_HAS_CONFIG_REF 4 +#define NBD_BOUND 5 +#define NBD_DESTROY_ON_DISCONNECT 6
-struct nbd_device { +struct nbd_config { u32 flags; unsigned long runtime_flags; - struct nbd_sock **socks; - int magic; + u64 dead_conn_timeout;
- struct blk_mq_tag_set tag_set; - - struct mutex config_lock; - struct gendisk *disk; + struct nbd_sock **socks; int num_connections; + atomic_t live_connections; + wait_queue_head_t conn_wait; + atomic_t recv_threads; wait_queue_head_t recv_wq; loff_t blksize; loff_t bytesize; - - struct task_struct *task_recv; - struct task_struct *task_setup; - #if IS_ENABLED(CONFIG_DEBUG_FS) struct dentry *dbg_dir; #endif };
+struct nbd_device { + struct blk_mq_tag_set tag_set; + + int index; + refcount_t config_refs; + refcount_t refs; + struct nbd_config *config; + struct mutex config_lock; + struct gendisk *disk; + + struct list_head list; + struct task_struct *task_recv; + struct task_struct *task_setup; +}; + struct nbd_cmd { struct nbd_device *nbd; + int index; + int cookie; struct completion send_complete; + int status; };
#if IS_ENABLED(CONFIG_DEBUG_FS) @@@ -133,16 -100,18 +133,16 @@@ static int part_shift
static int nbd_dev_dbg_init(struct nbd_device *nbd); static void nbd_dev_dbg_close(struct nbd_device *nbd); - +static void nbd_config_put(struct nbd_device *nbd); +static void nbd_connect_reply(struct genl_info *info, int index); +static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info); +static void nbd_dead_link_work(struct work_struct *work);
static inline struct device *nbd_to_dev(struct nbd_device *nbd) { return disk_to_dev(nbd->disk); }
-static bool nbd_is_connected(struct nbd_device *nbd) -{ - return !!nbd->task_recv; -} - static const char *nbdcmd_to_ascii(int cmd) { switch (cmd) { @@@ -155,104 -124,44 +155,104 @@@ return "invalid"; }
-static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev) +static ssize_t pid_show(struct device *dev, + struct device_attribute *attr, char *buf) { - if (bdev->bd_openers <= 1) - bd_set_size(bdev, 0); - set_capacity(nbd->disk, 0); - kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); + struct gendisk *disk = dev_to_disk(dev); + struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
- return 0; + return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv)); +} + +static struct device_attribute pid_attr = { + .attr = { .name = "pid", .mode = S_IRUGO}, + .show = pid_show, +}; + +static void nbd_dev_remove(struct nbd_device *nbd) +{ + struct gendisk *disk = nbd->disk; + if (disk) { + del_gendisk(disk); + blk_cleanup_queue(disk->queue); + blk_mq_free_tag_set(&nbd->tag_set); + disk->private_data = NULL; + put_disk(disk); + } + kfree(nbd); +} + +static void nbd_put(struct nbd_device *nbd) +{ + if (refcount_dec_and_mutex_lock(&nbd->refs, + &nbd_index_mutex)) { + idr_remove(&nbd_index_idr, nbd->index); + mutex_unlock(&nbd_index_mutex); + nbd_dev_remove(nbd); + } +} + +static int nbd_disconnected(struct nbd_config *config) +{ + return test_bit(NBD_DISCONNECTED, &config->runtime_flags) || + test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags); +} + +static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, + int notify) +{ + if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) { + struct link_dead_args *args; + args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO); + if (args) { + INIT_WORK(&args->work, nbd_dead_link_work); + args->index = nbd->index; + queue_work(system_wq, &args->work); + } + } + if (!nsock->dead) { + kernel_sock_shutdown(nsock->sock, SHUT_RDWR); + atomic_dec(&nbd->config->live_connections); + } + nsock->dead = true; + nsock->pending = NULL; + nsock->sent = 0; +} + +static void nbd_size_clear(struct nbd_device *nbd) +{ + if (nbd->config->bytesize) { + set_capacity(nbd->disk, 0); + kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); + } }
-static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev) +static void nbd_size_update(struct nbd_device *nbd) { - blk_queue_logical_block_size(nbd->disk->queue, nbd->blksize); - blk_queue_physical_block_size(nbd->disk->queue, nbd->blksize); - bd_set_size(bdev, nbd->bytesize); - set_capacity(nbd->disk, nbd->bytesize >> 9); + struct nbd_config *config = nbd->config; + blk_queue_logical_block_size(nbd->disk->queue, config->blksize); + blk_queue_physical_block_size(nbd->disk->queue, config->blksize); + set_capacity(nbd->disk, config->bytesize >> 9); kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); }
-static void nbd_size_set(struct nbd_device *nbd, struct block_device *bdev, - loff_t blocksize, loff_t nr_blocks) +static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize, + loff_t nr_blocks) { - nbd->blksize = blocksize; - nbd->bytesize = blocksize * nr_blocks; - if (nbd_is_connected(nbd)) - nbd_size_update(nbd, bdev); + struct nbd_config *config = nbd->config; + config->blksize = blocksize; + config->bytesize = blocksize * nr_blocks; + nbd_size_update(nbd); }
-static void nbd_end_request(struct nbd_cmd *cmd) +static void nbd_complete_rq(struct request *req) { - struct nbd_device *nbd = cmd->nbd; - struct request *req = blk_mq_rq_from_pdu(cmd); - int error = req->errors ? -EIO : 0; + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
- dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd, - error ? "failed" : "done"); + dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", cmd, + cmd->status ? "failed" : "done");
- blk_mq_complete_request(req, error); + blk_mq_end_request(req, cmd->status); }
/* @@@ -260,18 -169,17 +260,18 @@@ */ static void sock_shutdown(struct nbd_device *nbd) { + struct nbd_config *config = nbd->config; int i;
- if (nbd->num_connections == 0) + if (config->num_connections == 0) return; - if (test_and_set_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) + if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags)) return;
- for (i = 0; i < nbd->num_connections; i++) { - struct nbd_sock *nsock = nbd->socks[i]; + for (i = 0; i < config->num_connections; i++) { + struct nbd_sock *nsock = config->socks[i]; mutex_lock(&nsock->tx_lock); - kernel_sock_shutdown(nsock->sock, SHUT_RDWR); + nbd_mark_nsock_dead(nbd, nsock, 0); mutex_unlock(&nsock->tx_lock); } dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n"); @@@ -282,58 -190,14 +282,58 @@@ static enum blk_eh_timer_return nbd_xmi { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); struct nbd_device *nbd = cmd->nbd; + struct nbd_config *config;
- dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n"); - set_bit(NBD_TIMEDOUT, &nbd->runtime_flags); - req->errors = -EIO; + if (!refcount_inc_not_zero(&nbd->config_refs)) { + cmd->status = -EIO; + return BLK_EH_HANDLED; + }
- mutex_lock(&nbd->config_lock); + /* If we are waiting on our dead timer then we could get timeout + * callbacks for our request. For this we just want to reset the timer + * and let the queue side take care of everything. + */ + if (!completion_done(&cmd->send_complete)) { + nbd_config_put(nbd); + return BLK_EH_RESET_TIMER; + } + config = nbd->config; + + if (config->num_connections > 1) { + dev_err_ratelimited(nbd_to_dev(nbd), + "Connection timed out, retrying\n"); + /* + * Hooray we have more connections, requeue this IO, the submit + * path will put it on a real connection. + */ + if (config->socks && config->num_connections > 1) { + if (cmd->index < config->num_connections) { + struct nbd_sock *nsock = + config->socks[cmd->index]; + mutex_lock(&nsock->tx_lock); + /* We can have multiple outstanding requests, so + * we don't want to mark the nsock dead if we've + * already reconnected with a new socket, so + * only mark it dead if its the same socket we + * were sent out on. + */ + if (cmd->cookie == nsock->cookie) + nbd_mark_nsock_dead(nbd, nsock, 1); + mutex_unlock(&nsock->tx_lock); + } + blk_mq_requeue_request(req, true); + nbd_config_put(nbd); + return BLK_EH_NOT_HANDLED; + } + } else { + dev_err_ratelimited(nbd_to_dev(nbd), + "Connection timed out\n"); + } + set_bit(NBD_TIMEDOUT, &config->runtime_flags); + cmd->status = -EIO; sock_shutdown(nbd); - mutex_unlock(&nbd->config_lock); + nbd_config_put(nbd); + return BLK_EH_HANDLED; }
@@@ -343,8 -207,7 +343,8 @@@ static int sock_xmit(struct nbd_device *nbd, int index, int send, struct iov_iter *iter, int msg_flags, int *sent) { - struct socket *sock = nbd->socks[index]->sock; + struct nbd_config *config = nbd->config; + struct socket *sock = config->socks[index]->sock; int result; struct msghdr msg; unsigned long pflags = current->flags; @@@ -381,7 -244,7 +381,7 @@@ *sent += result; } while (msg_data_left(&msg));
- tsk_restore_flags(current, pflags, PF_MEMALLOC); + current_restore_flags(pflags, PF_MEMALLOC);
return result; } @@@ -390,8 -253,7 +390,8 @@@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) { struct request *req = blk_mq_rq_from_pdu(cmd); - struct nbd_sock *nsock = nbd->socks[index]; + struct nbd_config *config = nbd->config; + struct nbd_sock *nsock = config->socks[index]; int result; struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)}; struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)}; @@@ -422,7 -284,7 +422,7 @@@ }
if (rq_data_dir(req) == WRITE && - (nbd->flags & NBD_FLAG_READ_ONLY)) { + (config->flags & NBD_FLAG_READ_ONLY)) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Write on read-only\n"); return -EIO; @@@ -439,8 -301,6 +439,8 @@@ } iov_iter_advance(&from, sent); } + cmd->index = index; + cmd->cookie = nsock->cookie; request.type = htonl(type); if (type != NBD_CMD_FLUSH) { request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); @@@ -468,7 -328,7 +468,7 @@@ } dev_err_ratelimited(disk_to_dev(nbd->disk), "Send control failed (result %d)\n", result); - return -EIO; + return -EAGAIN; } send_pages: if (type != NBD_CMD_WRITE) @@@ -510,7 -370,7 +510,7 @@@ dev_err(disk_to_dev(nbd->disk), "Send data failed (result %d)\n", result); - return -EIO; + return -EAGAIN; } /* * The completion might already have come in, @@@ -532,7 -392,6 +532,7 @@@ out /* NULL returned = something went wrong, inform userspace */ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) { + struct nbd_config *config = nbd->config; int result; struct nbd_reply reply; struct nbd_cmd *cmd; @@@ -546,7 -405,8 +546,7 @@@ iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply)); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); if (result <= 0) { - if (!test_bit(NBD_DISCONNECTED, &nbd->runtime_flags) && - !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags)) + if (!nbd_disconnected(config)) dev_err(disk_to_dev(nbd->disk), "Receive control failed (result %d)\n", result); return ERR_PTR(result); @@@ -573,7 -433,7 +573,7 @@@ if (ntohl(reply.error)) { dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", ntohl(reply.error)); - req->errors = -EIO; + cmd->status = -EIO; return cmd; }
@@@ -589,19 -449,8 +589,19 @@@ if (result <= 0) { dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", result); - req->errors = -EIO; - return cmd; + /* + * If we've disconnected or we only have 1 + * connection then we need to make sure we + * complete this request, otherwise error out + * and let the timeout stuff handle resubmitting + * this request onto another connection. + */ + if (nbd_disconnected(config) || + config->num_connections <= 1) { + cmd->status = -EIO; + return cmd; + } + return ERR_PTR(-EIO); } dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n", cmd, bvec.bv_len); @@@ -613,34 -462,54 +613,34 @@@ return cmd; }
-static ssize_t pid_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - struct nbd_device *nbd = (struct nbd_device *)disk->private_data; - - return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv)); -} - -static struct device_attribute pid_attr = { - .attr = { .name = "pid", .mode = S_IRUGO}, - .show = pid_show, -}; - -struct recv_thread_args { - struct work_struct work; - struct nbd_device *nbd; - int index; -}; - static void recv_work(struct work_struct *work) { struct recv_thread_args *args = container_of(work, struct recv_thread_args, work); struct nbd_device *nbd = args->nbd; + struct nbd_config *config = nbd->config; struct nbd_cmd *cmd; int ret = 0;
- BUG_ON(nbd->magic != NBD_MAGIC); while (1) { cmd = nbd_read_stat(nbd, args->index); if (IS_ERR(cmd)) { + struct nbd_sock *nsock = config->socks[args->index]; + + mutex_lock(&nsock->tx_lock); + nbd_mark_nsock_dead(nbd, nsock, 1); + mutex_unlock(&nsock->tx_lock); ret = PTR_ERR(cmd); break; }
- nbd_end_request(cmd); + blk_mq_complete_request(blk_mq_rq_from_pdu(cmd)); } - - /* - * We got an error, shut everybody down if this wasn't the result of a - * disconnect request. - */ - if (ret && !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags)) - sock_shutdown(nbd); - atomic_dec(&nbd->recv_threads); - wake_up(&nbd->recv_wq); + atomic_dec(&config->recv_threads); + wake_up(&config->recv_wq); + nbd_config_put(nbd); + kfree(args); }
static void nbd_clear_req(struct request *req, void *data, bool reserved) @@@ -650,119 -519,47 +650,119 @@@ if (!blk_mq_request_started(req)) return; cmd = blk_mq_rq_to_pdu(req); - req->errors = -EIO; - nbd_end_request(cmd); + cmd->status = -EIO; + blk_mq_complete_request(req); }
static void nbd_clear_que(struct nbd_device *nbd) { - BUG_ON(nbd->magic != NBD_MAGIC); - + blk_mq_stop_hw_queues(nbd->disk->queue); blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL); + blk_mq_start_hw_queues(nbd->disk->queue); dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); }
+static int find_fallback(struct nbd_device *nbd, int index) +{ + struct nbd_config *config = nbd->config; + int new_index = -1; + struct nbd_sock *nsock = config->socks[index]; + int fallback = nsock->fallback_index; + + if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) + return new_index; + + if (config->num_connections <= 1) { + dev_err_ratelimited(disk_to_dev(nbd->disk), + "Attempted send on invalid socket\n"); + return new_index; + } + + if (fallback >= 0 && fallback < config->num_connections && + !config->socks[fallback]->dead) + return fallback; + + if (nsock->fallback_index < 0 || + nsock->fallback_index >= config->num_connections || + config->socks[nsock->fallback_index]->dead) { + int i; + for (i = 0; i < config->num_connections; i++) { + if (i == index) + continue; + if (!config->socks[i]->dead) { + new_index = i; + break; + } + } + nsock->fallback_index = new_index; + if (new_index < 0) { + dev_err_ratelimited(disk_to_dev(nbd->disk), + "Dead connection, failed to find a fallback\n"); + return new_index; + } + } + new_index = nsock->fallback_index; + return new_index; +} + +static int wait_for_reconnect(struct nbd_device *nbd) +{ + struct nbd_config *config = nbd->config; + if (!config->dead_conn_timeout) + return 0; + if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) + return 0; + wait_event_interruptible_timeout(config->conn_wait, + atomic_read(&config->live_connections), + config->dead_conn_timeout); + return atomic_read(&config->live_connections); +}
static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) { struct request *req = blk_mq_rq_from_pdu(cmd); struct nbd_device *nbd = cmd->nbd; + struct nbd_config *config; struct nbd_sock *nsock; int ret;
- if (index >= nbd->num_connections) { + if (!refcount_inc_not_zero(&nbd->config_refs)) { dev_err_ratelimited(disk_to_dev(nbd->disk), - "Attempted send on invalid socket\n"); + "Socks array is empty\n"); return -EINVAL; } + config = nbd->config;
- if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) { + if (index >= config->num_connections) { dev_err_ratelimited(disk_to_dev(nbd->disk), - "Attempted send on closed socket\n"); + "Attempted send on invalid socket\n"); + nbd_config_put(nbd); return -EINVAL; } - - req->errors = 0; - - nsock = nbd->socks[index]; + cmd->status = 0; +again: + nsock = config->socks[index]; mutex_lock(&nsock->tx_lock); - if (unlikely(!nsock->sock)) { + if (nsock->dead) { + int old_index = index; + index = find_fallback(nbd, index); mutex_unlock(&nsock->tx_lock); - dev_err_ratelimited(disk_to_dev(nbd->disk), - "Attempted send on closed socket\n"); - return -EINVAL; + if (index < 0) { + if (wait_for_reconnect(nbd)) { + index = old_index; + goto again; + } + /* All the sockets should already be down at this point, + * we just want to make sure that DISCONNECTED is set so + * any requests that come in that were queue'ed waiting + * for the reconnect timer don't trigger the timer again + * and instead just error out. + */ + sock_shutdown(nbd); + nbd_config_put(nbd); + return -EIO; + } + goto again; }
/* Handle the case that we have a pending request that was partially @@@ -775,21 -572,9 +775,21 @@@ ret = 0; goto out; } + /* + * Some failures are related to the link going down, so anything that + * returns EAGAIN can be retried on a different socket. + */ ret = nbd_send_cmd(nbd, cmd, index); + if (ret == -EAGAIN) { + dev_err_ratelimited(disk_to_dev(nbd->disk), + "Request send failed trying another connection\n"); + nbd_mark_nsock_dead(nbd, nsock, 1); + mutex_unlock(&nsock->tx_lock); + goto again; + } out: mutex_unlock(&nsock->tx_lock); + nbd_config_put(nbd); return ret; }
@@@ -826,10 -611,9 +826,10 @@@ static int nbd_queue_rq(struct blk_mq_h return ret; }
-static int nbd_add_socket(struct nbd_device *nbd, struct block_device *bdev, - unsigned long arg) +static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, + bool netlink) { + struct nbd_config *config = nbd->config; struct socket *sock; struct nbd_sock **socks; struct nbd_sock *nsock; @@@ -839,107 -623,43 +839,107 @@@ if (!sock) return err;
- if (!nbd->task_setup) + if (!netlink && !nbd->task_setup && + !test_bit(NBD_BOUND, &config->runtime_flags)) nbd->task_setup = current; - if (nbd->task_setup != current) { + + if (!netlink && + (nbd->task_setup != current || + test_bit(NBD_BOUND, &config->runtime_flags))) { dev_err(disk_to_dev(nbd->disk), "Device being setup by another task"); - return -EINVAL; + sockfd_put(sock); + return -EBUSY; }
- socks = krealloc(nbd->socks, (nbd->num_connections + 1) * + socks = krealloc(config->socks, (config->num_connections + 1) * sizeof(struct nbd_sock *), GFP_KERNEL); - if (!socks) + if (!socks) { + sockfd_put(sock); return -ENOMEM; + } nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL); - if (!nsock) + if (!nsock) { + sockfd_put(sock); return -ENOMEM; + }
- nbd->socks = socks; + config->socks = socks;
+ nsock->fallback_index = -1; + nsock->dead = false; mutex_init(&nsock->tx_lock); nsock->sock = sock; nsock->pending = NULL; nsock->sent = 0; - socks[nbd->num_connections++] = nsock; + nsock->cookie = 0; + socks[config->num_connections++] = nsock; + atomic_inc(&config->live_connections);
- if (max_part) - bdev->bd_invalidated = 1; return 0; }
+static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) +{ + struct nbd_config *config = nbd->config; + struct socket *sock, *old; + struct recv_thread_args *args; + int i; + int err; + + sock = sockfd_lookup(arg, &err); + if (!sock) + return err; + + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (!args) { + sockfd_put(sock); + return -ENOMEM; + } + + for (i = 0; i < config->num_connections; i++) { + struct nbd_sock *nsock = config->socks[i]; + + if (!nsock->dead) + continue; + + mutex_lock(&nsock->tx_lock); + if (!nsock->dead) { + mutex_unlock(&nsock->tx_lock); + continue; + } + sk_set_memalloc(sock->sk); + atomic_inc(&config->recv_threads); + refcount_inc(&nbd->config_refs); + old = nsock->sock; + nsock->fallback_index = -1; + nsock->sock = sock; + nsock->dead = false; + INIT_WORK(&args->work, recv_work); + args->index = i; + args->nbd = nbd; + nsock->cookie++; + mutex_unlock(&nsock->tx_lock); + sockfd_put(old); + + /* We take the tx_mutex in an error path in the recv_work, so we + * need to queue_work outside of the tx_mutex. + */ + queue_work(recv_workqueue, &args->work); + + atomic_inc(&config->live_connections); + wake_up(&config->conn_wait); + return 0; + } + sockfd_put(sock); + kfree(args); + return -ENOSPC; +} + /* Reset all properties of an NBD device */ static void nbd_reset(struct nbd_device *nbd) { - nbd->runtime_flags = 0; - nbd->blksize = 1024; - nbd->bytesize = 0; - set_capacity(nbd->disk, 0); - nbd->flags = 0; + nbd->config = NULL; nbd->tag_set.timeout = 0; queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); } @@@ -948,23 -668,21 +948,23 @@@ static void nbd_bdev_reset(struct block { if (bdev->bd_openers > 1) return; - set_device_ro(bdev, false); - bdev->bd_inode->i_size = 0; + bd_set_size(bdev, 0); if (max_part > 0) { blkdev_reread_part(bdev); bdev->bd_invalidated = 1; } }
-static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev) +static void nbd_parse_flags(struct nbd_device *nbd) { - if (nbd->flags & NBD_FLAG_READ_ONLY) - set_device_ro(bdev, true); - if (nbd->flags & NBD_FLAG_SEND_TRIM) + struct nbd_config *config = nbd->config; + if (config->flags & NBD_FLAG_READ_ONLY) + set_disk_ro(nbd->disk, true); + else + set_disk_ro(nbd->disk, false); + if (config->flags & NBD_FLAG_SEND_TRIM) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); - if (nbd->flags & NBD_FLAG_SEND_FLUSH) + if (config->flags & NBD_FLAG_SEND_FLUSH) blk_queue_write_cache(nbd->disk->queue, true, false); else blk_queue_write_cache(nbd->disk->queue, false, false); @@@ -972,7 -690,6 +972,7 @@@
static void send_disconnects(struct nbd_device *nbd) { + struct nbd_config *config = nbd->config; struct nbd_request request = { .magic = htonl(NBD_REQUEST_MAGIC), .type = htonl(NBD_CMD_DISC), @@@ -981,7 -698,7 +981,7 @@@ struct iov_iter from; int i, ret;
- for (i = 0; i < nbd->num_connections; i++) { + for (i = 0; i < config->num_connections; i++) { iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request)); ret = sock_xmit(nbd, i, 1, &from, 0, NULL); if (ret <= 0) @@@ -990,162 -707,145 +990,162 @@@ } }
-static int nbd_disconnect(struct nbd_device *nbd, struct block_device *bdev) +static int nbd_disconnect(struct nbd_device *nbd) { - dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); - if (!nbd->socks) - return -EINVAL; - - mutex_unlock(&nbd->config_lock); - fsync_bdev(bdev); - mutex_lock(&nbd->config_lock); - - /* Check again after getting mutex back. */ - if (!nbd->socks) - return -EINVAL; + struct nbd_config *config = nbd->config;
+ dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED, - &nbd->runtime_flags)) + &config->runtime_flags)) send_disconnects(nbd); return 0; }
-static int nbd_clear_sock(struct nbd_device *nbd, struct block_device *bdev) +static void nbd_clear_sock(struct nbd_device *nbd) { sock_shutdown(nbd); nbd_clear_que(nbd); + nbd->task_setup = NULL; +}
- __invalidate_device(bdev, true); - nbd_bdev_reset(bdev); - /* - * We want to give the run thread a chance to wait for everybody - * to clean up and then do it's own cleanup. - */ - if (!test_bit(NBD_RUNNING, &nbd->runtime_flags) && - nbd->num_connections) { - int i; - - for (i = 0; i < nbd->num_connections; i++) { - sockfd_put(nbd->socks[i]->sock); - kfree(nbd->socks[i]); +static void nbd_config_put(struct nbd_device *nbd) +{ + if (refcount_dec_and_mutex_lock(&nbd->config_refs, + &nbd->config_lock)) { + struct nbd_config *config = nbd->config; + nbd_dev_dbg_close(nbd); + nbd_size_clear(nbd); + if (test_and_clear_bit(NBD_HAS_PID_FILE, + &config->runtime_flags)) + device_remove_file(disk_to_dev(nbd->disk), &pid_attr); + nbd->task_recv = NULL; + nbd_clear_sock(nbd); + if (config->num_connections) { + int i; + for (i = 0; i < config->num_connections; i++) { + sockfd_put(config->socks[i]->sock); + kfree(config->socks[i]); + } + kfree(config->socks); } - kfree(nbd->socks); - nbd->socks = NULL; - nbd->num_connections = 0; - } - nbd->task_setup = NULL; + nbd_reset(nbd);
- return 0; + mutex_unlock(&nbd->config_lock); + nbd_put(nbd); + module_put(THIS_MODULE); + } }
-static int nbd_start_device(struct nbd_device *nbd, struct block_device *bdev) +static int nbd_start_device(struct nbd_device *nbd) { - struct recv_thread_args *args; - int num_connections = nbd->num_connections; + struct nbd_config *config = nbd->config; + int num_connections = config->num_connections; int error = 0, i;
if (nbd->task_recv) return -EBUSY; - if (!nbd->socks) + if (!config->socks) return -EINVAL; if (num_connections > 1 && - !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) { + !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) { dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n"); - error = -EINVAL; - goto out_err; + return -EINVAL; }
- set_bit(NBD_RUNNING, &nbd->runtime_flags); - blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections); - args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL); - if (!args) { - error = -ENOMEM; - goto out_err; - } + blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections); nbd->task_recv = current; - mutex_unlock(&nbd->config_lock);
- nbd_parse_flags(nbd, bdev); + nbd_parse_flags(nbd);
error = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (error) { dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); - goto out_recv; + return error; } - - nbd_size_update(nbd, bdev); + set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
nbd_dev_dbg_init(nbd); for (i = 0; i < num_connections; i++) { - sk_set_memalloc(nbd->socks[i]->sock->sk); - atomic_inc(&nbd->recv_threads); - INIT_WORK(&args[i].work, recv_work); - args[i].nbd = nbd; - args[i].index = i; - queue_work(recv_workqueue, &args[i].work); - } - wait_event_interruptible(nbd->recv_wq, - atomic_read(&nbd->recv_threads) == 0); - for (i = 0; i < num_connections; i++) - flush_work(&args[i].work); - nbd_dev_dbg_close(nbd); - nbd_size_clear(nbd, bdev); - device_remove_file(disk_to_dev(nbd->disk), &pid_attr); -out_recv: - mutex_lock(&nbd->config_lock); - nbd->task_recv = NULL; -out_err: - clear_bit(NBD_RUNNING, &nbd->runtime_flags); - nbd_clear_sock(nbd, bdev); + struct recv_thread_args *args;
+ args = kzalloc(sizeof(*args), GFP_KERNEL); + if (!args) { + sock_shutdown(nbd); + return -ENOMEM; + } + sk_set_memalloc(config->socks[i]->sock->sk); + atomic_inc(&config->recv_threads); + refcount_inc(&nbd->config_refs); + INIT_WORK(&args->work, recv_work); + args->nbd = nbd; + args->index = i; + queue_work(recv_workqueue, &args->work); + } + return error; +} + +static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev) +{ + struct nbd_config *config = nbd->config; + int ret; + + ret = nbd_start_device(nbd); + if (ret) + return ret; + + bd_set_size(bdev, config->bytesize); + if (max_part) + bdev->bd_invalidated = 1; + mutex_unlock(&nbd->config_lock); + ret = wait_event_interruptible(config->recv_wq, + atomic_read(&config->recv_threads) == 0); + if (ret) + sock_shutdown(nbd); + mutex_lock(&nbd->config_lock); + bd_set_size(bdev, 0); /* user requested, ignore socket errors */ - if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags)) - error = 0; - if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags)) - error = -ETIMEDOUT; + if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags)) + ret = 0; + if (test_bit(NBD_TIMEDOUT, &config->runtime_flags)) + ret = -ETIMEDOUT; + return ret; +}
- nbd_reset(nbd); - return error; +static void nbd_clear_sock_ioctl(struct nbd_device *nbd, + struct block_device *bdev) +{ + sock_shutdown(nbd); + kill_bdev(bdev); + nbd_bdev_reset(bdev); + if (test_and_clear_bit(NBD_HAS_CONFIG_REF, + &nbd->config->runtime_flags)) + nbd_config_put(nbd); }
/* Must be called with config_lock held */ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) { + struct nbd_config *config = nbd->config; + switch (cmd) { case NBD_DISCONNECT: - return nbd_disconnect(nbd, bdev); + return nbd_disconnect(nbd); case NBD_CLEAR_SOCK: - return nbd_clear_sock(nbd, bdev); + nbd_clear_sock_ioctl(nbd, bdev); + return 0; case NBD_SET_SOCK: - return nbd_add_socket(nbd, bdev, arg); + return nbd_add_socket(nbd, arg, false); case NBD_SET_BLKSIZE: - nbd_size_set(nbd, bdev, arg, - div_s64(nbd->bytesize, arg)); + nbd_size_set(nbd, arg, + div_s64(config->bytesize, arg)); return 0; case NBD_SET_SIZE: - nbd_size_set(nbd, bdev, nbd->blksize, - div_s64(arg, nbd->blksize)); + nbd_size_set(nbd, config->blksize, + div_s64(arg, config->blksize)); return 0; case NBD_SET_SIZE_BLOCKS: - nbd_size_set(nbd, bdev, nbd->blksize, arg); + nbd_size_set(nbd, config->blksize, arg); return 0; case NBD_SET_TIMEOUT: if (arg) { @@@ -1155,10 -855,10 +1155,10 @@@ return 0;
case NBD_SET_FLAGS: - nbd->flags = arg; + config->flags = arg; return 0; case NBD_DO_IT: - return nbd_start_device(nbd, bdev); + return nbd_start_device_ioctl(nbd, bdev); case NBD_CLEAR_QUE: /* * This is for compatibility only. The queue is always cleared @@@ -1179,92 -879,23 +1179,92 @@@ static int nbd_ioctl(struct block_devic unsigned int cmd, unsigned long arg) { struct nbd_device *nbd = bdev->bd_disk->private_data; - int error; + struct nbd_config *config = nbd->config; + int error = -EINVAL;
if (!capable(CAP_SYS_ADMIN)) return -EPERM;
- BUG_ON(nbd->magic != NBD_MAGIC); - mutex_lock(&nbd->config_lock); - error = __nbd_ioctl(bdev, nbd, cmd, arg); - mutex_unlock(&nbd->config_lock);
+ /* Don't allow ioctl operations on a nbd device that was created with + * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine. + */ + if (!test_bit(NBD_BOUND, &config->runtime_flags) || + (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK)) + error = __nbd_ioctl(bdev, nbd, cmd, arg); + else + dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n"); + mutex_unlock(&nbd->config_lock); return error; }
+static struct nbd_config *nbd_alloc_config(void) +{ + struct nbd_config *config; + + config = kzalloc(sizeof(struct nbd_config), GFP_NOFS); + if (!config) + return NULL; + atomic_set(&config->recv_threads, 0); + init_waitqueue_head(&config->recv_wq); + init_waitqueue_head(&config->conn_wait); + config->blksize = 1024; + atomic_set(&config->live_connections, 0); + try_module_get(THIS_MODULE); + return config; +} + +static int nbd_open(struct block_device *bdev, fmode_t mode) +{ + struct nbd_device *nbd; + int ret = 0; + + mutex_lock(&nbd_index_mutex); + nbd = bdev->bd_disk->private_data; + if (!nbd) { + ret = -ENXIO; + goto out; + } + if (!refcount_inc_not_zero(&nbd->refs)) { + ret = -ENXIO; + goto out; + } + if (!refcount_inc_not_zero(&nbd->config_refs)) { + struct nbd_config *config; + + mutex_lock(&nbd->config_lock); + if (refcount_inc_not_zero(&nbd->config_refs)) { + mutex_unlock(&nbd->config_lock); + goto out; + } + config = nbd->config = nbd_alloc_config(); + if (!config) { + ret = -ENOMEM; + mutex_unlock(&nbd->config_lock); + goto out; + } + refcount_set(&nbd->config_refs, 1); + refcount_inc(&nbd->refs); + mutex_unlock(&nbd->config_lock); + } +out: + mutex_unlock(&nbd_index_mutex); + return ret; +} + +static void nbd_release(struct gendisk *disk, fmode_t mode) +{ + struct nbd_device *nbd = disk->private_data; + nbd_config_put(nbd); + nbd_put(nbd); +} + static const struct block_device_operations nbd_fops = { .owner = THIS_MODULE, + .open = nbd_open, + .release = nbd_release, .ioctl = nbd_ioctl, .compat_ioctl = nbd_ioctl, }; @@@ -1296,7 -927,7 +1296,7 @@@ static const struct file_operations nbd static int nbd_dbg_flags_show(struct seq_file *s, void *unused) { struct nbd_device *nbd = s->private; - u32 flags = nbd->flags; + u32 flags = nbd->config->flags;
seq_printf(s, "Hex: 0x%08x\n\n", flags);
@@@ -1329,7 -960,6 +1329,7 @@@ static const struct file_operations nbd static int nbd_dev_dbg_init(struct nbd_device *nbd) { struct dentry *dir; + struct nbd_config *config = nbd->config;
if (!nbd_dbg_dir) return -EIO; @@@ -1340,12 -970,12 +1340,12 @@@ nbd_name(nbd)); return -EIO; } - nbd->dbg_dir = dir; + config->dbg_dir = dir;
debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops); - debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize); + debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize); debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout); - debugfs_create_u64("blocksize", 0444, dir, &nbd->blksize); + debugfs_create_u64("blocksize", 0444, dir, &config->blksize); debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
return 0; @@@ -1353,7 -983,7 +1353,7 @@@
static void nbd_dev_dbg_close(struct nbd_device *nbd) { - debugfs_remove_recursive(nbd->dbg_dir); + debugfs_remove_recursive(nbd->config->dbg_dir); }
static int nbd_dbg_init(void) @@@ -1405,13 -1035,25 +1405,13 @@@ static int nbd_init_request(void *data return 0; }
-static struct blk_mq_ops nbd_mq_ops = { +static const struct blk_mq_ops nbd_mq_ops = { .queue_rq = nbd_queue_rq, + .complete = nbd_complete_rq, .init_request = nbd_init_request, .timeout = nbd_xmit_timeout, };
-static void nbd_dev_remove(struct nbd_device *nbd) -{ - struct gendisk *disk = nbd->disk; - nbd->magic = 0; - if (disk) { - del_gendisk(disk); - blk_cleanup_queue(disk->queue); - blk_mq_free_tag_set(&nbd->tag_set); - put_disk(disk); - } - kfree(nbd); -} - static int nbd_dev_add(int index) { struct nbd_device *nbd; @@@ -1440,7 -1082,6 +1440,7 @@@ if (err < 0) goto out_free_disk;
+ nbd->index = index; nbd->disk = disk; nbd->tag_set.ops = &nbd_mq_ops; nbd->tag_set.nr_hw_queues = 1; @@@ -1469,23 -1110,20 +1469,23 @@@ queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue); disk->queue->limits.discard_granularity = 512; blk_queue_max_discard_sectors(disk->queue, UINT_MAX); - disk->queue->limits.discard_zeroes_data = 0; + blk_queue_max_segment_size(disk->queue, UINT_MAX); + blk_queue_max_segments(disk->queue, USHRT_MAX); blk_queue_max_hw_sectors(disk->queue, 65536); disk->queue->limits.max_sectors = 256;
- nbd->magic = NBD_MAGIC; mutex_init(&nbd->config_lock); + refcount_set(&nbd->config_refs, 0); + refcount_set(&nbd->refs, 1); + INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; disk->first_minor = index << part_shift; disk->fops = &nbd_fops; disk->private_data = nbd; sprintf(disk->disk_name, "nbd%d", index); - init_waitqueue_head(&nbd->recv_wq); nbd_reset(nbd); add_disk(disk); + nbd_total_devices++; return index;
out_free_tags: @@@ -1500,535 -1138,10 +1500,535 @@@ out return err; }
-/* - * And here should be modules and kernel interface - * (Just smiley confuses emacs :-) +static int find_free_cb(int id, void *ptr, void *data) +{ + struct nbd_device *nbd = ptr; + struct nbd_device **found = data; + + if (!refcount_read(&nbd->config_refs)) { + *found = nbd; + return 1; + } + return 0; +} + +/* Netlink interface. */ +static struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = { + [NBD_ATTR_INDEX] = { .type = NLA_U32 }, + [NBD_ATTR_SIZE_BYTES] = { .type = NLA_U64 }, + [NBD_ATTR_BLOCK_SIZE_BYTES] = { .type = NLA_U64 }, + [NBD_ATTR_TIMEOUT] = { .type = NLA_U64 }, + [NBD_ATTR_SERVER_FLAGS] = { .type = NLA_U64 }, + [NBD_ATTR_CLIENT_FLAGS] = { .type = NLA_U64 }, + [NBD_ATTR_SOCKETS] = { .type = NLA_NESTED}, + [NBD_ATTR_DEAD_CONN_TIMEOUT] = { .type = NLA_U64 }, + [NBD_ATTR_DEVICE_LIST] = { .type = NLA_NESTED}, +}; + +static struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = { + [NBD_SOCK_FD] = { .type = NLA_U32 }, +}; + +/* We don't use this right now since we don't parse the incoming list, but we + * still want it here so userspace knows what to expect. */ +static struct nla_policy __attribute__((unused)) +nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = { + [NBD_DEVICE_INDEX] = { .type = NLA_U32 }, + [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 }, +}; + +static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) +{ + struct nbd_device *nbd = NULL; + struct nbd_config *config; + int index = -1; + int ret; + bool put_dev = false; + + if (!netlink_capable(skb, CAP_SYS_ADMIN)) + return -EPERM; + + if (info->attrs[NBD_ATTR_INDEX]) + index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); + if (!info->attrs[NBD_ATTR_SOCKETS]) { + printk(KERN_ERR "nbd: must specify at least one socket\n"); + return -EINVAL; + } + if (!info->attrs[NBD_ATTR_SIZE_BYTES]) { + printk(KERN_ERR "nbd: must specify a size in bytes for the device\n"); + return -EINVAL; + } +again: + mutex_lock(&nbd_index_mutex); + if (index == -1) { + ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd); + if (ret == 0) { + int new_index; + new_index = nbd_dev_add(-1); + if (new_index < 0) { + mutex_unlock(&nbd_index_mutex); + printk(KERN_ERR "nbd: failed to add new device\n"); + return ret; + } + nbd = idr_find(&nbd_index_idr, new_index); + } + } else { + nbd = idr_find(&nbd_index_idr, index); + } + if (!nbd) { + printk(KERN_ERR "nbd: couldn't find device at index %d\n", + index); + mutex_unlock(&nbd_index_mutex); + return -EINVAL; + } + if (!refcount_inc_not_zero(&nbd->refs)) { + mutex_unlock(&nbd_index_mutex); + if (index == -1) + goto again; + printk(KERN_ERR "nbd: device at index %d is going down\n", + index); + return -EINVAL; + } + mutex_unlock(&nbd_index_mutex); + + mutex_lock(&nbd->config_lock); + if (refcount_read(&nbd->config_refs)) { + mutex_unlock(&nbd->config_lock); + nbd_put(nbd); + if (index == -1) + goto again; + printk(KERN_ERR "nbd: nbd%d already in use\n", index); + return -EBUSY; + } + if (WARN_ON(nbd->config)) { + mutex_unlock(&nbd->config_lock); + nbd_put(nbd); + return -EINVAL; + } + config = nbd->config = nbd_alloc_config(); + if (!nbd->config) { + mutex_unlock(&nbd->config_lock); + nbd_put(nbd); + printk(KERN_ERR "nbd: couldn't allocate config\n"); + return -ENOMEM; + } + refcount_set(&nbd->config_refs, 1); + set_bit(NBD_BOUND, &config->runtime_flags); + + if (info->attrs[NBD_ATTR_SIZE_BYTES]) { + u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); + nbd_size_set(nbd, config->blksize, + div64_u64(bytes, config->blksize)); + } + if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) { + u64 bsize = + nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); + nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize)); + } + if (info->attrs[NBD_ATTR_TIMEOUT]) { + u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); + nbd->tag_set.timeout = timeout * HZ; + blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ); + } + if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) { + config->dead_conn_timeout = + nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]); + config->dead_conn_timeout *= HZ; + } + if (info->attrs[NBD_ATTR_SERVER_FLAGS]) + config->flags = + nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]); + if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { + u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); + if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { + set_bit(NBD_DESTROY_ON_DISCONNECT, + &config->runtime_flags); + put_dev = true; + } + } + + if (info->attrs[NBD_ATTR_SOCKETS]) { + struct nlattr *attr; + int rem, fd; + + nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS], + rem) { + struct nlattr *socks[NBD_SOCK_MAX+1]; + + if (nla_type(attr) != NBD_SOCK_ITEM) { + printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n"); + ret = -EINVAL; + goto out; + } + ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr, - nbd_sock_policy); ++ nbd_sock_policy, info->extack); + if (ret != 0) { + printk(KERN_ERR "nbd: error processing sock list\n"); + ret = -EINVAL; + goto out; + } + if (!socks[NBD_SOCK_FD]) + continue; + fd = (int)nla_get_u32(socks[NBD_SOCK_FD]); + ret = nbd_add_socket(nbd, fd, true); + if (ret) + goto out; + } + } + ret = nbd_start_device(nbd); +out: + mutex_unlock(&nbd->config_lock); + if (!ret) { + set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags); + refcount_inc(&nbd->config_refs); + nbd_connect_reply(info, nbd->index); + } + nbd_config_put(nbd); + if (put_dev) + nbd_put(nbd); + return ret; +} + +static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) +{ + struct nbd_device *nbd; + int index; + + if (!netlink_capable(skb, CAP_SYS_ADMIN)) + return -EPERM; + + if (!info->attrs[NBD_ATTR_INDEX]) { + printk(KERN_ERR "nbd: must specify an index to disconnect\n"); + return -EINVAL; + } + index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); + mutex_lock(&nbd_index_mutex); + nbd = idr_find(&nbd_index_idr, index); + if (!nbd) { + mutex_unlock(&nbd_index_mutex); + printk(KERN_ERR "nbd: couldn't find device at index %d\n", + index); + return -EINVAL; + } + if (!refcount_inc_not_zero(&nbd->refs)) { + mutex_unlock(&nbd_index_mutex); + printk(KERN_ERR "nbd: device at index %d is going down\n", + index); + return -EINVAL; + } + mutex_unlock(&nbd_index_mutex); + if (!refcount_inc_not_zero(&nbd->config_refs)) { + nbd_put(nbd); + return 0; + } + mutex_lock(&nbd->config_lock); + nbd_disconnect(nbd); + mutex_unlock(&nbd->config_lock); + if (test_and_clear_bit(NBD_HAS_CONFIG_REF, + &nbd->config->runtime_flags)) + nbd_config_put(nbd); + nbd_config_put(nbd); + nbd_put(nbd); + return 0; +} + +static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) +{ + struct nbd_device *nbd = NULL; + struct nbd_config *config; + int index; + int ret = -EINVAL; + bool put_dev = false; + + if (!netlink_capable(skb, CAP_SYS_ADMIN)) + return -EPERM; + + if (!info->attrs[NBD_ATTR_INDEX]) { + printk(KERN_ERR "nbd: must specify a device to reconfigure\n"); + return -EINVAL; + } + index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); + mutex_lock(&nbd_index_mutex); + nbd = idr_find(&nbd_index_idr, index); + if (!nbd) { + mutex_unlock(&nbd_index_mutex); + printk(KERN_ERR "nbd: couldn't find a device at index %d\n", + index); + return -EINVAL; + } + if (!refcount_inc_not_zero(&nbd->refs)) { + mutex_unlock(&nbd_index_mutex); + printk(KERN_ERR "nbd: device at index %d is going down\n", + index); + return -EINVAL; + } + mutex_unlock(&nbd_index_mutex); + + if (!refcount_inc_not_zero(&nbd->config_refs)) { + dev_err(nbd_to_dev(nbd), + "not configured, cannot reconfigure\n"); + nbd_put(nbd); + return -EINVAL; + } + + mutex_lock(&nbd->config_lock); + config = nbd->config; + if (!test_bit(NBD_BOUND, &config->runtime_flags) || + !nbd->task_recv) { + dev_err(nbd_to_dev(nbd), + "not configured, cannot reconfigure\n"); + goto out; + } + + if (info->attrs[NBD_ATTR_TIMEOUT]) { + u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); + nbd->tag_set.timeout = timeout * HZ; + blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ); + } + if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) { + config->dead_conn_timeout = + nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]); + config->dead_conn_timeout *= HZ; + } + if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { + u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); + if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { + if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, + &config->runtime_flags)) + put_dev = true; + } else { + if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, + &config->runtime_flags)) + refcount_inc(&nbd->refs); + } + } + + if (info->attrs[NBD_ATTR_SOCKETS]) { + struct nlattr *attr; + int rem, fd; + + nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS], + rem) { + struct nlattr *socks[NBD_SOCK_MAX+1]; + + if (nla_type(attr) != NBD_SOCK_ITEM) { + printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n"); + ret = -EINVAL; + goto out; + } + ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr, - nbd_sock_policy); ++ nbd_sock_policy, info->extack); + if (ret != 0) { + printk(KERN_ERR "nbd: error processing sock list\n"); + ret = -EINVAL; + goto out; + } + if (!socks[NBD_SOCK_FD]) + continue; + fd = (int)nla_get_u32(socks[NBD_SOCK_FD]); + ret = nbd_reconnect_socket(nbd, fd); + if (ret) { + if (ret == -ENOSPC) + ret = 0; + goto out; + } + dev_info(nbd_to_dev(nbd), "reconnected socket\n"); + } + } +out: + mutex_unlock(&nbd->config_lock); + nbd_config_put(nbd); + nbd_put(nbd); + if (put_dev) + nbd_put(nbd); + return ret; +} + +static const struct genl_ops nbd_connect_genl_ops[] = { + { + .cmd = NBD_CMD_CONNECT, + .policy = nbd_attr_policy, + .doit = nbd_genl_connect, + }, + { + .cmd = NBD_CMD_DISCONNECT, + .policy = nbd_attr_policy, + .doit = nbd_genl_disconnect, + }, + { + .cmd = NBD_CMD_RECONFIGURE, + .policy = nbd_attr_policy, + .doit = nbd_genl_reconfigure, + }, + { + .cmd = NBD_CMD_STATUS, + .policy = nbd_attr_policy, + .doit = nbd_genl_status, + }, +}; + +static const struct genl_multicast_group nbd_mcast_grps[] = { + { .name = NBD_GENL_MCAST_GROUP_NAME, }, +}; + +static struct genl_family nbd_genl_family __ro_after_init = { + .hdrsize = 0, + .name = NBD_GENL_FAMILY_NAME, + .version = NBD_GENL_VERSION, + .module = THIS_MODULE, + .ops = nbd_connect_genl_ops, + .n_ops = ARRAY_SIZE(nbd_connect_genl_ops), + .maxattr = NBD_ATTR_MAX, + .mcgrps = nbd_mcast_grps, + .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps), +}; + +static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply) +{ + struct nlattr *dev_opt; + u8 connected = 0; + int ret; + + /* This is a little racey, but for status it's ok. The + * reason we don't take a ref here is because we can't + * take a ref in the index == -1 case as we would need + * to put under the nbd_index_mutex, which could + * deadlock if we are configured to remove ourselves + * once we're disconnected. + */ + if (refcount_read(&nbd->config_refs)) + connected = 1; + dev_opt = nla_nest_start(reply, NBD_DEVICE_ITEM); + if (!dev_opt) + return -EMSGSIZE; + ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index); + if (ret) + return -EMSGSIZE; + ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED, + connected); + if (ret) + return -EMSGSIZE; + nla_nest_end(reply, dev_opt); + return 0; +} + +static int status_cb(int id, void *ptr, void *data) +{ + struct nbd_device *nbd = ptr; + return populate_nbd_status(nbd, (struct sk_buff *)data); +} + +static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *dev_list; + struct sk_buff *reply; + void *reply_head; + size_t msg_size; + int index = -1; + int ret = -ENOMEM; + + if (info->attrs[NBD_ATTR_INDEX]) + index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); + + mutex_lock(&nbd_index_mutex); + + msg_size = nla_total_size(nla_attr_size(sizeof(u32)) + + nla_attr_size(sizeof(u8))); + msg_size *= (index == -1) ? nbd_total_devices : 1; + + reply = genlmsg_new(msg_size, GFP_KERNEL); + if (!reply) + goto out; + reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0, + NBD_CMD_STATUS); + if (!reply_head) { + nlmsg_free(reply); + goto out; + } + + dev_list = nla_nest_start(reply, NBD_ATTR_DEVICE_LIST); + if (index == -1) { + ret = idr_for_each(&nbd_index_idr, &status_cb, reply); + if (ret) { + nlmsg_free(reply); + goto out; + } + } else { + struct nbd_device *nbd; + nbd = idr_find(&nbd_index_idr, index); + if (nbd) { + ret = populate_nbd_status(nbd, reply); + if (ret) { + nlmsg_free(reply); + goto out; + } + } + } + nla_nest_end(reply, dev_list); + genlmsg_end(reply, reply_head); + genlmsg_reply(reply, info); + ret = 0; +out: + mutex_unlock(&nbd_index_mutex); + return ret; +} + +static void nbd_connect_reply(struct genl_info *info, int index) +{ + struct sk_buff *skb; + void *msg_head; + int ret; + + skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL); + if (!skb) + return; + msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0, + NBD_CMD_CONNECT); + if (!msg_head) { + nlmsg_free(skb); + return; + } + ret = nla_put_u32(skb, NBD_ATTR_INDEX, index); + if (ret) { + nlmsg_free(skb); + return; + } + genlmsg_end(skb, msg_head); + genlmsg_reply(skb, info); +} + +static void nbd_mcast_index(int index) +{ + struct sk_buff *skb; + void *msg_head; + int ret; + + skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL); + if (!skb) + return; + msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0, + NBD_CMD_LINK_DEAD); + if (!msg_head) { + nlmsg_free(skb); + return; + } + ret = nla_put_u32(skb, NBD_ATTR_INDEX, index); + if (ret) { + nlmsg_free(skb); + return; + } + genlmsg_end(skb, msg_head); + genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL); +} + +static void nbd_dead_link_work(struct work_struct *work) +{ + struct link_dead_args *args = container_of(work, struct link_dead_args, + work); + nbd_mcast_index(args->index); + kfree(args); +}
static int __init nbd_init(void) { @@@ -2071,11 -1184,6 +2071,11 @@@ return -EIO; }
+ if (genl_register_family(&nbd_genl_family)) { + unregister_blkdev(NBD_MAJOR, "nbd"); + destroy_workqueue(recv_workqueue); + return -EINVAL; + } nbd_dbg_init();
mutex_lock(&nbd_index_mutex); @@@ -2087,34 -1195,17 +2087,34 @@@
static int nbd_exit_cb(int id, void *ptr, void *data) { + struct list_head *list = (struct list_head *)data; struct nbd_device *nbd = ptr; - nbd_dev_remove(nbd); + + list_add_tail(&nbd->list, list); return 0; }
static void __exit nbd_cleanup(void) { + struct nbd_device *nbd; + LIST_HEAD(del_list); + nbd_dbg_close();
- idr_for_each(&nbd_index_idr, &nbd_exit_cb, NULL); + mutex_lock(&nbd_index_mutex); + idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list); + mutex_unlock(&nbd_index_mutex); + + while (!list_empty(&del_list)) { + nbd = list_first_entry(&del_list, struct nbd_device, list); + list_del_init(&nbd->list); + if (refcount_read(&nbd->refs) != 1) + printk(KERN_ERR "nbd: possibly leaking a device\n"); + nbd_put(nbd); + } + idr_destroy(&nbd_index_idr); + genl_unregister_family(&nbd_genl_family); destroy_workqueue(recv_workqueue); unregister_blkdev(NBD_MAJOR, "nbd"); } diff --combined drivers/net/bonding/bond_main.c index 34481c9,e549bf6..2be7880 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@@ -201,12 -201,6 +201,6 @@@ atomic_t netpoll_block_tx = ATOMIC_INIT
unsigned int bond_net_id __read_mostly;
- static __be32 arp_target[BOND_MAX_ARP_TARGETS]; - static int arp_ip_count; - static int bond_mode = BOND_MODE_ROUNDROBIN; - static int xmit_hashtype = BOND_XMIT_POLICY_LAYER2; - static int lacp_fast; - /*-------------------------- Forward declarations ---------------------------*/
static int bond_init(struct net_device *bond_dev); @@@ -371,9 -365,10 +365,10 @@@ down /* Get link speed and duplex from the slave's base driver * using ethtool. If for some reason the call fails or the * values are invalid, set speed and duplex to -1, - * and return. + * and return. Return 1 if speed or duplex settings are + * UNKNOWN; 0 otherwise. */ - static void bond_update_speed_duplex(struct slave *slave) + static int bond_update_speed_duplex(struct slave *slave) { struct net_device *slave_dev = slave->dev; struct ethtool_link_ksettings ecmd; @@@ -384,23 -379,21 +379,21 @@@
res = __ethtool_get_link_ksettings(slave_dev, &ecmd); if (res < 0) - return; - + return 1; if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) - return; - + return 1; switch (ecmd.base.duplex) { case DUPLEX_FULL: case DUPLEX_HALF: break; default: - return; + return 1; }
slave->speed = ecmd.base.speed; slave->duplex = ecmd.base.duplex;
- return; + return 0; }
const char *bond_slave_link_status(s8 link) @@@ -652,8 -645,8 +645,8 @@@ static void bond_do_fail_over_mac(struc struct slave *new_active, struct slave *old_active) { - u8 tmp_mac[ETH_ALEN]; - struct sockaddr saddr; + u8 tmp_mac[MAX_ADDR_LEN]; + struct sockaddr_storage ss; int rv;
switch (bond->params.fail_over_mac) { @@@ -673,16 -666,20 +666,20 @@@ old_active = bond_get_old_active(bond, new_active);
if (old_active) { - ether_addr_copy(tmp_mac, new_active->dev->dev_addr); - ether_addr_copy(saddr.sa_data, - old_active->dev->dev_addr); - saddr.sa_family = new_active->dev->type; + bond_hw_addr_copy(tmp_mac, new_active->dev->dev_addr, + new_active->dev->addr_len); + bond_hw_addr_copy(ss.__data, + old_active->dev->dev_addr, + old_active->dev->addr_len); + ss.ss_family = new_active->dev->type; } else { - ether_addr_copy(saddr.sa_data, bond->dev->dev_addr); - saddr.sa_family = bond->dev->type; + bond_hw_addr_copy(ss.__data, bond->dev->dev_addr, + bond->dev->addr_len); + ss.ss_family = bond->dev->type; }
- rv = dev_set_mac_address(new_active->dev, &saddr); + rv = dev_set_mac_address(new_active->dev, + (struct sockaddr *)&ss); if (rv) { netdev_err(bond->dev, "Error %d setting MAC of slave %s\n", -rv, new_active->dev->name); @@@ -692,10 -689,12 +689,12 @@@ if (!old_active) goto out;
- ether_addr_copy(saddr.sa_data, tmp_mac); - saddr.sa_family = old_active->dev->type; + bond_hw_addr_copy(ss.__data, tmp_mac, + new_active->dev->addr_len); + ss.ss_family = old_active->dev->type;
- rv = dev_set_mac_address(old_active->dev, &saddr); + rv = dev_set_mac_address(old_active->dev, + (struct sockaddr *)&ss); if (rv) netdev_err(bond->dev, "Error %d setting MAC of slave %s\n", -rv, new_active->dev->name); @@@ -1104,11 -1103,11 +1103,11 @@@ static void bond_compute_features(struc gso_max_size = min(gso_max_size, slave->dev->gso_max_size); gso_max_segs = min(gso_max_segs, slave->dev->gso_max_segs); } + bond_dev->hard_header_len = max_hard_header_len;
done: bond_dev->vlan_features = vlan_features; bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL; - bond_dev->hard_header_len = max_hard_header_len; bond_dev->gso_max_segs = gso_max_segs; netif_set_gso_max_size(bond_dev, gso_max_size);
@@@ -1177,6 -1176,9 +1176,9 @@@ static rx_handler_result_t bond_handle_ } }
+ /* don't change skb->dev for link-local packets */ + if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) + return RX_HANDLER_PASS; if (bond_should_deliver_exact_match(skb, slave, bond)) return RX_HANDLER_EXACT;
@@@ -1191,7 -1193,8 +1193,8 @@@ kfree_skb(skb); return RX_HANDLER_CONSUMED; } - ether_addr_copy(eth_hdr(skb)->h_dest, bond->dev->dev_addr); + bond_hw_addr_copy(eth_hdr(skb)->h_dest, bond->dev->dev_addr, + bond->dev->addr_len); }
return ret; @@@ -1330,7 -1333,7 +1333,7 @@@ int bond_enslave(struct net_device *bon struct bonding *bond = netdev_priv(bond_dev); const struct net_device_ops *slave_ops = slave_dev->netdev_ops; struct slave *new_slave = NULL, *prev_slave; - struct sockaddr addr; + struct sockaddr_storage ss; int link_reporting; int res = 0, i;
@@@ -1481,16 -1484,17 +1484,17 @@@ * that need it, and for restoring it upon release, and then * set it to the master's address */ - ether_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr); + bond_hw_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr, + slave_dev->addr_len);
if (!bond->params.fail_over_mac || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* Set slave to master's mac address. The application already * set the master's mac address to that of the first slave */ - memcpy(addr.sa_data, bond_dev->dev_addr, bond_dev->addr_len); - addr.sa_family = slave_dev->type; - res = dev_set_mac_address(slave_dev, &addr); + memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len); + ss.ss_family = slave_dev->type; + res = dev_set_mac_address(slave_dev, (struct sockaddr *)&ss); if (res) { netdev_dbg(bond_dev, "Error %d calling set_mac_address\n", res); goto err_restore_mtu; @@@ -1565,7 -1569,8 +1569,8 @@@ new_slave->delay = 0; new_slave->link_failure_count = 0;
- bond_update_speed_duplex(new_slave); + if (bond_update_speed_duplex(new_slave)) + new_slave->link = BOND_LINK_DOWN;
new_slave->last_rx = jiffies - (msecs_to_jiffies(bond->params.arp_interval) + 1); @@@ -1773,9 -1778,10 +1778,10 @@@ err_restore_mac * MAC if this slave's MAC is in use by the bond, or at * least print a warning. */ - ether_addr_copy(addr.sa_data, new_slave->perm_hwaddr); - addr.sa_family = slave_dev->type; - dev_set_mac_address(slave_dev, &addr); + bond_hw_addr_copy(ss.__data, new_slave->perm_hwaddr, + new_slave->dev->addr_len); + ss.ss_family = slave_dev->type; + dev_set_mac_address(slave_dev, (struct sockaddr *)&ss); }
err_restore_mtu: @@@ -1818,7 -1824,7 +1824,7 @@@ static int __bond_release_one(struct ne { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *oldcurrent; - struct sockaddr addr; + struct sockaddr_storage ss; int old_flags = bond_dev->flags; netdev_features_t old_features = bond_dev->features;
@@@ -1953,9 -1959,10 +1959,10 @@@ if (bond->params.fail_over_mac != BOND_FOM_ACTIVE || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* restore original ("permanent") mac address */ - ether_addr_copy(addr.sa_data, slave->perm_hwaddr); - addr.sa_family = slave_dev->type; - dev_set_mac_address(slave_dev, &addr); + bond_hw_addr_copy(ss.__data, slave->perm_hwaddr, + slave->dev->addr_len); + ss.ss_family = slave_dev->type; + dev_set_mac_address(slave_dev, (struct sockaddr *)&ss); }
dev_set_mtu(slave_dev, slave->original_mtu); @@@ -2039,8 -2046,7 +2046,7 @@@ static int bond_miimon_inspect(struct b if (link_state) continue;
- bond_set_slave_link_state(slave, BOND_LINK_FAIL, - BOND_SLAVE_NOTIFY_LATER); + bond_propose_link_state(slave, BOND_LINK_FAIL); slave->delay = bond->params.downdelay; if (slave->delay) { netdev_info(bond->dev, "link status down for %sinterface %s, disabling it in %d ms\n", @@@ -2055,13 -2061,13 +2061,13 @@@ case BOND_LINK_FAIL: if (link_state) { /* recovered before downdelay expired */ - bond_set_slave_link_state(slave, BOND_LINK_UP, - BOND_SLAVE_NOTIFY_LATER); + bond_propose_link_state(slave, BOND_LINK_UP); slave->last_link_up = jiffies; netdev_info(bond->dev, "link status up again after %d ms for interface %s\n", (bond->params.downdelay - slave->delay) * bond->params.miimon, slave->dev->name); + commit++; continue; }
@@@ -2078,8 -2084,7 +2084,7 @@@ if (!link_state) continue;
- bond_set_slave_link_state(slave, BOND_LINK_BACK, - BOND_SLAVE_NOTIFY_LATER); + bond_propose_link_state(slave, BOND_LINK_BACK); slave->delay = bond->params.updelay;
if (slave->delay) { @@@ -2092,14 -2097,12 +2097,12 @@@ /*FALLTHRU*/ case BOND_LINK_BACK: if (!link_state) { - bond_set_slave_link_state(slave, - BOND_LINK_DOWN, - BOND_SLAVE_NOTIFY_LATER); + bond_propose_link_state(slave, BOND_LINK_DOWN); netdev_info(bond->dev, "link status down again after %d ms for interface %s\n", (bond->params.updelay - slave->delay) * bond->params.miimon, slave->dev->name); - + commit++; continue; }
@@@ -2132,7 -2135,13 +2135,13 @@@ static void bond_miimon_commit(struct b continue;
case BOND_LINK_UP: - bond_update_speed_duplex(slave); + if (bond_update_speed_duplex(slave)) { + slave->link = BOND_LINK_DOWN; + netdev_warn(bond->dev, + "failed to get link speed/duplex for %s\n", + slave->dev->name); + continue; + } bond_set_slave_link_state(slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); slave->last_link_up = jiffies; @@@ -2231,6 -2240,8 +2240,8 @@@ static void bond_mii_monitor(struct wor mii_work.work); bool should_notify_peers = false; unsigned long delay; + struct slave *slave; + struct list_head *iter;
delay = msecs_to_jiffies(bond->params.miimon);
@@@ -2251,6 -2262,9 +2262,9 @@@ goto re_arm; }
+ bond_for_each_slave(bond, slave, iter) { + bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER); + } bond_miimon_commit(bond);
rtnl_unlock(); /* might sleep, hold no other locks */ @@@ -2575,10 -2589,8 +2589,8 @@@ static bool bond_time_in_interval(struc * arp is transmitted to generate traffic. see activebackup_arp_monitor for * arp monitoring in active backup mode. */ - static void bond_loadbalance_arp_mon(struct work_struct *work) + static void bond_loadbalance_arp_mon(struct bonding *bond) { - struct bonding *bond = container_of(work, struct bonding, - arp_work.work); struct slave *slave, *oldcurrent; struct list_head *iter; int do_failover = 0, slave_state_changed = 0; @@@ -2916,10 -2928,8 +2928,8 @@@ check_state return should_notify_rtnl; }
- static void bond_activebackup_arp_mon(struct work_struct *work) + static void bond_activebackup_arp_mon(struct bonding *bond) { - struct bonding *bond = container_of(work, struct bonding, - arp_work.work); bool should_notify_peers = false; bool should_notify_rtnl = false; int delta_in_ticks; @@@ -2972,6 -2982,17 +2982,17 @@@ re_arm } }
+ static void bond_arp_monitor(struct work_struct *work) + { + struct bonding *bond = container_of(work, struct bonding, + arp_work.work); + + if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) + bond_activebackup_arp_mon(bond); + else + bond_loadbalance_arp_mon(bond); + } + /*-------------------------- netdev event handling --------------------------*/
/* Change device name */ @@@ -3222,16 -3243,13 +3243,13 @@@ u32 bond_xmit_hash(struct bonding *bond
/*-------------------------- Device entry points ----------------------------*/
- static void bond_work_init_all(struct bonding *bond) + void bond_work_init_all(struct bonding *bond) { INIT_DELAYED_WORK(&bond->mcast_work, bond_resend_igmp_join_requests_delayed); INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor); INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor); - if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) - INIT_DELAYED_WORK(&bond->arp_work, bond_activebackup_arp_mon); - else - INIT_DELAYED_WORK(&bond->arp_work, bond_loadbalance_arp_mon); + INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor); INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler); INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler); } @@@ -3266,8 -3284,6 +3284,6 @@@ static int bond_open(struct net_device } }
- bond_work_init_all(bond); - if (bond_is_lb(bond)) { /* bond_alb_initialize must be called before the timer * is started. @@@ -3327,12 -3343,17 +3343,17 @@@ static void bond_fold_stats(struct rtnl for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) { u64 nv = new[i]; u64 ov = old[i]; + s64 delta = nv - ov;
/* detects if this particular field is 32bit only */ if (((nv | ov) >> 32) == 0) - res[i] += (u32)nv - (u32)ov; - else - res[i] += nv - ov; + delta = (s64)(s32)((u32)nv - (u32)ov); + + /* filter anomalies, some drivers reset their stats + * at down/up events. + */ + if (delta > 0) + res[i] += delta; } }
@@@ -3619,7 -3640,7 +3640,7 @@@ static int bond_set_mac_address(struct { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; - struct sockaddr *sa = addr, tmp_sa; + struct sockaddr_storage *ss = addr, tmp_ss; struct list_head *iter; int res = 0;
@@@ -3636,7 -3657,7 +3657,7 @@@ BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) return 0;
- if (!is_valid_ether_addr(sa->sa_data)) + if (!is_valid_ether_addr(ss->__data)) return -EADDRNOTAVAIL;
bond_for_each_slave(bond, slave, iter) { @@@ -3655,12 -3676,12 +3676,12 @@@ }
/* success */ - memcpy(bond_dev->dev_addr, sa->sa_data, bond_dev->addr_len); + memcpy(bond_dev->dev_addr, ss->__data, bond_dev->addr_len); return 0;
unwind: - memcpy(tmp_sa.sa_data, bond_dev->dev_addr, bond_dev->addr_len); - tmp_sa.sa_family = bond_dev->type; + memcpy(tmp_ss.__data, bond_dev->dev_addr, bond_dev->addr_len); + tmp_ss.ss_family = bond_dev->type;
/* unwind from head to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { @@@ -3669,7 -3690,8 +3690,8 @@@ if (rollback_slave == slave) break;
- tmp_res = dev_set_mac_address(rollback_slave->dev, &tmp_sa); + tmp_res = dev_set_mac_address(rollback_slave->dev, + (struct sockaddr *)&tmp_ss); if (tmp_res) { netdev_dbg(bond_dev, "unwind err %d dev %s\n", tmp_res, rollback_slave->dev->name); @@@ -4252,6 -4274,12 +4274,12 @@@ static int bond_check_params(struct bon int arp_all_targets_value; u16 ad_actor_sys_prio = 0; u16 ad_user_port_key = 0; + __be32 arp_target[BOND_MAX_ARP_TARGETS]; + int arp_ip_count; + int bond_mode = BOND_MODE_ROUNDROBIN; + int xmit_hashtype = BOND_XMIT_POLICY_LAYER2; + int lacp_fast = 0; + int tlb_dynamic_lb = 0;
/* Convert string parameters. */ if (mode) { @@@ -4564,6 -4592,17 +4592,17 @@@ } ad_user_port_key = valptr->value;
+ if (bond_mode == BOND_MODE_TLB) { + bond_opt_initstr(&newval, "default"); + valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), + &newval); + if (!valptr) { + pr_err("Error: No tlb_dynamic_lb default value"); + return -EINVAL; + } + tlb_dynamic_lb = valptr->value; + } + if (lp_interval == 0) { pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n", INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL); @@@ -4591,7 -4630,7 +4630,7 @@@ params->min_links = min_links; params->lp_interval = lp_interval; params->packets_per_slave = packets_per_slave; - params->tlb_dynamic_lb = 1; /* Default value */ + params->tlb_dynamic_lb = tlb_dynamic_lb; params->ad_actor_sys_prio = ad_actor_sys_prio; eth_zero_addr(params->ad_actor_system); params->ad_user_port_key = ad_user_port_key; @@@ -4687,6 -4726,8 +4726,8 @@@ int bond_create(struct net *net, const
netif_carrier_off(bond_dev);
+ bond_work_init_all(bond); + rtnl_unlock(); if (res < 0) bond_destructor(bond_dev); diff --combined drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 6faaca1,aa71019..c12c4a3 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@@ -2338,6 -2338,10 +2338,10 @@@ int cxgb4_create_server_filter(const st f->locked = 1; f->fs.rpttid = 1;
+ /* Save the actual tid. We need this to get the corresponding + * filter entry structure in filter_rpl. + */ + f->tid = stid + adap->tids.ftid_base; ret = set_filter_wr(adap, stid); if (ret) { clear_filter(adap, f); @@@ -3809,15 -3813,6 +3813,15 @@@ static int adap_init0(struct adapter *a } if (caps_cmd.cryptocaps) { /* Should query params here...TODO */ + params[0] = FW_PARAM_PFVF(NCRYPTO_LOOKASIDE); + ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 2, + params, val); + if (ret < 0) { + if (ret != -EINVAL) + goto bye; + } else { + adap->vres.ncrypto_fc = val[0]; + } adap->params.crypto |= ULP_CRYPTO_LOOKASIDE; adap->num_uld += 1; } diff --combined fs/cifs/cifsfs.c index 34fee9f,5e9b306..d0d11b7 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@@ -37,6 -37,7 +37,7 @@@ #include <linux/freezer.h> #include <linux/namei.h> #include <linux/random.h> + #include <linux/uuid.h> #include <linux/xattr.h> #include <net/ipv6.h> #include "cifsfs.h" @@@ -138,12 -139,7 +139,12 @@@ cifs_read_super(struct super_block *sb sb->s_magic = CIFS_MAGIC_NUMBER; sb->s_op = &cifs_super_ops; sb->s_xattr = cifs_xattr_handlers; - sb->s_bdi = &cifs_sb->bdi; + rc = super_setup_bdi(sb); + if (rc) + goto out_no_root; + /* tune readahead according to rsize */ + sb->s_bdi->ra_pages = cifs_sb->rsize / PAGE_SIZE; + sb->s_blocksize = CIFS_MAX_MSGSIZE; sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ inode = cifs_root_iget(sb); diff --combined fs/cifs/connect.c index 0f27816,190a855..9bc0b4d --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@@ -35,6 -35,7 +35,7 @@@ #include <linux/pagevec.h> #include <linux/freezer.h> #include <linux/namei.h> + #include <linux/uuid.h> #include <linux/uaccess.h> #include <asm/processor.h> #include <linux/inet.h> @@@ -2912,14 -2913,16 +2913,14 @@@ match_prepath(struct super_block *sb, s { struct cifs_sb_info *old = CIFS_SB(sb); struct cifs_sb_info *new = mnt_data->cifs_sb; + bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; + bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
- if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) { - if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)) - return 0; - /* The prepath should be null terminated strings */ - if (strcmp(new->prepath, old->prepath)) - return 0; - + if (old_set && new_set && !strcmp(new->prepath, old->prepath)) return 1; - } + else if (!old_set && !new_set) + return 1; + return 0; }
@@@ -3690,6 -3693,10 +3691,6 @@@ cifs_mount(struct cifs_sb_info *cifs_sb int referral_walks_count = 0; #endif
- rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs"); - if (rc) - return rc; - #ifdef CONFIG_CIFS_DFS_UPCALL try_mount_again: /* cleanup activities if we're chasing a referral */ @@@ -3717,6 -3724,7 +3718,6 @@@ server = cifs_get_tcp_session(volume_info); if (IS_ERR(server)) { rc = PTR_ERR(server); - bdi_destroy(&cifs_sb->bdi); goto out; } if ((volume_info->max_credits < 20) || @@@ -3773,6 -3781,9 +3774,6 @@@ cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info); cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
- /* tune readahead according to rsize */ - cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE; - remote_path_check: #ifdef CONFIG_CIFS_DFS_UPCALL /* @@@ -3889,6 -3900,7 +3890,6 @@@ mount_fail_check cifs_put_smb_ses(ses); else cifs_put_tcp_session(server, 0); - bdi_destroy(&cifs_sb->bdi); }
out: @@@ -4091,6 -4103,7 +4092,6 @@@ cifs_umount(struct cifs_sb_info *cifs_s } spin_unlock(&cifs_sb->tlink_tree_lock);
- bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb->mountdata); kfree(cifs_sb->prepath); call_rcu(&cifs_sb->rcu, delayed_free); diff --combined fs/select.c index dd70937,9287d3a..bd4b2cc --- a/fs/select.c +++ b/fs/select.c @@@ -338,53 -338,6 +338,53 @@@ sticky return ret; }
+/* + * Scalable version of the fd_set. + */ + +typedef struct { + unsigned long *in, *out, *ex; + unsigned long *res_in, *res_out, *res_ex; +} fd_set_bits; + +/* + * How many longwords for "nr" bits? + */ +#define FDS_BITPERLONG (8*sizeof(long)) +#define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) +#define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) + +/* + * We do a VERIFY_WRITE here even though we are only reading this time: + * we'll write to it eventually.. + * + * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. + */ +static inline +int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) +{ + nr = FDS_BYTES(nr); + if (ufdset) + return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; + + memset(fdset, 0, nr); + return 0; +} + +static inline unsigned long __must_check +set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) +{ + if (ufdset) + return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); + return 0; +} + +static inline +void zero_fd_set(unsigned long nr, unsigned long *fdset) +{ + memset(fdset, 0, FDS_BYTES(nr)); +} + #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) @@@ -448,7 -401,7 +448,7 @@@ static inline void wait_key_set(poll_ta wait->_key |= POLLOUT_SET; }
-int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) +static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; @@@ -456,7 -409,7 +456,7 @@@ int retval, i, timed_out = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0;
rcu_read_lock(); retval = max_select_fd(n, fds); @@@ -559,11 -512,11 +559,11 @@@
/* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; @@@ -847,7 -800,7 +847,7 @@@ static int do_poll(struct poll_list *li int timed_out = 0, count = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0;
/* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@@ -900,11 -853,11 +900,11 @@@
/* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; @@@ -928,7 -881,7 +928,7 @@@ #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd))
-int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, +static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; @@@ -1100,373 -1053,3 +1100,373 @@@ SYSCALL_DEFINE5(ppoll, struct pollfd __
return ret; } + +#ifdef CONFIG_COMPAT +#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) + +static +int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, + int timeval, int ret) +{ + struct timespec ts; + + if (!p) + return ret; + + if (current->personality & STICKY_TIMEOUTS) + goto sticky; + + /* No update for zero timeout */ + if (!end_time->tv_sec && !end_time->tv_nsec) + return ret; + + ktime_get_ts(&ts); + ts = timespec_sub(*end_time, ts); + if (ts.tv_sec < 0) + ts.tv_sec = ts.tv_nsec = 0; + + if (timeval) { + struct compat_timeval rtv; + + rtv.tv_sec = ts.tv_sec; + rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; + + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + } else { + struct compat_timespec rts; + + rts.tv_sec = ts.tv_sec; + rts.tv_nsec = ts.tv_nsec; + + if (!copy_to_user(p, &rts, sizeof(rts))) + return ret; + } + /* + * If an application puts its timeval in read-only memory, we + * don't want the Linux-specific update to the timeval to + * cause a fault after the select has completed + * successfully. However, because we're not updating the + * timeval, we can't restart the system call. + */ + +sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; + return ret; +} + +/* + * Ooo, nasty. We need here to frob 32-bit unsigned longs to + * 64-bit unsigned longs. + */ +static +int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); + if (ufdset) { + unsigned long odd; + + if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t))) + return -EFAULT; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + if (__get_user(l, ufdset) || __get_user(h, ufdset+1)) + return -EFAULT; + ufdset += 2; + *fdset++ = h << 32 | l; + nr -= 2; + } + if (odd && __get_user(*fdset, ufdset)) + return -EFAULT; + } else { + /* Tricky, must clear full unsigned long in the + * kernel fdset at the end, this makes sure that + * actually happens. + */ + memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t)); + } + return 0; +} + +static +int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + unsigned long odd; + nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); + + if (!ufdset) + return 0; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + l = *fdset++; + h = l >> 32; + if (__put_user(l, ufdset) || __put_user(h, ufdset+1)) + return -EFAULT; + ufdset += 2; + nr -= 2; + } + if (odd && __put_user(*fdset, ufdset)) + return -EFAULT; + return 0; +} + + +/* + * This is a virtual copy of sys_select from fs/select.c and probably + * should be compared to it from time to time + */ + +/* + * We can actually return ERESTARTSYS instead of EINTR, but I'd + * like to be certain this leads to no problems. So I return + * EINTR just for safety. + * + * Update: ERESTARTSYS breaks at least the xview clock binary, so + * I'm trying ERESTARTNOHAND which restart only when you want to. + */ +static int compat_core_sys_select(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct timespec *end_time) +{ + fd_set_bits fds; + void *bits; + int size, max_fds, ret = -EINVAL; + struct fdtable *fdt; + long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; + + if (n < 0) + goto out_nofds; + + /* max_fds can increase, so grab it once to avoid race */ + rcu_read_lock(); + fdt = files_fdtable(current->files); + max_fds = fdt->max_fds; + rcu_read_unlock(); + if (n > max_fds) + n = max_fds; + + /* + * We need 6 bitmaps (in/out/ex for both incoming and outgoing), + * since we used fdset we need to allocate memory in units of + * long-words. + */ + size = FDS_BYTES(n); + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + bits = kmalloc(6 * size, GFP_KERNEL); + ret = -ENOMEM; + if (!bits) + goto out_nofds; + } + fds.in = (unsigned long *) bits; + fds.out = (unsigned long *) (bits + size); + fds.ex = (unsigned long *) (bits + 2*size); + fds.res_in = (unsigned long *) (bits + 3*size); + fds.res_out = (unsigned long *) (bits + 4*size); + fds.res_ex = (unsigned long *) (bits + 5*size); + + if ((ret = compat_get_fd_set(n, inp, fds.in)) || + (ret = compat_get_fd_set(n, outp, fds.out)) || + (ret = compat_get_fd_set(n, exp, fds.ex))) + goto out; + zero_fd_set(n, fds.res_in); + zero_fd_set(n, fds.res_out); + zero_fd_set(n, fds.res_ex); + + ret = do_select(n, &fds, end_time); + + if (ret < 0) + goto out; + if (!ret) { + ret = -ERESTARTNOHAND; + if (signal_pending(current)) + goto out; + ret = 0; + } + + if (compat_set_fd_set(n, inp, fds.res_in) || + compat_set_fd_set(n, outp, fds.res_out) || + compat_set_fd_set(n, exp, fds.res_ex)) + ret = -EFAULT; +out: + if (bits != stack_fds) + kfree(bits); +out_nofds: + return ret; +} + +COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timeval __user *, tvp) +{ + struct timespec end_time, *to = NULL; + struct compat_timeval tv; + int ret; + + if (tvp) { + if (copy_from_user(&tv, tvp, sizeof(tv))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, + tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), + (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) + return -EINVAL; + } + + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret); + + return ret; +} + +struct compat_sel_arg_struct { + compat_ulong_t n; + compat_uptr_t inp; + compat_uptr_t outp; + compat_uptr_t exp; + compat_uptr_t tvp; +}; + +COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) +{ + struct compat_sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), + compat_ptr(a.exp), compat_ptr(a.tvp)); +} + +static long do_compat_pselect(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, + compat_size_t sigsetsize) +{ + compat_sigset_t ss32; + sigset_t ksigmask, sigsaved; + struct compat_timespec ts; + struct timespec end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&ksigmask, &ss32); + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); + + if (ret == -ERESTARTNOHAND) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + return ret; +} + +COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timespec __user *, tsp, void __user *, sig) +{ + compat_size_t sigsetsize = 0; + compat_uptr_t up = 0; + + if (sig) { + if (!access_ok(VERIFY_READ, sig, + sizeof(compat_uptr_t)+sizeof(compat_size_t)) || + __get_user(up, (compat_uptr_t __user *)sig) || + __get_user(sigsetsize, + (compat_size_t __user *)(sig+sizeof(up)))) + return -EFAULT; + } + return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up), + sigsetsize); +} + +COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, + unsigned int, nfds, struct compat_timespec __user *, tsp, + const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) +{ + compat_sigset_t ss32; + sigset_t ksigmask, sigsaved; + struct compat_timespec ts; + struct timespec end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&ksigmask, &ss32); + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = do_sys_poll(ufds, nfds, to); + + /* We can restart this syscall, usually */ + if (ret == -EINTR) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + ret = -ERESTARTNOHAND; + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); + + return ret; +} +#endif diff --combined include/linux/skbuff.h index 53383bc,81ef53f..a098d95 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@@ -413,14 -413,15 +413,15 @@@ struct ubuf_info * the end of the header data, ie. at skb->end. */ struct skb_shared_info { + unsigned short _unused; unsigned char nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; - unsigned short gso_type; struct sk_buff *frag_list; struct skb_shared_hwtstamps hwtstamps; + unsigned int gso_type; u32 tskey; __be32 ip6_frag_id;
@@@ -491,6 -492,8 +492,8 @@@ enum SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
SKB_GSO_SCTP = 1 << 15, + + SKB_GSO_ESP = 1 << 16, };
#if BITS_PER_LONG > 32 @@@ -3113,7 -3116,7 +3116,7 @@@ struct sk_buff *pskb_extract(struct sk_
static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) { - return copy_from_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT; + return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT; }
static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) diff --combined kernel/trace/bpf_trace.c index f806dbd,8a4efac..460a031 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@@ -96,7 -96,7 +96,7 @@@ BPF_CALL_3(bpf_probe_write_user, void * if (unlikely(in_interrupt() || current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; - if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + if (unlikely(uaccess_kernel())) return -EPERM; if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) return -EPERM; @@@ -501,16 -501,11 +501,11 @@@ static bool kprobe_prog_is_valid_access return true; }
- static const struct bpf_verifier_ops kprobe_prog_ops = { + const struct bpf_verifier_ops kprobe_prog_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, };
- static struct bpf_prog_type_list kprobe_tl __ro_after_init = { - .ops = &kprobe_prog_ops, - .type = BPF_PROG_TYPE_KPROBE, - }; - BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, u64, flags, void *, data, u64, size) { @@@ -584,16 -579,11 +579,11 @@@ static bool tp_prog_is_valid_access(in return true; }
- static const struct bpf_verifier_ops tracepoint_prog_ops = { + const struct bpf_verifier_ops tracepoint_prog_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = tp_prog_is_valid_access, };
- static struct bpf_prog_type_list tracepoint_tl __ro_after_init = { - .ops = &tracepoint_prog_ops, - .type = BPF_PROG_TYPE_TRACEPOINT, - }; - static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type) { @@@ -642,22 -632,8 +632,8 @@@ static u32 pe_prog_convert_ctx_access(e return insn - insn_buf; }
- static const struct bpf_verifier_ops perf_event_prog_ops = { + const struct bpf_verifier_ops perf_event_prog_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; - - static struct bpf_prog_type_list perf_event_tl __ro_after_init = { - .ops = &perf_event_prog_ops, - .type = BPF_PROG_TYPE_PERF_EVENT, - }; - - static int __init register_kprobe_prog_ops(void) - { - bpf_register_prog_type(&kprobe_tl); - bpf_register_prog_type(&tracepoint_tl); - bpf_register_prog_type(&perf_event_tl); - return 0; - } - late_initcall(register_kprobe_prog_ops); diff --combined net/core/datagram.c index d797baa,15ef9946..db1866f2 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@@ -256,8 -256,12 +256,12 @@@ struct sk_buff *__skb_try_recv_datagram }
spin_unlock_irqrestore(&queue->lock, cpu_flags); - } while (sk_can_busy_loop(sk) && - sk_busy_loop(sk, flags & MSG_DONTWAIT)); + + if (!sk_can_busy_loop(sk)) + break; + + sk_busy_loop(sk, flags & MSG_DONTWAIT); + } while (!skb_queue_empty(&sk->sk_receive_queue));
error = -EAGAIN;
@@@ -760,7 -764,7 +764,7 @@@ int skb_copy_and_csum_datagram_msg(stru
if (msg_data_left(msg) < chunk) { if (__skb_checksum_complete(skb)) - goto csum_error; + return -EINVAL; if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) goto fault; } else { @@@ -768,16 -772,15 +772,16 @@@ if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, chunk, &csum)) goto fault; - if (csum_fold(csum)) - goto csum_error; + + if (csum_fold(csum)) { + iov_iter_revert(&msg->msg_iter, chunk); + return -EINVAL; + } + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) netdev_rx_csum_fault(skb->dev); } return 0; -csum_error: - iov_iter_revert(&msg->msg_iter, chunk); - return -EINVAL; fault: return -EFAULT; } diff --combined net/core/dev.c index c57878b,0b2876e..d07aa5f --- a/net/core/dev.c +++ b/net/core/dev.c @@@ -95,6 -95,7 +95,7 @@@ #include <linux/notifier.h> #include <linux/skbuff.h> #include <linux/bpf.h> + #include <linux/bpf_trace.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/busy_poll.h> @@@ -2975,6 -2976,9 +2976,9 @@@ static struct sk_buff *validate_xmit_sk __skb_linearize(skb)) goto out_kfree_skb;
+ if (validate_xmit_xfrm(skb, features)) + goto out_kfree_skb; + /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. @@@ -3444,6 -3448,7 +3448,7 @@@ EXPORT_SYMBOL(netdev_max_backlog)
int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; + unsigned int __read_mostly netdev_budget_usecs = 2000; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ @@@ -4243,13 -4248,132 +4248,132 @@@ static int __netif_receive_skb(struct s */ current->flags |= PF_MEMALLOC; ret = __netif_receive_skb_core(skb, true); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + current_restore_flags(pflags, PF_MEMALLOC); } else ret = __netif_receive_skb_core(skb, false);
return ret; }
+ static struct static_key generic_xdp_needed __read_mostly; + + static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) + { + struct bpf_prog *new = xdp->prog; + int ret = 0; + + switch (xdp->command) { + case XDP_SETUP_PROG: { + struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); + + rcu_assign_pointer(dev->xdp_prog, new); + if (old) + bpf_prog_put(old); + + if (old && !new) { + static_key_slow_dec(&generic_xdp_needed); + } else if (new && !old) { + static_key_slow_inc(&generic_xdp_needed); + dev_disable_lro(dev); + } + break; + } + + case XDP_QUERY_PROG: + xdp->prog_attached = !!rcu_access_pointer(dev->xdp_prog); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; + } + + static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct bpf_prog *xdp_prog) + { + struct xdp_buff xdp; + u32 act = XDP_DROP; + void *orig_data; + int hlen, off; + u32 mac_len; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_cloned(skb)) + return XDP_PASS; + + if (skb_linearize(skb)) + goto do_drop; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hlen = skb_headlen(skb) + mac_len; + xdp.data = skb->data - mac_len; + xdp.data_end = xdp.data + hlen; + xdp.data_hard_start = skb->data - skb_headroom(skb); + orig_data = xdp.data; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + off = xdp.data - orig_data; + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + switch (act) { + case XDP_TX: + __skb_push(skb, mac_len); + /* fall through */ + case XDP_PASS: + break; + + default: + bpf_warn_invalid_xdp_action(act); + /* fall through */ + case XDP_ABORTED: + trace_xdp_exception(skb->dev, xdp_prog, act); + /* fall through */ + case XDP_DROP: + do_drop: + kfree_skb(skb); + break; + } + + return act; + } + + /* When doing generic XDP we have to bypass the qdisc layer and the + * network taps in order to match in-driver-XDP behavior. + */ + static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) + { + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + bool free_skb = true; + int cpu, rc; + + txq = netdev_pick_tx(dev, skb, NULL); + cpu = smp_processor_id(); + HARD_TX_LOCK(dev, txq, cpu); + if (!netif_xmit_stopped(txq)) { + rc = netdev_start_xmit(skb, dev, txq, 0); + if (dev_xmit_complete(rc)) + free_skb = false; + } + HARD_TX_UNLOCK(dev, txq); + if (free_skb) { + trace_xdp_exception(dev, xdp_prog, XDP_TX); + kfree_skb(skb); + } + } + static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; @@@ -4261,6 -4385,21 +4385,21 @@@
rcu_read_lock();
+ if (static_key_false(&generic_xdp_needed)) { + struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); + + if (xdp_prog) { + u32 act = netif_receive_generic_xdp(skb, xdp_prog); + + if (act != XDP_PASS) { + rcu_read_unlock(); + if (act == XDP_TX) + generic_xdp_tx(skb, xdp_prog); + return NET_RX_DROP; + } + } + } + #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@@ -4493,7 -4632,7 +4632,7 @@@ static enum gro_result dev_gro_receive( enum gro_result ret; int grow;
- if (!(skb->dev->features & NETIF_F_GRO)) + if (netif_elide_gro(skb->dev)) goto normal;
if (skb->csum_bad) @@@ -5063,27 -5202,28 +5202,28 @@@ static void busy_poll_stop(struct napi_ do_softirq(); }
- bool sk_busy_loop(struct sock *sk, int nonblock) + void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) { - unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; - int rc;
restart: - rc = false; napi_poll = NULL;
rcu_read_lock();
- napi = napi_by_id(sk->sk_napi_id); + napi = napi_by_id(napi_id); if (!napi) goto out;
preempt_disable(); for (;;) { - rc = 0; + int work = 0; + local_bh_disable(); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); @@@ -5101,16 -5241,15 +5241,15 @@@ have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - rc = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + work = napi_poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, work, BUSY_POLL_BUDGET); count: - if (rc > 0) - __NET_ADD_STATS(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); + if (work > 0) + __NET_ADD_STATS(dev_net(napi->dev), + LINUX_MIB_BUSYPOLLRXPACKETS, work); local_bh_enable();
- if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || - busy_loop_timeout(end_time)) + if (!loop_end || loop_end(loop_end_arg, start_time)) break;
if (unlikely(need_resched())) { @@@ -5119,9 -5258,8 +5258,8 @@@ preempt_enable(); rcu_read_unlock(); cond_resched(); - rc = !skb_queue_empty(&sk->sk_receive_queue); - if (rc || busy_loop_timeout(end_time)) - return rc; + if (loop_end(loop_end_arg, start_time)) + return; goto restart; } cpu_relax(); @@@ -5129,12 -5267,10 +5267,10 @@@ if (napi_poll) busy_poll_stop(napi, have_poll_lock); preempt_enable(); - rc = !skb_queue_empty(&sk->sk_receive_queue); out: rcu_read_unlock(); - return rc; } - EXPORT_SYMBOL(sk_busy_loop); + EXPORT_SYMBOL(napi_busy_loop);
#endif /* CONFIG_NET_RX_BUSY_POLL */
@@@ -5146,10 -5282,10 +5282,10 @@@ static void napi_hash_add(struct napi_s
spin_lock(&napi_hash_lock);
- /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ + /* 0..NR_CPUS range is reserved for sender_cpu use */ do { - if (unlikely(++napi_gen_id < NR_CPUS + 1)) - napi_gen_id = NR_CPUS + 1; + if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); napi->napi_id = napi_gen_id;
@@@ -5313,7 -5449,8 +5449,8 @@@ out_unlock static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); - unsigned long time_limit = jiffies + 2; + unsigned long time_limit = jiffies + + usecs_to_jiffies(netdev_budget_usecs); int budget = netdev_budget; LIST_HEAD(list); LIST_HEAD(repoll); @@@ -6717,13 -6854,16 +6854,16 @@@ EXPORT_SYMBOL(dev_change_proto_down) /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device + * @extack: netlink extended ack * @fd: new program fd or negative value to clear * @flags: xdp-related flags * * Set or clear a bpf program for a device */ - int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) + int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, u32 flags) { + int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp); const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; struct netdev_xdp xdp; @@@ -6731,14 -6871,16 +6871,16 @@@
ASSERT_RTNL();
- if (!ops->ndo_xdp) - return -EOPNOTSUPP; + xdp_op = ops->ndo_xdp; + if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE)) + xdp_op = generic_xdp_install; + if (fd >= 0) { if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_QUERY_PROG;
- err = ops->ndo_xdp(dev, &xdp); + err = xdp_op(dev, &xdp); if (err < 0) return err; if (xdp.prog_attached) @@@ -6752,9 -6894,10 +6894,10 @@@
memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_SETUP_PROG; + xdp.extack = extack; xdp.prog = prog;
- err = ops->ndo_xdp(dev, &xdp); + err = xdp_op(dev, &xdp); if (err < 0 && prog) bpf_prog_put(prog);
@@@ -7105,13 -7248,10 +7248,10 @@@ void netif_stacked_transfer_operstate(c else netif_dormant_off(dev);
- if (netif_carrier_ok(rootdev)) { - if (!netif_carrier_ok(dev)) - netif_carrier_on(dev); - } else { - if (netif_carrier_ok(dev)) - netif_carrier_off(dev); - } + if (netif_carrier_ok(rootdev)) + netif_carrier_on(dev); + else + netif_carrier_off(dev); } EXPORT_SYMBOL(netif_stacked_transfer_operstate);
@@@ -7794,6 -7934,7 +7934,7 @@@ EXPORT_SYMBOL(alloc_netdev_mqs) void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; + struct bpf_prog *prog;
might_sleep(); netif_free_tx_queues(dev); @@@ -7812,6 -7953,12 +7953,12 @@@ free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL;
+ prog = rcu_dereference_protected(dev->xdp_prog, 1); + if (prog) { + bpf_prog_put(prog); + static_key_slow_dec(&generic_xdp_needed); + } + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); diff --combined net/core/skbuff.c index b782b45,58604c1..346d3e8 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@@ -1576,8 -1576,6 +1576,8 @@@ done skb_set_tail_pointer(skb, len); }
+ if (!skb->sk || skb->destructor == sock_edemux) + skb_condense(skb); return 0; } EXPORT_SYMBOL(___pskb_trim); @@@ -1982,6 -1980,7 +1982,6 @@@ int skb_splice_bits(struct sk_buff *skb .pages = pages, .partial = partial, .nr_pages_max = MAX_SKB_FRAGS, - .flags = flags, .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; @@@ -3101,7 -3100,7 +3101,7 @@@ struct sk_buff *skb_segment(struct sk_b skb_walk_frags(head_skb, iter) { if (frag_len != iter->len && iter->next) goto normal; - if (skb_headlen(iter)) + if (skb_headlen(iter) && !iter->head_frag) goto normal;
len -= iter->len; diff --combined net/core/sock.c index b416a53,a06bb7a..b5baeb9 --- a/net/core/sock.c +++ b/net/core/sock.c @@@ -247,12 -247,66 +247,66 @@@ static const char *const af_family_kern static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { _sock_locks("k-clock-") }; + static const char *const af_family_rlock_key_strings[AF_MAX+1] = { + "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" , + "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK", + "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" , + "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" , + "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" , + "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" , + "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" , + "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" , + "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" , + "rlock-27" , "rlock-28" , "rlock-AF_CAN" , + "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" , + "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , + "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , + "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , + "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX" + }; + static const char *const af_family_wlock_key_strings[AF_MAX+1] = { + "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , + "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK", + "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" , + "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" , + "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" , + "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" , + "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" , + "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" , + "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" , + "wlock-27" , "wlock-28" , "wlock-AF_CAN" , + "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" , + "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , + "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , + "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , + "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX" + }; + static const char *const af_family_elock_key_strings[AF_MAX+1] = { + "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , + "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK", + "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" , + "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" , + "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" , + "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" , + "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" , + "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" , + "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" , + "elock-27" , "elock-28" , "elock-AF_CAN" , + "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" , + "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , + "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , + "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , + "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX" + };
/* - * sk_callback_lock locking rules are per-address-family, + * sk_callback_lock and sk queues locking rules are per-address-family, * so split the lock classes by using a per-AF key: */ static struct lock_class_key af_callback_keys[AF_MAX]; + static struct lock_class_key af_rlock_keys[AF_MAX]; + static struct lock_class_key af_wlock_keys[AF_MAX]; + static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX];
/* Take into consideration the size of the struct sk_buff overhead in the @@@ -325,7 -379,7 +379,7 @@@ int __sk_backlog_rcv(struct sock *sk, s
current->flags |= PF_MEMALLOC; ret = sk->sk_backlog_rcv(sk, skb); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + current_restore_flags(pflags, PF_MEMALLOC);
return ret; } @@@ -1029,6 -1083,7 +1083,7 @@@ int sock_getsockopt(struct socket *sock
union { int val; + u64 val64; struct linger ling; struct timeval tm; } v; @@@ -1259,6 -1314,40 +1314,40 @@@ v.val = sk->sk_incoming_cpu; break;
+ case SO_MEMINFO: + { + u32 meminfo[SK_MEMINFO_VARS]; + + if (get_user(len, optlen)) + return -EFAULT; + + sk_get_meminfo(sk, meminfo); + + len = min_t(unsigned int, len, sizeof(meminfo)); + if (copy_to_user(optval, &meminfo, len)) + return -EFAULT; + + goto lenout; + } + + #ifdef CONFIG_NET_RX_BUSY_POLL + case SO_INCOMING_NAPI_ID: + v.val = READ_ONCE(sk->sk_napi_id); + + /* aggregate non-NAPI IDs down to 0 */ + if (v.val < MIN_NAPI_ID) + v.val = 0; + + break; + #endif + + case SO_COOKIE: + lv = sizeof(u64); + if (len < lv) + return -EINVAL; + v.val64 = sock_gen_cookie(sk); + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@@ -1483,6 -1572,27 +1572,27 @@@ void sk_free(struct sock *sk } EXPORT_SYMBOL(sk_free);
+ static void sk_init_common(struct sock *sk) + { + skb_queue_head_init(&sk->sk_receive_queue); + skb_queue_head_init(&sk->sk_write_queue); + skb_queue_head_init(&sk->sk_error_queue); + + rwlock_init(&sk->sk_callback_lock); + lockdep_set_class_and_name(&sk->sk_receive_queue.lock, + af_rlock_keys + sk->sk_family, + af_family_rlock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_write_queue.lock, + af_wlock_keys + sk->sk_family, + af_family_wlock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_error_queue.lock, + af_elock_keys + sk->sk_family, + af_family_elock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_callback_keys + sk->sk_family, + af_family_clock_key_strings[sk->sk_family]); + } + /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone @@@ -1516,13 -1626,7 +1626,7 @@@ struct sock *sk_clone_lock(const struc */ atomic_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); - skb_queue_head_init(&newsk->sk_receive_queue); - skb_queue_head_init(&newsk->sk_write_queue); - - rwlock_init(&newsk->sk_callback_lock); - lockdep_set_class_and_name(&newsk->sk_callback_lock, - af_callback_keys + newsk->sk_family, - af_family_clock_key_strings[newsk->sk_family]); + sk_init_common(newsk);
newsk->sk_dst_cache = NULL; newsk->sk_dst_pending_confirm = 0; @@@ -1533,7 -1637,6 +1637,6 @@@ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
sock_reset_flag(newsk, SOCK_DONE); - skb_queue_head_init(&newsk->sk_error_queue);
filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) @@@ -2466,10 -2569,7 +2569,7 @@@ EXPORT_SYMBOL(sk_stop_timer)
void sock_init_data(struct socket *sock, struct sock *sk) { - skb_queue_head_init(&sk->sk_receive_queue); - skb_queue_head_init(&sk->sk_write_queue); - skb_queue_head_init(&sk->sk_error_queue); - + sk_init_common(sk); sk->sk_send_head = NULL;
init_timer(&sk->sk_timer); @@@ -2521,7 -2621,7 +2621,7 @@@ sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
- sk->sk_stamp = ktime_set(-1L, 0); + sk->sk_stamp = SK_DEFAULT_STAMP;
#ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; @@@ -2802,6 -2902,21 +2902,21 @@@ void sk_common_release(struct sock *sk } EXPORT_SYMBOL(sk_common_release);
+ void sk_get_meminfo(const struct sock *sk, u32 *mem) + { + memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); + + mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); + mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; + mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); + mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; + mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; + mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + } + #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { @@@ -3142,3 -3257,14 +3257,14 @@@ static int __init proto_init(void subsys_initcall(proto_init);
#endif /* PROC_FS */ + + #ifdef CONFIG_NET_RX_BUSY_POLL + bool sk_busy_loop_end(void *p, unsigned long start_time) + { + struct sock *sk = p; + + return !skb_queue_empty(&sk->sk_receive_queue) || + sk_busy_loop_timeout(sk, start_time); + } + EXPORT_SYMBOL(sk_busy_loop_end); + #endif /* CONFIG_NET_RX_BUSY_POLL */ diff --combined net/core/utils.c index 32c467c,d758880..93066bd --- a/net/core/utils.c +++ b/net/core/utils.c @@@ -26,11 -26,9 +26,11 @@@ #include <linux/percpu.h> #include <linux/init.h> #include <linux/ratelimit.h> +#include <linux/socket.h>
#include <net/sock.h> #include <net/net_ratelimit.h> +#include <net/ipv6.h>
#include <asm/byteorder.h> #include <linux/uaccess.h> @@@ -53,7 -51,7 +53,7 @@@ EXPORT_SYMBOL(net_ratelimit)
__be32 in_aton(const char *str) { - unsigned long l; + unsigned int l; unsigned int val; int i;
@@@ -302,107 -300,6 +302,107 @@@ out } EXPORT_SYMBOL(in6_pton);
+static int inet4_pton(const char *src, u16 port_num, + struct sockaddr_storage *addr) +{ + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + int srclen = strlen(src); + + if (srclen > INET_ADDRSTRLEN) + return -EINVAL; + + if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr, + '\n', NULL) == 0) + return -EINVAL; + + addr4->sin_family = AF_INET; + addr4->sin_port = htons(port_num); + + return 0; +} + +static int inet6_pton(struct net *net, const char *src, u16 port_num, + struct sockaddr_storage *addr) +{ + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + const char *scope_delim; + int srclen = strlen(src); + + if (srclen > INET6_ADDRSTRLEN) + return -EINVAL; + + if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr, + '%', &scope_delim) == 0) + return -EINVAL; + + if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL && + src + srclen != scope_delim && *scope_delim == '%') { + struct net_device *dev; + char scope_id[16]; + size_t scope_len = min_t(size_t, sizeof(scope_id) - 1, + src + srclen - scope_delim - 1); + + memcpy(scope_id, scope_delim + 1, scope_len); + scope_id[scope_len] = '\0'; + + dev = dev_get_by_name(net, scope_id); + if (dev) { + addr6->sin6_scope_id = dev->ifindex; + dev_put(dev); + } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) { + return -EINVAL; + } + } + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(port_num); + + return 0; +} + +/** + * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address + * @net: net namespace (used for scope handling) + * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either + * @src: the start of the address string + * @port: the start of the port string (or NULL for none) + * @addr: output socket address + * + * Return zero on success, return errno when any error occurs. + */ +int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, + const char *src, const char *port, struct sockaddr_storage *addr) +{ + u16 port_num; + int ret = -EINVAL; + + if (port) { + if (kstrtou16(port, 0, &port_num)) + return -EINVAL; + } else { + port_num = 0; + } + + switch (af) { + case AF_INET: + ret = inet4_pton(src, port_num, addr); + break; + case AF_INET6: + ret = inet6_pton(net, src, port_num, addr); + break; + case AF_UNSPEC: + ret = inet4_pton(src, port_num, addr); + if (ret) + ret = inet6_pton(net, src, port_num, addr); + break; + default: + pr_err("unexpected address family %d\n", af); + }; + + return ret; +} +EXPORT_SYMBOL(inet_pton_with_scope); + void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr) { diff --combined net/ipv4/af_inet.c index 13a9a32,d1a1170..f3dad16 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@@ -1342,9 -1342,6 +1342,9 @@@ struct sk_buff **inet_gro_receive(struc
if (*(u8 *)iph != 0x45) goto out_unlock; + + if (ip_is_fragment(iph)) + goto out_unlock;
if (unlikely(ip_fast_csum((u8 *)iph, 5))) goto out_unlock; @@@ -1602,8 -1599,9 +1602,9 @@@ static const struct net_protocol igmp_p }; #endif
- static const struct net_protocol tcp_protocol = { + static struct net_protocol tcp_protocol = { .early_demux = tcp_v4_early_demux, + .early_demux_handler = tcp_v4_early_demux, .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, @@@ -1611,8 -1609,9 +1612,9 @@@ .icmp_strict_tag_validation = 1, };
- static const struct net_protocol udp_protocol = { + static struct net_protocol udp_protocol = { .early_demux = udp_v4_early_demux, + .early_demux_handler = udp_v4_early_demux, .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, @@@ -1723,6 -1722,8 +1725,8 @@@ static __net_init int inet_init_net(str net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; + net->ipv4.sysctl_udp_early_demux = 1; + net->ipv4.sysctl_tcp_early_demux = 1; #ifdef CONFIG_SYSCTL net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif diff --combined net/ipv4/tcp_output.c index a85d863c,ffc9274..60111a0 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@@ -212,12 -212,12 +212,12 @@@ void tcp_select_initial_window(int __sp
/* If no clamp set the clamp to the max possible scaled window */ if (*window_clamp == 0) - (*window_clamp) = (65535 << 14); + (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE); space = min(*window_clamp, space);
/* Quantize space offering to a multiple of mss if possible. */ if (space > mss) - space = (space / mss) * mss; + space = rounddown(space, mss);
/* NOTE: offering an initial window larger than 32767 * will break some buggy TCP stacks. If the admin tells us @@@ -234,13 -234,11 +234,11 @@@
(*rcv_wscale) = 0; if (wscale_ok) { - /* Set window scaling on max possible window - * See RFC1323 for an explanation of the limit to 14 - */ + /* Set window scaling on max possible window */ space = max_t(u32, space, sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); - while (space > 65535 && (*rcv_wscale) < 14) { + while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { space >>= 1; (*rcv_wscale)++; } @@@ -253,7 -251,7 +251,7 @@@ }
/* Set the clamp no higher than max representable value */ - (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); + (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); } EXPORT_SYMBOL(tcp_select_initial_window);
@@@ -1267,7 -1265,7 +1265,7 @@@ int tcp_fragment(struct sock *sk, struc * eventually). The difference is that pulled data not copied, but * immediately discarded. */ -static void __pskb_trim_head(struct sk_buff *skb, int len) +static int __pskb_trim_head(struct sk_buff *skb, int len) { struct skb_shared_info *shinfo; int i, k, eat; @@@ -1277,7 -1275,7 +1275,7 @@@ __skb_pull(skb, eat); len -= eat; if (!len) - return; + return 0; } eat = len; k = 0; @@@ -1303,28 -1301,23 +1301,28 @@@ skb_reset_tail_pointer(skb); skb->data_len -= len; skb->len = skb->data_len; + return len; }
/* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { + u32 delta_truesize; + if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM;
- __pskb_trim_head(skb, len); + delta_truesize = __pskb_trim_head(skb, len);
TCP_SKB_CB(skb)->seq += len; skb->ip_summed = CHECKSUM_PARTIAL;
- skb->truesize -= len; - sk->sk_wmem_queued -= len; - sk_mem_uncharge(sk, len); - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + if (delta_truesize) { + skb->truesize -= delta_truesize; + sk->sk_wmem_queued -= delta_truesize; + sk_mem_uncharge(sk, delta_truesize); + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + }
/* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) @@@ -2566,7 -2559,6 +2564,6 @@@ u32 __tcp_select_window(struct sock *sk /* Don't do rounding if we are using window scaling, since the * scaled window will not line up with the MSS boundary anyway. */ - window = tp->rcv_wnd; if (tp->rx_opt.rcv_wscale) { window = free_space;
@@@ -2574,10 -2566,9 +2571,9 @@@ * Import case: prevent zero window announcement if * 1<<rcv_wscale > mss. */ - if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window) - window = (((window >> tp->rx_opt.rcv_wscale) + 1) - << tp->rx_opt.rcv_wscale); + window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale)); } else { + window = tp->rcv_wnd; /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. * If our current window offering is within 1 mss of the @@@ -2587,7 -2578,7 +2583,7 @@@ * is too small. */ if (window <= free_space - mss || window > free_space) - window = (free_space / mss) * mss; + window = rounddown(free_space, mss); else if (mss == full_space && free_space > window + (full_space >> 1)) window = free_space; diff --combined net/tipc/socket.c index bdce99f,7e45ef9..0d4f2f4 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@@ -51,6 -51,7 +51,7 @@@ #define TIPC_FWD_MSG 1 #define TIPC_MAX_PORT 0xffffffff #define TIPC_MIN_PORT 1 + #define TIPC_ACK_RATE 4 /* ACK at 1/4 of of rcv window size */
enum { TIPC_LISTEN = TCP_LISTEN, @@@ -866,14 -867,6 +867,14 @@@ static void tipc_sk_proto_rcv(struct ti if (!tsk_peer_msg(tsk, hdr)) goto exit;
+ if (unlikely(msg_errcode(hdr))) { + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk), + tsk_peer_port(tsk)); + sk->sk_state_change(sk); + goto exit; + } + tsk->probe_unacked = false;
if (mtyp == CONN_PROBE) { @@@ -1267,10 -1260,7 +1268,10 @@@ static int tipc_wait_for_rcvmsg(struct struct sock *sk = sock->sk; DEFINE_WAIT(wait); long timeo = *timeop; - int err; + int err = sock_error(sk); + + if (err) + return err;
for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); @@@ -1292,10 -1282,6 +1293,10 @@@ err = sock_intr_errno(timeo); if (signal_pending(current)) break; + + err = sock_error(sk); + if (err) + break; } finish_wait(sk_sleep(sk), &wait); *timeop = timeo; @@@ -1305,7 -1291,7 +1306,7 @@@ /** * tipc_recvmsg - receive packet-oriented message * @m: descriptor for message info - * @buf_len: total size of user buffer area + * @buflen: length of user buffer area * @flags: receive flags * * Used for SOCK_DGRAM, SOCK_RDM, and SOCK_SEQPACKET messages. @@@ -1313,95 -1299,85 +1314,85 @@@ * * Returns size of returned message data, errno otherwise */ - static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len, - int flags) + static int tipc_recvmsg(struct socket *sock, struct msghdr *m, + size_t buflen, int flags) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct sk_buff *buf; - struct tipc_msg *msg; - bool is_connectionless = tipc_sk_type_connectionless(sk); - long timeo; - unsigned int sz; - u32 err; - int res, hlen; + struct sk_buff *skb; + struct tipc_msg *hdr; + bool connected = !tipc_sk_type_connectionless(sk); + int rc, err, hlen, dlen, copy; + long timeout;
/* Catch invalid receive requests */ - if (unlikely(!buf_len)) + if (unlikely(!buflen)) return -EINVAL;
lock_sock(sk); - - if (!is_connectionless && unlikely(sk->sk_state == TIPC_OPEN)) { - res = -ENOTCONN; + if (unlikely(connected && sk->sk_state == TIPC_OPEN)) { + rc = -ENOTCONN; goto exit; } + timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
- timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - restart: - - /* Look for a message in receive queue; wait if necessary */ - res = tipc_wait_for_rcvmsg(sock, &timeo); - if (res) - goto exit; - - /* Look at first message in receive queue */ - buf = skb_peek(&sk->sk_receive_queue); - msg = buf_msg(buf); - sz = msg_data_sz(msg); - hlen = msg_hdr_sz(msg); - err = msg_errcode(msg); - - /* Discard an empty non-errored message & try again */ - if ((!sz) && (!err)) { + do { + /* Look at first msg in receive queue; wait if necessary */ + rc = tipc_wait_for_rcvmsg(sock, &timeout); + if (unlikely(rc)) + goto exit; + skb = skb_peek(&sk->sk_receive_queue); + hdr = buf_msg(skb); + dlen = msg_data_sz(hdr); + hlen = msg_hdr_sz(hdr); + err = msg_errcode(hdr); + if (likely(dlen || err)) + break; tsk_advance_rx_queue(sk); - goto restart; - } - - /* Capture sender's address (optional) */ - set_orig_addr(m, msg); + } while (1);
- /* Capture ancillary data (optional) */ - res = tipc_sk_anc_data_recv(m, msg, tsk); - if (res) + /* Collect msg meta data, including error code and rejected data */ + set_orig_addr(m, hdr); + rc = tipc_sk_anc_data_recv(m, hdr, tsk); + if (unlikely(rc)) goto exit;
- /* Capture message data (if valid) & compute return value (always) */ - if (!err) { - if (unlikely(buf_len < sz)) { - sz = buf_len; + /* Capture data if non-error msg, otherwise just set return value */ + if (likely(!err)) { + copy = min_t(int, dlen, buflen); + if (unlikely(copy != dlen)) m->msg_flags |= MSG_TRUNC; - } - res = skb_copy_datagram_msg(buf, hlen, m, sz); - if (res) - goto exit; - res = sz; + rc = skb_copy_datagram_msg(skb, hlen, m, copy); } else { - if (is_connectionless || err == TIPC_CONN_SHUTDOWN || - m->msg_control) - res = 0; - else - res = -ECONNRESET; + copy = 0; + rc = 0; + if (err != TIPC_CONN_SHUTDOWN && connected && !m->msg_control) + rc = -ECONNRESET; } + if (unlikely(rc)) + goto exit;
+ /* Caption of data or error code/rejected data was successful */ if (unlikely(flags & MSG_PEEK)) goto exit;
- if (likely(!is_connectionless)) { - tsk->rcv_unacked += tsk_inc(tsk, hlen + sz); - if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4))) - tipc_sk_send_ack(tsk); - } tsk_advance_rx_queue(sk); + if (likely(!connected)) + goto exit; + + /* Send connection flow control ack when applicable */ + tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); + if (tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) + tipc_sk_send_ack(tsk); exit: release_sock(sk); - return res; + return rc ? rc : copy; }
/** - * tipc_recv_stream - receive stream-oriented data + * tipc_recvstream - receive stream-oriented data * @m: descriptor for message info - * @buf_len: total size of user buffer area + * @buflen: total size of user buffer area * @flags: receive flags * * Used for SOCK_STREAM messages only. If not enough data is available @@@ -1409,111 -1385,98 +1400,98 @@@ * * Returns size of returned message data, errno otherwise */ - static int tipc_recv_stream(struct socket *sock, struct msghdr *m, - size_t buf_len, int flags) + static int tipc_recvstream(struct socket *sock, struct msghdr *m, + size_t buflen, int flags) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct sk_buff *buf; - struct tipc_msg *msg; - long timeo; - unsigned int sz; - int target; - int sz_copied = 0; - u32 err; - int res = 0, hlen; + struct sk_buff *skb; + struct tipc_msg *hdr; + struct tipc_skb_cb *skb_cb; + bool peek = flags & MSG_PEEK; + int offset, required, copy, copied = 0; + int hlen, dlen, err, rc; + long timeout;
/* Catch invalid receive attempts */ - if (unlikely(!buf_len)) + if (unlikely(!buflen)) return -EINVAL;
lock_sock(sk);
if (unlikely(sk->sk_state == TIPC_OPEN)) { - res = -ENOTCONN; - goto exit; - } - - target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len); - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - restart: - /* Look for a message in receive queue; wait if necessary */ - res = tipc_wait_for_rcvmsg(sock, &timeo); - if (res) + rc = -ENOTCONN; goto exit; - - /* Look at first message in receive queue */ - buf = skb_peek(&sk->sk_receive_queue); - msg = buf_msg(buf); - sz = msg_data_sz(msg); - hlen = msg_hdr_sz(msg); - err = msg_errcode(msg); - - /* Discard an empty non-errored message & try again */ - if ((!sz) && (!err)) { - tsk_advance_rx_queue(sk); - goto restart; } + required = sock_rcvlowat(sk, flags & MSG_WAITALL, buflen); + timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
- /* Optionally capture sender's address & ancillary data of first msg */ - if (sz_copied == 0) { - set_orig_addr(m, msg); - res = tipc_sk_anc_data_recv(m, msg, tsk); - if (res) - goto exit; - } - - /* Capture message data (if valid) & compute return value (always) */ - if (!err) { - u32 offset = TIPC_SKB_CB(buf)->bytes_read; - u32 needed; - int sz_to_copy; - - sz -= offset; - needed = (buf_len - sz_copied); - sz_to_copy = min(sz, needed); - - res = skb_copy_datagram_msg(buf, hlen + offset, m, sz_to_copy); - if (res) - goto exit; + do { + /* Look at first msg in receive queue; wait if necessary */ + rc = tipc_wait_for_rcvmsg(sock, &timeout); + if (unlikely(rc)) + break; + skb = skb_peek(&sk->sk_receive_queue); + skb_cb = TIPC_SKB_CB(skb); + hdr = buf_msg(skb); + dlen = msg_data_sz(hdr); + hlen = msg_hdr_sz(hdr); + err = msg_errcode(hdr); + + /* Discard any empty non-errored (SYN-) message */ + if (unlikely(!dlen && !err)) { + tsk_advance_rx_queue(sk); + continue; + }
- sz_copied += sz_to_copy; + /* Collect msg meta data, incl. error code and rejected data */ + if (!copied) { + set_orig_addr(m, hdr); + rc = tipc_sk_anc_data_recv(m, hdr, tsk); + if (rc) + break; + }
- if (sz_to_copy < sz) { - if (!(flags & MSG_PEEK)) - TIPC_SKB_CB(buf)->bytes_read = - offset + sz_to_copy; - goto exit; + /* Copy data if msg ok, otherwise return error/partial data */ + if (likely(!err)) { + offset = skb_cb->bytes_read; + copy = min_t(int, dlen - offset, buflen - copied); + rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); + if (unlikely(rc)) + break; + copied += copy; + offset += copy; + if (unlikely(offset < dlen)) { + if (!peek) + skb_cb->bytes_read = offset; + break; + } + } else { + rc = 0; + if ((err != TIPC_CONN_SHUTDOWN) && !m->msg_control) + rc = -ECONNRESET; + if (copied || rc) + break; } - } else { - if (sz_copied != 0) - goto exit; /* can't add error msg to valid data */
- if ((err == TIPC_CONN_SHUTDOWN) || m->msg_control) - res = 0; - else - res = -ECONNRESET; - } + if (unlikely(peek)) + break;
- if (unlikely(flags & MSG_PEEK)) - goto exit; + tsk_advance_rx_queue(sk);
- tsk->rcv_unacked += tsk_inc(tsk, hlen + msg_data_sz(msg)); - if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4))) - tipc_sk_send_ack(tsk); - tsk_advance_rx_queue(sk); + /* Send connection flow control advertisement when applicable */ + tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); + if (unlikely(tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)) + tipc_sk_send_ack(tsk);
- /* Loop around if more data is required */ - if ((sz_copied < buf_len) && /* didn't get all requested data */ - (!skb_queue_empty(&sk->sk_receive_queue) || - (sz_copied < target)) && /* and more is ready or required */ - (!err)) /* and haven't reached a FIN */ - goto restart; + /* Exit if all requested data or FIN/error received */ + if (copied == buflen || err) + break;
+ } while (!skb_queue_empty(&sk->sk_receive_queue) || copied < required); exit: release_sock(sk); - return sz_copied ? sz_copied : res; + return copied ? copied : rc; }
/** @@@ -1566,8 -1529,6 +1544,8 @@@ static bool filter_connect(struct tipc_ struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct tipc_msg *hdr = buf_msg(skb); + u32 pport = msg_origport(hdr); + u32 pnode = msg_orignode(hdr);
if (unlikely(msg_mcast(hdr))) return false; @@@ -1575,28 -1536,18 +1553,28 @@@ switch (sk->sk_state) { case TIPC_CONNECTING: /* Accept only ACK or NACK message */ - if (unlikely(!msg_connected(hdr))) - return false; + if (unlikely(!msg_connected(hdr))) { + if (pport != tsk_peer_port(tsk) || + pnode != tsk_peer_node(tsk)) + return false; + + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + sk->sk_err = ECONNREFUSED; + sk->sk_state_change(sk); + return true; + }
if (unlikely(msg_errcode(hdr))) { tipc_set_sk_state(sk, TIPC_DISCONNECTING); sk->sk_err = ECONNREFUSED; + sk->sk_state_change(sk); return true; }
if (unlikely(!msg_isdata(hdr))) { tipc_set_sk_state(sk, TIPC_DISCONNECTING); sk->sk_err = EINVAL; + sk->sk_state_change(sk); return true; }
@@@ -1608,7 -1559,8 +1586,7 @@@ return true;
/* If empty 'ACK-' message, wake up sleeping connect() */ - if (waitqueue_active(sk_sleep(sk))) - wake_up_interruptible(sk_sleep(sk)); + sk->sk_data_ready(sk);
/* 'ACK-' message is neither accepted nor rejected: */ msg_set_dest_droppable(hdr, 1); @@@ -2537,6 -2489,28 +2515,28 @@@ static int tipc_ioctl(struct socket *so } }
+ static int tipc_socketpair(struct socket *sock1, struct socket *sock2) + { + struct tipc_sock *tsk2 = tipc_sk(sock2->sk); + struct tipc_sock *tsk1 = tipc_sk(sock1->sk); + u32 onode = tipc_own_addr(sock_net(sock1->sk)); + + tsk1->peer.family = AF_TIPC; + tsk1->peer.addrtype = TIPC_ADDR_ID; + tsk1->peer.scope = TIPC_NODE_SCOPE; + tsk1->peer.addr.id.ref = tsk2->portid; + tsk1->peer.addr.id.node = onode; + tsk2->peer.family = AF_TIPC; + tsk2->peer.addrtype = TIPC_ADDR_ID; + tsk2->peer.scope = TIPC_NODE_SCOPE; + tsk2->peer.addr.id.ref = tsk1->portid; + tsk2->peer.addr.id.node = onode; + + tipc_sk_finish_conn(tsk1, tsk2->portid, onode); + tipc_sk_finish_conn(tsk2, tsk1->portid, onode); + return 0; + } + /* Protocol switches for the various types of TIPC sockets */
static const struct proto_ops msg_ops = { @@@ -2545,7 -2519,7 +2545,7 @@@ .release = tipc_release, .bind = tipc_bind, .connect = tipc_connect, - .socketpair = sock_no_socketpair, + .socketpair = tipc_socketpair, .accept = sock_no_accept, .getname = tipc_getname, .poll = tipc_poll, @@@ -2566,7 -2540,7 +2566,7 @@@ static const struct proto_ops packet_op .release = tipc_release, .bind = tipc_bind, .connect = tipc_connect, - .socketpair = sock_no_socketpair, + .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, .poll = tipc_poll, @@@ -2587,7 -2561,7 +2587,7 @@@ static const struct proto_ops stream_op .release = tipc_release, .bind = tipc_bind, .connect = tipc_connect, - .socketpair = sock_no_socketpair, + .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, .poll = tipc_poll, @@@ -2597,7 -2571,7 +2597,7 @@@ .setsockopt = tipc_setsockopt, .getsockopt = tipc_getsockopt, .sendmsg = tipc_sendstream, - .recvmsg = tipc_recv_stream, + .recvmsg = tipc_recvstream, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage }; @@@ -2870,7 -2844,7 +2870,7 @@@ int tipc_nl_publ_dump(struct sk_buff *s
err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], - tipc_nl_sock_policy); + tipc_nl_sock_policy, NULL); if (err) return err;
diff --combined net/xfrm/xfrm_input.c index e23570b,21c6cc9..9de4b1d --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@@ -107,6 -107,8 +107,8 @@@ struct sec_path *secpath_dup(struct sec sp->len = 0; sp->olen = 0;
+ memset(sp->ovec, 0, sizeof(sp->ovec[XFRM_MAX_OFFLOAD_DEPTH])); + if (src) { int i;
@@@ -207,8 -209,9 +209,9 @@@ int xfrm_input(struct sk_buff *skb, in unsigned int family; int decaps = 0; int async = 0; - struct xfrm_offload *xo; bool xfrm_gro = false; + bool crypto_done = false; + struct xfrm_offload *xo = xfrm_offload(skb);
if (encap_type < 0) { x = xfrm_input_state(skb); @@@ -220,9 -223,40 +223,40 @@@ seq = XFRM_SKB_CB(skb)->seq.input.low; goto resume; } + /* encap_type < -1 indicates a GRO call. */ encap_type = 0; seq = XFRM_SPI_SKB_CB(skb)->seq; + + if (xo && (xo->flags & CRYPTO_DONE)) { + crypto_done = true; + x = xfrm_input_state(skb); + family = XFRM_SPI_SKB_CB(skb)->family; + + if (!(xo->status & CRYPTO_SUCCESS)) { + if (xo->status & + (CRYPTO_TRANSPORT_AH_AUTH_FAILED | + CRYPTO_TRANSPORT_ESP_AUTH_FAILED | + CRYPTO_TUNNEL_AH_AUTH_FAILED | + CRYPTO_TUNNEL_ESP_AUTH_FAILED)) { + + xfrm_audit_state_icvfail(x, skb, + x->type->proto); + x->stats.integrity_failed++; + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); + goto drop; + } + + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto drop; + } + + if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); + goto drop; + } + } + goto lock; }
@@@ -311,7 -345,10 +345,10 @@@ lock skb_dst_force(skb); dev_hold(skb->dev);
- nexthdr = x->type->input(x, skb); + if (crypto_done) + nexthdr = x->type_offload->input_tail(x, skb); + else + nexthdr = x->type->input(x, skb);
if (nexthdr == -EINPROGRESS) return 0; @@@ -395,7 -432,7 +432,7 @@@ resume if (xo) xfrm_gro = xo->flags & XFRM_GRO;
- err = x->inner_mode->afinfo->transport_finish(skb, async); + err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async); if (xfrm_gro) { skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); diff --combined net/xfrm/xfrm_policy.c index dfc77b9,dd44ddc..b00a1d5 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@@ -116,11 -116,10 +116,10 @@@ static const struct xfrm_policy_afinfo return afinfo; }
- static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, - int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - int family) + struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr, + int family) { const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; @@@ -135,6 -134,7 +134,7 @@@
return dst; } + EXPORT_SYMBOL(__xfrm_dst_lookup);
static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, int oif, @@@ -1006,10 -1006,6 +1006,10 @@@ int xfrm_policy_flush(struct net *net, err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + + if (cnt) + xfrm_garbage_collect(net); + return err; } EXPORT_SYMBOL(xfrm_policy_flush); @@@ -2933,21 -2929,6 +2933,6 @@@ void xfrm_policy_unregister_afinfo(cons } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
- static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) - { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - - switch (event) { - case NETDEV_DOWN: - xfrm_garbage_collect(dev_net(dev)); - } - return NOTIFY_DONE; - } - - static struct notifier_block xfrm_dev_notifier = { - .notifier_call = xfrm_dev_event, - }; - #ifdef CONFIG_XFRM_STATISTICS static int __net_init xfrm_statistics_init(struct net *net) { @@@ -3024,7 -3005,7 +3009,7 @@@ static int __net_init xfrm_policy_init( INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); if (net_eq(net, &init_net)) - register_netdevice_notifier(&xfrm_dev_notifier); + xfrm_dev_init(); return 0;
out_bydst: diff --combined net/xfrm/xfrm_user.c index 7916af0,c4ccedd..38614df --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@@ -55,7 -55,7 +55,7 @@@ static int verify_one_alg(struct nlatt return -EINVAL; }
- algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0'; + algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; return 0; }
@@@ -71,7 -71,7 +71,7 @@@ static int verify_auth_trunc(struct nla if (nla_len(rt) < xfrm_alg_auth_len(algp)) return -EINVAL;
- algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0'; + algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; return 0; }
@@@ -87,7 -87,7 +87,7 @@@ static int verify_aead(struct nlattr ** if (nla_len(rt) < aead_len(algp)) return -EINVAL;
- algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0'; + algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; return 0; }
@@@ -595,6 -595,13 +595,13 @@@ static struct xfrm_state *xfrm_state_co goto error; }
+ if (attrs[XFRMA_OFFLOAD_DEV]) { + err = xfrm_dev_state_add(net, x, + nla_data(attrs[XFRMA_OFFLOAD_DEV])); + if (err) + goto error; + } + if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, attrs[XFRMA_REPLAY_ESN_VAL]))) goto error; @@@ -779,6 -786,23 +786,23 @@@ static int copy_sec_ctx(struct xfrm_sec return 0; }
+ static int copy_user_offload(struct xfrm_state_offload *xso, struct sk_buff *skb) + { + struct xfrm_user_offload *xuo; + struct nlattr *attr; + + attr = nla_reserve(skb, XFRMA_OFFLOAD_DEV, sizeof(*xuo)); + if (attr == NULL) + return -EMSGSIZE; + + xuo = nla_data(attr); + + xuo->ifindex = xso->dev->ifindex; + xuo->flags = xso->flags; + + return 0; + } + static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) { struct xfrm_algo *algo; @@@ -869,6 -893,10 +893,10 @@@ static int copy_to_user_state_extra(str &x->replay); if (ret) goto out; + if(x->xso.dev) + ret = copy_user_offload(&x->xso, skb); + if (ret) + goto out; if (x->security) ret = copy_sec_ctx(x->security, skb); out: @@@ -932,8 -960,8 +960,8 @@@ static int xfrm_dump_sa(struct sk_buff u8 proto = 0; int err;
- err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, - xfrma_policy); + err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, xfrma_policy, + NULL); if (err < 0) return err;
@@@ -2406,6 -2434,7 +2434,7 @@@ static const struct nla_policy xfrma_po [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 }, [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, + [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, };
static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { @@@ -2448,7 -2477,8 +2477,8 @@@ static const struct xfrm_link [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, };
- static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) + static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *attrs[XFRMA_MAX+1]; @@@ -2488,7 -2518,7 +2518,7 @@@
err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, link->nla_max ? : XFRMA_MAX, - link->nla_pol ? : xfrma_policy); + link->nla_pol ? : xfrma_policy, extack); if (err < 0) return err;
@@@ -2622,6 -2652,8 +2652,8 @@@ static inline size_t xfrm_sa_len(struc l += nla_total_size(sizeof(*x->coaddr)); if (x->props.extra_flags) l += nla_total_size(sizeof(x->props.extra_flags)); + if (x->xso.dev) + l += nla_total_size(sizeof(x->xso));
/* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); @@@ -3108,7 -3140,6 +3140,6 @@@ static bool xfrm_is_alive(const struct }
static struct xfrm_mgr netlink_mgr = { - .id = "netlink", .notify = xfrm_send_state_notify, .acquire = xfrm_send_acquire, .compile_policy = xfrm_compile_policy,
linux-merge@lists.open-mesh.org