The following commit has been merged in the master branch: commit bf063cd286fd84ec423410dc60629480e4e6d0b9 Merge: 043140984aa381157f8d47d2554fbe79213ac6f0 a54df682e559da9cf09b41779ee62bc9f11d3804 Author: Stephen Rothwell sfr@canb.auug.org.au Date: Mon Aug 7 12:00:26 2017 +1000
Merge remote-tracking branch 'net-next/master'
diff --combined MAINTAINERS index 6aab3b7172c2,a515da73c7e4..433f3cf91390 --- a/MAINTAINERS +++ b/MAINTAINERS @@@ -301,7 -301,6 +301,7 @@@ S: Supporte F: drivers/acpi/ F: drivers/pnp/pnpacpi/ F: include/linux/acpi.h +F: include/linux/fwnode.h F: include/acpi/ F: Documentation/acpi/ F: Documentation/ABI/testing/sysfs-bus-acpi @@@ -311,14 -310,6 +311,14 @@@ F: drivers/pci/*/*acpi F: drivers/pci/*/*/*acpi* F: tools/power/acpi/
+ACPI APEI +M: "Rafael J. Wysocki" rjw@rjwysocki.net +M: Len Brown lenb@kernel.org +L: linux-acpi@vger.kernel.org +R: Tony Luck tony.luck@intel.com +R: Borislav Petkov bp@alien8.de +F: drivers/acpi/apei/ + ACPI COMPONENT ARCHITECTURE (ACPICA) M: Robert Moore robert.moore@intel.com M: Lv Zheng lv.zheng@intel.com @@@ -778,12 -769,6 +778,12 @@@ W: http://ez.analog.com/community/linux S: Supported F: drivers/media/i2c/adv7180.c
+ANALOG DEVICES INC ADV748X DRIVER +M: Kieran Bingham kieran.bingham@ideasonboard.com +L: linux-media@vger.kernel.org +S: Maintained +F: drivers/media/i2c/adv748x/* + ANALOG DEVICES INC ADV7511 DRIVER M: Hans Verkuil hans.verkuil@cisco.com L: linux-media@vger.kernel.org @@@ -2492,7 -2477,7 +2492,7 @@@ Q: https://patchwork.open-mesh.org/proj S: Maintained F: Documentation/ABI/testing/sysfs-class-net-batman-adv F: Documentation/ABI/testing/sysfs-class-net-mesh - F: Documentation/networking/batman-adv.txt + F: Documentation/networking/batman-adv.rst F: include/uapi/linux/batman_adv.h F: net/batman-adv/
@@@ -5116,6 -5101,7 +5116,7 @@@ F: include/linux/of_net. F: include/linux/phy.h F: include/linux/phy_fixed.h F: include/linux/platform_data/mdio-gpio.h + F: include/linux/platform_data/mdio-bcm-unimac.h F: include/trace/events/mdio.h F: include/uapi/linux/mdio.h F: include/uapi/linux/mii.h @@@ -6162,6 -6148,14 +6163,14 @@@ S: Maintaine F: drivers/net/ethernet/hisilicon/ F: Documentation/devicetree/bindings/net/hisilicon*.txt
+ HISILICON NETWORK SUBSYSTEM 3 DRIVER (HNS3) + M: Yisen Zhuang yisen.zhuang@huawei.com + M: Salil Mehta salil.mehta@huawei.com + L: netdev@vger.kernel.org + W: http://www.hisilicon.com + S: Maintained + F: drivers/net/ethernet/hisilicon/hns3/ + HISILICON ROCE DRIVER M: Lijun Ou oulijun@huawei.com M: Wei Hu(Xavier) xavier.huwei@huawei.com @@@ -6272,6 -6266,7 +6281,7 @@@ M: Haiyang Zhang <haiyangz@microsoft.co M: Stephen Hemminger sthemmin@microsoft.com L: devel@linuxdriverproject.org S: Maintained + F: Documentation/networking/netvsc.txt F: arch/x86/include/asm/mshyperv.h F: arch/x86/include/uapi/asm/hyperv.h F: arch/x86/kernel/cpu/mshyperv.c @@@ -8414,22 -8409,6 +8424,22 @@@ S: Supporte F: Documentation/devicetree/bindings/media/renesas,vsp1.txt F: drivers/media/platform/vsp1/
+MEDIA DRIVERS FOR ST STV0910 DEMODULATOR ICs +M: Daniel Scheller d.scheller.oss@gmail.com +L: linux-media@vger.kernel.org +W: https://linuxtv.org +T: git git://linuxtv.org/media_tree.git +S: Maintained +F: drivers/media/dvb-frontends/stv0910* + +MEDIA DRIVERS FOR ST STV6111 TUNER ICs +M: Daniel Scheller d.scheller.oss@gmail.com +L: linux-media@vger.kernel.org +W: https://linuxtv.org +T: git git://linuxtv.org/media_tree.git +S: Maintained +F: drivers/media/dvb-frontends/stv6111* + MEDIA INPUT INFRASTRUCTURE (V4L/DVB) M: Mauro Carvalho Chehab mchehab@s-opensource.com M: Mauro Carvalho Chehab mchehab@kernel.org @@@ -8455,7 -8434,9 +8465,9 @@@ F: include/uapi/linux/uvcvideo.
MEDIATEK ETHERNET DRIVER M: Felix Fietkau nbd@openwrt.org - M: John Crispin blogic@openwrt.org + M: John Crispin john@phrozen.org + M: Sean Wang sean.wang@mediatek.com + M: Nelson Chang nelson.chang@mediatek.com L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/mediatek/ @@@ -9740,7 -9721,7 +9752,7 @@@ S: Maintaine F: drivers/media/i2c/ov5640.c
OMNIVISION OV5647 SENSOR DRIVER -M: Ramiro Oliveira roliveir@synopsys.com +M: Luis Oliveira lolivei@synopsys.com L: linux-media@vger.kernel.org T: git git://linuxtv.org/media_tree.git S: Maintained @@@ -10738,7 -10719,6 +10750,7 @@@ L: linux-media@vger.kernel.or T: git git://linuxtv.org/media_tree.git S: Maintained F: drivers/media/usb/pulse8-cec/* +F: Documentation/media/cec-drivers/pulse8-cec.rst
PVRUSB2 VIDEO4LINUX DRIVER M: Mike Isely isely@pobox.com @@@ -12639,12 -12619,6 +12651,12 @@@ F: drivers/clocksource/arc_timer. F: drivers/tty/serial/arc_uart.c T: git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git
+SYNOPSYS ARC SDP clock driver +M: Eugeniy Paltsev Eugeniy.Paltsev@synopsys.com +S: Supported +F: drivers/clk/axs10x/* +F: Documentation/devicetree/bindings/clock/snps,pll-clock.txt + SYNOPSYS ARC SDP platform support M: Alexey Brodkin abrodkin@synopsys.com S: Supported @@@ -12681,13 -12655,6 +12693,13 @@@ L: linux-mmc@vger.kernel.or S: Maintained F: drivers/mmc/host/dw_mmc*
+SYNOPSYS HSDK RESET CONTROLLER DRIVER +M: Eugeniy Paltsev Eugeniy.Paltsev@synopsys.com +S: Supported +F: drivers/reset/reset-hsdk-v1.c +F: include/dt-bindings/reset/snps,hsdk-v1-reset.h +F: Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt + SYSTEM CONFIGURATION (SYSCON) M: Lee Jones lee.jones@linaro.org M: Arnd Bergmann arnd@arndb.de @@@ -14263,12 -14230,6 +14275,12 @@@ F: drivers/watchdog F: include/linux/watchdog.h F: include/uapi/linux/watchdog.h
+WHISKEYCOVE PMIC GPIO DRIVER +M: Kuppuswamy Sathyanarayanan sathyanarayanan.kuppuswamy@linux.intel.com +L: linux-gpio@vger.kernel.org +S: Maintained +F: drivers/gpio/gpio-wcove.c + WIIMOTE HID DRIVER M: David Herrmann dh.herrmann@googlemail.com L: linux-input@vger.kernel.org diff --combined drivers/net/ethernet/ibm/ibmvnic.c index c45e8e3b82d3,5932160eb815..74f44133b5f8 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@@ -111,7 -111,6 +111,7 @@@ static void send_request_map(struct ibm static void send_request_unmap(struct ibmvnic_adapter *, u8); static void send_login(struct ibmvnic_adapter *adapter); static void send_cap_queries(struct ibmvnic_adapter *adapter); +static int init_sub_crqs(struct ibmvnic_adapter *); static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter); static int ibmvnic_init(struct ibmvnic_adapter *); static void release_crq_queue(struct ibmvnic_adapter *); @@@ -347,6 -346,31 +347,31 @@@ static void replenish_pools(struct ibmv } }
+ static void release_stats_buffers(struct ibmvnic_adapter *adapter) + { + kfree(adapter->tx_stats_buffers); + kfree(adapter->rx_stats_buffers); + } + + static int init_stats_buffers(struct ibmvnic_adapter *adapter) + { + adapter->tx_stats_buffers = + kcalloc(adapter->req_tx_queues, + sizeof(struct ibmvnic_tx_queue_stats), + GFP_KERNEL); + if (!adapter->tx_stats_buffers) + return -ENOMEM; + + adapter->rx_stats_buffers = + kcalloc(adapter->req_rx_queues, + sizeof(struct ibmvnic_rx_queue_stats), + GFP_KERNEL); + if (!adapter->rx_stats_buffers) + return -ENOMEM; + + return 0; + } + static void release_stats_token(struct ibmvnic_adapter *adapter) { struct device *dev = &adapter->vdev->dev; @@@ -652,7 -676,6 +677,7 @@@ static int ibmvnic_login(struct net_dev struct ibmvnic_adapter *adapter = netdev_priv(netdev); unsigned long timeout = msecs_to_jiffies(30000); struct device *dev = &adapter->vdev->dev; + int rc;
do { if (adapter->renegotiate) { @@@ -666,18 -689,6 +691,18 @@@ dev_err(dev, "Capabilities query timeout\n"); return -1; } + rc = init_sub_crqs(adapter); + if (rc) { + dev_err(dev, + "Initialization of SCRQ's failed\n"); + return -1; + } + rc = init_sub_crq_irqs(adapter); + if (rc) { + dev_err(dev, + "Initialization of SCRQ's irqs failed\n"); + return -1; + } }
reinit_completion(&adapter->init_done); @@@ -700,6 -711,7 +725,7 @@@ static void release_resources(struct ib release_rx_pools(adapter);
release_stats_token(adapter); + release_stats_buffers(adapter); release_error_buffers(adapter);
if (adapter->napi) { @@@ -777,6 -789,10 +803,10 @@@ static int init_resources(struct ibmvni if (rc) return rc;
+ rc = init_stats_buffers(adapter); + if (rc) + return rc; + rc = init_stats_token(adapter); if (rc) return rc; @@@ -1259,6 -1275,9 +1289,9 @@@ out netdev->stats.tx_packets += tx_packets; adapter->tx_send_failed += tx_send_failed; adapter->tx_map_failed += tx_map_failed; + adapter->tx_stats_buffers[queue_num].packets += tx_packets; + adapter->tx_stats_buffers[queue_num].bytes += tx_bytes; + adapter->tx_stats_buffers[queue_num].dropped_packets += tx_dropped;
return ret; } @@@ -1599,6 -1618,8 +1632,8 @@@ restart_poll napi_gro_receive(napi, skb); /* send it up */ netdev->stats.rx_packets++; netdev->stats.rx_bytes += length; + adapter->rx_stats_buffers[scrq_num].packets++; + adapter->rx_stats_buffers[scrq_num].bytes += length; frames_processed++; }
@@@ -1708,18 -1729,36 +1743,36 @@@ static u32 ibmvnic_get_link(struct net_ static void ibmvnic_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring) { - ring->rx_max_pending = 0; - ring->tx_max_pending = 0; + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + + ring->rx_max_pending = adapter->max_rx_add_entries_per_subcrq; + ring->tx_max_pending = adapter->max_tx_entries_per_subcrq; ring->rx_mini_max_pending = 0; ring->rx_jumbo_max_pending = 0; - ring->rx_pending = 0; - ring->tx_pending = 0; + ring->rx_pending = adapter->req_rx_add_entries_per_subcrq; + ring->tx_pending = adapter->req_tx_entries_per_subcrq; ring->rx_mini_pending = 0; ring->rx_jumbo_pending = 0; }
+ static void ibmvnic_get_channels(struct net_device *netdev, + struct ethtool_channels *channels) + { + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + + channels->max_rx = adapter->max_rx_queues; + channels->max_tx = adapter->max_tx_queues; + channels->max_other = 0; + channels->max_combined = 0; + channels->rx_count = adapter->req_rx_queues; + channels->tx_count = adapter->req_tx_queues; + channels->other_count = 0; + channels->combined_count = 0; + } + static void ibmvnic_get_strings(struct net_device *dev, u32 stringset, u8 *data) { + struct ibmvnic_adapter *adapter = netdev_priv(dev); int i;
if (stringset != ETH_SS_STATS) @@@ -1727,13 -1766,39 +1780,39 @@@
for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++, data += ETH_GSTRING_LEN) memcpy(data, ibmvnic_stats[i].name, ETH_GSTRING_LEN); + + for (i = 0; i < adapter->req_tx_queues; i++) { + snprintf(data, ETH_GSTRING_LEN, "tx%d_packets", i); + data += ETH_GSTRING_LEN; + + snprintf(data, ETH_GSTRING_LEN, "tx%d_bytes", i); + data += ETH_GSTRING_LEN; + + snprintf(data, ETH_GSTRING_LEN, "tx%d_dropped_packets", i); + data += ETH_GSTRING_LEN; + } + + for (i = 0; i < adapter->req_rx_queues; i++) { + snprintf(data, ETH_GSTRING_LEN, "rx%d_packets", i); + data += ETH_GSTRING_LEN; + + snprintf(data, ETH_GSTRING_LEN, "rx%d_bytes", i); + data += ETH_GSTRING_LEN; + + snprintf(data, ETH_GSTRING_LEN, "rx%d_interrupts", i); + data += ETH_GSTRING_LEN; + } }
static int ibmvnic_get_sset_count(struct net_device *dev, int sset) { + struct ibmvnic_adapter *adapter = netdev_priv(dev); + switch (sset) { case ETH_SS_STATS: - return ARRAY_SIZE(ibmvnic_stats); + return ARRAY_SIZE(ibmvnic_stats) + + adapter->req_tx_queues * NUM_TX_STATS + + adapter->req_rx_queues * NUM_RX_STATS; default: return -EOPNOTSUPP; } @@@ -1744,7 -1809,7 +1823,7 @@@ static void ibmvnic_get_ethtool_stats(s { struct ibmvnic_adapter *adapter = netdev_priv(dev); union ibmvnic_crq crq; - int i; + int i, j;
memset(&crq, 0, sizeof(crq)); crq.request_statistics.first = IBMVNIC_CRQ_CMD; @@@ -1759,7 -1824,26 +1838,26 @@@ wait_for_completion(&adapter->stats_done);
for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++) - data[i] = IBMVNIC_GET_STAT(adapter, ibmvnic_stats[i].offset); + data[i] = be64_to_cpu(IBMVNIC_GET_STAT(adapter, + ibmvnic_stats[i].offset)); + + for (j = 0; j < adapter->req_tx_queues; j++) { + data[i] = adapter->tx_stats_buffers[j].packets; + i++; + data[i] = adapter->tx_stats_buffers[j].bytes; + i++; + data[i] = adapter->tx_stats_buffers[j].dropped_packets; + i++; + } + + for (j = 0; j < adapter->req_rx_queues; j++) { + data[i] = adapter->rx_stats_buffers[j].packets; + i++; + data[i] = adapter->rx_stats_buffers[j].bytes; + i++; + data[i] = adapter->rx_stats_buffers[j].interrupts; + i++; + } }
static const struct ethtool_ops ibmvnic_ethtool_ops = { @@@ -1768,6 -1852,7 +1866,7 @@@ .set_msglevel = ibmvnic_set_msglevel, .get_link = ibmvnic_get_link, .get_ringparam = ibmvnic_get_ringparam, + .get_channels = ibmvnic_get_channels, .get_strings = ibmvnic_get_strings, .get_sset_count = ibmvnic_get_sset_count, .get_ethtool_stats = ibmvnic_get_ethtool_stats, @@@ -2064,6 -2149,8 +2163,8 @@@ static irqreturn_t ibmvnic_interrupt_rx struct ibmvnic_sub_crq_queue *scrq = instance; struct ibmvnic_adapter *adapter = scrq->adapter;
+ adapter->rx_stats_buffers[scrq->scrq_num].interrupts++; + if (napi_schedule_prep(&adapter->napi[scrq->scrq_num])) { disable_scrq_irq(adapter, scrq); __napi_schedule(&adapter->napi[scrq->scrq_num]); @@@ -3018,6 -3105,7 +3119,6 @@@ static void handle_request_cap_rsp(unio *req_value, (long int)be64_to_cpu(crq->request_capability_rsp. number), name); - release_sub_crqs(adapter); *req_value = be64_to_cpu(crq->request_capability_rsp.number); ibmvnic_send_req_caps(adapter, 1); return; @@@ -3864,10 -3952,7 +3965,7 @@@ static int ibmvnic_resume(struct devic if (adapter->state != VNIC_OPEN) return 0;
- /* kick the interrupt handlers just in case we lost an interrupt */ - for (i = 0; i < adapter->req_rx_queues; i++) - ibmvnic_interrupt_rx(adapter->rx_scrq[i]->irq, - adapter->rx_scrq[i]); + tasklet_schedule(&adapter->tasklet);
return 0; } diff --combined drivers/net/ethernet/intel/i40e/i40e_txrx.c index 2194960d5855,d464fceb300f..8a969d8f0790 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@@ -860,7 -860,7 +860,7 @@@ static bool i40e_clean_tx_irq(struct i4 netdev_tx_completed_queue(txring_txq(tx_ring), total_packets, total_bytes);
- #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2) + #define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2)) if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) && (I40E_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) { /* Make sure that anybody stopping the queue after this @@@ -1113,8 -1113,6 +1113,8 @@@ int i40e_setup_tx_descriptors(struct i4 if (!tx_ring->tx_bi) goto err;
+ u64_stats_init(&tx_ring->syncp); + /* round up to nearest 4K */ tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc); /* add u32 for head writeback, align after this takes care of @@@ -2065,7 -2063,7 +2065,7 @@@ static int i40e_clean_rx_irq(struct i40 u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); bool failure = false, xdp_xmit = false;
- while (likely(total_rx_packets < budget)) { + while (likely(total_rx_packets < (unsigned int)budget)) { struct i40e_rx_buffer *rx_buffer; union i40e_rx_desc *rx_desc; struct xdp_buff xdp; @@@ -2198,7 -2196,7 +2198,7 @@@ rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
/* guarantee a trip back through this routine if there was a failure */ - return failure ? budget : total_rx_packets; + return failure ? budget : (int)total_rx_packets; }
static u32 i40e_buildreg_itr(const int type, const u16 itr) @@@ -2453,9 -2451,15 +2453,15 @@@ static void i40e_atr(struct i40e_ring * hlen = (hdr.network[0] & 0x0F) << 2; l4_proto = hdr.ipv4->protocol; } else { - hlen = hdr.network - skb->data; - l4_proto = ipv6_find_hdr(skb, &hlen, IPPROTO_TCP, NULL, NULL); - hlen -= hdr.network - skb->data; + /* find the start of the innermost ipv6 header */ + unsigned int inner_hlen = hdr.network - skb->data; + unsigned int h_offset = inner_hlen; + + /* this function updates h_offset to the end of the header */ + l4_proto = + ipv6_find_hdr(skb, &h_offset, IPPROTO_TCP, NULL, NULL); + /* hlen will contain our best estimate of the tcp header */ + hlen = h_offset - inner_hlen; }
if (l4_proto != IPPROTO_TCP) diff --combined drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 4631ca8b8eb2,ea471604450e..4a990033c4d5 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@@ -513,7 -513,6 +513,7 @@@ nfp_net_tx_ring_init(struct nfp_net_tx_ tx_ring->idx = idx; tx_ring->r_vec = r_vec; tx_ring->is_xdp = is_xdp; + u64_stats_init(&tx_ring->r_vec->tx_sync);
tx_ring->qcidx = tx_ring->idx * nn->stride_tx; tx_ring->qcp_q = nn->tx_bar + NFP_QCP_QUEUE_OFF(tx_ring->qcidx); @@@ -533,7 -532,6 +533,7 @@@ nfp_net_rx_ring_init(struct nfp_net_rx_
rx_ring->idx = idx; rx_ring->r_vec = r_vec; + u64_stats_init(&rx_ring->r_vec->rx_sync);
rx_ring->fl_qcidx = rx_ring->idx * nn->stride_rx; rx_ring->qcp_fl = nn->rx_bar + NFP_QCP_QUEUE_OFF(rx_ring->fl_qcidx); @@@ -2660,6 -2658,7 +2660,7 @@@ static int nfp_net_netdev_close(struct /* Step 2: Tell NFP */ nfp_net_clear_config_and_disable(nn); + nfp_port_configure(netdev, false);
/* Step 3: Free resources */ @@@ -2777,16 -2776,21 +2778,21 @@@ static int nfp_net_netdev_open(struct n goto err_free_all;
/* Step 2: Configure the NFP + * - Ifup the physical interface if it exists * - Enable rings from 0 to tx_rings/rx_rings - 1. * - Write MAC address (in case it changed) * - Set the MTU * - Set the Freelist buffer size * - Enable the FW */ - err = nfp_net_set_config_and_enable(nn); + err = nfp_port_configure(netdev, true); if (err) goto err_free_all;
+ err = nfp_net_set_config_and_enable(nn); + if (err) + goto err_port_disable; + /* Step 3: Enable for kernel * - put some freelist descriptors on each RX ring * - enable NAPI on each ring @@@ -2797,6 -2801,8 +2803,8 @@@
return 0;
+ err_port_disable: + nfp_port_configure(netdev, false); err_free_all: nfp_net_close_free_all(nn); return err; diff --combined drivers/net/hyperv/netvsc.c index 96f90c75d1b7,9598220b3bcc..9b45e8c1a69d --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@@ -29,6 -29,9 +29,9 @@@ #include <linux/netdevice.h> #include <linux/if_ether.h> #include <linux/vmalloc.h> + #include <linux/rtnetlink.h> + #include <linux/prefetch.h> + #include <asm/sync_bitops.h>
#include "hyperv_net.h" @@@ -41,7 -44,7 +44,7 @@@ void netvsc_switch_datapath(struct net_ { struct net_device_context *net_device_ctx = netdev_priv(ndev); struct hv_device *dev = net_device_ctx->device_ctx; - struct netvsc_device *nv_dev = net_device_ctx->nvdev; + struct netvsc_device *nv_dev = rtnl_dereference(net_device_ctx->nvdev); struct nvsp_message *init_pkt = &nv_dev->channel_init_pkt;
memset(init_pkt, 0, sizeof(struct nvsp_message)); @@@ -69,9 -72,6 +72,6 @@@ static struct netvsc_device *alloc_net_ if (!net_device) return NULL;
- net_device->chan_table[0].mrc.buf - = vzalloc(NETVSC_RECVSLOT_MAX * sizeof(struct recv_comp_data)); - init_waitqueue_head(&net_device->wait_drain); net_device->destroy = false; atomic_set(&net_device->open_cnt, 0); @@@ -89,7 -89,7 +89,7 @@@ static void free_netvsc_device(struct r int i;
for (i = 0; i < VRSS_CHANNEL_MAX; i++) - vfree(nvdev->chan_table[i].mrc.buf); + vfree(nvdev->chan_table[i].mrc.slots);
kfree(nvdev); } @@@ -103,7 -103,8 +103,8 @@@ static void netvsc_destroy_buf(struct h { struct nvsp_message *revoke_packet; struct net_device *ndev = hv_get_drvdata(device); - struct netvsc_device *net_device = net_device_to_netvsc_device(ndev); + struct net_device_context *ndc = netdev_priv(ndev); + struct netvsc_device *net_device = rtnl_dereference(ndc->nvdev); int ret;
/* @@@ -167,12 -168,6 +168,6 @@@ net_device->recv_buf = NULL; }
- if (net_device->recv_section) { - net_device->recv_section_cnt = 0; - kfree(net_device->recv_section); - net_device->recv_section = NULL; - } - /* Deal with the send buffer we may have setup. * If we got a send section size, it means we received a * NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE msg (ie sent @@@ -235,11 -230,26 +230,26 @@@ kfree(net_device->send_section_map); }
+ int netvsc_alloc_recv_comp_ring(struct netvsc_device *net_device, u32 q_idx) + { + struct netvsc_channel *nvchan = &net_device->chan_table[q_idx]; + int node = cpu_to_node(nvchan->channel->target_cpu); + size_t size; + + size = net_device->recv_completion_cnt * sizeof(struct recv_comp_data); + nvchan->mrc.slots = vzalloc_node(size, node); + if (!nvchan->mrc.slots) + nvchan->mrc.slots = vzalloc(size); + + return nvchan->mrc.slots ? 0 : -ENOMEM; + } + static int netvsc_init_buf(struct hv_device *device, struct netvsc_device *net_device) { int ret = 0; struct nvsp_message *init_packet; + struct nvsp_1_message_send_receive_buffer_complete *resp; struct net_device *ndev; size_t map_words; int node; @@@ -296,43 -306,41 +306,41 @@@ wait_for_completion(&net_device->channel_init_wait);
/* Check the response */ - if (init_packet->msg.v1_msg. - send_recv_buf_complete.status != NVSP_STAT_SUCCESS) { - netdev_err(ndev, "Unable to complete receive buffer " - "initialization with NetVsp - status %d\n", - init_packet->msg.v1_msg. - send_recv_buf_complete.status); + resp = &init_packet->msg.v1_msg.send_recv_buf_complete; + if (resp->status != NVSP_STAT_SUCCESS) { + netdev_err(ndev, + "Unable to complete receive buffer initialization with NetVsp - status %d\n", + resp->status); ret = -EINVAL; goto cleanup; }
/* Parse the response */ + netdev_dbg(ndev, "Receive sections: %u sub_allocs: size %u count: %u\n", + resp->num_sections, resp->sections[0].sub_alloc_size, + resp->sections[0].num_sub_allocs);
- net_device->recv_section_cnt = init_packet->msg. - v1_msg.send_recv_buf_complete.num_sections; - - net_device->recv_section = kmemdup( - init_packet->msg.v1_msg.send_recv_buf_complete.sections, - net_device->recv_section_cnt * - sizeof(struct nvsp_1_receive_buffer_section), - GFP_KERNEL); - if (net_device->recv_section == NULL) { - ret = -EINVAL; - goto cleanup; - } + net_device->recv_section_cnt = resp->num_sections;
/* * For 1st release, there should only be 1 section that represents the * entire receive buffer */ if (net_device->recv_section_cnt != 1 || - net_device->recv_section->offset != 0) { + resp->sections[0].offset != 0) { ret = -EINVAL; goto cleanup; }
- /* Now setup the send buffer. - */ + /* Setup receive completion ring */ + net_device->recv_completion_cnt + = round_up(resp->sections[0].num_sub_allocs + 1, + PAGE_SIZE / sizeof(u64)); + ret = netvsc_alloc_recv_comp_ring(net_device, 0); + if (ret) + goto cleanup; + + /* Now setup the send buffer. */ net_device->send_buf = vzalloc_node(net_device->send_buf_size, node); if (!net_device->send_buf) net_device->send_buf = vzalloc(net_device->send_buf_size); @@@ -549,7 -557,8 +557,8 @@@ void netvsc_device_remove(struct hv_dev { struct net_device *ndev = hv_get_drvdata(device); struct net_device_context *net_device_ctx = netdev_priv(ndev); - struct netvsc_device *net_device = net_device_ctx->nvdev; + struct netvsc_device *net_device + = rtnl_dereference(net_device_ctx->nvdev); int i;
netvsc_disconnect_vsp(device); @@@ -692,7 -701,7 +701,7 @@@ static u32 netvsc_copy_to_send_buf(stru u32 pend_size, struct hv_netvsc_packet *packet, struct rndis_message *rndis_msg, - struct hv_page_buffer **pb, + struct hv_page_buffer *pb, struct sk_buff *skb) { char *start = net_device->send_buf; @@@ -713,9 -722,9 +722,9 @@@ }
for (i = 0; i < page_count; i++) { - char *src = phys_to_virt((*pb)[i].pfn << PAGE_SHIFT); - u32 offset = (*pb)[i].offset; - u32 len = (*pb)[i].len; + char *src = phys_to_virt(pb[i].pfn << PAGE_SHIFT); + u32 offset = pb[i].offset; + u32 len = pb[i].len;
memcpy(dest, (src + offset), len); msg_size += len; @@@ -734,36 -743,32 +743,32 @@@ static inline int netvsc_send_pkt struct hv_device *device, struct hv_netvsc_packet *packet, struct netvsc_device *net_device, - struct hv_page_buffer **pb, + struct hv_page_buffer *pb, struct sk_buff *skb) { struct nvsp_message nvmsg; - struct netvsc_channel *nvchan - = &net_device->chan_table[packet->q_idx]; + struct nvsp_1_message_send_rndis_packet * const rpkt = + &nvmsg.msg.v1_msg.send_rndis_pkt; + struct netvsc_channel * const nvchan = + &net_device->chan_table[packet->q_idx]; struct vmbus_channel *out_channel = nvchan->channel; struct net_device *ndev = hv_get_drvdata(device); struct netdev_queue *txq = netdev_get_tx_queue(ndev, packet->q_idx); u64 req_id; int ret; - struct hv_page_buffer *pgbuf; u32 ring_avail = hv_ringbuf_avail_percent(&out_channel->outbound);
nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT; - if (skb != NULL) { - /* 0 is RMC_DATA; */ - nvmsg.msg.v1_msg.send_rndis_pkt.channel_type = 0; - } else { - /* 1 is RMC_CONTROL; */ - nvmsg.msg.v1_msg.send_rndis_pkt.channel_type = 1; - } + if (skb) + rpkt->channel_type = 0; /* 0 is RMC_DATA */ + else + rpkt->channel_type = 1; /* 1 is RMC_CONTROL */
- nvmsg.msg.v1_msg.send_rndis_pkt.send_buf_section_index = - packet->send_buf_index; + rpkt->send_buf_section_index = packet->send_buf_index; if (packet->send_buf_index == NETVSC_INVALID_INDEX) - nvmsg.msg.v1_msg.send_rndis_pkt.send_buf_section_size = 0; + rpkt->send_buf_section_size = 0; else - nvmsg.msg.v1_msg.send_rndis_pkt.send_buf_section_size = - packet->total_data_buflen; + rpkt->send_buf_section_size = packet->total_data_buflen;
req_id = (ulong)skb;
@@@ -771,11 -776,11 +776,11 @@@ return -ENODEV;
if (packet->page_buf_cnt) { - pgbuf = packet->cp_partial ? (*pb) + - packet->rmsg_pgcnt : (*pb); + if (packet->cp_partial) + pb += packet->rmsg_pgcnt; + ret = vmbus_sendpacket_pagebuffer_ctl(out_channel, - pgbuf, - packet->page_buf_cnt, + pb, packet->page_buf_cnt, &nvmsg, sizeof(struct nvsp_message), req_id, @@@ -800,8 -805,10 +805,10 @@@ ret = -ENOSPC; } } else { - netdev_err(ndev, "Unable to send packet %p ret %d\n", - packet, ret); + netdev_err(ndev, + "Unable to send packet pages %u len %u, ret %d\n", + packet->page_buf_cnt, packet->total_data_buflen, + ret); }
return ret; @@@ -819,13 -826,16 +826,16 @@@ static inline void move_pkt_msd(struct msdp->count = 0; }
- int netvsc_send(struct hv_device *device, + /* RCU already held by caller */ + int netvsc_send(struct net_device_context *ndev_ctx, struct hv_netvsc_packet *packet, struct rndis_message *rndis_msg, - struct hv_page_buffer **pb, + struct hv_page_buffer *pb, struct sk_buff *skb) { - struct netvsc_device *net_device = hv_device_to_netvsc_device(device); + struct netvsc_device *net_device + = rcu_dereference_bh(ndev_ctx->nvdev); + struct hv_device *device = ndev_ctx->device_ctx; int ret = 0; struct netvsc_channel *nvchan; u32 pktlen = packet->total_data_buflen, msd_len = 0; @@@ -837,7 -847,7 +847,7 @@@ bool xmit_more = (skb != NULL) ? skb->xmit_more : false;
/* If device is rescinded, return error and packet will get dropped. */ - if (unlikely(net_device->destroy)) + if (unlikely(!net_device || net_device->destroy)) return -ENODEV;
/* We may race with netvsc_connect_vsp()/netvsc_init_buf() and get @@@ -942,130 -952,94 +952,94 @@@ send_now return ret; }
- static int netvsc_send_recv_completion(struct vmbus_channel *channel, - u64 transaction_id, u32 status) + /* Send pending recv completions */ + static int send_recv_completions(struct netvsc_channel *nvchan) { - struct nvsp_message recvcompMessage; + struct netvsc_device *nvdev = nvchan->net_device; + struct multi_recv_comp *mrc = &nvchan->mrc; + struct recv_comp_msg { + struct nvsp_message_header hdr; + u32 status; + } __packed; + struct recv_comp_msg msg = { + .hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE, + }; int ret;
- recvcompMessage.hdr.msg_type = - NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE; + while (mrc->first != mrc->next) { + const struct recv_comp_data *rcd + = mrc->slots + mrc->first;
- recvcompMessage.msg.v1_msg.send_rndis_pkt_complete.status = status; + msg.status = rcd->status; + ret = vmbus_sendpacket(nvchan->channel, &msg, sizeof(msg), + rcd->tid, VM_PKT_COMP, 0); + if (unlikely(ret)) + return ret;
- /* Send the completion */ - ret = vmbus_sendpacket(channel, &recvcompMessage, - sizeof(struct nvsp_message_header) + sizeof(u32), - transaction_id, VM_PKT_COMP, 0); - - return ret; - } - - static inline void count_recv_comp_slot(struct netvsc_device *nvdev, u16 q_idx, - u32 *filled, u32 *avail) - { - struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc; - u32 first = mrc->first; - u32 next = mrc->next; - - *filled = (first > next) ? NETVSC_RECVSLOT_MAX - first + next : - next - first; - - *avail = NETVSC_RECVSLOT_MAX - *filled - 1; - } - - /* Read the first filled slot, no change to index */ - static inline struct recv_comp_data *read_recv_comp_slot(struct netvsc_device - *nvdev, u16 q_idx) - { - struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc; - u32 filled, avail; - - if (unlikely(!mrc->buf)) - return NULL; + if (++mrc->first == nvdev->recv_completion_cnt) + mrc->first = 0; + }
- count_recv_comp_slot(nvdev, q_idx, &filled, &avail); - if (!filled) - return NULL; + /* receive completion ring has been emptied */ + if (unlikely(nvdev->destroy)) + wake_up(&nvdev->wait_drain);
- return mrc->buf + mrc->first * sizeof(struct recv_comp_data); + return 0; }
- /* Put the first filled slot back to available pool */ - static inline void put_recv_comp_slot(struct netvsc_device *nvdev, u16 q_idx) + /* Count how many receive completions are outstanding */ + static void recv_comp_slot_avail(const struct netvsc_device *nvdev, + const struct multi_recv_comp *mrc, + u32 *filled, u32 *avail) { - struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc; - int num_recv; - - mrc->first = (mrc->first + 1) % NETVSC_RECVSLOT_MAX; + u32 count = nvdev->recv_completion_cnt;
- num_recv = atomic_dec_return(&nvdev->num_outstanding_recvs); + if (mrc->next >= mrc->first) + *filled = mrc->next - mrc->first; + else + *filled = (count - mrc->first) + mrc->next;
- if (nvdev->destroy && num_recv == 0) - wake_up(&nvdev->wait_drain); + *avail = count - *filled - 1; }
- /* Check and send pending recv completions */ - static void netvsc_chk_recv_comp(struct netvsc_device *nvdev, - struct vmbus_channel *channel, u16 q_idx) + /* Add receive complete to ring to send to host. */ + static void enq_receive_complete(struct net_device *ndev, + struct netvsc_device *nvdev, u16 q_idx, + u64 tid, u32 status) { + struct netvsc_channel *nvchan = &nvdev->chan_table[q_idx]; + struct multi_recv_comp *mrc = &nvchan->mrc; struct recv_comp_data *rcd; - int ret; + u32 filled, avail;
- while (true) { - rcd = read_recv_comp_slot(nvdev, q_idx); - if (!rcd) - break; + recv_comp_slot_avail(nvdev, mrc, &filled, &avail);
- ret = netvsc_send_recv_completion(channel, rcd->tid, - rcd->status); - if (ret) - break; - - put_recv_comp_slot(nvdev, q_idx); + if (unlikely(filled > NAPI_POLL_WEIGHT)) { + send_recv_completions(nvchan); + recv_comp_slot_avail(nvdev, mrc, &filled, &avail); } - }
- #define NETVSC_RCD_WATERMARK 80 - - /* Get next available slot */ - static inline struct recv_comp_data *get_recv_comp_slot( - struct netvsc_device *nvdev, struct vmbus_channel *channel, u16 q_idx) - { - struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc; - u32 filled, avail, next; - struct recv_comp_data *rcd; - - if (unlikely(!nvdev->recv_section)) - return NULL; - - if (unlikely(!mrc->buf)) - return NULL; - - if (atomic_read(&nvdev->num_outstanding_recvs) > - nvdev->recv_section->num_sub_allocs * NETVSC_RCD_WATERMARK / 100) - netvsc_chk_recv_comp(nvdev, channel, q_idx); - - count_recv_comp_slot(nvdev, q_idx, &filled, &avail); - if (!avail) - return NULL; - - next = mrc->next; - rcd = mrc->buf + next * sizeof(struct recv_comp_data); - mrc->next = (next + 1) % NETVSC_RECVSLOT_MAX; + if (unlikely(!avail)) { + netdev_err(ndev, "Recv_comp full buf q:%hd, tid:%llx\n", + q_idx, tid); + return; + }
- atomic_inc(&nvdev->num_outstanding_recvs); + rcd = mrc->slots + mrc->next; + rcd->tid = tid; + rcd->status = status;
- return rcd; + if (++mrc->next == nvdev->recv_completion_cnt) + mrc->next = 0; }
static int netvsc_receive(struct net_device *ndev, - struct netvsc_device *net_device, - struct net_device_context *net_device_ctx, - struct hv_device *device, - struct vmbus_channel *channel, - const struct vmpacket_descriptor *desc, - struct nvsp_message *nvsp) + struct netvsc_device *net_device, + struct net_device_context *net_device_ctx, + struct hv_device *device, + struct vmbus_channel *channel, + const struct vmpacket_descriptor *desc, + struct nvsp_message *nvsp) { const struct vmtransfer_page_packet_header *vmxferpage_packet = container_of(desc, const struct vmtransfer_page_packet_header, d); @@@ -1074,7 -1048,6 +1048,6 @@@ u32 status = NVSP_STAT_SUCCESS; int i; int count = 0; - int ret;
/* Make sure this is a valid nvsp packet */ if (unlikely(nvsp->hdr.msg_type != NVSP_MSG1_TYPE_SEND_RNDIS_PKT)) { @@@ -1105,25 -1078,9 +1078,9 @@@ channel, data, buflen); }
- if (net_device->chan_table[q_idx].mrc.buf) { - struct recv_comp_data *rcd; + enq_receive_complete(ndev, net_device, q_idx, + vmxferpage_packet->d.trans_id, status);
- rcd = get_recv_comp_slot(net_device, channel, q_idx); - if (rcd) { - rcd->tid = vmxferpage_packet->d.trans_id; - rcd->status = status; - } else { - netdev_err(ndev, "Recv_comp full buf q:%hd, tid:%llx\n", - q_idx, vmxferpage_packet->d.trans_id); - } - } else { - ret = netvsc_send_recv_completion(channel, - vmxferpage_packet->d.trans_id, - status); - if (ret) - netdev_err(ndev, "Recv_comp q:%hd, tid:%llx, err:%d\n", - q_idx, vmxferpage_packet->d.trans_id, ret); - } return count; }
@@@ -1219,11 -1176,10 +1176,10 @@@ int netvsc_poll(struct napi_struct *nap { struct netvsc_channel *nvchan = container_of(napi, struct netvsc_channel, napi); + struct netvsc_device *net_device = nvchan->net_device; struct vmbus_channel *channel = nvchan->channel; struct hv_device *device = netvsc_channel_to_device(channel); - u16 q_idx = channel->offermsg.offer.sub_channel_index; struct net_device *ndev = hv_get_drvdata(device); - struct netvsc_device *net_device = net_device_to_netvsc_device(ndev); int work_done = 0;
/* If starting a new interval */ @@@ -1236,17 -1192,23 +1192,23 @@@ nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc); }
- /* If receive ring was exhausted - * and not doing busy poll + /* if ring is empty, signal host */ + if (!nvchan->desc) + hv_pkt_iter_close(channel); + + /* If send of pending receive completions suceeded + * and did not exhaust NAPI budget this time + * and not doing busy poll * then re-enable host interrupts - * and reschedule if ring is not empty. + * and reschedule if ring is not empty. */ - if (work_done < budget && + if (send_recv_completions(nvchan) == 0 && + work_done < budget && napi_complete_done(napi, work_done) && - hv_end_read(&channel->inbound) != 0) + hv_end_read(&channel->inbound)) { + hv_begin_read(&channel->inbound); napi_reschedule(napi); - - netvsc_chk_recv_comp(net_device, channel, q_idx); + }
/* Driver may overshoot since multiple packets per descriptor */ return min(work_done, budget); @@@ -1258,10 -1220,15 +1220,15 @@@ void netvsc_channel_cb(void *context) { struct netvsc_channel *nvchan = context; + struct vmbus_channel *channel = nvchan->channel; + struct hv_ring_buffer_info *rbi = &channel->inbound; + + /* preload first vmpacket descriptor */ + prefetch(hv_get_ring_buffer(rbi) + rbi->priv_read_index);
if (napi_schedule_prep(&nvchan->napi)) { /* disable interupts from host */ - hv_begin_read(&nvchan->channel->inbound); + hv_begin_read(rbi);
__napi_schedule(&nvchan->napi); } @@@ -1271,8 -1238,8 +1238,8 @@@ * netvsc_device_add - Callback when the device belonging to this * driver is added */ - int netvsc_device_add(struct hv_device *device, - const struct netvsc_device_info *device_info) + struct netvsc_device *netvsc_device_add(struct hv_device *device, + const struct netvsc_device_info *device_info) { int i, ret = 0; int ring_size = device_info->ring_size; @@@ -1282,7 -1249,7 +1249,7 @@@
net_device = alloc_net_device(); if (!net_device) - return -ENOMEM; + return ERR_PTR(-ENOMEM);
net_device->ring_size = ring_size;
@@@ -1302,8 -1269,7 +1269,9 @@@ struct netvsc_channel *nvchan = &net_device->chan_table[i];
nvchan->channel = device->channel; + nvchan->net_device = net_device; + u64_stats_init(&nvchan->tx_stats.syncp); + u64_stats_init(&nvchan->rx_stats.syncp); }
/* Enable NAPI handler before init callbacks */ @@@ -1340,10 -1306,11 +1308,11 @@@ goto close; }
- return ret; + return net_device;
close: - netif_napi_del(&net_device->chan_table[0].napi); + RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); + napi_disable(&net_device->chan_table[0].napi);
/* Now, we can close the channel safely */ vmbus_close(device->channel); @@@ -1351,6 -1318,5 +1320,5 @@@ cleanup: free_netvsc_device(&net_device->rcu);
- return ret; - + return ERR_PTR(ret); } diff --combined drivers/net/ipvlan/ipvlan_main.c index 8dab74a81303,fdde20735416..58a9f990b553 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@@ -169,7 -169,7 +169,7 @@@ static void ipvlan_port_destroy(struct
#define IPVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ - NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | NETIF_F_GSO_ROBUST | \ + NETIF_F_GSO | NETIF_F_TSO | NETIF_F_GSO_ROBUST | \ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)
@@@ -192,7 -192,7 +192,7 @@@ static int ipvlan_init(struct net_devic
netdev_lockdep_set_classes(dev);
- ipvlan->pcpu_stats = alloc_percpu(struct ipvl_pcpu_stats); + ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats); if (!ipvlan->pcpu_stats) return -ENOMEM;
diff --combined drivers/net/vxlan.c index e17baac70f43,dbca067540d0..35e84a9e1cfb --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@@ -623,7 -623,6 +623,7 @@@ static struct sk_buff **vxlan_gro_recei
out: skb_gro_remcsum_cleanup(skb, &grc); + skb->remcsum_offload = 0; NAPI_GRO_CB(skb)->flush |= flush;
return pp; @@@ -2609,7 -2608,7 +2609,7 @@@ static struct device_type vxlan_type = * supply the listening VXLAN udp ports. Callers are expected * to implement the ndo_udp_tunnel_add. */ - static void vxlan_push_rx_ports(struct net_device *dev) + static void vxlan_offload_rx_ports(struct net_device *dev, bool push) { struct vxlan_sock *vs; struct net *net = dev_net(dev); @@@ -2618,11 -2617,19 +2618,19 @@@
spin_lock(&vn->sock_lock); for (i = 0; i < PORT_HASH_SIZE; ++i) { - hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) - udp_tunnel_push_rx_port(dev, vs->sock, - (vs->flags & VXLAN_F_GPE) ? - UDP_TUNNEL_TYPE_VXLAN_GPE : - UDP_TUNNEL_TYPE_VXLAN); + hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { + unsigned short type; + + if (vs->flags & VXLAN_F_GPE) + type = UDP_TUNNEL_TYPE_VXLAN_GPE; + else + type = UDP_TUNNEL_TYPE_VXLAN; + + if (push) + udp_tunnel_push_rx_port(dev, vs->sock, type); + else + udp_tunnel_drop_rx_port(dev, vs->sock, type); + } } spin_unlock(&vn->sock_lock); } @@@ -3631,10 -3638,15 +3639,15 @@@ static int vxlan_netdevice_event(struc struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
- if (event == NETDEV_UNREGISTER) + if (event == NETDEV_UNREGISTER) { + vxlan_offload_rx_ports(dev, false); vxlan_handle_lowerdev_unregister(vn, dev); - else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO) - vxlan_push_rx_ports(dev); + } else if (event == NETDEV_REGISTER) { + vxlan_offload_rx_ports(dev, true); + } else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO || + event == NETDEV_UDP_TUNNEL_DROP_INFO) { + vxlan_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO); + }
return NOTIFY_DONE; } diff --combined include/net/tcp.h index ada65e767b28,bb1881b4ce48..44dbf5d43a93 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@@ -139,6 -139,7 +139,7 @@@ void tcp_time_wait(struct sock *sk, in #endif #define TCP_RTO_MAX ((unsigned)(120*HZ)) #define TCP_RTO_MIN ((unsigned)(HZ/5)) + #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the @@@ -150,8 -151,6 +151,6 @@@ #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes * for local resources. */ - #define TCP_REO_TIMEOUT_MIN (2000) /* Min RACK reordering timeout in usec */ - #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ #define TCP_KEEPALIVE_INTVL (75*HZ) @@@ -257,7 -256,6 +256,6 @@@ extern int sysctl_tcp_rmem[3] extern int sysctl_tcp_app_win; extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_frto; - extern int sysctl_tcp_low_latency; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; @@@ -352,8 -350,11 +350,11 @@@ int tcp_v4_rcv(struct sk_buff *skb)
int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); + int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); + int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, + size_t size, int flags); ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tcp_release_cb(struct sock *sk); @@@ -363,7 -364,7 +364,7 @@@ void tcp_delack_timer_handler(struct so int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len); + const struct tcphdr *th); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); @@@ -633,29 -634,6 +634,6 @@@ static inline u32 __tcp_set_rto(const s return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); }
- static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) - { - tp->pred_flags = htonl((tp->tcp_header_len << 26) | - ntohl(TCP_FLAG_ACK) | - snd_wnd); - } - - static inline void tcp_fast_path_on(struct tcp_sock *tp) - { - __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); - } - - static inline void tcp_fast_path_check(struct sock *sk) - { - struct tcp_sock *tp = tcp_sk(sk); - - if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && - tp->rcv_wnd && - atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && - !tp->urg_data) - tcp_fast_path_on(tp); - } - /* Compute the actual rto_min value */ static inline u32 tcp_rto_min(struct sock *sk) { @@@ -905,9 -883,8 +883,8 @@@ enum tcp_ca_event
/* Information about inbound ACK, passed to cong_ops->in_ack_event() */ enum tcp_ca_ack_event_flags { - CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ - CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ - CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ + CA_ACK_WIN_UPDATE = (1 << 0), /* ACK updated window */ + CA_ACK_ECE = (1 << 1), /* ECE bit is set on ack */ };
/* @@@ -1245,17 -1222,6 +1222,6 @@@ static inline bool tcp_checksum_complet __tcp_checksum_complete(skb); }
- /* Prequeue for VJ style copy to user, combined with checksumming. */ - - static inline void tcp_prequeue_init(struct tcp_sock *tp) - { - tp->ucopy.task = NULL; - tp->ucopy.len = 0; - tp->ucopy.memory = 0; - skb_queue_head_init(&tp->ucopy.prequeue); - } - - bool tcp_prequeue(struct sock *sk, struct sk_buff *skb); bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); int tcp_filter(struct sock *sk, struct sk_buff *skb);
@@@ -1916,16 -1882,6 +1882,16 @@@ extern void tcp_rack_advance(struct tcp u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk);
+/* At how many usecs into the future should the RTO fire? */ +static inline s64 tcp_rto_delta_us(const struct sock *sk) +{ + const struct sk_buff *skb = tcp_write_queue_head(sk); + u32 rto = inet_csk(sk)->icsk_rto; + u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto); + + return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; +} + /* * Save and compile IPv4 options, return a pointer to it */ diff --combined net/ipv4/tcp_input.c index 53de1424c13c,99cdf4ccabb8..a805229e05a5 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@@ -103,11 -103,9 +103,10 @@@ int sysctl_tcp_invalid_ratelimit __read #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ - #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ +#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ @@@ -2521,8 -2519,8 +2520,8 @@@ static inline void tcp_end_cwnd_reducti return;
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || - (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { tp->snd_cwnd = tp->snd_ssthresh; tp->snd_cwnd_stamp = tcp_jiffies32; } @@@ -3005,7 -3003,10 +3004,7 @@@ void tcp_rearm_rto(struct sock *sk /* Offset the time elapsed after installing regular RTO */ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { - struct sk_buff *skb = tcp_write_queue_head(sk); - u64 rto_time_stamp = skb->skb_mstamp + - jiffies_to_usecs(rto); - s64 delta_us = rto_time_stamp - tp->tcp_mstamp; + s64 delta_us = tcp_rto_delta_us(sk); /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ @@@ -3017,13 -3018,6 +3016,13 @@@ } }
+/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ +static void tcp_set_xmit_timer(struct sock *sk) +{ + if (!tcp_schedule_loss_probe(sk)) + tcp_rearm_rto(sk); +} + /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { @@@ -3185,7 -3179,7 +3184,7 @@@ static int tcp_clean_rtx_queue(struct s ca_rtt_us, sack->rate);
if (flag & FLAG_ACKED) { - tcp_rearm_rto(sk); + flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); @@@ -3213,7 -3207,7 +3212,7 @@@ * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. */ - tcp_rearm_rto(sk); + flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ }
if (icsk->icsk_ca_ops->pkts_acked) { @@@ -3372,12 -3366,6 +3371,6 @@@ static int tcp_ack_update_window(struc if (tp->snd_wnd != nwin) { tp->snd_wnd = nwin;
- /* Note, it is the only place, where - * fast path is recovered for sending TCP. - */ - tp->pred_flags = 0; - tcp_fast_path_check(sk); - if (tcp_send_head(sk)) tcp_slow_start_after_idle_check(sk);
@@@ -3559,6 -3547,7 +3552,7 @@@ static int tcp_ack(struct sock *sk, con u32 lost = tp->lost; int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 ack_ev_flags = 0;
sack_state.first_sackt = 0; sack_state.rate = &rs; @@@ -3585,6 -3574,9 +3579,6 @@@ if (after(ack, tp->snd_nxt)) goto invalid_ack;
- if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) - tcp_rearm_rto(sk); - if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; icsk->icsk_retransmits = 0; @@@ -3599,42 -3591,26 +3593,26 @@@ if (flag & FLAG_UPDATE_TS_RECENT) tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { - /* Window is constant, pure forward advance. - * No more checks are required. - * Note, we use the fact that SND.UNA>=SND.WL2. - */ - tcp_update_wl(tp, ack_seq); - tcp_snd_una_update(tp, ack); - flag |= FLAG_WIN_UPDATE; - - tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); - } else { - u32 ack_ev_flags = CA_ACK_SLOWPATH; - - if (ack_seq != TCP_SKB_CB(skb)->end_seq) - flag |= FLAG_DATA; - else - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS); + if (ack_seq != TCP_SKB_CB(skb)->end_seq) + flag |= FLAG_DATA; + else + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
- flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); + flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
- if (TCP_SKB_CB(skb)->sacked) - flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_state); + if (TCP_SKB_CB(skb)->sacked) + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, + &sack_state);
- if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { - flag |= FLAG_ECE; - ack_ev_flags |= CA_ACK_ECE; - } + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { + flag |= FLAG_ECE; + ack_ev_flags = CA_ACK_ECE; + }
- if (flag & FLAG_WIN_UPDATE) - ack_ev_flags |= CA_ACK_WIN_UPDATE; + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE;
- tcp_in_ack_event(sk, ack_ev_flags); - } + tcp_in_ack_event(sk, ack_ev_flags);
/* We passed data and got it acked, remove any soft error * log. Something worked... @@@ -3649,20 -3625,18 +3627,20 @@@ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, &sack_state);
+ if (tp->tlp_high_seq) + tcp_process_tlp_ack(sk, ack, flag); + /* If needed, reset TLP/RTO timer; RACK may later override this. */ + if (flag & FLAG_SET_XMIT_TIMER) + tcp_set_xmit_timer(sk); + if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } - if (tp->tlp_high_seq) - tcp_process_tlp_ack(sk, ack, flag);
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk);
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) - tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ tcp_rate_gen(sk, delivered, lost, sack_state.rate); @@@ -4402,8 -4376,6 +4380,6 @@@ static void tcp_data_queue_ofo(struct s return; }
- /* Disable header prediction. */ - tp->pred_flags = 0; inet_csk_schedule_ack(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); @@@ -4592,8 -4564,8 +4568,8 @@@ err static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - bool fragstolen = false; - int eaten = -1; + bool fragstolen; + int eaten;
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); @@@ -4615,32 -4587,13 +4591,13 @@@ goto out_of_window;
/* Ok. In sequence. In window. */ - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && - sock_owned_by_user(sk) && !tp->urg_data) { - int chunk = min_t(unsigned int, skb->len, - tp->ucopy.len); - - __set_current_state(TASK_RUNNING); - - if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) { - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - eaten = (chunk == skb->len); - tcp_rcv_space_adjust(sk); - } - } - - if (eaten <= 0) { queue_and_out: - if (eaten < 0) { - if (skb_queue_len(&sk->sk_receive_queue) == 0) - sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) - goto drop; - } - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); - } + if (skb_queue_len(&sk->sk_receive_queue) == 0) + sk_forced_mem_schedule(sk, skb->truesize); + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + goto drop; + + eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); @@@ -4660,8 -4613,6 +4617,6 @@@ if (tp->rx_opt.num_sacks) tcp_sack_remove(tp);
- tcp_fast_path_check(sk); - if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) @@@ -4987,7 -4938,6 +4942,6 @@@ static int tcp_prune_queue(struct sock NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */ - tp->pred_flags = 0; return -1; }
@@@ -5159,9 -5109,6 +5113,6 @@@ static void tcp_check_urg(struct sock *
tp->urg_data = TCP_URG_NOTYET; tp->urg_seq = ptr; - - /* Disable header prediction. */ - tp->pred_flags = 0; }
/* This is the 'fast' part of urgent handling. */ @@@ -5190,26 -5137,6 +5141,6 @@@ static void tcp_urg(struct sock *sk, st } }
- static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) - { - struct tcp_sock *tp = tcp_sk(sk); - int chunk = skb->len - hlen; - int err; - - if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk); - else - err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg); - - if (!err) { - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - tcp_rcv_space_adjust(sk); - } - - return err; - } - /* Accept RST for rcv_nxt - 1 after a FIN. * When tcp connections are abruptly terminated from Mac OSX (via ^C), a * FIN is sent followed by a RST packet. The RST is sent with the same @@@ -5340,201 -5267,29 +5271,29 @@@ discard
/* * TCP receive function for the ESTABLISHED state. - * - * It is split into a fast path and a slow path. The fast path is - * disabled when: - * - A zero window was announced from us - zero window probing - * is only handled properly in the slow path. - * - Out of order segments arrived. - * - Urgent data is expected. - * - There is no buffer space left - * - Unexpected TCP flags/window values/header lengths are received - * (detected by checking the TCP header against pred_flags) - * - Data is sent in both directions. Fast path only supports pure senders - * or pure receivers (this means either the sequence number or the ack - * value must stay constant) - * - Unexpected TCP option. - * - * When these conditions are not satisfied it drops into a standard - * receive procedure patterned after RFC793 to handle all cases. - * The first three cases are guaranteed by proper pred_flags setting, - * the rest is checked inline. Fast processing is turned on in - * tcp_data_queue when everything is OK. */ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { + unsigned int len = skb->len; struct tcp_sock *tp = tcp_sk(sk);
tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); - /* - * Header prediction. - * The code loosely follows the one in the famous - * "30 instruction TCP receive" Van Jacobson mail. - * - * Van's trick is to deposit buffers into socket queue - * on a device interrupt, to call tcp_recv function - * on the receive process context and checksum and copy - * the buffer to user space. smart... - * - * Our current scheme is not silly either but we take the - * extra cost of the net_bh soft interrupt processing... - * We do checksum and copy also but from device to kernel. - */
tp->rx_opt.saw_tstamp = 0;
- /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_prediction is to be made - * 'S' will always be tp->tcp_header_len >> 2 - * '?' will be 0 for the fast path, otherwise pred_flags is 0 to - * turn it off (when there are holes in the receive - * space for instance) - * PSH flag is ignored. - */ - - if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && - TCP_SKB_CB(skb)->seq == tp->rcv_nxt && - !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { - int tcp_header_len = tp->tcp_header_len; - - /* Timestamp header prediction: tcp_header_len - * is automatically equal to th->doff*4 due to pred_flags - * match. - */ - - /* Check timestamp */ - if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { - /* No? Slow path! */ - if (!tcp_parse_aligned_timestamp(tp, th)) - goto slow_path; - - /* If PAWS failed, check it more carefully in slow path */ - if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) - goto slow_path; - - /* DO NOT update ts_recent here, if checksum fails - * and timestamp was corrupted part, it will result - * in a hung connection since we will drop all - * future packets due to the PAWS test. - */ - } - - if (len <= tcp_header_len) { - /* Bulk data transfer: sender */ - if (len == tcp_header_len) { - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - /* We know that such packets are checksummed - * on entry. - */ - tcp_ack(sk, skb, 0); - __kfree_skb(skb); - tcp_data_snd_check(sk); - return; - } else { /* Header too small */ - TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); - goto discard; - } - } else { - int eaten = 0; - bool fragstolen = false; - - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len && - sock_owned_by_user(sk)) { - __set_current_state(TASK_RUNNING); - - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + - TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - tcp_rcv_rtt_measure_ts(sk, skb); - - __skb_pull(skb, tcp_header_len); - tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPHPHITSTOUSER); - eaten = 1; - } - } - if (!eaten) { - if (tcp_checksum_complete(skb)) - goto csum_error; - - if ((int)skb->truesize > sk->sk_forward_alloc) - goto step5; - - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - tcp_rcv_rtt_measure_ts(sk, skb); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); - - /* Bulk data transfer: receiver */ - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, - &fragstolen); - } - - tcp_event_data_recv(sk, skb); - - if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { - /* Well, only one small jumplet in fast path... */ - tcp_ack(sk, skb, FLAG_DATA); - tcp_data_snd_check(sk); - if (!inet_csk_ack_scheduled(sk)) - goto no_ack; - } - - __tcp_ack_snd_check(sk, 0); - no_ack: - if (eaten) - kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk); - return; - } - } - - slow_path: if (len < (th->doff << 2) || tcp_checksum_complete(skb)) goto csum_error;
if (!th->ack && !th->rst && !th->syn) goto discard;
- /* - * Standard slow path. - */ - if (!tcp_validate_incoming(sk, skb, th, 1)) return;
- step5: - if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) + if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0) goto discard;
tcp_rcv_rtt_measure_ts(sk, skb); @@@ -5587,12 -5342,6 +5346,6 @@@ void tcp_finish_connect(struct sock *sk
if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); - - if (!tp->rx_opt.snd_wscale) - __tcp_fast_path_on(tp, tp->snd_wnd); - else - tp->pred_flags = 0; - }
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, @@@ -5721,7 -5470,7 +5474,7 @@@ static int tcp_rcv_synsent_state_proces tcp_ecn_rcv_synack(tp, th);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_ack(sk, skb, FLAG_SLOWPATH); + tcp_ack(sk, skb, 0);
/* Ok.. it's good. Set up sequence numbers and * move to established. @@@ -5957,8 -5706,8 +5710,8 @@@ int tcp_rcv_state_process(struct sock * return 0;
/* step 5: check the ACK field */ - acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | - FLAG_UPDATE_TS_RECENT | + + acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT | FLAG_NO_CHALLENGE_ACK) > 0;
if (!acceptable) { @@@ -6026,7 -5775,6 +5779,6 @@@ tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk); - tcp_fast_path_on(tp); break;
case TCP_FIN_WAIT1: { diff --combined net/ipv4/tcp_output.c index 276406a83a37,d49bff51bdb7..7ae5de0018b5 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@@ -295,9 -295,7 +295,7 @@@ static u16 tcp_select_window(struct soc /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale;
- /* If we advertise zero window, disable fast path. */ if (new_win == 0) { - tp->pred_flags = 0; if (old_win) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTOZEROWINDOWADV); @@@ -2377,15 -2375,23 +2375,14 @@@ bool tcp_schedule_loss_probe(struct soc { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); - u32 timeout, tlp_time_stamp, rto_time_stamp; + u32 timeout, rto_delta_us;
- /* No consecutive loss probes. */ - if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { - tcp_rearm_rto(sk); - return false; - } /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ if (tp->fastopen_rsk) return false;
- /* TLP is only scheduled when next timer event is RTO. */ - if (icsk->icsk_pending != ICSK_TIME_RETRANS) - return false; - /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ @@@ -2398,20 -2404,28 +2395,24 @@@ tcp_send_head(sk)) return false;
- /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account + /* Probe timeout is 2*rtt. Add minimum RTO to account * for delayed ack when there's one outstanding packet. If no RTT * sample is available then probe after TCP_TIMEOUT_INIT. */ - timeout = rtt << 1 ? : TCP_TIMEOUT_INIT; - if (tp->packets_out == 1) - timeout = max_t(u32, timeout, - (rtt + (rtt >> 1) + TCP_DELACK_MAX)); - timeout = max_t(u32, timeout, msecs_to_jiffies(10)); + if (tp->srtt_us) { + timeout = usecs_to_jiffies(tp->srtt_us >> 2); + if (tp->packets_out == 1) + timeout += TCP_RTO_MIN; + else + timeout += TCP_TIMEOUT_MIN; + } else { + timeout = TCP_TIMEOUT_INIT; + }
- /* If RTO is shorter, just schedule TLP in its place. */ - tlp_time_stamp = tcp_jiffies32 + timeout; - rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; - if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { - s32 delta = rto_time_stamp - tcp_jiffies32; - if (delta > 0) - timeout = delta; - } + /* If the RTO formula yields an earlier time, then use that time. */ + rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */ + if (rto_delta_us > 0) + timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); diff --combined net/ipv4/tcp_timer.c index e906014890b6,f753f9d2fee3..655dd8d7f064 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@@ -239,7 -239,6 +239,6 @@@ static int tcp_write_timeout(struct soc /* Called with BH disabled */ void tcp_delack_timer_handler(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
sk_mem_reclaim_partial(sk); @@@ -254,17 -253,6 +253,6 @@@ } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
- if (!skb_queue_empty(&tp->ucopy.prequeue)) { - struct sk_buff *skb; - - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); - - while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb); - - tp->ucopy.memory = 0; - } - if (inet_csk_ack_scheduled(sk)) { if (!icsk->icsk_ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ @@@ -652,8 -640,7 +640,8 @@@ static void tcp_keepalive_timer (unsign goto death; }
- if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) + if (!sock_flag(sk, SOCK_KEEPOPEN) || + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) goto out;
elapsed = keepalive_time_when(tp); diff --combined net/ipv6/route.c index a640fbcba15d,aba07fce67fb..521f58183240 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@@ -1820,6 -1820,11 +1820,11 @@@ static struct rt6_info *ip6_route_info_ goto out; }
+ if (cfg->fc_flags & RTF_OFFLOAD) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_OFFLOAD"); + goto out; + } + if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto out; @@@ -2351,7 -2356,6 +2356,7 @@@ static void rt6_do_redirect(struct dst_ if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY;
+ nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
if (ip6_ins_rt(nrt)) @@@ -2462,7 -2466,6 +2467,7 @@@ static struct rt6_info *rt6_add_route_i .fc_dst_len = prefixlen, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), + .fc_protocol = RTPROT_RA, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, @@@ -2515,7 -2518,6 +2520,7 @@@ struct rt6_info *rt6_add_dflt_router(co .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), + .fc_protocol = RTPROT_RA, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = dev_net(dev), @@@ -3330,6 -3332,9 +3335,9 @@@ static int rt6_nexthop_info(struct sk_b goto nla_put_failure; }
+ if (rt->rt6i_flags & RTF_OFFLOAD) + *flags |= RTNH_F_OFFLOAD; + /* not needed for multipath encoding b/c it has a rtnexthop struct */ if (!skip_oif && rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) @@@ -3427,6 -3432,14 +3435,6 @@@ static int rt6_fill_node(struct net *ne rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; - if (rt->rt6i_flags & RTF_DYNAMIC) - rtm->rtm_protocol = RTPROT_REDIRECT; - else if (rt->rt6i_flags & RTF_ADDRCONF) { - if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO)) - rtm->rtm_protocol = RTPROT_RA; - else - rtm->rtm_protocol = RTPROT_KERNEL; - }
if (rt->rt6i_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; diff --combined net/xfrm/xfrm_policy.c index 6f5a0dad502f,06c3bf7ab86b..2ecab54b74b1 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@@ -24,6 -24,7 +24,7 @@@ #include <linux/netfilter.h> #include <linux/module.h> #include <linux/cache.h> + #include <linux/cpu.h> #include <linux/audit.h> #include <net/dst.h> #include <net/flow.h> @@@ -44,6 -45,8 +45,8 @@@ struct xfrm_flo u8 flags; };
+ static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst); + static struct work_struct *xfrm_pcpu_work __read_mostly; static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; @@@ -246,36 -249,6 +249,6 @@@ expired xfrm_pol_put(xp); }
- static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo) - { - struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo); - - if (unlikely(pol->walk.dead)) - flo = NULL; - else - xfrm_pol_hold(pol); - - return flo; - } - - static int xfrm_policy_flo_check(struct flow_cache_object *flo) - { - struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo); - - return !pol->walk.dead; - } - - static void xfrm_policy_flo_delete(struct flow_cache_object *flo) - { - xfrm_pol_put(container_of(flo, struct xfrm_policy, flo)); - } - - static const struct flow_cache_ops xfrm_policy_fc_ops = { - .get = xfrm_policy_flo_get, - .check = xfrm_policy_flo_check, - .delete = xfrm_policy_flo_delete, - }; - /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 * SPD calls. */ @@@ -298,7 -271,6 +271,6 @@@ struct xfrm_policy *xfrm_policy_alloc(s (unsigned long)policy); setup_timer(&policy->polq.hold_timer, xfrm_policy_queue_process, (unsigned long)policy); - policy->flo.ops = &xfrm_policy_fc_ops; } return policy; } @@@ -798,7 -770,6 +770,6 @@@ int xfrm_policy_insert(int dir, struct else hlist_add_head(&policy->bydst, chain); __xfrm_policy_link(policy, dir); - atomic_inc(&net->xfrm.flow_cache_genid);
/* After previous checking, family can either be AF_INET or AF_INET6 */ if (policy->family == AF_INET) @@@ -1004,6 -975,8 +975,8 @@@ int xfrm_policy_flush(struct net *net, } if (!cnt) err = -ESRCH; + else + xfrm_policy_cache_flush(); out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; @@@ -1175,7 -1148,7 +1148,7 @@@ fail }
static struct xfrm_policy * - __xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir) + xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_policy *pol; @@@ -1187,61 -1160,6 +1160,6 @@@ return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); }
- static int flow_to_policy_dir(int dir) - { - if (XFRM_POLICY_IN == FLOW_DIR_IN && - XFRM_POLICY_OUT == FLOW_DIR_OUT && - XFRM_POLICY_FWD == FLOW_DIR_FWD) - return dir; - - switch (dir) { - default: - case FLOW_DIR_IN: - return XFRM_POLICY_IN; - case FLOW_DIR_OUT: - return XFRM_POLICY_OUT; - case FLOW_DIR_FWD: - return XFRM_POLICY_FWD; - } - } - - static struct flow_cache_object * - xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, - u8 dir, struct flow_cache_object *old_obj, void *ctx) - { - struct xfrm_policy *pol; - - if (old_obj) - xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo)); - - pol = __xfrm_policy_lookup(net, fl, family, flow_to_policy_dir(dir)); - if (IS_ERR_OR_NULL(pol)) - return ERR_CAST(pol); - - /* Resolver returns two references: - * one for cache and one for caller of flow_cache_lookup() */ - xfrm_pol_hold(pol); - - return &pol->flo; - } - - static inline int policy_to_flow_dir(int dir) - { - if (XFRM_POLICY_IN == FLOW_DIR_IN && - XFRM_POLICY_OUT == FLOW_DIR_OUT && - XFRM_POLICY_FWD == FLOW_DIR_FWD) - return dir; - switch (dir) { - default: - case XFRM_POLICY_IN: - return FLOW_DIR_IN; - case XFRM_POLICY_OUT: - return FLOW_DIR_OUT; - case XFRM_POLICY_FWD: - return FLOW_DIR_FWD; - } - } - static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, const struct flowi *fl, u16 family) { @@@ -1261,7 -1179,7 +1179,7 @@@ } err = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, - policy_to_flow_dir(dir)); + dir); if (!err) { if (!xfrm_pol_hold_rcu(pol)) goto again; @@@ -1545,58 -1463,6 +1463,6 @@@ static int xfrm_get_tos(const struct fl return tos; }
- static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo) - { - struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); - struct dst_entry *dst = &xdst->u.dst; - - if (xdst->route == NULL) { - /* Dummy bundle - if it has xfrms we were not - * able to build bundle as template resolution failed. - * It means we need to try again resolving. */ - if (xdst->num_xfrms > 0) - return NULL; - } else if (dst->flags & DST_XFRM_QUEUE) { - return NULL; - } else { - /* Real bundle */ - if (stale_bundle(dst)) - return NULL; - } - - dst_hold(dst); - return flo; - } - - static int xfrm_bundle_flo_check(struct flow_cache_object *flo) - { - struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); - struct dst_entry *dst = &xdst->u.dst; - - if (!xdst->route) - return 0; - if (stale_bundle(dst)) - return 0; - - return 1; - } - - static void xfrm_bundle_flo_delete(struct flow_cache_object *flo) - { - struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); - struct dst_entry *dst = &xdst->u.dst; - - /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ - dst->obsolete = DST_OBSOLETE_DEAD; - dst_release_immediate(dst); - } - - static const struct flow_cache_ops xfrm_bundle_fc_ops = { - .get = xfrm_bundle_flo_get, - .check = xfrm_bundle_flo_check, - .delete = xfrm_bundle_flo_delete, - }; - static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); @@@ -1624,7 -1490,6 +1490,6 @@@ struct dst_entry *dst = &xdst->u.dst;
memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); - xdst->flo.ops = &xfrm_bundle_fc_ops; } else xdst = ERR_PTR(-ENOBUFS);
@@@ -1840,6 -1705,102 +1705,102 @@@ static int xfrm_expand_policies(const s
}
+ static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old) + { + this_cpu_write(xfrm_last_dst, xdst); + if (old) + dst_release(&old->u.dst); + } + + static void __xfrm_pcpu_work_fn(void) + { + struct xfrm_dst *old; + + old = this_cpu_read(xfrm_last_dst); + if (old && !xfrm_bundle_ok(old)) + xfrm_last_dst_update(NULL, old); + } + + static void xfrm_pcpu_work_fn(struct work_struct *work) + { + local_bh_disable(); + rcu_read_lock(); + __xfrm_pcpu_work_fn(); + rcu_read_unlock(); + local_bh_enable(); + } + + void xfrm_policy_cache_flush(void) + { + struct xfrm_dst *old; + bool found = 0; + int cpu; + + local_bh_disable(); + rcu_read_lock(); + for_each_possible_cpu(cpu) { + old = per_cpu(xfrm_last_dst, cpu); + if (old && !xfrm_bundle_ok(old)) { + if (smp_processor_id() == cpu) { + __xfrm_pcpu_work_fn(); + continue; + } + found = true; + break; + } + } + + rcu_read_unlock(); + local_bh_enable(); + + if (!found) + return; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + bool bundle_release; + + rcu_read_lock(); + old = per_cpu(xfrm_last_dst, cpu); + bundle_release = old && !xfrm_bundle_ok(old); + rcu_read_unlock(); + + if (!bundle_release) + continue; + + if (cpu_online(cpu)) { + schedule_work_on(cpu, &xfrm_pcpu_work[cpu]); + continue; + } + + rcu_read_lock(); + old = per_cpu(xfrm_last_dst, cpu); + if (old && !xfrm_bundle_ok(old)) { + per_cpu(xfrm_last_dst, cpu) = NULL; + dst_release(&old->u.dst); + } + rcu_read_unlock(); + } + + put_online_cpus(); + } + + static bool xfrm_pol_dead(struct xfrm_dst *xdst) + { + unsigned int num_pols = xdst->num_pols; + unsigned int pol_dead = 0, i; + + for (i = 0; i < num_pols; i++) + pol_dead |= xdst->pols[i]->walk.dead; + + /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ + if (pol_dead) + xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; + + return pol_dead; + } + static struct xfrm_dst * xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, const struct flowi *fl, u16 family, @@@ -1847,10 -1808,22 +1808,22 @@@ { struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; + struct xfrm_dst *xdst, *old; struct dst_entry *dst; - struct xfrm_dst *xdst; int err;
+ xdst = this_cpu_read(xfrm_last_dst); + if (xdst && + xdst->u.dst.dev == dst_orig->dev && + xdst->num_pols == num_pols && + !xfrm_pol_dead(xdst) && + memcmp(xdst->pols, pols, + sizeof(struct xfrm_policy *) * num_pols) == 0) { + dst_hold(&xdst->u.dst); + return xdst; + } + + old = xdst; /* Try to instantiate a bundle */ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); if (err <= 0) { @@@ -1871,6 -1844,9 +1844,9 @@@ memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid);
+ atomic_set(&xdst->u.dst.__refcnt, 2); + xfrm_last_dst_update(xdst, old); + return xdst; }
@@@ -2051,86 -2027,39 +2027,39 @@@ free_dst goto out; }
- static struct flow_cache_object * - xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, - struct flow_cache_object *oldflo, void *ctx) + static struct xfrm_dst * + xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo) { - struct xfrm_flo *xflo = (struct xfrm_flo *)ctx; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; - struct xfrm_dst *xdst, *new_xdst; - int num_pols = 0, num_xfrms = 0, i, err, pol_dead; - - /* Check if the policies from old bundle are usable */ - xdst = NULL; - if (oldflo) { - xdst = container_of(oldflo, struct xfrm_dst, flo); - num_pols = xdst->num_pols; - num_xfrms = xdst->num_xfrms; - pol_dead = 0; - for (i = 0; i < num_pols; i++) { - pols[i] = xdst->pols[i]; - pol_dead |= pols[i]->walk.dead; - } - if (pol_dead) { - /* Mark DST_OBSOLETE_DEAD to fail the next - * xfrm_dst_check() - */ - xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; - dst_release_immediate(&xdst->u.dst); - xdst = NULL; - num_pols = 0; - num_xfrms = 0; - oldflo = NULL; - } - } + int num_pols = 0, num_xfrms = 0, err; + struct xfrm_dst *xdst;
/* Resolve policies to use if we couldn't get them from * previous cache entry */ - if (xdst == NULL) { - num_pols = 1; - pols[0] = __xfrm_policy_lookup(net, fl, family, - flow_to_policy_dir(dir)); - err = xfrm_expand_policies(fl, family, pols, + num_pols = 1; + pols[0] = xfrm_policy_lookup(net, fl, family, dir); + err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); - if (err < 0) - goto inc_error; - if (num_pols == 0) - return NULL; - if (num_xfrms <= 0) - goto make_dummy_bundle; - } + if (err < 0) + goto inc_error; + if (num_pols == 0) + return NULL; + if (num_xfrms <= 0) + goto make_dummy_bundle;
- new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, + xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, xflo->dst_orig); - if (IS_ERR(new_xdst)) { - err = PTR_ERR(new_xdst); + if (IS_ERR(xdst)) { + err = PTR_ERR(xdst); if (err != -EAGAIN) goto error; - if (oldflo == NULL) - goto make_dummy_bundle; - dst_hold(&xdst->u.dst); - return oldflo; - } else if (new_xdst == NULL) { + goto make_dummy_bundle; + } else if (xdst == NULL) { num_xfrms = 0; - if (oldflo == NULL) - goto make_dummy_bundle; - xdst->num_xfrms = 0; - dst_hold(&xdst->u.dst); - return oldflo; - } - - /* Kill the previous bundle */ - if (xdst) { - /* The policies were stolen for newly generated bundle */ - xdst->num_pols = 0; - /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ - xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; - dst_release_immediate(&xdst->u.dst); + goto make_dummy_bundle; }
- /* We do need to return one reference for original caller */ - dst_hold(&new_xdst->u.dst); - return &new_xdst->flo; + return xdst;
make_dummy_bundle: /* We found policies, but there's no bundles to instantiate: @@@ -2146,17 -2075,12 +2075,12 @@@ memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
dst_hold(&xdst->u.dst); - return &xdst->flo; + return xdst;
inc_error: XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); error: - if (xdst != NULL) { - /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ - xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; - dst_release_immediate(&xdst->u.dst); - } else - xfrm_pols_put(pols, num_pols); + xfrm_pols_put(pols, num_pols); return ERR_PTR(err); }
@@@ -2187,11 -2111,10 +2111,10 @@@ struct dst_entry *xfrm_lookup(struct ne const struct sock *sk, int flags) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; - struct flow_cache_object *flo; struct xfrm_dst *xdst; struct dst_entry *dst, *route; u16 family = dst_orig->ops->family; - u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT); + u8 dir = XFRM_POLICY_OUT; int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
dst = NULL; @@@ -2242,15 -2165,13 +2165,13 @@@ !net->xfrm.policy_count[XFRM_POLICY_OUT]) goto nopol;
- flo = flow_cache_lookup(net, fl, family, dir, - xfrm_bundle_lookup, &xflo); - if (flo == NULL) + xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo); + if (xdst == NULL) goto nopol; - if (IS_ERR(flo)) { - err = PTR_ERR(flo); + if (IS_ERR(xdst)) { + err = PTR_ERR(xdst); goto dropdst; } - xdst = container_of(flo, struct xfrm_dst, flo);
num_pols = xdst->num_pols; num_xfrms = xdst->num_xfrms; @@@ -2449,12 -2370,10 +2370,10 @@@ int __xfrm_policy_check(struct sock *sk int pi; int reverse; struct flowi fl; - u8 fl_dir; int xerr_idx = -1;
reverse = dir & ~XFRM_POLICY_MASK; dir &= XFRM_POLICY_MASK; - fl_dir = policy_to_flow_dir(dir);
if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); @@@ -2486,16 -2405,8 +2405,8 @@@ } }
- if (!pol) { - struct flow_cache_object *flo; - - flo = flow_cache_lookup(net, &fl, family, fl_dir, - xfrm_policy_lookup, NULL); - if (IS_ERR_OR_NULL(flo)) - pol = ERR_CAST(flo); - else - pol = container_of(flo, struct xfrm_policy, flo); - } + if (!pol) + pol = xfrm_policy_lookup(net, &fl, family, dir);
if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); @@@ -2641,11 -2552,9 +2552,9 @@@ static struct dst_entry *xfrm_dst_check * notice. That's what we are validating here via the * stale_bundle() check. * - * When an xdst is removed from flow cache, DST_OBSOLETE_DEAD will - * be marked on it. * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will * be marked on it. - * Both will force stable_bundle() to fail on any xdst bundle with + * This will force stale_bundle() to fail on any xdst bundle with * this dst linked in it. */ if (dst->obsolete < 0 && !stale_bundle(dst)) @@@ -2685,18 -2594,6 +2594,6 @@@ static struct dst_entry *xfrm_negative_ return dst; }
- void xfrm_garbage_collect(struct net *net) - { - flow_cache_flush(net); - } - EXPORT_SYMBOL(xfrm_garbage_collect); - - void xfrm_garbage_collect_deferred(struct net *net) - { - flow_cache_flush_deferred(net); - } - EXPORT_SYMBOL(xfrm_garbage_collect_deferred); - static void xfrm_init_pmtu(struct dst_entry *dst) { do { @@@ -3034,14 -2931,9 +2931,9 @@@ static int __net_init xfrm_net_init(str rv = xfrm_sysctl_init(net); if (rv < 0) goto out_sysctl; - rv = flow_cache_init(net); - if (rv < 0) - goto out;
return 0;
- out: - xfrm_sysctl_fini(net); out_sysctl: xfrm_policy_fini(net); out_policy: @@@ -3054,7 -2946,6 +2946,6 @@@ out_statistics
static void __net_exit xfrm_net_exit(struct net *net) { - flow_cache_fini(net); xfrm_sysctl_fini(net); xfrm_policy_fini(net); xfrm_state_fini(net); @@@ -3068,7 -2959,15 +2959,15 @@@ static struct pernet_operations __net_i
void __init xfrm_init(void) { - flow_cache_hp_init(); + int i; + + xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work), + GFP_KERNEL); + BUG_ON(!xfrm_pcpu_work); + + for (i = 0; i < NR_CPUS; i++) + INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn); + register_pernet_subsys(&xfrm_net_ops); seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); @@@ -3308,15 -3207,9 +3207,15 @@@ int xfrm_migrate(const struct xfrm_sele struct xfrm_state *x_new[XFRM_MAX_DEPTH]; struct xfrm_migrate *mp;
+ /* Stage 0 - sanity checks */ if ((err = xfrm_migrate_check(m, num_migrate)) < 0) goto out;
+ if (dir >= XFRM_POLICY_MAX) { + err = -EINVAL; + goto out; + } + /* Stage 1 - find policy */ if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) { err = -ENOENT; diff --combined net/xfrm/xfrm_state.c index a792effdb0b5,82cbbce69b79..fd63c2fdbc39 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@@ -724,9 -724,10 +724,10 @@@ restart } } } - if (cnt) + if (cnt) { err = 0; - + xfrm_policy_cache_flush(); + } out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); return err; @@@ -1620,7 -1621,6 +1621,7 @@@ in xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, unsigned short family, struct net *net) { + int i; int err = 0; struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); if (!afinfo) @@@ -1629,9 -1629,6 +1630,9 @@@ spin_lock_bh(&net->xfrm.xfrm_state_lock); /*FIXME*/ if (afinfo->tmpl_sort) err = afinfo->tmpl_sort(dst, src, n); + else + for (i = 0; i < n; i++) + dst[i] = src[i]; spin_unlock_bh(&net->xfrm.xfrm_state_lock); rcu_read_unlock(); return err; @@@ -1642,7 -1639,6 +1643,7 @@@ in xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, unsigned short family) { + int i; int err = 0; struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); struct net *net = xs_net(*src); @@@ -1653,9 -1649,6 +1654,9 @@@ spin_lock_bh(&net->xfrm.xfrm_state_lock); if (afinfo->state_sort) err = afinfo->state_sort(dst, src, n); + else + for (i = 0; i < n; i++) + dst[i] = src[i]; spin_unlock_bh(&net->xfrm.xfrm_state_lock); rcu_read_unlock(); return err;
linux-merge@lists.open-mesh.org