The following commit has been merged in the master branch: commit 3118e6e19da7b8d76b2456b880c74a9aa3a2268b Merge: feca7d8c135bc1527b244fe817b8b6498066ccec 48fb6f4db940e92cfb16cd878cddd59ea6120d06 Author: David S. Miller davem@davemloft.net Date: Wed Aug 9 16:28:45 2017 -0700
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
The UDP offload conflict is dealt with by simply taking what is in net-next where we have removed all of the UFO handling code entirely.
The TCP conflict was a case of local variables in a function being removed from both net and net-next.
In netvsc we had an assignment right next to where a missing set of u64 stats sync object inits were added.
Signed-off-by: David S. Miller davem@davemloft.net
diff --combined MAINTAINERS index b3a8ca6aa3ed,3c419022ed93..7cb7f4c3ad3f --- a/MAINTAINERS +++ b/MAINTAINERS @@@ -1161,7 -1161,7 +1161,7 @@@ M: Brendan Higgins <brendanhiggins@goog R: Benjamin Herrenschmidt benh@kernel.crashing.org R: Joel Stanley joel@jms.id.au L: linux-i2c@vger.kernel.org - L: openbmc@lists.ozlabs.org + L: openbmc@lists.ozlabs.org (moderated for non-subscribers) S: Maintained F: drivers/irqchip/irq-aspeed-i2c-ic.c F: drivers/i2c/busses/i2c-aspeed.c @@@ -2477,7 -2477,7 +2477,7 @@@ Q: https://patchwork.open-mesh.org/proj S: Maintained F: Documentation/ABI/testing/sysfs-class-net-batman-adv F: Documentation/ABI/testing/sysfs-class-net-mesh -F: Documentation/networking/batman-adv.txt +F: Documentation/networking/batman-adv.rst F: include/uapi/linux/batman_adv.h F: net/batman-adv/
@@@ -5101,7 -5101,6 +5101,7 @@@ F: include/linux/of_net. F: include/linux/phy.h F: include/linux/phy_fixed.h F: include/linux/platform_data/mdio-gpio.h +F: include/linux/platform_data/mdio-bcm-unimac.h F: include/trace/events/mdio.h F: include/uapi/linux/mdio.h F: include/uapi/linux/mii.h @@@ -5835,7 -5834,7 +5835,7 @@@ F: drivers/staging/greybus/spi. F: drivers/staging/greybus/spilib.c F: drivers/staging/greybus/spilib.h
- GREYBUS LOOBACK/TIME PROTOCOLS DRIVERS + GREYBUS LOOPBACK/TIME PROTOCOLS DRIVERS M: Bryan O'Donoghue pure.logic@nexus-software.ie S: Maintained F: drivers/staging/greybus/loopback.c @@@ -6148,14 -6147,6 +6148,14 @@@ S: Maintaine F: drivers/net/ethernet/hisilicon/ F: Documentation/devicetree/bindings/net/hisilicon*.txt
+HISILICON NETWORK SUBSYSTEM 3 DRIVER (HNS3) +M: Yisen Zhuang yisen.zhuang@huawei.com +M: Salil Mehta salil.mehta@huawei.com +L: netdev@vger.kernel.org +W: http://www.hisilicon.com +S: Maintained +F: drivers/net/ethernet/hisilicon/hns3/ + HISILICON ROCE DRIVER M: Lijun Ou oulijun@huawei.com M: Wei Hu(Xavier) xavier.huwei@huawei.com @@@ -6266,7 -6257,6 +6266,7 @@@ M: Haiyang Zhang <haiyangz@microsoft.co M: Stephen Hemminger sthemmin@microsoft.com L: devel@linuxdriverproject.org S: Maintained +F: Documentation/networking/netvsc.txt F: arch/x86/include/asm/mshyperv.h F: arch/x86/include/uapi/asm/hyperv.h F: arch/x86/kernel/cpu/mshyperv.c @@@ -8434,9 -8424,7 +8434,9 @@@ F: include/uapi/linux/uvcvideo.
MEDIATEK ETHERNET DRIVER M: Felix Fietkau nbd@openwrt.org -M: John Crispin blogic@openwrt.org +M: John Crispin john@phrozen.org +M: Sean Wang sean.wang@mediatek.com +M: Nelson Chang nelson.chang@mediatek.com L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/mediatek/ diff --combined drivers/net/dsa/mt7530.c index 12700710f26d,264b281eb86b..8faa796a115f --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@@ -625,6 -625,44 +625,44 @@@ static void mt7530_adjust_link(struct d * all finished. */ mt7623_pad_clk_setup(ds); + } else { + u16 lcl_adv = 0, rmt_adv = 0; + u8 flowctrl; + u32 mcr = PMCR_USERP_LINK | PMCR_FORCE_MODE; + + switch (phydev->speed) { + case SPEED_1000: + mcr |= PMCR_FORCE_SPEED_1000; + break; + case SPEED_100: + mcr |= PMCR_FORCE_SPEED_100; + break; + }; + + if (phydev->link) + mcr |= PMCR_FORCE_LNK; + + if (phydev->duplex) { + mcr |= PMCR_FORCE_FDX; + + if (phydev->pause) + rmt_adv = LPA_PAUSE_CAP; + if (phydev->asym_pause) + rmt_adv |= LPA_PAUSE_ASYM; + + if (phydev->advertising & ADVERTISED_Pause) + lcl_adv |= ADVERTISE_PAUSE_CAP; + if (phydev->advertising & ADVERTISED_Asym_Pause) + lcl_adv |= ADVERTISE_PAUSE_ASYM; + + flowctrl = mii_resolve_flowctrl_fdx(lcl_adv, rmt_adv); + + if (flowctrl & FLOW_CTRL_TX) + mcr |= PMCR_TX_FC_EN; + if (flowctrl & FLOW_CTRL_RX) + mcr |= PMCR_RX_FC_EN; + } + mt7530_write(priv, MT7530_PMCR_P(port), mcr); } }
@@@ -801,31 -839,49 +839,31 @@@ mt7530_port_bridge_leave(struct dsa_swi }
static int -mt7530_port_fdb_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_fdb *fdb, - struct switchdev_trans *trans) +mt7530_port_fdb_add(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid) { struct mt7530_priv *priv = ds->priv; int ret; + u8 port_mask = BIT(port);
- /* Because auto-learned entrie shares the same FDB table. - * an entry is reserved with no port_mask to make sure fdb_add - * is called while the entry is still available. - */ mutex_lock(&priv->reg_mutex); - mt7530_fdb_write(priv, fdb->vid, 0, fdb->addr, -1, STATIC_ENT); + mt7530_fdb_write(priv, vid, port_mask, addr, -1, STATIC_ENT); ret = mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, 0); mutex_unlock(&priv->reg_mutex);
return ret; }
-static void -mt7530_port_fdb_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_fdb *fdb, - struct switchdev_trans *trans) -{ - struct mt7530_priv *priv = ds->priv; - u8 port_mask = BIT(port); - - mutex_lock(&priv->reg_mutex); - mt7530_fdb_write(priv, fdb->vid, port_mask, fdb->addr, -1, STATIC_ENT); - mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, 0); - mutex_unlock(&priv->reg_mutex); -} - static int mt7530_port_fdb_del(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_fdb *fdb) + const unsigned char *addr, u16 vid) { struct mt7530_priv *priv = ds->priv; int ret; u8 port_mask = BIT(port);
mutex_lock(&priv->reg_mutex); - mt7530_fdb_write(priv, fdb->vid, port_mask, fdb->addr, -1, STATIC_EMP); + mt7530_fdb_write(priv, vid, port_mask, addr, -1, STATIC_EMP); ret = mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, 0); mutex_unlock(&priv->reg_mutex);
@@@ -834,7 -890,8 +872,7 @@@
static int mt7530_port_fdb_dump(struct dsa_switch *ds, int port, - struct switchdev_obj_port_fdb *fdb, - switchdev_obj_dump_cb_t *cb) + dsa_fdb_dump_cb_t *cb, void *data) { struct mt7530_priv *priv = ds->priv; struct mt7530_fdb _fdb = { 0 }; @@@ -852,8 -909,11 +890,8 @@@ if (rsp & ATC_SRCH_HIT) { mt7530_fdb_read(priv, &_fdb); if (_fdb.port_mask & BIT(port)) { - ether_addr_copy(fdb->addr, _fdb.mac); - fdb->vid = _fdb.vid; - fdb->ndm_state = _fdb.noarp ? - NUD_NOARP : NUD_REACHABLE; - ret = cb(&fdb->obj); + ret = cb(_fdb.mac, _fdb.vid, _fdb.noarp, + data); if (ret < 0) break; } @@@ -993,6 -1053,7 +1031,6 @@@ static struct dsa_switch_ops mt7530_swi .port_stp_state_set = mt7530_stp_state_set, .port_bridge_join = mt7530_port_bridge_join, .port_bridge_leave = mt7530_port_bridge_leave, - .port_fdb_prepare = mt7530_port_fdb_prepare, .port_fdb_add = mt7530_port_fdb_add, .port_fdb_del = mt7530_port_fdb_del, .port_fdb_dump = mt7530_port_fdb_dump, diff --combined drivers/net/ethernet/ibm/ibmvnic.c index 99576ba4187f,c45e8e3b82d3..32c116652755 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@@ -111,6 -111,7 +111,7 @@@ static void send_request_map(struct ibm static void send_request_unmap(struct ibmvnic_adapter *, u8); static void send_login(struct ibmvnic_adapter *adapter); static void send_cap_queries(struct ibmvnic_adapter *adapter); + static int init_sub_crqs(struct ibmvnic_adapter *); static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter); static int ibmvnic_init(struct ibmvnic_adapter *); static void release_crq_queue(struct ibmvnic_adapter *); @@@ -346,31 -347,6 +347,31 @@@ static void replenish_pools(struct ibmv } }
+static void release_stats_buffers(struct ibmvnic_adapter *adapter) +{ + kfree(adapter->tx_stats_buffers); + kfree(adapter->rx_stats_buffers); +} + +static int init_stats_buffers(struct ibmvnic_adapter *adapter) +{ + adapter->tx_stats_buffers = + kcalloc(adapter->req_tx_queues, + sizeof(struct ibmvnic_tx_queue_stats), + GFP_KERNEL); + if (!adapter->tx_stats_buffers) + return -ENOMEM; + + adapter->rx_stats_buffers = + kcalloc(adapter->req_rx_queues, + sizeof(struct ibmvnic_rx_queue_stats), + GFP_KERNEL); + if (!adapter->rx_stats_buffers) + return -ENOMEM; + + return 0; +} + static void release_stats_token(struct ibmvnic_adapter *adapter) { struct device *dev = &adapter->vdev->dev; @@@ -676,6 -652,7 +677,7 @@@ static int ibmvnic_login(struct net_dev struct ibmvnic_adapter *adapter = netdev_priv(netdev); unsigned long timeout = msecs_to_jiffies(30000); struct device *dev = &adapter->vdev->dev; + int rc;
do { if (adapter->renegotiate) { @@@ -689,6 -666,18 +691,18 @@@ dev_err(dev, "Capabilities query timeout\n"); return -1; } + rc = init_sub_crqs(adapter); + if (rc) { + dev_err(dev, + "Initialization of SCRQ's failed\n"); + return -1; + } + rc = init_sub_crq_irqs(adapter); + if (rc) { + dev_err(dev, + "Initialization of SCRQ's irqs failed\n"); + return -1; + } }
reinit_completion(&adapter->init_done); @@@ -711,7 -700,6 +725,7 @@@ static void release_resources(struct ib release_rx_pools(adapter);
release_stats_token(adapter); + release_stats_buffers(adapter); release_error_buffers(adapter);
if (adapter->napi) { @@@ -789,10 -777,6 +803,10 @@@ static int init_resources(struct ibmvni if (rc) return rc;
+ rc = init_stats_buffers(adapter); + if (rc) + return rc; + rc = init_stats_token(adapter); if (rc) return rc; @@@ -1275,9 -1259,6 +1289,9 @@@ out netdev->stats.tx_packets += tx_packets; adapter->tx_send_failed += tx_send_failed; adapter->tx_map_failed += tx_map_failed; + adapter->tx_stats_buffers[queue_num].packets += tx_packets; + adapter->tx_stats_buffers[queue_num].bytes += tx_bytes; + adapter->tx_stats_buffers[queue_num].dropped_packets += tx_dropped;
return ret; } @@@ -1579,8 -1560,7 +1593,8 @@@ restart_poll rx_comp.correlator); /* do error checking */ if (next->rx_comp.rc) { - netdev_err(netdev, "rx error %x\n", next->rx_comp.rc); + netdev_dbg(netdev, "rx buffer returned with rc %x\n", + be16_to_cpu(next->rx_comp.rc)); /* free the entry */ next->rx_comp.first = 0; remove_buff_from_pool(adapter, rx_buff); @@@ -1619,8 -1599,6 +1633,8 @@@ napi_gro_receive(napi, skb); /* send it up */ netdev->stats.rx_packets++; netdev->stats.rx_bytes += length; + adapter->rx_stats_buffers[scrq_num].packets++; + adapter->rx_stats_buffers[scrq_num].bytes += length; frames_processed++; }
@@@ -1730,36 -1708,18 +1744,36 @@@ static u32 ibmvnic_get_link(struct net_ static void ibmvnic_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring) { - ring->rx_max_pending = 0; - ring->tx_max_pending = 0; + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + + ring->rx_max_pending = adapter->max_rx_add_entries_per_subcrq; + ring->tx_max_pending = adapter->max_tx_entries_per_subcrq; ring->rx_mini_max_pending = 0; ring->rx_jumbo_max_pending = 0; - ring->rx_pending = 0; - ring->tx_pending = 0; + ring->rx_pending = adapter->req_rx_add_entries_per_subcrq; + ring->tx_pending = adapter->req_tx_entries_per_subcrq; ring->rx_mini_pending = 0; ring->rx_jumbo_pending = 0; }
+static void ibmvnic_get_channels(struct net_device *netdev, + struct ethtool_channels *channels) +{ + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + + channels->max_rx = adapter->max_rx_queues; + channels->max_tx = adapter->max_tx_queues; + channels->max_other = 0; + channels->max_combined = 0; + channels->rx_count = adapter->req_rx_queues; + channels->tx_count = adapter->req_tx_queues; + channels->other_count = 0; + channels->combined_count = 0; +} + static void ibmvnic_get_strings(struct net_device *dev, u32 stringset, u8 *data) { + struct ibmvnic_adapter *adapter = netdev_priv(dev); int i;
if (stringset != ETH_SS_STATS) @@@ -1767,39 -1727,13 +1781,39 @@@
for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++, data += ETH_GSTRING_LEN) memcpy(data, ibmvnic_stats[i].name, ETH_GSTRING_LEN); + + for (i = 0; i < adapter->req_tx_queues; i++) { + snprintf(data, ETH_GSTRING_LEN, "tx%d_packets", i); + data += ETH_GSTRING_LEN; + + snprintf(data, ETH_GSTRING_LEN, "tx%d_bytes", i); + data += ETH_GSTRING_LEN; + + snprintf(data, ETH_GSTRING_LEN, "tx%d_dropped_packets", i); + data += ETH_GSTRING_LEN; + } + + for (i = 0; i < adapter->req_rx_queues; i++) { + snprintf(data, ETH_GSTRING_LEN, "rx%d_packets", i); + data += ETH_GSTRING_LEN; + + snprintf(data, ETH_GSTRING_LEN, "rx%d_bytes", i); + data += ETH_GSTRING_LEN; + + snprintf(data, ETH_GSTRING_LEN, "rx%d_interrupts", i); + data += ETH_GSTRING_LEN; + } }
static int ibmvnic_get_sset_count(struct net_device *dev, int sset) { + struct ibmvnic_adapter *adapter = netdev_priv(dev); + switch (sset) { case ETH_SS_STATS: - return ARRAY_SIZE(ibmvnic_stats); + return ARRAY_SIZE(ibmvnic_stats) + + adapter->req_tx_queues * NUM_TX_STATS + + adapter->req_rx_queues * NUM_RX_STATS; default: return -EOPNOTSUPP; } @@@ -1810,7 -1744,7 +1824,7 @@@ static void ibmvnic_get_ethtool_stats(s { struct ibmvnic_adapter *adapter = netdev_priv(dev); union ibmvnic_crq crq; - int i; + int i, j;
memset(&crq, 0, sizeof(crq)); crq.request_statistics.first = IBMVNIC_CRQ_CMD; @@@ -1825,26 -1759,7 +1839,26 @@@ wait_for_completion(&adapter->stats_done);
for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++) - data[i] = IBMVNIC_GET_STAT(adapter, ibmvnic_stats[i].offset); + data[i] = be64_to_cpu(IBMVNIC_GET_STAT(adapter, + ibmvnic_stats[i].offset)); + + for (j = 0; j < adapter->req_tx_queues; j++) { + data[i] = adapter->tx_stats_buffers[j].packets; + i++; + data[i] = adapter->tx_stats_buffers[j].bytes; + i++; + data[i] = adapter->tx_stats_buffers[j].dropped_packets; + i++; + } + + for (j = 0; j < adapter->req_rx_queues; j++) { + data[i] = adapter->rx_stats_buffers[j].packets; + i++; + data[i] = adapter->rx_stats_buffers[j].bytes; + i++; + data[i] = adapter->rx_stats_buffers[j].interrupts; + i++; + } }
static const struct ethtool_ops ibmvnic_ethtool_ops = { @@@ -1853,7 -1768,6 +1867,7 @@@ .set_msglevel = ibmvnic_set_msglevel, .get_link = ibmvnic_get_link, .get_ringparam = ibmvnic_get_ringparam, + .get_channels = ibmvnic_get_channels, .get_strings = ibmvnic_get_strings, .get_sset_count = ibmvnic_get_sset_count, .get_ethtool_stats = ibmvnic_get_ethtool_stats, @@@ -2150,8 -2064,6 +2164,8 @@@ static irqreturn_t ibmvnic_interrupt_rx struct ibmvnic_sub_crq_queue *scrq = instance; struct ibmvnic_adapter *adapter = scrq->adapter;
+ adapter->rx_stats_buffers[scrq->scrq_num].interrupts++; + if (napi_schedule_prep(&adapter->napi[scrq->scrq_num])) { disable_scrq_irq(adapter, scrq); __napi_schedule(&adapter->napi[scrq->scrq_num]); @@@ -3106,7 -3018,6 +3120,6 @@@ static void handle_request_cap_rsp(unio *req_value, (long int)be64_to_cpu(crq->request_capability_rsp. number), name); - release_sub_crqs(adapter); *req_value = be64_to_cpu(crq->request_capability_rsp.number); ibmvnic_send_req_caps(adapter, 1); return; @@@ -3953,7 -3864,10 +3966,7 @@@ static int ibmvnic_resume(struct devic if (adapter->state != VNIC_OPEN) return 0;
- /* kick the interrupt handlers just in case we lost an interrupt */ - for (i = 0; i < adapter->req_rx_queues; i++) - ibmvnic_interrupt_rx(adapter->rx_scrq[i]->irq, - adapter->rx_scrq[i]); + tasklet_schedule(&adapter->tasklet);
return 0; } diff --combined drivers/net/ethernet/intel/i40e/i40e_txrx.c index d464fceb300f,2194960d5855..8a969d8f0790 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@@ -860,7 -860,7 +860,7 @@@ static bool i40e_clean_tx_irq(struct i4 netdev_tx_completed_queue(txring_txq(tx_ring), total_packets, total_bytes);
-#define TX_WAKE_THRESHOLD (DESC_NEEDED * 2) +#define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2)) if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) && (I40E_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) { /* Make sure that anybody stopping the queue after this @@@ -1113,6 -1113,8 +1113,8 @@@ int i40e_setup_tx_descriptors(struct i4 if (!tx_ring->tx_bi) goto err;
+ u64_stats_init(&tx_ring->syncp); + /* round up to nearest 4K */ tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc); /* add u32 for head writeback, align after this takes care of @@@ -2063,7 -2065,7 +2065,7 @@@ static int i40e_clean_rx_irq(struct i40 u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); bool failure = false, xdp_xmit = false;
- while (likely(total_rx_packets < budget)) { + while (likely(total_rx_packets < (unsigned int)budget)) { struct i40e_rx_buffer *rx_buffer; union i40e_rx_desc *rx_desc; struct xdp_buff xdp; @@@ -2196,7 -2198,7 +2198,7 @@@ rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
/* guarantee a trip back through this routine if there was a failure */ - return failure ? budget : total_rx_packets; + return failure ? budget : (int)total_rx_packets; }
static u32 i40e_buildreg_itr(const int type, const u16 itr) @@@ -2451,15 -2453,9 +2453,15 @@@ static void i40e_atr(struct i40e_ring * hlen = (hdr.network[0] & 0x0F) << 2; l4_proto = hdr.ipv4->protocol; } else { - hlen = hdr.network - skb->data; - l4_proto = ipv6_find_hdr(skb, &hlen, IPPROTO_TCP, NULL, NULL); - hlen -= hdr.network - skb->data; + /* find the start of the innermost ipv6 header */ + unsigned int inner_hlen = hdr.network - skb->data; + unsigned int h_offset = inner_hlen; + + /* this function updates h_offset to the end of the header */ + l4_proto = + ipv6_find_hdr(skb, &h_offset, IPPROTO_TCP, NULL, NULL); + /* hlen will contain our best estimate of the tcp header */ + hlen = h_offset - inner_hlen; }
if (l4_proto != IPPROTO_TCP) diff --combined drivers/net/ethernet/netronome/nfp/nfp_net_common.c index ea471604450e,4631ca8b8eb2..4a990033c4d5 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@@ -513,6 -513,7 +513,7 @@@ nfp_net_tx_ring_init(struct nfp_net_tx_ tx_ring->idx = idx; tx_ring->r_vec = r_vec; tx_ring->is_xdp = is_xdp; + u64_stats_init(&tx_ring->r_vec->tx_sync);
tx_ring->qcidx = tx_ring->idx * nn->stride_tx; tx_ring->qcp_q = nn->tx_bar + NFP_QCP_QUEUE_OFF(tx_ring->qcidx); @@@ -532,6 -533,7 +533,7 @@@ nfp_net_rx_ring_init(struct nfp_net_rx_
rx_ring->idx = idx; rx_ring->r_vec = r_vec; + u64_stats_init(&rx_ring->r_vec->rx_sync);
rx_ring->fl_qcidx = rx_ring->idx * nn->stride_rx; rx_ring->qcp_fl = nn->rx_bar + NFP_QCP_QUEUE_OFF(rx_ring->fl_qcidx); @@@ -2658,7 -2660,6 +2660,7 @@@ static int nfp_net_netdev_close(struct /* Step 2: Tell NFP */ nfp_net_clear_config_and_disable(nn); + nfp_port_configure(netdev, false);
/* Step 3: Free resources */ @@@ -2776,21 -2777,16 +2778,21 @@@ static int nfp_net_netdev_open(struct n goto err_free_all;
/* Step 2: Configure the NFP + * - Ifup the physical interface if it exists * - Enable rings from 0 to tx_rings/rx_rings - 1. * - Write MAC address (in case it changed) * - Set the MTU * - Set the Freelist buffer size * - Enable the FW */ - err = nfp_net_set_config_and_enable(nn); + err = nfp_port_configure(netdev, true); if (err) goto err_free_all;
+ err = nfp_net_set_config_and_enable(nn); + if (err) + goto err_port_disable; + /* Step 3: Enable for kernel * - put some freelist descriptors on each RX ring * - enable NAPI on each ring @@@ -2801,8 -2797,6 +2803,8 @@@
return 0;
+err_port_disable: + nfp_port_configure(netdev, false); err_free_all: nfp_net_close_free_all(nn); return err; diff --combined drivers/net/ethernet/qlogic/qed/qed_mcp.c index c1ecce6b9141,3eb241657368..376485d99357 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@@ -253,7 -253,7 +253,7 @@@ int qed_mcp_cmd_init(struct qed_hwfn *p size = MFW_DRV_MSG_MAX_DWORDS(p_info->mfw_mb_length) * sizeof(u32); p_info->mfw_mb_cur = kzalloc(size, GFP_KERNEL); p_info->mfw_mb_shadow = kzalloc(size, GFP_KERNEL); - if (!p_info->mfw_mb_shadow || !p_info->mfw_mb_addr) + if (!p_info->mfw_mb_cur || !p_info->mfw_mb_shadow) goto err;
return 0; @@@ -1097,31 -1097,6 +1097,31 @@@ static void qed_mcp_handle_transceiver_ DP_NOTICE(p_hwfn, "Transceiver is unplugged.\n"); }
+static void qed_mcp_read_eee_config(struct qed_hwfn *p_hwfn, + struct qed_ptt *p_ptt, + struct qed_mcp_link_state *p_link) +{ + u32 eee_status, val; + + p_link->eee_adv_caps = 0; + p_link->eee_lp_adv_caps = 0; + eee_status = qed_rd(p_hwfn, + p_ptt, + p_hwfn->mcp_info->port_addr + + offsetof(struct public_port, eee_status)); + p_link->eee_active = !!(eee_status & EEE_ACTIVE_BIT); + val = (eee_status & EEE_LD_ADV_STATUS_MASK) >> EEE_LD_ADV_STATUS_OFFSET; + if (val & EEE_1G_ADV) + p_link->eee_adv_caps |= QED_EEE_1G_ADV; + if (val & EEE_10G_ADV) + p_link->eee_adv_caps |= QED_EEE_10G_ADV; + val = (eee_status & EEE_LP_ADV_STATUS_MASK) >> EEE_LP_ADV_STATUS_OFFSET; + if (val & EEE_1G_ADV) + p_link->eee_lp_adv_caps |= QED_EEE_1G_ADV; + if (val & EEE_10G_ADV) + p_link->eee_lp_adv_caps |= QED_EEE_10G_ADV; +} + static void qed_mcp_handle_link_change(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, bool b_reset) { @@@ -1253,9 -1228,6 +1253,9 @@@
p_link->sfp_tx_fault = !!(status & LINK_STATUS_SFP_TX_FAULT);
+ if (p_hwfn->mcp_info->capabilities & FW_MB_PARAM_FEATURE_SUPPORT_EEE) + qed_mcp_read_eee_config(p_hwfn, p_ptt, p_link); + qed_link_update(p_hwfn); out: spin_unlock_bh(&p_hwfn->mcp_info->link_lock); @@@ -1279,19 -1251,6 +1279,19 @@@ int qed_mcp_set_link(struct qed_hwfn *p phy_cfg.pause |= (params->pause.forced_tx) ? ETH_PAUSE_TX : 0; phy_cfg.adv_speed = params->speed.advertised_speeds; phy_cfg.loopback_mode = params->loopback_mode; + if (p_hwfn->mcp_info->capabilities & FW_MB_PARAM_FEATURE_SUPPORT_EEE) { + if (params->eee.enable) + phy_cfg.eee_cfg |= EEE_CFG_EEE_ENABLED; + if (params->eee.tx_lpi_enable) + phy_cfg.eee_cfg |= EEE_CFG_TX_LPI; + if (params->eee.adv_caps & QED_EEE_1G_ADV) + phy_cfg.eee_cfg |= EEE_CFG_ADV_SPEED_1G; + if (params->eee.adv_caps & QED_EEE_10G_ADV) + phy_cfg.eee_cfg |= EEE_CFG_ADV_SPEED_10G; + phy_cfg.eee_cfg |= (params->eee.tx_lpi_timer << + EEE_TX_TIMER_USEC_OFFSET) & + EEE_TX_TIMER_USEC_MASK; + }
p_hwfn->b_drv_link_init = b_up;
@@@ -2863,28 -2822,3 +2863,28 @@@ void qed_mcp_resc_lock_default_init(str p_unlock->resource = resource; } } + +int qed_mcp_get_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) +{ + u32 mcp_resp; + int rc; + + rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_GET_MFW_FEATURE_SUPPORT, + 0, &mcp_resp, &p_hwfn->mcp_info->capabilities); + if (!rc) + DP_VERBOSE(p_hwfn, (QED_MSG_SP | NETIF_MSG_PROBE), + "MFW supported features: %08x\n", + p_hwfn->mcp_info->capabilities); + + return rc; +} + +int qed_mcp_set_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) +{ + u32 mcp_resp, mcp_param, features; + + features = DRV_MB_PARAM_FEATURE_SUPPORT_PORT_EEE; + + return qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_FEATURE_SUPPORT, + features, &mcp_resp, &mcp_param); +} diff --combined drivers/net/hyperv/hyperv_net.h index d1ea99a12cf2,12cc64bfcff8..98b25f6900c8 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@@ -147,6 -147,7 +147,6 @@@ struct hv_netvsc_packet struct netvsc_device_info { unsigned char mac_adr[ETH_ALEN]; int ring_size; - u32 max_num_vrss_chns; u32 num_chn; };
@@@ -182,16 -183,13 +182,16 @@@ struct rndis_device /* Interface */ struct rndis_message; struct netvsc_device; -int netvsc_device_add(struct hv_device *device, - const struct netvsc_device_info *info); +struct net_device_context; + +struct netvsc_device *netvsc_device_add(struct hv_device *device, + const struct netvsc_device_info *info); +int netvsc_alloc_recv_comp_ring(struct netvsc_device *net_device, u32 q_idx); void netvsc_device_remove(struct hv_device *device); -int netvsc_send(struct hv_device *device, +int netvsc_send(struct net_device_context *ndc, struct hv_netvsc_packet *packet, struct rndis_message *rndis_msg, - struct hv_page_buffer **page_buffer, + struct hv_page_buffer *page_buffer, struct sk_buff *skb); void netvsc_linkstatus_callback(struct hv_device *device_obj, struct rndis_message *resp); @@@ -202,11 -200,10 +202,11 @@@ int netvsc_recv_callback(struct net_dev const struct ndis_pkt_8021q_info *vlan); void netvsc_channel_cb(void *context); int netvsc_poll(struct napi_struct *napi, int budget); +bool rndis_filter_opened(const struct netvsc_device *nvdev); int rndis_filter_open(struct netvsc_device *nvdev); int rndis_filter_close(struct netvsc_device *nvdev); -int rndis_filter_device_add(struct hv_device *dev, - struct netvsc_device_info *info); +struct netvsc_device *rndis_filter_device_add(struct hv_device *dev, + struct netvsc_device_info *info); void rndis_filter_update(struct netvsc_device *nvdev); void rndis_filter_device_remove(struct hv_device *dev, struct netvsc_device *nvdev); @@@ -218,8 -215,7 +218,8 @@@ int rndis_filter_receive(struct net_dev struct vmbus_channel *channel, void *data, u32 buflen);
-int rndis_filter_set_device_mac(struct net_device *ndev, char *mac); +int rndis_filter_set_device_mac(struct netvsc_device *ndev, + const char *mac);
void netvsc_switch_datapath(struct net_device *nv_dev, bool vf);
@@@ -658,10 -654,13 +658,10 @@@ struct recv_comp_data u32 status; };
-/* Netvsc Receive Slots Max */ -#define NETVSC_RECVSLOT_MAX (NETVSC_RECEIVE_BUFFER_SIZE / ETH_DATA_LEN + 1) - struct multi_recv_comp { - void *buf; /* queued receive completions */ - u32 first; /* first data entry */ - u32 next; /* next entry for writing */ + struct recv_comp_data *slots; + u32 first; /* first data entry */ + u32 next; /* next entry for writing */ };
struct netvsc_stats { @@@ -680,15 -679,6 +680,15 @@@ struct netvsc_ethtool_stats unsigned long tx_busy; };
+struct netvsc_vf_pcpu_stats { + u64 rx_packets; + u64 rx_bytes; + u64 tx_packets; + u64 tx_bytes; + struct u64_stats_sync syncp; + u32 tx_dropped; +}; + struct netvsc_reconfig { struct list_head list; u32 event; @@@ -722,19 -712,18 +722,19 @@@ struct net_device_context
/* State to manage the associated VF interface. */ struct net_device __rcu *vf_netdev; + struct netvsc_vf_pcpu_stats __percpu *vf_stats; + struct work_struct vf_takeover;
/* 1: allocated, serial number is valid. 0: not allocated */ u32 vf_alloc; /* Serial number of the VF to team with */ u32 vf_serial; - - bool datapath; /* 0 - synthetic, 1 - VF nic */ };
/* Per channel data */ struct netvsc_channel { struct vmbus_channel *channel; + struct netvsc_device *net_device; const struct vmpacket_descriptor *desc; struct napi_struct napi; struct multi_send_data msd; @@@ -757,7 -746,7 +757,7 @@@ struct netvsc_device u32 recv_buf_size; u32 recv_buf_gpadl_handle; u32 recv_section_cnt; - struct nvsp_1_receive_buffer_section *recv_section; + u32 recv_completion_cnt;
/* Send buffer allocated by us */ void *send_buf; @@@ -776,7 -765,8 +776,8 @@@ u32 max_chn; u32 num_chn;
- refcount_t sc_offered; + atomic_t open_chn; + wait_queue_head_t subchan_open;
struct rndis_device *extension;
@@@ -785,6 -775,8 +786,6 @@@ u32 max_pkt; /* max number of pkt in one send, e.g. 8 */ u32 pkt_align; /* alignment bytes, e.g. 8 */
- atomic_t num_outstanding_recvs; - atomic_t open_cnt;
struct netvsc_channel chan_table[VRSS_CHANNEL_MAX]; @@@ -792,6 -784,18 +793,6 @@@ struct rcu_head rcu; };
-static inline struct netvsc_device * -net_device_to_netvsc_device(struct net_device *ndev) -{ - return ((struct net_device_context *)netdev_priv(ndev))->nvdev; -} - -static inline struct netvsc_device * -hv_device_to_netvsc_device(struct hv_device *device) -{ - return net_device_to_netvsc_device(hv_get_drvdata(device)); -} - /* NdisInitialize message */ struct rndis_initialize_request { u32 req_id; diff --combined drivers/net/hyperv/netvsc.c index 208f03aa83de,d18c3326a1f7..bffaf93d3cb0 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@@ -29,9 -29,6 +29,9 @@@ #include <linux/netdevice.h> #include <linux/if_ether.h> #include <linux/vmalloc.h> +#include <linux/rtnetlink.h> +#include <linux/prefetch.h> + #include <asm/sync_bitops.h>
#include "hyperv_net.h" @@@ -44,7 -41,7 +44,7 @@@ void netvsc_switch_datapath(struct net_ { struct net_device_context *net_device_ctx = netdev_priv(ndev); struct hv_device *dev = net_device_ctx->device_ctx; - struct netvsc_device *nv_dev = net_device_ctx->nvdev; + struct netvsc_device *nv_dev = rtnl_dereference(net_device_ctx->nvdev); struct nvsp_message *init_pkt = &nv_dev->channel_init_pkt;
memset(init_pkt, 0, sizeof(struct nvsp_message)); @@@ -60,6 -57,8 +60,6 @@@ sizeof(struct nvsp_message), (unsigned long)init_pkt, VM_PKT_DATA_INBAND, 0); - - net_device_ctx->datapath = vf; }
static struct netvsc_device *alloc_net_device(void) @@@ -70,12 -69,16 +70,13 @@@ if (!net_device) return NULL;
- net_device->chan_table[0].mrc.buf - = vzalloc(NETVSC_RECVSLOT_MAX * sizeof(struct recv_comp_data)); - init_waitqueue_head(&net_device->wait_drain); net_device->destroy = false; atomic_set(&net_device->open_cnt, 0); net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT; net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT; init_completion(&net_device->channel_init_wait); + init_waitqueue_head(&net_device->subchan_open);
return net_device; } @@@ -87,7 -90,7 +88,7 @@@ static void free_netvsc_device(struct r int i;
for (i = 0; i < VRSS_CHANNEL_MAX; i++) - vfree(nvdev->chan_table[i].mrc.buf); + vfree(nvdev->chan_table[i].mrc.slots);
kfree(nvdev); } @@@ -101,8 -104,7 +102,8 @@@ static void netvsc_destroy_buf(struct h { struct nvsp_message *revoke_packet; struct net_device *ndev = hv_get_drvdata(device); - struct netvsc_device *net_device = net_device_to_netvsc_device(ndev); + struct net_device_context *ndc = netdev_priv(ndev); + struct netvsc_device *net_device = rtnl_dereference(ndc->nvdev); int ret;
/* @@@ -166,6 -168,12 +167,6 @@@ net_device->recv_buf = NULL; }
- if (net_device->recv_section) { - net_device->recv_section_cnt = 0; - kfree(net_device->recv_section); - net_device->recv_section = NULL; - } - /* Deal with the send buffer we may have setup. * If we got a send section size, it means we received a * NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE msg (ie sent @@@ -228,26 -236,11 +229,26 @@@ kfree(net_device->send_section_map); }
+int netvsc_alloc_recv_comp_ring(struct netvsc_device *net_device, u32 q_idx) +{ + struct netvsc_channel *nvchan = &net_device->chan_table[q_idx]; + int node = cpu_to_node(nvchan->channel->target_cpu); + size_t size; + + size = net_device->recv_completion_cnt * sizeof(struct recv_comp_data); + nvchan->mrc.slots = vzalloc_node(size, node); + if (!nvchan->mrc.slots) + nvchan->mrc.slots = vzalloc(size); + + return nvchan->mrc.slots ? 0 : -ENOMEM; +} + static int netvsc_init_buf(struct hv_device *device, struct netvsc_device *net_device) { int ret = 0; struct nvsp_message *init_packet; + struct nvsp_1_message_send_receive_buffer_complete *resp; struct net_device *ndev; size_t map_words; int node; @@@ -304,41 -297,43 +305,41 @@@ wait_for_completion(&net_device->channel_init_wait);
/* Check the response */ - if (init_packet->msg.v1_msg. - send_recv_buf_complete.status != NVSP_STAT_SUCCESS) { - netdev_err(ndev, "Unable to complete receive buffer " - "initialization with NetVsp - status %d\n", - init_packet->msg.v1_msg. - send_recv_buf_complete.status); + resp = &init_packet->msg.v1_msg.send_recv_buf_complete; + if (resp->status != NVSP_STAT_SUCCESS) { + netdev_err(ndev, + "Unable to complete receive buffer initialization with NetVsp - status %d\n", + resp->status); ret = -EINVAL; goto cleanup; }
/* Parse the response */ + netdev_dbg(ndev, "Receive sections: %u sub_allocs: size %u count: %u\n", + resp->num_sections, resp->sections[0].sub_alloc_size, + resp->sections[0].num_sub_allocs);
- net_device->recv_section_cnt = init_packet->msg. - v1_msg.send_recv_buf_complete.num_sections; - - net_device->recv_section = kmemdup( - init_packet->msg.v1_msg.send_recv_buf_complete.sections, - net_device->recv_section_cnt * - sizeof(struct nvsp_1_receive_buffer_section), - GFP_KERNEL); - if (net_device->recv_section == NULL) { - ret = -EINVAL; - goto cleanup; - } + net_device->recv_section_cnt = resp->num_sections;
/* * For 1st release, there should only be 1 section that represents the * entire receive buffer */ if (net_device->recv_section_cnt != 1 || - net_device->recv_section->offset != 0) { + resp->sections[0].offset != 0) { ret = -EINVAL; goto cleanup; }
- /* Now setup the send buffer. - */ + /* Setup receive completion ring */ + net_device->recv_completion_cnt + = round_up(resp->sections[0].num_sub_allocs + 1, + PAGE_SIZE / sizeof(u64)); + ret = netvsc_alloc_recv_comp_ring(net_device, 0); + if (ret) + goto cleanup; + + /* Now setup the send buffer. */ net_device->send_buf = vzalloc_node(net_device->send_buf_size, node); if (!net_device->send_buf) net_device->send_buf = vzalloc(net_device->send_buf_size); @@@ -555,8 -550,7 +556,8 @@@ void netvsc_device_remove(struct hv_dev { struct net_device *ndev = hv_get_drvdata(device); struct net_device_context *net_device_ctx = netdev_priv(ndev); - struct netvsc_device *net_device = net_device_ctx->nvdev; + struct netvsc_device *net_device + = rtnl_dereference(net_device_ctx->nvdev); int i;
netvsc_disconnect_vsp(device); @@@ -699,7 -693,7 +700,7 @@@ static u32 netvsc_copy_to_send_buf(stru u32 pend_size, struct hv_netvsc_packet *packet, struct rndis_message *rndis_msg, - struct hv_page_buffer **pb, + struct hv_page_buffer *pb, struct sk_buff *skb) { char *start = net_device->send_buf; @@@ -720,9 -714,9 +721,9 @@@ }
for (i = 0; i < page_count; i++) { - char *src = phys_to_virt((*pb)[i].pfn << PAGE_SHIFT); - u32 offset = (*pb)[i].offset; - u32 len = (*pb)[i].len; + char *src = phys_to_virt(pb[i].pfn << PAGE_SHIFT); + u32 offset = pb[i].offset; + u32 len = pb[i].len;
memcpy(dest, (src + offset), len); msg_size += len; @@@ -741,32 -735,36 +742,32 @@@ static inline int netvsc_send_pkt struct hv_device *device, struct hv_netvsc_packet *packet, struct netvsc_device *net_device, - struct hv_page_buffer **pb, + struct hv_page_buffer *pb, struct sk_buff *skb) { struct nvsp_message nvmsg; - struct netvsc_channel *nvchan - = &net_device->chan_table[packet->q_idx]; + struct nvsp_1_message_send_rndis_packet * const rpkt = + &nvmsg.msg.v1_msg.send_rndis_pkt; + struct netvsc_channel * const nvchan = + &net_device->chan_table[packet->q_idx]; struct vmbus_channel *out_channel = nvchan->channel; struct net_device *ndev = hv_get_drvdata(device); struct netdev_queue *txq = netdev_get_tx_queue(ndev, packet->q_idx); u64 req_id; int ret; - struct hv_page_buffer *pgbuf; u32 ring_avail = hv_ringbuf_avail_percent(&out_channel->outbound);
nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT; - if (skb != NULL) { - /* 0 is RMC_DATA; */ - nvmsg.msg.v1_msg.send_rndis_pkt.channel_type = 0; - } else { - /* 1 is RMC_CONTROL; */ - nvmsg.msg.v1_msg.send_rndis_pkt.channel_type = 1; - } + if (skb) + rpkt->channel_type = 0; /* 0 is RMC_DATA */ + else + rpkt->channel_type = 1; /* 1 is RMC_CONTROL */
- nvmsg.msg.v1_msg.send_rndis_pkt.send_buf_section_index = - packet->send_buf_index; + rpkt->send_buf_section_index = packet->send_buf_index; if (packet->send_buf_index == NETVSC_INVALID_INDEX) - nvmsg.msg.v1_msg.send_rndis_pkt.send_buf_section_size = 0; + rpkt->send_buf_section_size = 0; else - nvmsg.msg.v1_msg.send_rndis_pkt.send_buf_section_size = - packet->total_data_buflen; + rpkt->send_buf_section_size = packet->total_data_buflen;
req_id = (ulong)skb;
@@@ -774,11 -772,11 +775,11 @@@ return -ENODEV;
if (packet->page_buf_cnt) { - pgbuf = packet->cp_partial ? (*pb) + - packet->rmsg_pgcnt : (*pb); + if (packet->cp_partial) + pb += packet->rmsg_pgcnt; + ret = vmbus_sendpacket_pagebuffer_ctl(out_channel, - pgbuf, - packet->page_buf_cnt, + pb, packet->page_buf_cnt, &nvmsg, sizeof(struct nvsp_message), req_id, @@@ -803,10 -801,8 +804,10 @@@ ret = -ENOSPC; } } else { - netdev_err(ndev, "Unable to send packet %p ret %d\n", - packet, ret); + netdev_err(ndev, + "Unable to send packet pages %u len %u, ret %d\n", + packet->page_buf_cnt, packet->total_data_buflen, + ret); }
return ret; @@@ -824,16 -820,13 +825,16 @@@ static inline void move_pkt_msd(struct msdp->count = 0; }
-int netvsc_send(struct hv_device *device, +/* RCU already held by caller */ +int netvsc_send(struct net_device_context *ndev_ctx, struct hv_netvsc_packet *packet, struct rndis_message *rndis_msg, - struct hv_page_buffer **pb, + struct hv_page_buffer *pb, struct sk_buff *skb) { - struct netvsc_device *net_device = hv_device_to_netvsc_device(device); + struct netvsc_device *net_device + = rcu_dereference_bh(ndev_ctx->nvdev); + struct hv_device *device = ndev_ctx->device_ctx; int ret = 0; struct netvsc_channel *nvchan; u32 pktlen = packet->total_data_buflen, msd_len = 0; @@@ -845,7 -838,7 +846,7 @@@ bool xmit_more = (skb != NULL) ? skb->xmit_more : false;
/* If device is rescinded, return error and packet will get dropped. */ - if (unlikely(net_device->destroy)) + if (unlikely(!net_device || net_device->destroy)) return -ENODEV;
/* We may race with netvsc_connect_vsp()/netvsc_init_buf() and get @@@ -950,94 -943,130 +951,94 @@@ send_now return ret; }
-static int netvsc_send_recv_completion(struct vmbus_channel *channel, - u64 transaction_id, u32 status) +/* Send pending recv completions */ +static int send_recv_completions(struct netvsc_channel *nvchan) { - struct nvsp_message recvcompMessage; + struct netvsc_device *nvdev = nvchan->net_device; + struct multi_recv_comp *mrc = &nvchan->mrc; + struct recv_comp_msg { + struct nvsp_message_header hdr; + u32 status; + } __packed; + struct recv_comp_msg msg = { + .hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE, + }; int ret;
- recvcompMessage.hdr.msg_type = - NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE; - - recvcompMessage.msg.v1_msg.send_rndis_pkt_complete.status = status; - - /* Send the completion */ - ret = vmbus_sendpacket(channel, &recvcompMessage, - sizeof(struct nvsp_message_header) + sizeof(u32), - transaction_id, VM_PKT_COMP, 0); + while (mrc->first != mrc->next) { + const struct recv_comp_data *rcd + = mrc->slots + mrc->first;
- return ret; -} + msg.status = rcd->status; + ret = vmbus_sendpacket(nvchan->channel, &msg, sizeof(msg), + rcd->tid, VM_PKT_COMP, 0); + if (unlikely(ret)) + return ret;
-static inline void count_recv_comp_slot(struct netvsc_device *nvdev, u16 q_idx, - u32 *filled, u32 *avail) -{ - struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc; - u32 first = mrc->first; - u32 next = mrc->next; - - *filled = (first > next) ? NETVSC_RECVSLOT_MAX - first + next : - next - first; - - *avail = NETVSC_RECVSLOT_MAX - *filled - 1; -} - -/* Read the first filled slot, no change to index */ -static inline struct recv_comp_data *read_recv_comp_slot(struct netvsc_device - *nvdev, u16 q_idx) -{ - struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc; - u32 filled, avail; - - if (unlikely(!mrc->buf)) - return NULL; + if (++mrc->first == nvdev->recv_completion_cnt) + mrc->first = 0; + }
- count_recv_comp_slot(nvdev, q_idx, &filled, &avail); - if (!filled) - return NULL; + /* receive completion ring has been emptied */ + if (unlikely(nvdev->destroy)) + wake_up(&nvdev->wait_drain);
- return mrc->buf + mrc->first * sizeof(struct recv_comp_data); + return 0; }
-/* Put the first filled slot back to available pool */ -static inline void put_recv_comp_slot(struct netvsc_device *nvdev, u16 q_idx) +/* Count how many receive completions are outstanding */ +static void recv_comp_slot_avail(const struct netvsc_device *nvdev, + const struct multi_recv_comp *mrc, + u32 *filled, u32 *avail) { - struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc; - int num_recv; + u32 count = nvdev->recv_completion_cnt;
- mrc->first = (mrc->first + 1) % NETVSC_RECVSLOT_MAX; - - num_recv = atomic_dec_return(&nvdev->num_outstanding_recvs); + if (mrc->next >= mrc->first) + *filled = mrc->next - mrc->first; + else + *filled = (count - mrc->first) + mrc->next;
- if (nvdev->destroy && num_recv == 0) - wake_up(&nvdev->wait_drain); + *avail = count - *filled - 1; }
-/* Check and send pending recv completions */ -static void netvsc_chk_recv_comp(struct netvsc_device *nvdev, - struct vmbus_channel *channel, u16 q_idx) +/* Add receive complete to ring to send to host. */ +static void enq_receive_complete(struct net_device *ndev, + struct netvsc_device *nvdev, u16 q_idx, + u64 tid, u32 status) { + struct netvsc_channel *nvchan = &nvdev->chan_table[q_idx]; + struct multi_recv_comp *mrc = &nvchan->mrc; struct recv_comp_data *rcd; - int ret; - - while (true) { - rcd = read_recv_comp_slot(nvdev, q_idx); - if (!rcd) - break; + u32 filled, avail;
- ret = netvsc_send_recv_completion(channel, rcd->tid, - rcd->status); - if (ret) - break; + recv_comp_slot_avail(nvdev, mrc, &filled, &avail);
- put_recv_comp_slot(nvdev, q_idx); + if (unlikely(filled > NAPI_POLL_WEIGHT)) { + send_recv_completions(nvchan); + recv_comp_slot_avail(nvdev, mrc, &filled, &avail); } -} - -#define NETVSC_RCD_WATERMARK 80 - -/* Get next available slot */ -static inline struct recv_comp_data *get_recv_comp_slot( - struct netvsc_device *nvdev, struct vmbus_channel *channel, u16 q_idx) -{ - struct multi_recv_comp *mrc = &nvdev->chan_table[q_idx].mrc; - u32 filled, avail, next; - struct recv_comp_data *rcd; - - if (unlikely(!nvdev->recv_section)) - return NULL; - - if (unlikely(!mrc->buf)) - return NULL; - - if (atomic_read(&nvdev->num_outstanding_recvs) > - nvdev->recv_section->num_sub_allocs * NETVSC_RCD_WATERMARK / 100) - netvsc_chk_recv_comp(nvdev, channel, q_idx);
- count_recv_comp_slot(nvdev, q_idx, &filled, &avail); - if (!avail) - return NULL; - - next = mrc->next; - rcd = mrc->buf + next * sizeof(struct recv_comp_data); - mrc->next = (next + 1) % NETVSC_RECVSLOT_MAX; + if (unlikely(!avail)) { + netdev_err(ndev, "Recv_comp full buf q:%hd, tid:%llx\n", + q_idx, tid); + return; + }
- atomic_inc(&nvdev->num_outstanding_recvs); + rcd = mrc->slots + mrc->next; + rcd->tid = tid; + rcd->status = status;
- return rcd; + if (++mrc->next == nvdev->recv_completion_cnt) + mrc->next = 0; }
static int netvsc_receive(struct net_device *ndev, - struct netvsc_device *net_device, - struct net_device_context *net_device_ctx, - struct hv_device *device, - struct vmbus_channel *channel, - const struct vmpacket_descriptor *desc, - struct nvsp_message *nvsp) + struct netvsc_device *net_device, + struct net_device_context *net_device_ctx, + struct hv_device *device, + struct vmbus_channel *channel, + const struct vmpacket_descriptor *desc, + struct nvsp_message *nvsp) { const struct vmtransfer_page_packet_header *vmxferpage_packet = container_of(desc, const struct vmtransfer_page_packet_header, d); @@@ -1046,6 -1075,7 +1047,6 @@@ u32 status = NVSP_STAT_SUCCESS; int i; int count = 0; - int ret;
/* Make sure this is a valid nvsp packet */ if (unlikely(nvsp->hdr.msg_type != NVSP_MSG1_TYPE_SEND_RNDIS_PKT)) { @@@ -1076,9 -1106,25 +1077,9 @@@ channel, data, buflen); }
- if (net_device->chan_table[q_idx].mrc.buf) { - struct recv_comp_data *rcd; + enq_receive_complete(ndev, net_device, q_idx, + vmxferpage_packet->d.trans_id, status);
- rcd = get_recv_comp_slot(net_device, channel, q_idx); - if (rcd) { - rcd->tid = vmxferpage_packet->d.trans_id; - rcd->status = status; - } else { - netdev_err(ndev, "Recv_comp full buf q:%hd, tid:%llx\n", - q_idx, vmxferpage_packet->d.trans_id); - } - } else { - ret = netvsc_send_recv_completion(channel, - vmxferpage_packet->d.trans_id, - status); - if (ret) - netdev_err(ndev, "Recv_comp q:%hd, tid:%llx, err:%d\n", - q_idx, vmxferpage_packet->d.trans_id, ret); - } return count; }
@@@ -1174,10 -1220,11 +1175,10 @@@ int netvsc_poll(struct napi_struct *nap { struct netvsc_channel *nvchan = container_of(napi, struct netvsc_channel, napi); + struct netvsc_device *net_device = nvchan->net_device; struct vmbus_channel *channel = nvchan->channel; struct hv_device *device = netvsc_channel_to_device(channel); - u16 q_idx = channel->offermsg.offer.sub_channel_index; struct net_device *ndev = hv_get_drvdata(device); - struct netvsc_device *net_device = net_device_to_netvsc_device(ndev); int work_done = 0;
/* If starting a new interval */ @@@ -1190,23 -1237,17 +1191,23 @@@ nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc); }
- /* If receive ring was exhausted - * and not doing busy poll + /* if ring is empty, signal host */ + if (!nvchan->desc) + hv_pkt_iter_close(channel); + + /* If send of pending receive completions suceeded + * and did not exhaust NAPI budget this time + * and not doing busy poll * then re-enable host interrupts - * and reschedule if ring is not empty. + * and reschedule if ring is not empty. */ - if (work_done < budget && + if (send_recv_completions(nvchan) == 0 && + work_done < budget && napi_complete_done(napi, work_done) && - hv_end_read(&channel->inbound) != 0) + hv_end_read(&channel->inbound)) { + hv_begin_read(&channel->inbound); napi_reschedule(napi); - - netvsc_chk_recv_comp(net_device, channel, q_idx); + }
/* Driver may overshoot since multiple packets per descriptor */ return min(work_done, budget); @@@ -1218,15 -1259,10 +1219,15 @@@ void netvsc_channel_cb(void *context) { struct netvsc_channel *nvchan = context; + struct vmbus_channel *channel = nvchan->channel; + struct hv_ring_buffer_info *rbi = &channel->inbound; + + /* preload first vmpacket descriptor */ + prefetch(hv_get_ring_buffer(rbi) + rbi->priv_read_index);
if (napi_schedule_prep(&nvchan->napi)) { /* disable interupts from host */ - hv_begin_read(&nvchan->channel->inbound); + hv_begin_read(rbi);
__napi_schedule(&nvchan->napi); } @@@ -1236,8 -1272,8 +1237,8 @@@ * netvsc_device_add - Callback when the device belonging to this * driver is added */ -int netvsc_device_add(struct hv_device *device, - const struct netvsc_device_info *device_info) +struct netvsc_device *netvsc_device_add(struct hv_device *device, + const struct netvsc_device_info *device_info) { int i, ret = 0; int ring_size = device_info->ring_size; @@@ -1247,7 -1283,7 +1248,7 @@@
net_device = alloc_net_device(); if (!net_device) - return -ENOMEM; + return ERR_PTR(-ENOMEM);
net_device->ring_size = ring_size;
@@@ -1267,7 -1303,8 +1268,9 @@@ struct netvsc_channel *nvchan = &net_device->chan_table[i];
nvchan->channel = device->channel; + nvchan->net_device = net_device; + u64_stats_init(&nvchan->tx_stats.syncp); + u64_stats_init(&nvchan->rx_stats.syncp); }
/* Enable NAPI handler before init callbacks */ @@@ -1304,11 -1341,10 +1307,11 @@@ goto close; }
- return ret; + return net_device;
close: - netif_napi_del(&net_device->chan_table[0].napi); + RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); + napi_disable(&net_device->chan_table[0].napi);
/* Now, we can close the channel safely */ vmbus_close(device->channel); @@@ -1316,5 -1352,6 +1319,5 @@@ cleanup: free_netvsc_device(&net_device->rcu);
- return ret; - + return ERR_PTR(ret); } diff --combined drivers/net/hyperv/rndis_filter.c index 44165fe328a4,d6308ffda53e..36e9ee82ec6f --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@@ -28,7 -28,6 +28,7 @@@ #include <linux/if_vlan.h> #include <linux/nls.h> #include <linux/vmalloc.h> +#include <linux/rtnetlink.h>
#include "hyperv_net.h"
@@@ -214,11 -213,11 +214,11 @@@ static void dump_rndis_message(struct h static int rndis_filter_send_request(struct rndis_device *dev, struct rndis_request *req) { - int ret; struct hv_netvsc_packet *packet; struct hv_page_buffer page_buf[2]; struct hv_page_buffer *pb = page_buf; struct net_device_context *net_device_ctx = netdev_priv(dev->ndev); + int ret;
/* Setup the packet to send it */ packet = &req->pkt; @@@ -244,10 -243,7 +244,10 @@@ pb[0].len; }
- ret = netvsc_send(net_device_ctx->device_ctx, packet, NULL, &pb, NULL); + rcu_read_lock_bh(); + ret = netvsc_send(net_device_ctx, packet, NULL, pb, NULL); + rcu_read_unlock_bh(); + return ret; }
@@@ -447,9 -443,8 +447,9 @@@ int rndis_filter_receive(struct net_dev return 0; }
-static int rndis_filter_query_device(struct rndis_device *dev, u32 oid, - void *result, u32 *result_size) +static int rndis_filter_query_device(struct rndis_device *dev, + struct netvsc_device *nvdev, + u32 oid, void *result, u32 *result_size) { struct rndis_request *request; u32 inresult_size = *result_size; @@@ -476,6 -471,8 +476,6 @@@ query->dev_vc_handle = 0;
if (oid == OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES) { - struct net_device_context *ndevctx = netdev_priv(dev->ndev); - struct netvsc_device *nvdev = ndevctx->nvdev; struct ndis_offload *hwcaps; u32 nvsp_version = nvdev->nvsp_version; u8 ndis_rev; @@@ -544,15 -541,14 +544,15 @@@ cleanup
/* Get the hardware offload capabilities */ static int -rndis_query_hwcaps(struct rndis_device *dev, struct ndis_offload *caps) +rndis_query_hwcaps(struct rndis_device *dev, struct netvsc_device *net_device, + struct ndis_offload *caps) { u32 caps_len = sizeof(*caps); int ret;
memset(caps, 0, sizeof(*caps));
- ret = rndis_filter_query_device(dev, + ret = rndis_filter_query_device(dev, net_device, OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES, caps, &caps_len); if (ret) @@@ -581,12 -577,11 +581,12 @@@ return 0; }
-static int rndis_filter_query_device_mac(struct rndis_device *dev) +static int rndis_filter_query_device_mac(struct rndis_device *dev, + struct netvsc_device *net_device) { u32 size = ETH_ALEN;
- return rndis_filter_query_device(dev, + return rndis_filter_query_device(dev, net_device, RNDIS_OID_802_3_PERMANENT_ADDRESS, dev->hw_mac_adr, &size); } @@@ -594,9 -589,9 +594,9 @@@ #define NWADR_STR "NetworkAddress" #define NWADR_STRLEN 14
-int rndis_filter_set_device_mac(struct net_device *ndev, char *mac) +int rndis_filter_set_device_mac(struct netvsc_device *nvdev, + const char *mac) { - struct netvsc_device *nvdev = net_device_to_netvsc_device(ndev); struct rndis_device *rdev = nvdev->extension; struct rndis_request *request; struct rndis_set_request *set; @@@ -650,8 -645,11 +650,8 @@@ wait_for_completion(&request->wait_event);
set_complete = &request->response_msg.msg.set_complete; - if (set_complete->status != RNDIS_STATUS_SUCCESS) { - netdev_err(ndev, "Fail to set MAC on host side:0x%x\n", - set_complete->status); - ret = -EINVAL; - } + if (set_complete->status != RNDIS_STATUS_SUCCESS) + ret = -EIO;
cleanup: put_rndis_request(rdev, request); @@@ -660,9 -658,9 +660,9 @@@
static int rndis_filter_set_offload_params(struct net_device *ndev, + struct netvsc_device *nvdev, struct ndis_offload_params *req_offloads) { - struct netvsc_device *nvdev = net_device_to_netvsc_device(ndev); struct rndis_device *rdev = nvdev->extension; struct rndis_request *request; struct rndis_set_request *set; @@@ -784,27 -782,27 +784,27 @@@ cleanup return ret; }
-static int rndis_filter_query_device_link_status(struct rndis_device *dev) +static int rndis_filter_query_device_link_status(struct rndis_device *dev, + struct netvsc_device *net_device) { u32 size = sizeof(u32); u32 link_status; - int ret; - - ret = rndis_filter_query_device(dev, - RNDIS_OID_GEN_MEDIA_CONNECT_STATUS, - &link_status, &size);
- return ret; + return rndis_filter_query_device(dev, net_device, + RNDIS_OID_GEN_MEDIA_CONNECT_STATUS, + &link_status, &size); }
-static int rndis_filter_query_link_speed(struct rndis_device *dev) +static int rndis_filter_query_link_speed(struct rndis_device *dev, + struct netvsc_device *net_device) { u32 size = sizeof(u32); u32 link_speed; struct net_device_context *ndc; int ret;
- ret = rndis_filter_query_device(dev, RNDIS_OID_GEN_LINK_SPEED, + ret = rndis_filter_query_device(dev, net_device, + RNDIS_OID_GEN_LINK_SPEED, &link_speed, &size);
if (!ret) { @@@ -873,14 -871,14 +873,14 @@@ void rndis_filter_update(struct netvsc_ schedule_work(&rdev->mcast_work); }
-static int rndis_filter_init_device(struct rndis_device *dev) +static int rndis_filter_init_device(struct rndis_device *dev, + struct netvsc_device *nvdev) { struct rndis_request *request; struct rndis_initialize_request *init; struct rndis_initialize_complete *init_complete; u32 status; int ret; - struct netvsc_device *nvdev = net_device_to_netvsc_device(dev->ndev);
request = get_rndis_request(dev, RNDIS_MSG_INIT, RNDIS_MESSAGE_SIZE(struct rndis_initialize_request)); @@@ -928,12 -926,12 +928,12 @@@ static bool netvsc_device_idle(const st { int i;
- if (atomic_read(&nvdev->num_outstanding_recvs) > 0) - return false; - for (i = 0; i < nvdev->num_chn; i++) { const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
+ if (nvchan->mrc.first != nvchan->mrc.next) + return false; + if (atomic_read(&nvchan->queue_sends) > 0) return false; } @@@ -946,7 -944,7 +946,7 @@@ static void rndis_filter_halt_device(st struct rndis_request *request; struct rndis_halt_request *halt; struct net_device_context *net_device_ctx = netdev_priv(dev->ndev); - struct netvsc_device *nvdev = net_device_ctx->nvdev; + struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
/* Attempt to do a rndis device halt */ request = get_rndis_request(dev, RNDIS_MSG_HALT, @@@ -1017,20 -1015,20 +1017,20 @@@ static void netvsc_sc_open(struct vmbus { struct net_device *ndev = hv_get_drvdata(new_sc->primary_channel->device_obj); - struct netvsc_device *nvscdev = net_device_to_netvsc_device(ndev); + struct net_device_context *ndev_ctx = netdev_priv(ndev); + struct netvsc_device *nvscdev; u16 chn_index = new_sc->offermsg.offer.sub_channel_index; struct netvsc_channel *nvchan; int ret;
- if (chn_index >= nvscdev->num_chn) + /* This is safe because this callback only happens when + * new device is being setup and waiting on the channel_init_wait. + */ + nvscdev = rcu_dereference_raw(ndev_ctx->nvdev); + if (!nvscdev || chn_index >= nvscdev->num_chn) return;
nvchan = nvscdev->chan_table + chn_index; - nvchan->mrc.buf - = vzalloc(NETVSC_RECVSLOT_MAX * sizeof(struct recv_comp_data)); - - if (!nvchan->mrc.buf) - return;
/* Because the device uses NAPI, all the interrupt batching and * control is done via Net softirq, not the channel handling @@@ -1050,12 -1048,12 +1050,12 @@@ else netif_napi_del(&nvchan->napi);
- if (refcount_dec_and_test(&nvscdev->sc_offered)) - complete(&nvscdev->channel_init_wait); + atomic_inc(&nvscdev->open_chn); + wake_up(&nvscdev->subchan_open); }
-int rndis_filter_device_add(struct hv_device *dev, - struct netvsc_device_info *device_info) +struct netvsc_device *rndis_filter_device_add(struct hv_device *dev, + struct netvsc_device_info *device_info) { struct net_device *net = hv_get_drvdata(dev); struct net_device_context *net_device_ctx = netdev_priv(net); @@@ -1074,52 -1072,57 +1074,50 @@@
rndis_device = get_rndis_device(); if (!rndis_device) - return -ENODEV; + return ERR_PTR(-ENODEV);
/* * Let the inner driver handle this first to create the netvsc channel * NOTE! Once the channel is created, we may get a receive callback * (RndisFilterOnReceive()) before this call is completed */ - ret = netvsc_device_add(dev, device_info); - if (ret != 0) { + net_device = netvsc_device_add(dev, device_info); + if (IS_ERR(net_device)) { kfree(rndis_device); - return ret; + return net_device; }
/* Initialize the rndis device */ - net_device = net_device_ctx->nvdev; net_device->max_chn = 1; net_device->num_chn = 1;
- refcount_set(&net_device->sc_offered, 0); - net_device->extension = rndis_device; rndis_device->ndev = net;
/* Send the rndis initialization message */ - ret = rndis_filter_init_device(rndis_device); - if (ret != 0) { - rndis_filter_device_remove(dev, net_device); - return ret; - } + ret = rndis_filter_init_device(rndis_device, net_device); + if (ret != 0) + goto err_dev_remv;
/* Get the MTU from the host */ size = sizeof(u32); - ret = rndis_filter_query_device(rndis_device, + ret = rndis_filter_query_device(rndis_device, net_device, RNDIS_OID_GEN_MAXIMUM_FRAME_SIZE, &mtu, &size); if (ret == 0 && size == sizeof(u32) && mtu < net->mtu) net->mtu = mtu;
/* Get the mac address */ - ret = rndis_filter_query_device_mac(rndis_device); - if (ret != 0) { - rndis_filter_device_remove(dev, net_device); - return ret; - } + ret = rndis_filter_query_device_mac(rndis_device, net_device); + if (ret != 0) + goto err_dev_remv;
memcpy(device_info->mac_adr, rndis_device->hw_mac_adr, ETH_ALEN);
/* Find HW offload capabilities */ - ret = rndis_query_hwcaps(rndis_device, &hwcaps); - if (ret != 0) { - rndis_filter_device_remove(dev, net_device); - return ret; - } + ret = rndis_query_hwcaps(rndis_device, net_device, &hwcaps); + if (ret != 0) + goto err_dev_remv;
/* A value of zero means "no change"; now turn on what we want. */ memset(&offloads, 0, sizeof(struct ndis_offload_params)); @@@ -1174,24 -1177,24 +1172,24 @@@
netif_set_gso_max_size(net, gso_max_size);
- ret = rndis_filter_set_offload_params(net, &offloads); + ret = rndis_filter_set_offload_params(net, net_device, &offloads); if (ret) goto err_dev_remv;
- rndis_filter_query_device_link_status(rndis_device); + rndis_filter_query_device_link_status(rndis_device, net_device);
netdev_dbg(net, "Device MAC %pM link state %s\n", rndis_device->hw_mac_adr, rndis_device->link_state ? "down" : "up");
if (net_device->nvsp_version < NVSP_PROTOCOL_VERSION_5) - return 0; + return net_device;
- rndis_filter_query_link_speed(rndis_device); + rndis_filter_query_link_speed(rndis_device, net_device);
/* vRSS setup */ memset(&rsscap, 0, rsscap_size); - ret = rndis_filter_query_device(rndis_device, + ret = rndis_filter_query_device(rndis_device, net_device, OID_GEN_RECEIVE_SCALE_CAPABILITIES, &rsscap, &rsscap_size); if (ret || rsscap.num_recv_que < 2) @@@ -1216,20 -1219,11 +1214,20 @@@ rndis_device->ind_table[i] = ethtool_rxfh_indir_default(i, net_device->num_chn);
+ atomic_set(&net_device->open_chn, 1); num_rss_qs = net_device->num_chn - 1; if (num_rss_qs == 0) - return 0; + return net_device; + + for (i = 1; i < net_device->num_chn; i++) { + ret = netvsc_alloc_recv_comp_ring(net_device, i); + if (ret) { + while (--i != 0) + vfree(net_device->chan_table[i].mrc.slots); + goto out; + } + }
- refcount_set(&net_device->sc_offered, num_rss_qs); vmbus_set_sc_create_callback(dev->channel, netvsc_sc_open);
init_packet = &net_device->channel_init_pkt; @@@ -1246,15 -1240,19 +1244,19 @@@ if (ret) goto out;
+ wait_for_completion(&net_device->channel_init_wait); if (init_packet->msg.v5_msg.subchn_comp.status != NVSP_STAT_SUCCESS) { ret = -ENODEV; goto out; } - wait_for_completion(&net_device->channel_init_wait);
net_device->num_chn = 1 + init_packet->msg.v5_msg.subchn_comp.num_subchannels;
+ /* wait for all sub channels to open */ + wait_event(net_device->subchan_open, + atomic_read(&net_device->open_chn) == net_device->num_chn); + /* ignore failues from setting rss parameters, still have channels */ rndis_filter_set_rss_param(rndis_device, netvsc_hash_key, net_device->num_chn); @@@ -1264,11 -1262,11 +1266,11 @@@ out net_device->num_chn = 1; }
- return 0; /* return 0 because primary channel can be used alone */ + return net_device;
err_dev_remv: rndis_filter_device_remove(dev, net_device); - return ret; + return ERR_PTR(ret); }
void rndis_filter_device_remove(struct hv_device *dev, @@@ -1306,8 -1304,3 +1308,8 @@@ int rndis_filter_close(struct netvsc_de
return rndis_filter_close_device(nvdev->extension); } + +bool rndis_filter_opened(const struct netvsc_device *nvdev) +{ + return atomic_read(&nvdev->open_cnt) > 0; +} diff --combined drivers/net/ipvlan/ipvlan_main.c index fdde20735416,8dab74a81303..58a9f990b553 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@@ -169,7 -169,7 +169,7 @@@ static void ipvlan_port_destroy(struct
#define IPVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ - NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | NETIF_F_GSO_ROBUST | \ + NETIF_F_GSO | NETIF_F_TSO | NETIF_F_GSO_ROBUST | \ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)
@@@ -192,7 -192,7 +192,7 @@@ static int ipvlan_init(struct net_devic
netdev_lockdep_set_classes(dev);
- ipvlan->pcpu_stats = alloc_percpu(struct ipvl_pcpu_stats); + ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats); if (!ipvlan->pcpu_stats) return -ENOMEM;
diff --combined drivers/net/vxlan.c index dbca067540d0,e17baac70f43..35e84a9e1cfb --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@@ -623,6 -623,7 +623,7 @@@ static struct sk_buff **vxlan_gro_recei
out: skb_gro_remcsum_cleanup(skb, &grc); + skb->remcsum_offload = 0; NAPI_GRO_CB(skb)->flush |= flush;
return pp; @@@ -2608,7 -2609,7 +2609,7 @@@ static struct device_type vxlan_type = * supply the listening VXLAN udp ports. Callers are expected * to implement the ndo_udp_tunnel_add. */ -static void vxlan_push_rx_ports(struct net_device *dev) +static void vxlan_offload_rx_ports(struct net_device *dev, bool push) { struct vxlan_sock *vs; struct net *net = dev_net(dev); @@@ -2617,19 -2618,11 +2618,19 @@@
spin_lock(&vn->sock_lock); for (i = 0; i < PORT_HASH_SIZE; ++i) { - hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) - udp_tunnel_push_rx_port(dev, vs->sock, - (vs->flags & VXLAN_F_GPE) ? - UDP_TUNNEL_TYPE_VXLAN_GPE : - UDP_TUNNEL_TYPE_VXLAN); + hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { + unsigned short type; + + if (vs->flags & VXLAN_F_GPE) + type = UDP_TUNNEL_TYPE_VXLAN_GPE; + else + type = UDP_TUNNEL_TYPE_VXLAN; + + if (push) + udp_tunnel_push_rx_port(dev, vs->sock, type); + else + udp_tunnel_drop_rx_port(dev, vs->sock, type); + } } spin_unlock(&vn->sock_lock); } @@@ -3638,15 -3631,10 +3639,15 @@@ static int vxlan_netdevice_event(struc struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
- if (event == NETDEV_UNREGISTER) + if (event == NETDEV_UNREGISTER) { + vxlan_offload_rx_ports(dev, false); vxlan_handle_lowerdev_unregister(vn, dev); - else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO) - vxlan_push_rx_ports(dev); + } else if (event == NETDEV_REGISTER) { + vxlan_offload_rx_ports(dev, true); + } else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO || + event == NETDEV_UDP_TUNNEL_DROP_INFO) { + vxlan_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO); + }
return NOTIFY_DONE; } diff --combined include/net/tcp.h index 999f3efe572b,ada65e767b28..afdab3781425 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@@ -139,7 -139,6 +139,7 @@@ void tcp_time_wait(struct sock *sk, in #endif #define TCP_RTO_MAX ((unsigned)(120*HZ)) #define TCP_RTO_MIN ((unsigned)(HZ/5)) +#define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the @@@ -151,6 -150,8 +151,6 @@@ #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes * for local resources. */ -#define TCP_REO_TIMEOUT_MIN (2000) /* Min RACK reordering timeout in usec */ - #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ #define TCP_KEEPALIVE_INTVL (75*HZ) @@@ -256,6 -257,7 +256,6 @@@ extern int sysctl_tcp_rmem[3] extern int sysctl_tcp_app_win; extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_frto; -extern int sysctl_tcp_low_latency; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; @@@ -350,11 -352,8 +350,11 @@@ int tcp_v4_rcv(struct sk_buff *skb)
int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, + size_t size, int flags); ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tcp_release_cb(struct sock *sk); @@@ -364,7 -363,7 +364,7 @@@ void tcp_delack_timer_handler(struct so int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len); + const struct tcphdr *th); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); @@@ -634,6 -633,29 +634,6 @@@ static inline u32 __tcp_set_rto(const s return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); }
-static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) -{ - tp->pred_flags = htonl((tp->tcp_header_len << 26) | - ntohl(TCP_FLAG_ACK) | - snd_wnd); -} - -static inline void tcp_fast_path_on(struct tcp_sock *tp) -{ - __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); -} - -static inline void tcp_fast_path_check(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && - tp->rcv_wnd && - atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && - !tp->urg_data) - tcp_fast_path_on(tp); -} - /* Compute the actual rto_min value */ static inline u32 tcp_rto_min(struct sock *sk) { @@@ -827,16 -849,6 +827,16 @@@ static inline int tcp_v6_iif(const stru
return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif; } + +/* TCP_SKB_CB reference means this can not be used from early demux */ +static inline int tcp_v6_sdif(const struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags)) + return TCP_SKB_CB(skb)->header.h6.iif; +#endif + return 0; +} #endif
/* TCP_SKB_CB reference means this can not be used from early demux */ @@@ -850,16 -862,6 +850,16 @@@ static inline bool inet_exact_dif_match return false; }
+/* TCP_SKB_CB reference means this can not be used from early demux */ +static inline int tcp_v4_sdif(struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) + return TCP_SKB_CB(skb)->header.h4.iif; +#endif + return 0; +} + /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ @@@ -903,8 -905,9 +903,8 @@@ enum tcp_ca_event
/* Information about inbound ACK, passed to cong_ops->in_ack_event() */ enum tcp_ca_ack_event_flags { - CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ - CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ - CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ + CA_ACK_WIN_UPDATE = (1 << 0), /* ACK updated window */ + CA_ACK_ECE = (1 << 1), /* ECE bit is set on ack */ };
/* @@@ -1242,6 -1245,17 +1242,6 @@@ static inline bool tcp_checksum_complet __tcp_checksum_complete(skb); }
-/* Prequeue for VJ style copy to user, combined with checksumming. */ - -static inline void tcp_prequeue_init(struct tcp_sock *tp) -{ - tp->ucopy.task = NULL; - tp->ucopy.len = 0; - tp->ucopy.memory = 0; - skb_queue_head_init(&tp->ucopy.prequeue); -} - -bool tcp_prequeue(struct sock *sk, struct sk_buff *skb); bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); int tcp_filter(struct sock *sk, struct sk_buff *skb);
@@@ -1902,11 -1916,20 +1902,21 @@@ extern void tcp_rack_advance(struct tcp u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk);
+ /* At how many usecs into the future should the RTO fire? */ + static inline s64 tcp_rto_delta_us(const struct sock *sk) + { + const struct sk_buff *skb = tcp_write_queue_head(sk); + u32 rto = inet_csk(sk)->icsk_rto; + u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto); + + return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; + } + /* * Save and compile IPv4 options, return a pointer to it */ -static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) +static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net, + struct sk_buff *skb) { const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; @@@ -1915,7 -1938,7 +1925,7 @@@ int opt_size = sizeof(*dopt) + opt->optlen;
dopt = kmalloc(opt_size, GFP_ATOMIC); - if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { + if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) { kfree(dopt); dopt = NULL; } diff --combined net/core/dev.c index 1d75499add72,ce15a06d5558..3f69f6e71824 --- a/net/core/dev.c +++ b/net/core/dev.c @@@ -144,7 -144,6 +144,7 @@@ #include <linux/netfilter_ingress.h> #include <linux/crash_dump.h> #include <linux/sctp.h> +#include <net/udp_tunnel.h>
#include "net-sysfs.h"
@@@ -1414,7 -1413,7 +1414,7 @@@ int dev_open(struct net_device *dev } EXPORT_SYMBOL(dev_open);
-static int __dev_close_many(struct list_head *head) +static void __dev_close_many(struct list_head *head) { struct net_device *dev;
@@@ -1456,18 -1455,23 +1456,18 @@@ dev->flags &= ~IFF_UP; netpoll_poll_enable(dev); } - - return 0; }
-static int __dev_close(struct net_device *dev) +static void __dev_close(struct net_device *dev) { - int retval; LIST_HEAD(single);
list_add(&dev->close_list, &single); - retval = __dev_close_many(&single); + __dev_close_many(&single); list_del(&single); - - return retval; }
-int dev_close_many(struct list_head *head, bool unlink) +void dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp;
@@@ -1484,6 -1488,8 +1484,6 @@@ if (unlink) list_del_init(&dev->close_list); } - - return 0; } EXPORT_SYMBOL(dev_close_many);
@@@ -1496,7 -1502,7 +1496,7 @@@ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier * chain. */ -int dev_close(struct net_device *dev) +void dev_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); @@@ -1505,6 -1511,7 +1505,6 @@@ dev_close_many(&single, true); list_del(&single); } - return 0; } EXPORT_SYMBOL(dev_close);
@@@ -1853,7 -1860,7 +1853,7 @@@ static inline int deliver_skb(struct sk struct packet_type *pt_prev, struct net_device *orig_dev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); @@@ -2732,7 -2739,7 +2732,7 @@@ static inline bool skb_needs_check(stru { if (tx_path) return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_NONE; + skb->ip_summed != CHECKSUM_UNNECESSARY;
return skb->ip_summed == CHECKSUM_NONE; } @@@ -3858,121 -3865,6 +3858,121 @@@ drop return NET_RX_DROP; }
+static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct xdp_buff xdp; + u32 act = XDP_DROP; + void *orig_data; + int hlen, off; + u32 mac_len; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_cloned(skb)) + return XDP_PASS; + + if (skb_linearize(skb)) + goto do_drop; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hlen = skb_headlen(skb) + mac_len; + xdp.data = skb->data - mac_len; + xdp.data_end = xdp.data + hlen; + xdp.data_hard_start = skb->data - skb_headroom(skb); + orig_data = xdp.data; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + off = xdp.data - orig_data; + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + __skb_push(skb, mac_len); + /* fall through */ + case XDP_PASS: + break; + + default: + bpf_warn_invalid_xdp_action(act); + /* fall through */ + case XDP_ABORTED: + trace_xdp_exception(skb->dev, xdp_prog, act); + /* fall through */ + case XDP_DROP: + do_drop: + kfree_skb(skb); + break; + } + + return act; +} + +/* When doing generic XDP we have to bypass the qdisc layer and the + * network taps in order to match in-driver-XDP behavior. + */ +static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + bool free_skb = true; + int cpu, rc; + + txq = netdev_pick_tx(dev, skb, NULL); + cpu = smp_processor_id(); + HARD_TX_LOCK(dev, txq, cpu); + if (!netif_xmit_stopped(txq)) { + rc = netdev_start_xmit(skb, dev, txq, 0); + if (dev_xmit_complete(rc)) + free_skb = false; + } + HARD_TX_UNLOCK(dev, txq); + if (free_skb) { + trace_xdp_exception(dev, xdp_prog, XDP_TX); + kfree_skb(skb); + } +} + +static struct static_key generic_xdp_needed __read_mostly; + +static int do_xdp_generic(struct sk_buff *skb) +{ + struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); + + if (xdp_prog) { + u32 act = netif_receive_generic_xdp(skb, xdp_prog); + int err; + + if (act != XDP_PASS) { + switch (act) { + case XDP_REDIRECT: + err = xdp_do_generic_redirect(skb->dev, skb); + if (err) + goto out_redir; + /* fallthru to submit skb */ + case XDP_TX: + generic_xdp_tx(skb, xdp_prog); + break; + } + return XDP_DROP; + } + } + return XDP_PASS; +out_redir: + trace_xdp_exception(skb->dev, xdp_prog, XDP_REDIRECT); + kfree_skb(skb); + return XDP_DROP; +} + static int netif_rx_internal(struct sk_buff *skb) { int ret; @@@ -3980,18 -3872,6 +3980,18 @@@ net_timestamp_check(netdev_tstamp_prequeue, skb);
trace_netif_rx(skb); + + if (static_key_false(&generic_xdp_needed)) { + int ret = do_xdp_generic(skb); + + /* Consider XDP consuming the packet a success from + * the netdev point of view we do not want to count + * this as an error. + */ + if (ret != XDP_PASS) + return NET_RX_SUCCESS; + } + #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@@ -4412,7 -4292,7 +4412,7 @@@ skip_classify }
if (pt_prev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; else ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); @@@ -4458,6 -4338,8 +4458,6 @@@ static int __netif_receive_skb(struct s return ret; }
-static struct static_key generic_xdp_needed __read_mostly; - static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); @@@ -4491,6 -4373,89 +4491,6 @@@ return ret; }
-static u32 netif_receive_generic_xdp(struct sk_buff *skb, - struct bpf_prog *xdp_prog) -{ - struct xdp_buff xdp; - u32 act = XDP_DROP; - void *orig_data; - int hlen, off; - u32 mac_len; - - /* Reinjected packets coming from act_mirred or similar should - * not get XDP generic processing. - */ - if (skb_cloned(skb)) - return XDP_PASS; - - if (skb_linearize(skb)) - goto do_drop; - - /* The XDP program wants to see the packet starting at the MAC - * header. - */ - mac_len = skb->data - skb_mac_header(skb); - hlen = skb_headlen(skb) + mac_len; - xdp.data = skb->data - mac_len; - xdp.data_end = xdp.data + hlen; - xdp.data_hard_start = skb->data - skb_headroom(skb); - orig_data = xdp.data; - - act = bpf_prog_run_xdp(xdp_prog, &xdp); - - off = xdp.data - orig_data; - if (off > 0) - __skb_pull(skb, off); - else if (off < 0) - __skb_push(skb, -off); - - switch (act) { - case XDP_TX: - __skb_push(skb, mac_len); - /* fall through */ - case XDP_PASS: - break; - - default: - bpf_warn_invalid_xdp_action(act); - /* fall through */ - case XDP_ABORTED: - trace_xdp_exception(skb->dev, xdp_prog, act); - /* fall through */ - case XDP_DROP: - do_drop: - kfree_skb(skb); - break; - } - - return act; -} - -/* When doing generic XDP we have to bypass the qdisc layer and the - * network taps in order to match in-driver-XDP behavior. - */ -static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) -{ - struct net_device *dev = skb->dev; - struct netdev_queue *txq; - bool free_skb = true; - int cpu, rc; - - txq = netdev_pick_tx(dev, skb, NULL); - cpu = smp_processor_id(); - HARD_TX_LOCK(dev, txq, cpu); - if (!netif_xmit_stopped(txq)) { - rc = netdev_start_xmit(skb, dev, txq, 0); - if (dev_xmit_complete(rc)) - free_skb = false; - } - HARD_TX_UNLOCK(dev, txq); - if (free_skb) { - trace_xdp_exception(dev, xdp_prog, XDP_TX); - kfree_skb(skb); - } -} - static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; @@@ -4503,11 -4468,17 +4503,11 @@@ rcu_read_lock();
if (static_key_false(&generic_xdp_needed)) { - struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); + int ret = do_xdp_generic(skb);
- if (xdp_prog) { - u32 act = netif_receive_generic_xdp(skb, xdp_prog); - - if (act != XDP_PASS) { - rcu_read_unlock(); - if (act == XDP_TX) - generic_xdp_tx(skb, xdp_prog); - return NET_RX_DROP; - } + if (ret != XDP_PASS) { + rcu_read_unlock(); + return NET_RX_DROP; } }
@@@ -6718,12 -6689,8 +6718,12 @@@ int __dev_change_flags(struct net_devic */
ret = 0; - if ((old_flags ^ flags) & IFF_UP) - ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); + if ((old_flags ^ flags) & IFF_UP) { + if (old_flags & IFF_UP) + __dev_close(dev); + else + ret = __dev_open(dev); + }
if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; @@@ -7268,6 -7235,24 +7268,6 @@@ static netdev_features_t netdev_fix_fea features &= ~NETIF_F_GSO; }
- /* UFO needs SG and checksumming */ - if (features & NETIF_F_UFO) { - /* maybe split UFO into V4 and V6? */ - if (!(features & NETIF_F_HW_CSUM) && - ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != - (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { - netdev_dbg(dev, - "Dropping NETIF_F_UFO since no checksum offload features.\n"); - features &= ~NETIF_F_UFO; - } - - if (!(features & NETIF_F_SG)) { - netdev_dbg(dev, - "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); - features &= ~NETIF_F_UFO; - } - } - /* GSO partial features require GSO partial be set */ if ((features & dev->gso_partial_features) && !(features & NETIF_F_GSO_PARTIAL)) { @@@ -7328,27 -7313,8 +7328,27 @@@ sync_lower netdev_for_each_lower_dev(dev, lower, iter) netdev_sync_lower_features(dev, lower, features);
- if (!err) + if (!err) { + netdev_features_t diff = features ^ dev->features; + + if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { + /* udp_tunnel_{get,drop}_rx_info both need + * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the + * device, or they won't do anything. + * Thus we need to update dev->features + * *before* calling udp_tunnel_get_rx_info, + * but *after* calling udp_tunnel_drop_rx_info. + */ + if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { + dev->features = features; + udp_tunnel_get_rx_info(dev); + } else { + udp_tunnel_drop_rx_info(dev); + } + } + dev->features = features; + }
return err < 0 ? 0 : 1; } @@@ -7550,12 -7516,6 +7550,12 @@@ int register_netdevice(struct net_devic */ dev->hw_features |= NETIF_F_SOFT_FEATURES; dev->features |= NETIF_F_SOFT_FEATURES; + + if (dev->netdev_ops->ndo_udp_tunnel_add) { + dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; + dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; + } + dev->wanted_features = dev->features & dev->hw_features;
if (!(dev->flags & IFF_LOOPBACK)) diff --combined net/ipv4/tcp_input.c index 842ed75ccb25,53de1424c13c..d73903fe8c83 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@@ -103,9 -103,11 +103,10 @@@ int sysctl_tcp_invalid_ratelimit __read #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ -#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ + #define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ @@@ -1950,7 -1952,6 +1951,7 @@@ void tcp_enter_loss(struct sock *sk !after(tp->high_seq, tp->snd_una) || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->prior_cwnd = tp->snd_cwnd; tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); tcp_init_undo(tp); @@@ -2520,8 -2521,8 +2521,8 @@@ static inline void tcp_end_cwnd_reducti return;
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || - (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { tp->snd_cwnd = tp->snd_ssthresh; tp->snd_cwnd_stamp = tcp_jiffies32; } @@@ -3004,10 -3005,7 +3005,7 @@@ void tcp_rearm_rto(struct sock *sk /* Offset the time elapsed after installing regular RTO */ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { - struct sk_buff *skb = tcp_write_queue_head(sk); - u64 rto_time_stamp = skb->skb_mstamp + - jiffies_to_usecs(rto); - s64 delta_us = rto_time_stamp - tp->tcp_mstamp; + s64 delta_us = tcp_rto_delta_us(sk); /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ @@@ -3019,6 -3017,13 +3017,13 @@@ } }
+ /* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ + static void tcp_set_xmit_timer(struct sock *sk) + { + if (!tcp_schedule_loss_probe(sk)) + tcp_rearm_rto(sk); + } + /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { @@@ -3180,7 -3185,7 +3185,7 @@@ static int tcp_clean_rtx_queue(struct s ca_rtt_us, sack->rate);
if (flag & FLAG_ACKED) { - tcp_rearm_rto(sk); + flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); @@@ -3208,7 -3213,7 +3213,7 @@@ * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. */ - tcp_rearm_rto(sk); + flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ }
if (icsk->icsk_ca_ops->pkts_acked) { @@@ -3367,6 -3372,12 +3372,6 @@@ static int tcp_ack_update_window(struc if (tp->snd_wnd != nwin) { tp->snd_wnd = nwin;
- /* Note, it is the only place, where - * fast path is recovered for sending TCP. - */ - tp->pred_flags = 0; - tcp_fast_path_check(sk); - if (tcp_send_head(sk)) tcp_slow_start_after_idle_check(sk);
@@@ -3548,7 -3559,6 +3553,7 @@@ static int tcp_ack(struct sock *sk, con u32 lost = tp->lost; int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 ack_ev_flags = 0;
sack_state.first_sackt = 0; sack_state.rate = &rs; @@@ -3575,9 -3585,6 +3580,6 @@@ if (after(ack, tp->snd_nxt)) goto invalid_ack;
- if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) - tcp_rearm_rto(sk); - if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; icsk->icsk_retransmits = 0; @@@ -3592,26 -3599,42 +3594,26 @@@ if (flag & FLAG_UPDATE_TS_RECENT) tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { - /* Window is constant, pure forward advance. - * No more checks are required. - * Note, we use the fact that SND.UNA>=SND.WL2. - */ - tcp_update_wl(tp, ack_seq); - tcp_snd_una_update(tp, ack); - flag |= FLAG_WIN_UPDATE; - - tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); - } else { - u32 ack_ev_flags = CA_ACK_SLOWPATH; - - if (ack_seq != TCP_SKB_CB(skb)->end_seq) - flag |= FLAG_DATA; - else - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS); + if (ack_seq != TCP_SKB_CB(skb)->end_seq) + flag |= FLAG_DATA; + else + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
- flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); + flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
- if (TCP_SKB_CB(skb)->sacked) - flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_state); + if (TCP_SKB_CB(skb)->sacked) + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, + &sack_state);
- if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { - flag |= FLAG_ECE; - ack_ev_flags |= CA_ACK_ECE; - } + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { + flag |= FLAG_ECE; + ack_ev_flags = CA_ACK_ECE; + }
- if (flag & FLAG_WIN_UPDATE) - ack_ev_flags |= CA_ACK_WIN_UPDATE; + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE;
- tcp_in_ack_event(sk, ack_ev_flags); - } + tcp_in_ack_event(sk, ack_ev_flags);
/* We passed data and got it acked, remove any soft error * log. Something worked... @@@ -3626,18 -3649,20 +3628,20 @@@ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, &sack_state);
+ if (tp->tlp_high_seq) + tcp_process_tlp_ack(sk, ack, flag); + /* If needed, reset TLP/RTO timer; RACK may later override this. */ + if (flag & FLAG_SET_XMIT_TIMER) + tcp_set_xmit_timer(sk); + if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } - if (tp->tlp_high_seq) - tcp_process_tlp_ack(sk, ack, flag);
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk);
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) - tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ tcp_rate_gen(sk, delivered, lost, sack_state.rate); @@@ -4377,6 -4402,8 +4381,6 @@@ static void tcp_data_queue_ofo(struct s return; }
- /* Disable header prediction. */ - tp->pred_flags = 0; inet_csk_schedule_ack(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); @@@ -4565,8 -4592,8 +4569,8 @@@ err static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - bool fragstolen = false; - int eaten = -1; + bool fragstolen; + int eaten;
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); @@@ -4588,13 -4615,32 +4592,13 @@@ goto out_of_window;
/* Ok. In sequence. In window. */ - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && - sock_owned_by_user(sk) && !tp->urg_data) { - int chunk = min_t(unsigned int, skb->len, - tp->ucopy.len); - - __set_current_state(TASK_RUNNING); - - if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) { - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - eaten = (chunk == skb->len); - tcp_rcv_space_adjust(sk); - } - } - - if (eaten <= 0) { queue_and_out: - if (eaten < 0) { - if (skb_queue_len(&sk->sk_receive_queue) == 0) - sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) - goto drop; - } - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); - } + if (skb_queue_len(&sk->sk_receive_queue) == 0) + sk_forced_mem_schedule(sk, skb->truesize); + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + goto drop; + + eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); @@@ -4614,6 -4660,8 +4618,6 @@@ if (tp->rx_opt.num_sacks) tcp_sack_remove(tp);
- tcp_fast_path_check(sk); - if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) @@@ -4939,6 -4987,7 +4943,6 @@@ static int tcp_prune_queue(struct sock NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */ - tp->pred_flags = 0; return -1; }
@@@ -5110,6 -5159,9 +5114,6 @@@ static void tcp_check_urg(struct sock *
tp->urg_data = TCP_URG_NOTYET; tp->urg_seq = ptr; - - /* Disable header prediction. */ - tp->pred_flags = 0; }
/* This is the 'fast' part of urgent handling. */ @@@ -5138,6 -5190,26 +5142,6 @@@ static void tcp_urg(struct sock *sk, st } }
-static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) -{ - struct tcp_sock *tp = tcp_sk(sk); - int chunk = skb->len - hlen; - int err; - - if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk); - else - err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg); - - if (!err) { - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - tcp_rcv_space_adjust(sk); - } - - return err; -} - /* Accept RST for rcv_nxt - 1 after a FIN. * When tcp connections are abruptly terminated from Mac OSX (via ^C), a * FIN is sent followed by a RST packet. The RST is sent with the same @@@ -5268,29 -5340,201 +5272,29 @@@ discard
/* * TCP receive function for the ESTABLISHED state. - * - * It is split into a fast path and a slow path. The fast path is - * disabled when: - * - A zero window was announced from us - zero window probing - * is only handled properly in the slow path. - * - Out of order segments arrived. - * - Urgent data is expected. - * - There is no buffer space left - * - Unexpected TCP flags/window values/header lengths are received - * (detected by checking the TCP header against pred_flags) - * - Data is sent in both directions. Fast path only supports pure senders - * or pure receivers (this means either the sequence number or the ack - * value must stay constant) - * - Unexpected TCP option. - * - * When these conditions are not satisfied it drops into a standard - * receive procedure patterned after RFC793 to handle all cases. - * The first three cases are guaranteed by proper pred_flags setting, - * the rest is checked inline. Fast processing is turned on in - * tcp_data_queue when everything is OK. */ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { + unsigned int len = skb->len; struct tcp_sock *tp = tcp_sk(sk);
tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); - /* - * Header prediction. - * The code loosely follows the one in the famous - * "30 instruction TCP receive" Van Jacobson mail. - * - * Van's trick is to deposit buffers into socket queue - * on a device interrupt, to call tcp_recv function - * on the receive process context and checksum and copy - * the buffer to user space. smart... - * - * Our current scheme is not silly either but we take the - * extra cost of the net_bh soft interrupt processing... - * We do checksum and copy also but from device to kernel. - */
tp->rx_opt.saw_tstamp = 0;
- /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_prediction is to be made - * 'S' will always be tp->tcp_header_len >> 2 - * '?' will be 0 for the fast path, otherwise pred_flags is 0 to - * turn it off (when there are holes in the receive - * space for instance) - * PSH flag is ignored. - */ - - if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && - TCP_SKB_CB(skb)->seq == tp->rcv_nxt && - !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { - int tcp_header_len = tp->tcp_header_len; - - /* Timestamp header prediction: tcp_header_len - * is automatically equal to th->doff*4 due to pred_flags - * match. - */ - - /* Check timestamp */ - if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { - /* No? Slow path! */ - if (!tcp_parse_aligned_timestamp(tp, th)) - goto slow_path; - - /* If PAWS failed, check it more carefully in slow path */ - if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) - goto slow_path; - - /* DO NOT update ts_recent here, if checksum fails - * and timestamp was corrupted part, it will result - * in a hung connection since we will drop all - * future packets due to the PAWS test. - */ - } - - if (len <= tcp_header_len) { - /* Bulk data transfer: sender */ - if (len == tcp_header_len) { - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - /* We know that such packets are checksummed - * on entry. - */ - tcp_ack(sk, skb, 0); - __kfree_skb(skb); - tcp_data_snd_check(sk); - return; - } else { /* Header too small */ - TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); - goto discard; - } - } else { - int eaten = 0; - bool fragstolen = false; - - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len && - sock_owned_by_user(sk)) { - __set_current_state(TASK_RUNNING); - - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + - TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - tcp_rcv_rtt_measure_ts(sk, skb); - - __skb_pull(skb, tcp_header_len); - tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPHPHITSTOUSER); - eaten = 1; - } - } - if (!eaten) { - if (tcp_checksum_complete(skb)) - goto csum_error; - - if ((int)skb->truesize > sk->sk_forward_alloc) - goto step5; - - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - tcp_rcv_rtt_measure_ts(sk, skb); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); - - /* Bulk data transfer: receiver */ - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, - &fragstolen); - } - - tcp_event_data_recv(sk, skb); - - if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { - /* Well, only one small jumplet in fast path... */ - tcp_ack(sk, skb, FLAG_DATA); - tcp_data_snd_check(sk); - if (!inet_csk_ack_scheduled(sk)) - goto no_ack; - } - - __tcp_ack_snd_check(sk, 0); -no_ack: - if (eaten) - kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk); - return; - } - } - -slow_path: if (len < (th->doff << 2) || tcp_checksum_complete(skb)) goto csum_error;
if (!th->ack && !th->rst && !th->syn) goto discard;
- /* - * Standard slow path. - */ - if (!tcp_validate_incoming(sk, skb, th, 1)) return;
-step5: - if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) + if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0) goto discard;
tcp_rcv_rtt_measure_ts(sk, skb); @@@ -5343,6 -5587,12 +5347,6 @@@ void tcp_finish_connect(struct sock *sk
if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); - - if (!tp->rx_opt.snd_wscale) - __tcp_fast_path_on(tp, tp->snd_wnd); - else - tp->pred_flags = 0; - }
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, @@@ -5471,7 -5721,7 +5475,7 @@@ static int tcp_rcv_synsent_state_proces tcp_ecn_rcv_synack(tp, th);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_ack(sk, skb, FLAG_SLOWPATH); + tcp_ack(sk, skb, 0);
/* Ok.. it's good. Set up sequence numbers and * move to established. @@@ -5707,8 -5957,8 +5711,8 @@@ int tcp_rcv_state_process(struct sock * return 0;
/* step 5: check the ACK field */ - acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | - FLAG_UPDATE_TS_RECENT | + + acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT | FLAG_NO_CHALLENGE_ACK) > 0;
if (!acceptable) { @@@ -5776,6 -6026,7 +5780,6 @@@ tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk); - tcp_fast_path_on(tp); break;
case TCP_FIN_WAIT1: { diff --combined net/ipv4/tcp_output.c index d49bff51bdb7,b7661a68d498..3e0d19631534 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@@ -295,7 -295,9 +295,7 @@@ static u16 tcp_select_window(struct soc /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale;
- /* If we advertise zero window, disable fast path. */ if (new_win == 0) { - tp->pred_flags = 0; if (old_win) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTOZEROWINDOWADV); @@@ -2375,23 -2377,15 +2375,14 @@@ bool tcp_schedule_loss_probe(struct soc { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 timeout, tlp_time_stamp, rto_time_stamp; - u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); + u32 timeout, rto_delta_us;
- /* No consecutive loss probes. */ - if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { - tcp_rearm_rto(sk); - return false; - } /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ if (tp->fastopen_rsk) return false;
- /* TLP is only scheduled when next timer event is RTO. */ - if (icsk->icsk_pending != ICSK_TIME_RETRANS) - return false; - /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ @@@ -2404,28 -2398,20 +2395,24 @@@ tcp_send_head(sk)) return false;
- /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account + /* Probe timeout is 2*rtt. Add minimum RTO to account * for delayed ack when there's one outstanding packet. If no RTT * sample is available then probe after TCP_TIMEOUT_INIT. */ - timeout = rtt << 1 ? : TCP_TIMEOUT_INIT; - if (tp->packets_out == 1) - timeout = max_t(u32, timeout, - (rtt + (rtt >> 1) + TCP_DELACK_MAX)); - timeout = max_t(u32, timeout, msecs_to_jiffies(10)); + if (tp->srtt_us) { + timeout = usecs_to_jiffies(tp->srtt_us >> 2); + if (tp->packets_out == 1) + timeout += TCP_RTO_MIN; + else + timeout += TCP_TIMEOUT_MIN; + } else { + timeout = TCP_TIMEOUT_INIT; + }
- /* If RTO is shorter, just schedule TLP in its place. */ - tlp_time_stamp = tcp_jiffies32 + timeout; - rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; - if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { - s32 delta = rto_time_stamp - tcp_jiffies32; - if (delta > 0) - timeout = delta; - } + /* If the RTO formula yields an earlier time, then use that time. */ + rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */ + if (rto_delta_us > 0) + timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); @@@ -3450,6 -3436,10 +3437,10 @@@ int tcp_connect(struct sock *sk int err;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); + + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + tcp_connect_init(sk);
if (unlikely(tp->repair)) { diff --combined net/ipv4/tcp_timer.c index f753f9d2fee3,e906014890b6..655dd8d7f064 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@@ -239,6 -239,7 +239,6 @@@ static int tcp_write_timeout(struct soc /* Called with BH disabled */ void tcp_delack_timer_handler(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
sk_mem_reclaim_partial(sk); @@@ -253,6 -254,17 +253,6 @@@ } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
- if (!skb_queue_empty(&tp->ucopy.prequeue)) { - struct sk_buff *skb; - - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); - - while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb); - - tp->ucopy.memory = 0; - } - if (inet_csk_ack_scheduled(sk)) { if (!icsk->icsk_ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ @@@ -640,7 -652,8 +640,8 @@@ static void tcp_keepalive_timer (unsign goto death; }
- if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) + if (!sock_flag(sk, SOCK_KEEPOPEN) || + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) goto out;
elapsed = keepalive_time_when(tp); diff --combined net/ipv6/route.c index 7ecbe5eb19f8,a640fbcba15d..c73e61750642 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@@ -1820,11 -1820,6 +1820,11 @@@ static struct rt6_info *ip6_route_info_ goto out; }
+ if (cfg->fc_flags & RTF_OFFLOAD) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_OFFLOAD"); + goto out; + } + if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto out; @@@ -2356,6 -2351,7 +2356,7 @@@ static void rt6_do_redirect(struct dst_ if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY;
+ nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
if (ip6_ins_rt(nrt)) @@@ -2466,6 -2462,7 +2467,7 @@@ static struct rt6_info *rt6_add_route_i .fc_dst_len = prefixlen, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), + .fc_protocol = RTPROT_RA, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, @@@ -2518,6 -2515,7 +2520,7 @@@ struct rt6_info *rt6_add_dflt_router(co .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), + .fc_protocol = RTPROT_RA, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = dev_net(dev), @@@ -3332,9 -3330,6 +3335,9 @@@ static int rt6_nexthop_info(struct sk_b goto nla_put_failure; }
+ if (rt->rt6i_flags & RTF_OFFLOAD) + *flags |= RTNH_F_OFFLOAD; + /* not needed for multipath encoding b/c it has a rtnexthop struct */ if (!skip_oif && rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) @@@ -3432,14 -3427,6 +3435,6 @@@ static int rt6_fill_node(struct net *ne rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; - if (rt->rt6i_flags & RTF_DYNAMIC) - rtm->rtm_protocol = RTPROT_REDIRECT; - else if (rt->rt6i_flags & RTF_ADDRCONF) { - if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO)) - rtm->rtm_protocol = RTPROT_RA; - else - rtm->rtm_protocol = RTPROT_KERNEL; - }
if (rt->rt6i_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; @@@ -3934,7 -3921,6 +3929,7 @@@ static int __net_init ip6_route_net_ini ip6_template_metrics, true);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES + net->ipv6.fib6_has_custom_rules = false; net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, sizeof(*net->ipv6.ip6_prohibit_entry), GFP_KERNEL); diff --combined tools/testing/selftests/bpf/test_verifier.c index 65aa562cff87,d3ed7324105e..ab0cd1198326 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@@ -8,6 -8,7 +8,7 @@@ * License as published by the Free Software Foundation. */
+ #include <endian.h> #include <asm/types.h> #include <linux/types.h> #include <stdint.h> @@@ -421,7 -422,7 +422,7 @@@ static struct bpf_test tests[] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R1 pointer arithmetic", + .errstr_unpriv = "R1 subtraction from stack pointer", .result_unpriv = REJECT, .errstr = "R1 invalid mem access", .result = REJECT, @@@ -603,9 -604,8 +604,9 @@@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -4), BPF_EXIT_INSN(), }, - .errstr = "misaligned access", + .errstr = "misaligned stack access", .result = REJECT, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, }, { "invalid map_fd for function call", @@@ -651,9 -651,8 +652,9 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr = "misaligned access", + .errstr = "misaligned value access", .result = REJECT, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, }, { "sometimes access memory with incorrect alignment", @@@ -674,7 -673,6 +675,7 @@@ .errstr = "R0 invalid mem access", .errstr_unpriv = "R0 leaks addr", .result = REJECT, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, }, { "jump test 1", @@@ -1101,7 -1099,7 +1102,7 @@@ "check skb->hash byte load permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - #ifdef __LITTLE_ENDIAN + #if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, hash)), #else @@@ -1138,7 -1136,7 +1139,7 @@@ "check skb->hash byte load not permitted 3", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - #ifdef __LITTLE_ENDIAN + #if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, hash) + 3), #else @@@ -1218,9 -1216,8 +1219,9 @@@ offsetof(struct __sk_buff, cb[0]) + 1), BPF_EXIT_INSN(), }, - .errstr = "misaligned access", + .errstr = "misaligned context access", .result = REJECT, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, }, { "check __sk_buff->hash, offset 0, half store not permitted", @@@ -1248,7 -1245,7 +1249,7 @@@ "check skb->hash half load permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - #ifdef __LITTLE_ENDIAN + #if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, hash)), #else @@@ -1263,7 -1260,7 +1264,7 @@@ "check skb->hash half load not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - #ifdef __LITTLE_ENDIAN + #if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, hash) + 2), #else @@@ -1323,9 -1320,8 +1324,9 @@@ offsetof(struct __sk_buff, cb[0]) + 2), BPF_EXIT_INSN(), }, - .errstr = "misaligned access", + .errstr = "misaligned context access", .result = REJECT, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, }, { "check cb access: word, unaligned 2", @@@ -1335,9 -1331,8 +1336,9 @@@ offsetof(struct __sk_buff, cb[4]) + 1), BPF_EXIT_INSN(), }, - .errstr = "misaligned access", + .errstr = "misaligned context access", .result = REJECT, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, }, { "check cb access: word, unaligned 3", @@@ -1347,9 -1342,8 +1348,9 @@@ offsetof(struct __sk_buff, cb[4]) + 2), BPF_EXIT_INSN(), }, - .errstr = "misaligned access", + .errstr = "misaligned context access", .result = REJECT, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, }, { "check cb access: word, unaligned 4", @@@ -1359,9 -1353,8 +1360,9 @@@ offsetof(struct __sk_buff, cb[4]) + 3), BPF_EXIT_INSN(), }, - .errstr = "misaligned access", + .errstr = "misaligned context access", .result = REJECT, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, }, { "check cb access: double", @@@ -1387,9 -1380,8 +1388,9 @@@ offsetof(struct __sk_buff, cb[1])), BPF_EXIT_INSN(), }, - .errstr = "misaligned access", + .errstr = "misaligned context access", .result = REJECT, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, }, { "check cb access: double, unaligned 2", @@@ -1399,9 -1391,8 +1400,9 @@@ offsetof(struct __sk_buff, cb[3])), BPF_EXIT_INSN(), }, - .errstr = "misaligned access", + .errstr = "misaligned context access", .result = REJECT, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, }, { "check cb access: double, oob 1", @@@ -1533,8 -1524,7 +1534,8 @@@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "misaligned access off -6 size 8", + .errstr = "misaligned stack access off (0x0; 0x0)+-8+2 size 8", + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, }, { "PTR_TO_STACK store/load - bad alignment on reg", @@@ -1546,8 -1536,7 +1547,8 @@@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "misaligned access off -2 size 8", + .errstr = "misaligned stack access off (0x0; 0x0)+-10+8 size 8", + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, }, { "PTR_TO_STACK store/load - out of bounds low", @@@ -1591,6 -1580,8 +1592,6 @@@ BPF_EXIT_INSN(), }, .result = ACCEPT, - .result_unpriv = REJECT, - .errstr_unpriv = "R1 pointer arithmetic", }, { "unpriv: add pointer to pointer", @@@ -1601,7 -1592,7 +1602,7 @@@ }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R1 pointer arithmetic", + .errstr_unpriv = "R1 pointer += pointer", }, { "unpriv: neg pointer", @@@ -1942,7 -1933,10 +1943,7 @@@ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8), BPF_EXIT_INSN(), }, - .errstr_unpriv = "pointer arithmetic prohibited", - .result_unpriv = REJECT, - .errstr = "R1 invalid mem access", - .result = REJECT, + .result = ACCEPT, }, { "unpriv: cmp of stack pointer", @@@ -2006,7 -2000,7 +2007,7 @@@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack type R3", + .errstr = "R4 min value is negative", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { @@@ -2023,7 -2017,7 +2024,7 @@@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack type R3", + .errstr = "R4 min value is negative", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { @@@ -2225,7 -2219,7 +2226,7 @@@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack type R3 off=-1 access_size=-1", + .errstr = "R4 min value is negative", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { @@@ -2242,7 -2236,7 +2243,7 @@@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack type R3 off=-1 access_size=2147483647", + .errstr = "R4 unbounded memory access, use 'var &= const' or 'if (var < const)'", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { @@@ -2259,7 -2253,7 +2260,7 @@@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack type R3 off=-512 access_size=2147483647", + .errstr = "R4 unbounded memory access, use 'var &= const' or 'if (var < const)'", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { @@@ -2330,8 -2324,8 +2331,8 @@@ offsetof(struct __sk_buff, data)), BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_4), BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 48), - BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 48), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 49), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 49), BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_2), BPF_MOV64_REG(BPF_REG_2, BPF_REG_3), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8), @@@ -2659,7 -2653,7 +2660,7 @@@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), BPF_JMP_A(-6), }, - .errstr = "misaligned packet access off 2+15+-4 size 4", + .errstr = "misaligned packet access off 2+(0x0; 0x0)+15+-4 size 4", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .flags = F_LOAD_WITH_STRICT_ALIGNMENT, @@@ -2710,11 -2704,11 +2711,11 @@@ BPF_MOV64_IMM(BPF_REG_0, 0xffffffff), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), - BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffff), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0x7fff), BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_2), BPF_MOV64_REG(BPF_REG_5, BPF_REG_4), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0xffff - 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0x7fff - 1), BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1), BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0), BPF_MOV64_IMM(BPF_REG_0, 0), @@@ -2736,10 -2730,10 +2737,10 @@@ BPF_MOV64_IMM(BPF_REG_4, 0xffffffff), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_4, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), - BPF_ALU64_IMM(BPF_AND, BPF_REG_4, 0xffff), + BPF_ALU64_IMM(BPF_AND, BPF_REG_4, 0x7fff), BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_2), BPF_MOV64_REG(BPF_REG_5, BPF_REG_4), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0xffff - 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0x7fff - 1), BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1), BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0), BPF_MOV64_IMM(BPF_REG_0, 0), @@@ -2765,7 -2759,7 +2766,7 @@@ BPF_MOV64_IMM(BPF_REG_4, 0xffffffff), BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_4, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), - BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 48), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 49), BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_2), BPF_MOV64_REG(BPF_REG_0, BPF_REG_4), BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2), @@@ -2802,7 -2796,7 +2803,7 @@@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "cannot add integer value with 47 upper zero bits to ptr_to_packet", + .errstr = "invalid access to packet, off=0 size=8, R5(id=1,off=0,r=0)", }, { "direct packet access: test24 (x += pkt_ptr, 5)", @@@ -2820,7 -2814,7 +2821,7 @@@ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_4), BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), BPF_MOV64_REG(BPF_REG_5, BPF_REG_0), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0xffff - 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x7fff - 1), BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1), BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_0, 0), BPF_MOV64_IMM(BPF_REG_0, 0), @@@ -3119,7 -3113,7 +3120,7 @@@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to packet: test14, cls helper fail sub", + "helper access to packet: test14, cls helper ok sub", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, offsetof(struct __sk_buff, data)), @@@ -3139,36 -3133,12 +3140,36 @@@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "helper access to packet: test15, cls helper fail sub", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7), + BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 12), + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_csum_diff), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, .result = REJECT, - .errstr = "type=inv expected=fp", + .errstr = "invalid access to packet", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to packet: test15, cls helper fail range 1", + "helper access to packet: test16, cls helper fail range 1", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, offsetof(struct __sk_buff, data)), @@@ -3193,7 -3163,7 +3194,7 @@@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to packet: test16, cls helper fail range 2", + "helper access to packet: test17, cls helper fail range 2", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, offsetof(struct __sk_buff, data)), @@@ -3214,11 -3184,11 +3215,11 @@@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid access to packet", + .errstr = "R2 min value is negative", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to packet: test17, cls helper fail range 3", + "helper access to packet: test18, cls helper fail range 3", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, offsetof(struct __sk_buff, data)), @@@ -3239,11 -3209,11 +3240,11 @@@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid access to packet", + .errstr = "R2 min value is negative", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to packet: test18, cls helper fail range zero", + "helper access to packet: test19, cls helper fail range zero", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, offsetof(struct __sk_buff, data)), @@@ -3268,7 -3238,7 +3269,7 @@@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to packet: test19, pkt end as input", + "helper access to packet: test20, pkt end as input", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, offsetof(struct __sk_buff, data)), @@@ -3293,7 -3263,7 +3294,7 @@@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to packet: test20, wrong reg", + "helper access to packet: test21, wrong reg", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, offsetof(struct __sk_buff, data)), @@@ -3353,7 -3323,7 +3354,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr_unpriv = "R0 leaks addr", .result_unpriv = REJECT, .result = ACCEPT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@@ -3377,7 -3347,7 +3378,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr_unpriv = "R0 leaks addr", .result_unpriv = REJECT, .result = ACCEPT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@@ -3405,7 -3375,7 +3406,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr_unpriv = "R0 leaks addr", .result_unpriv = REJECT, .result = ACCEPT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@@ -3446,7 -3416,9 +3447,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", .errstr = "R0 min value is outside of the array range", - .result_unpriv = REJECT, .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@@ -3468,7 -3440,9 +3469,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", - .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", - .result_unpriv = REJECT, + .errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@@ -3482,7 -3456,7 +3483,7 @@@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), - BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), BPF_MOV32_IMM(BPF_REG_2, MAX_ENTRIES), BPF_JMP_REG(BPF_JSGT, BPF_REG_2, BPF_REG_1, 1), BPF_MOV32_IMM(BPF_REG_1, 0), @@@ -3493,8 -3467,8 +3494,8 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", - .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", + .errstr_unpriv = "R0 leaks addr", + .errstr = "R0 unbounded memory access", .result_unpriv = REJECT, .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@@ -3520,7 -3494,7 +3521,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr_unpriv = "R0 leaks addr", .errstr = "invalid access to map value, value_size=48 off=44 size=8", .result_unpriv = REJECT, .result = REJECT, @@@ -3550,8 -3524,8 +3551,8 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3, 11 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", - .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", + .errstr_unpriv = "R0 pointer += pointer", + .errstr = "R0 invalid mem access 'inv'", .result_unpriv = REJECT, .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@@ -3693,6 -3667,34 +3694,6 @@@ .prog_type = BPF_PROG_TYPE_SCHED_CLS }, { - "multiple registers share map_lookup_elem bad reg type", - .insns = { - BPF_MOV64_IMM(BPF_REG_1, 10), - BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, - BPF_FUNC_map_lookup_elem), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), - BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), - BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), - BPF_MOV64_REG(BPF_REG_5, BPF_REG_0), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), - BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), - BPF_MOV64_IMM(BPF_REG_1, 2), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0, 1), - BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 0), - BPF_MOV64_IMM(BPF_REG_1, 3), - BPF_EXIT_INSN(), - }, - .fixup_map1 = { 4 }, - .result = REJECT, - .errstr = "R3 invalid mem access 'inv'", - .prog_type = BPF_PROG_TYPE_SCHED_CLS - }, - { "invalid map access from else condition", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), @@@ -3710,9 -3712,9 +3711,9 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R0 unbounded memory access", .result = REJECT, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr_unpriv = "R0 leaks addr", .result_unpriv = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@@ -4090,7 -4092,7 +4091,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr = "invalid access to map value, value_size=48 off=0 size=-8", + .errstr = "R2 min value is negative", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@@ -4156,7 -4158,7 +4157,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "invalid access to map value, value_size=48 off=4 size=0", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@@ -4202,7 -4204,7 +4203,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr = "invalid access to map value, value_size=48 off=4 size=-8", + .errstr = "R2 min value is negative", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@@ -4224,7 -4226,7 +4225,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R2 min value is negative", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@@ -4340,7 -4342,7 +4341,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr = "invalid access to map value, value_size=48 off=4 size=-8", + .errstr = "R2 min value is negative", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@@ -4363,7 -4365,7 +4364,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R2 min value is negative", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@@ -4451,13 -4453,13 +4452,13 @@@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0), BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), - BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr = "R1 min value is negative, either use unsigned index or do a if (index >=0) check", + .errstr = "R1 unbounded memory access", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@@ -4577,7 -4579,7 +4578,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr_unpriv = "R0 leaks addr", .result = ACCEPT, .result_unpriv = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@@ -4605,7 -4607,7 +4606,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr_unpriv = "R0 leaks addr", .result = ACCEPT, .result_unpriv = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@@ -4624,7 -4626,7 +4625,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr_unpriv = "R0 bitwise operator &= on pointer", .errstr = "invalid mem access 'inv'", .result = REJECT, .result_unpriv = REJECT, @@@ -4643,7 -4645,7 +4644,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr_unpriv = "R0 32-bit pointer arithmetic prohibited", .errstr = "invalid mem access 'inv'", .result = REJECT, .result_unpriv = REJECT, @@@ -4662,7 -4664,7 +4663,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr_unpriv = "R0 pointer arithmetic with /= operator", .errstr = "invalid mem access 'inv'", .result = REJECT, .result_unpriv = REJECT, @@@ -4705,8 -4707,10 +4706,8 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 invalid mem access 'inv'", .errstr = "R0 invalid mem access 'inv'", .result = REJECT, - .result_unpriv = REJECT, }, { "map element value is preserved across register spilling", @@@ -4728,7 -4732,7 +4729,7 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr_unpriv = "R0 leaks addr", .result = ACCEPT, .result_unpriv = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@@ -4910,8 -4914,7 +4911,8 @@@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "R2 unbounded memory access", + /* because max wasn't checked, signed min is negative */ + .errstr = "R2 min value is negative, either use unsigned or 'var &= const'", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@@ -5060,20 -5063,6 +5061,20 @@@ .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, { + "helper access to variable memory: size = 0 allowed on NULL", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_EMIT_CALL(BPF_FUNC_csum_diff), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { "helper access to variable memory: size > 0 not allowed on NULL", .insns = { BPF_MOV64_IMM(BPF_REG_1, 0), @@@ -5087,7 -5076,7 +5088,7 @@@ BPF_EMIT_CALL(BPF_FUNC_csum_diff), BPF_EXIT_INSN(), }, - .errstr = "R1 type=imm expected=fp", + .errstr = "R1 type=inv expected=fp", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, @@@ -5172,7 -5161,7 +5173,7 @@@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), - BPF_MOV64_IMM(BPF_REG_1, 6), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), BPF_ALU64_IMM(BPF_AND, BPF_REG_1, -4), BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2), BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), @@@ -5181,8 -5170,10 +5182,8 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", - .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", + .errstr = "R0 max value is outside of the array range", .result = REJECT, - .result_unpriv = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { @@@ -5211,8 -5202,10 +5212,8 @@@ BPF_EXIT_INSN(), }, .fixup_map2 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", - .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", + .errstr = "R0 max value is outside of the array range", .result = REJECT, - .result_unpriv = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { @@@ -5259,7 -5252,7 +5260,7 @@@ }, .fixup_map_in_map = { 3 }, .errstr = "R1 type=inv expected=map_ptr", - .errstr_unpriv = "R1 pointer arithmetic prohibited", + .errstr_unpriv = "R1 pointer arithmetic on CONST_PTR_TO_MAP prohibited", .result = REJECT, }, { @@@ -5430,7 -5423,7 +5431,7 @@@ "check bpf_perf_event_data->sample_period byte load permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - #ifdef __LITTLE_ENDIAN + #if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, offsetof(struct bpf_perf_event_data, sample_period)), #else @@@ -5446,7 -5439,7 +5447,7 @@@ "check bpf_perf_event_data->sample_period half load permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - #ifdef __LITTLE_ENDIAN + #if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, offsetof(struct bpf_perf_event_data, sample_period)), #else @@@ -5462,7 -5455,7 +5463,7 @@@ "check bpf_perf_event_data->sample_period word load permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - #ifdef __LITTLE_ENDIAN + #if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, offsetof(struct bpf_perf_event_data, sample_period)), #else @@@ -5489,7 -5482,7 +5490,7 @@@ "check skb->data half load not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - #ifdef __LITTLE_ENDIAN + #if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, data)), #else @@@ -5505,7 -5498,7 +5506,7 @@@ "check skb->tc_classid half load not permitted for lwt prog", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - #ifdef __LITTLE_ENDIAN + #if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, tc_classid)), #else @@@ -5539,8 -5532,10 +5540,8 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", .errstr = "R0 min value is negative", .result = REJECT, - .result_unpriv = REJECT, }, { "bounds checks mixing signed and unsigned", @@@ -5563,8 -5558,10 +5564,8 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", .errstr = "R0 min value is negative", .result = REJECT, - .result_unpriv = REJECT, }, { "bounds checks mixing signed and unsigned, variant 2", @@@ -5589,8 -5586,10 +5590,8 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", .errstr = "R8 invalid mem access 'inv'", .result = REJECT, - .result_unpriv = REJECT, }, { "bounds checks mixing signed and unsigned, variant 3", @@@ -5614,8 -5613,10 +5615,8 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", .errstr = "R8 invalid mem access 'inv'", .result = REJECT, - .result_unpriv = REJECT, }, { "bounds checks mixing signed and unsigned, variant 4", @@@ -5638,7 -5639,10 +5639,7 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", - .errstr = "R0 min value is negative", - .result = REJECT, - .result_unpriv = REJECT, + .result = ACCEPT, }, { "bounds checks mixing signed and unsigned, variant 5", @@@ -5662,8 -5666,10 +5663,8 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", - .errstr = "R0 invalid mem access", + .errstr = "R0 min value is negative", .result = REJECT, - .result_unpriv = REJECT, }, { "bounds checks mixing signed and unsigned, variant 6", @@@ -5684,8 -5690,10 +5685,8 @@@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R4 min value is negative, either use unsigned", .errstr = "R4 min value is negative, either use unsigned", .result = REJECT, - .result_unpriv = REJECT, }, { "bounds checks mixing signed and unsigned, variant 7", @@@ -5708,7 -5716,10 +5709,7 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", - .errstr = "R0 min value is negative", - .result = REJECT, - .result_unpriv = REJECT, + .result = ACCEPT, }, { "bounds checks mixing signed and unsigned, variant 8", @@@ -5719,6 -5730,32 +5720,6 @@@ BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), - BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), - BPF_MOV64_IMM(BPF_REG_2, 1024 * 1024 * 1024 + 1), - BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 3), - BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 1, 2), - BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), - BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }, - .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", - .errstr = "R0 min value is negative", - .result = REJECT, - .result_unpriv = REJECT, - }, - { - "bounds checks mixing signed and unsigned, variant 9", - .insns = { - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, - BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), @@@ -5733,11 -5770,13 +5734,11 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", .errstr = "R0 min value is negative", .result = REJECT, - .result_unpriv = REJECT, }, { - "bounds checks mixing signed and unsigned, variant 10", + "bounds checks mixing signed and unsigned, variant 9", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@@ -5759,10 -5798,13 +5760,10 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", - .errstr = "R0 min value is negative", - .result = REJECT, - .result_unpriv = REJECT, + .result = ACCEPT, }, { - "bounds checks mixing signed and unsigned, variant 11", + "bounds checks mixing signed and unsigned, variant 10", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@@ -5784,11 -5826,13 +5785,11 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", .errstr = "R0 min value is negative", .result = REJECT, - .result_unpriv = REJECT, }, { - "bounds checks mixing signed and unsigned, variant 12", + "bounds checks mixing signed and unsigned, variant 11", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@@ -5811,11 -5855,13 +5812,11 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", .errstr = "R0 min value is negative", .result = REJECT, - .result_unpriv = REJECT, }, { - "bounds checks mixing signed and unsigned, variant 13", + "bounds checks mixing signed and unsigned, variant 12", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@@ -5837,11 -5883,13 +5838,11 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", .errstr = "R0 min value is negative", .result = REJECT, - .result_unpriv = REJECT, }, { - "bounds checks mixing signed and unsigned, variant 14", + "bounds checks mixing signed and unsigned, variant 13", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@@ -5866,11 -5914,13 +5867,11 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", .errstr = "R0 min value is negative", .result = REJECT, - .result_unpriv = REJECT, }, { - "bounds checks mixing signed and unsigned, variant 15", + "bounds checks mixing signed and unsigned, variant 14", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1, offsetof(struct __sk_buff, mark)), @@@ -5896,11 -5946,13 +5897,11 @@@ BPF_JMP_IMM(BPF_JA, 0, 0, -7), }, .fixup_map1 = { 4 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", .errstr = "R0 min value is negative", .result = REJECT, - .result_unpriv = REJECT, }, { - "bounds checks mixing signed and unsigned, variant 16", + "bounds checks mixing signed and unsigned, variant 15", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@@ -5924,13 -5976,13 +5925,13 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr_unpriv = "R0 pointer comparison prohibited", .errstr = "R0 min value is negative", .result = REJECT, .result_unpriv = REJECT, }, { - "subtraction bounds (map value)", + "subtraction bounds (map value) variant 1", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@@ -5952,74 -6004,10 +5953,74 @@@ BPF_EXIT_INSN(), }, .fixup_map1 = { 3 }, - .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr = "R0 max value is outside of the array range", + .result = REJECT, + }, + { + "subtraction bounds (map value) variant 2", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 0xff, 6), + BPF_LDX_MEM(BPF_B, BPF_REG_3, BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JGT, BPF_REG_3, 0xff, 4), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", .result = REJECT, - .result_unpriv = REJECT, + }, + { + "variable-offset ctx access", + .insns = { + /* Get an unknown value */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0), + /* Make it small and 4-byte aligned */ + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4), + /* add it to skb. We now have either &skb->len or + * &skb->pkt_type, but we don't know which + */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), + /* dereference it */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .errstr = "variable ctx access var_off=(0x0; 0x4)", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_LWT_IN, + }, + { + "variable-offset stack access", + .insns = { + /* Fill the top 8 bytes of the stack */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + /* Get an unknown value */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0), + /* Make it small and 4-byte aligned */ + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 8), + /* add it to fp. We now have either fp-4 or fp-8, but + * we don't know which + */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10), + /* dereference it */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0), + BPF_EXIT_INSN(), + }, + .errstr = "variable stack access var_off=(0xfffffffffffffff8; 0x4)", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_LWT_IN, }, };
linux-merge@lists.open-mesh.org