The following commit has been merged in the master branch: commit d489ded1a3690d7eca8633575cba3f7dac8484c7 Merge: 86dd9868b8788a9063893a97649594af93cd5aa6 3af409ca278d4a8d50e91f9f7c4c33b175645cf3 Author: David S. Miller davem@davemloft.net Date: Tue Feb 16 17:30:20 2021 -0800
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
diff --combined Documentation/networking/ip-sysctl.rst index 581bfce86dca,1b7f8debada6..c7952ac5bd2f --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@@ -178,27 -178,6 +178,27 @@@ min_adv_mss - INTEGE The advertised MSS depends on the first hop route MTU, but will never be lower than this setting.
+fib_notify_on_flag_change - INTEGER + Whether to emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/ + RTM_F_TRAP/RTM_F_OFFLOAD_FAILED flags are changed. + + After installing a route to the kernel, user space receives an + acknowledgment, which means the route was installed in the kernel, + but not necessarily in hardware. + It is also possible for a route already installed in hardware to change + its action and therefore its flags. For example, a host route that is + trapping packets can be "promoted" to perform decapsulation following + the installation of an IPinIP/VXLAN tunnel. + The notifications will indicate to user-space the state of the route. + + Default: 0 (Do not emit notifications.) + + Possible values: + + - 0 - Do not emit notifications. + - 1 - Emit notifications. + - 2 - Emit notifications only for RTM_F_OFFLOAD_FAILED flag change. + IP Fragmentation:
ipfrag_high_thresh - LONG INTEGER @@@ -651,16 -630,15 +651,15 @@@ tcp_rmem - vector of 3 INTEGERs: min, d
default: initial size of receive buffer used by TCP sockets. This value overrides net.core.rmem_default used by other protocols. - Default: 87380 bytes. This value results in window of 65535 with - default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit - less for default tcp_app_win. See below about these variables. + Default: 131072 bytes. + This value results in initial window of 65535.
max: maximal size of receive buffer allowed for automatically selected receiver buffers for TCP socket. This value does not override net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables automatic tuning of that socket's receive buffer size, in which case this value is ignored. - Default: between 87380B and 6MB, depending on RAM size. + Default: between 131072 and 6MB, depending on RAM size.
tcp_sack - BOOLEAN Enable select acknowledgments (SACKS). @@@ -1446,25 -1424,6 +1445,25 @@@ rp_filter - INTEGE Default value is 0. Note that some distributions enable it in startup scripts.
+src_valid_mark - BOOLEAN + - 0 - The fwmark of the packet is not included in reverse path + route lookup. This allows for asymmetric routing configurations + utilizing the fwmark in only one direction, e.g., transparent + proxying. + + - 1 - The fwmark of the packet is included in reverse path route + lookup. This permits rp_filter to function when the fwmark is + used for routing traffic in both directions. + + This setting also affects the utilization of fmwark when + performing source address selection for ICMP replies, or + determining addresses stored for the IPOPT_TS_TSANDADDR and + IPOPT_RR IP options. + + The max value from conf/{all,interface}/src_valid_mark is used. + + Default value is 0. + arp_filter - BOOLEAN - 1 - Allows you to have multiple network interfaces on the same subnet, and have the ARPs for each interface be answered @@@ -1815,27 -1774,6 +1814,27 @@@ nexthop_compat_mode - BOOLEA and extraneous notifications. Default: true (backward compat mode)
+fib_notify_on_flag_change - INTEGER + Whether to emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/ + RTM_F_TRAP/RTM_F_OFFLOAD_FAILED flags are changed. + + After installing a route to the kernel, user space receives an + acknowledgment, which means the route was installed in the kernel, + but not necessarily in hardware. + It is also possible for a route already installed in hardware to change + its action and therefore its flags. For example, a host route that is + trapping packets can be "promoted" to perform decapsulation following + the installation of an IPinIP/VXLAN tunnel. + The notifications will indicate to user-space the state of the route. + + Default: 0 (Do not emit notifications.) + + Possible values: + + - 0 - Do not emit notifications. + - 1 - Emit notifications. + - 2 - Emit notifications only for RTM_F_OFFLOAD_FAILED flag change. + IPv6 Fragmentation:
ip6frag_high_thresh - INTEGER @@@ -1944,16 -1882,6 +1943,16 @@@ accept_ra_defrtr - BOOLEA - enabled if accept_ra is enabled. - disabled if accept_ra is disabled.
+ra_defrtr_metric - UNSIGNED INTEGER + Route metric for default route learned in Router Advertisement. This value + will be assigned as metric for the default route learned via IPv6 Router + Advertisement. Takes affect only if accept_ra_defrtr is enabled. + + Possible values: + 1 to 0xFFFFFFFF + + Default: IP6_RT_PRIO_USER i.e. 1024. + accept_ra_from_local - BOOLEAN Accept RA with source-address that is found on local machine if the RA is otherwise proper and able to be accepted. diff --combined drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 99b6d5a9f1d9,395eb0b52680..4f714f874c4f --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@@ -1368,6 -1368,7 +1368,7 @@@ static void xgbe_stop(struct xgbe_prv_d return;
netif_tx_stop_all_queues(netdev); + netif_carrier_off(pdata->netdev);
xgbe_stop_timers(pdata); flush_workqueue(pdata->dev_workqueue); @@@ -2295,6 -2296,8 +2296,6 @@@ static const struct net_device_ops xgbe .ndo_setup_tc = xgbe_setup_tc, .ndo_fix_features = xgbe_fix_features, .ndo_set_features = xgbe_set_features, - .ndo_udp_tunnel_add = udp_tunnel_nic_add_port, - .ndo_udp_tunnel_del = udp_tunnel_nic_del_port, .ndo_features_check = xgbe_features_check, };
diff --combined drivers/net/ethernet/broadcom/bnxt/bnxt.c index d0f3f68faa91,1c96b7ba24f2..a680fd9c68ea --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@@ -255,9 -255,7 +255,9 @@@ static const u16 bnxt_async_events_arr[ ASYNC_EVENT_CMPL_EVENT_ID_PORT_PHY_CFG_CHANGE, ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY, ASYNC_EVENT_CMPL_EVENT_ID_ERROR_RECOVERY, + ASYNC_EVENT_CMPL_EVENT_ID_DEBUG_NOTIFICATION, ASYNC_EVENT_CMPL_EVENT_ID_RING_MONITOR_MSG, + ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST, };
static struct workqueue_struct *bnxt_pf_wq; @@@ -1267,7 -1265,8 +1267,7 @@@ static void bnxt_tpa_start(struct bnxt } else { tpa_info->hash_type = PKT_HASH_TYPE_NONE; tpa_info->gso_type = 0; - if (netif_msg_rx_err(bp)) - netdev_warn(bp->dev, "TPA packet without valid hash\n"); + netif_warn(bp, rx_err, bp->dev, "TPA packet without valid hash\n"); } tpa_info->flags2 = le32_to_cpu(tpa_start1->rx_tpa_start_cmp_flags2); tpa_info->metadata = le32_to_cpu(tpa_start1->rx_tpa_start_cmp_metadata); @@@ -2022,9 -2021,10 +2022,9 @@@ static int bnxt_async_event_process(str goto async_event_process_exit; set_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event); break; - case ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY: - if (netif_msg_hw(bp)) - netdev_warn(bp->dev, "Received RESET_NOTIFY event, data1: 0x%x, data2: 0x%x\n", - data1, data2); + case ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY: { + char *fatal_str = "non-fatal"; + if (!bp->fw_health) goto async_event_process_exit;
@@@ -2036,17 -2036,14 +2036,17 @@@ if (!bp->fw_reset_max_dsecs) bp->fw_reset_max_dsecs = BNXT_DFLT_FW_RST_MAX_DSECS; if (EVENT_DATA1_RESET_NOTIFY_FATAL(data1)) { - netdev_warn(bp->dev, "Firmware fatal reset event received\n"); + fatal_str = "fatal"; set_bit(BNXT_STATE_FW_FATAL_COND, &bp->state); - } else { - netdev_warn(bp->dev, "Firmware non-fatal reset event received, max wait time %d msec\n", - bp->fw_reset_max_dsecs * 100); } + netif_warn(bp, hw, bp->dev, + "Firmware %s reset event, data1: 0x%x, data2: 0x%x, min wait %u ms, max wait %u ms\n", + fatal_str, data1, data2, + bp->fw_reset_min_dsecs * 100, + bp->fw_reset_max_dsecs * 100); set_bit(BNXT_FW_RESET_NOTIFY_SP_EVENT, &bp->sp_event); break; + } case ASYNC_EVENT_CMPL_EVENT_ID_ERROR_RECOVERY: { struct bnxt_fw_health *fw_health = bp->fw_health;
@@@ -2055,11 -2052,16 +2055,11 @@@
fw_health->enabled = EVENT_DATA1_RECOVERY_ENABLED(data1); fw_health->master = EVENT_DATA1_RECOVERY_MASTER_FUNC(data1); - if (!fw_health->enabled) + if (!fw_health->enabled) { + netif_info(bp, drv, bp->dev, + "Error recovery info: error recovery[0]\n"); break; - - if (netif_msg_drv(bp)) - netdev_info(bp->dev, "Error recovery info: error recovery[%d], master[%d], reset count[0x%x], health status: 0x%x\n", - fw_health->enabled, fw_health->master, - bnxt_fw_health_readl(bp, - BNXT_FW_RESET_CNT_REG), - bnxt_fw_health_readl(bp, - BNXT_FW_HEALTH_REG)); + } fw_health->tmr_multiplier = DIV_ROUND_UP(fw_health->polling_dsecs * HZ, bp->current_interval * 10); @@@ -2068,17 -2070,8 +2068,17 @@@ bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG); fw_health->last_fw_reset_cnt = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG); + netif_info(bp, drv, bp->dev, + "Error recovery info: error recovery[1], master[%d], reset count[%u], health status: 0x%x\n", + fw_health->master, fw_health->last_fw_reset_cnt, + bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG)); goto async_event_process_exit; } + case ASYNC_EVENT_CMPL_EVENT_ID_DEBUG_NOTIFICATION: + netif_notice(bp, hw, bp->dev, + "Received firmware debug notification, data1: 0x%x, data2: 0x%x\n", + data1, data2); + goto async_event_process_exit; case ASYNC_EVENT_CMPL_EVENT_ID_RING_MONITOR_MSG: { struct bnxt_rx_ring_info *rxr; u16 grp_idx; @@@ -2101,20 -2094,6 +2101,20 @@@ bnxt_sched_reset(bp, rxr); goto async_event_process_exit; } + case ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST: { + struct bnxt_fw_health *fw_health = bp->fw_health; + + netif_notice(bp, hw, bp->dev, + "Received firmware echo request, data1: 0x%x, data2: 0x%x\n", + data1, data2); + if (fw_health) { + fw_health->echo_req_data1 = data1; + fw_health->echo_req_data2 = data2; + set_bit(BNXT_FW_ECHO_REQUEST_SP_EVENT, &bp->sp_event); + break; + } + goto async_event_process_exit; + } default: goto async_event_process_exit; } @@@ -2415,10 -2394,6 +2415,10 @@@ static int bnxt_poll(struct napi_struc struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring; int work_done = 0;
+ if (unlikely(test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state))) { + napi_complete(napi); + return 0; + } while (1) { work_done += bnxt_poll_work(bp, cpr, budget - work_done);
@@@ -2493,10 -2468,6 +2493,10 @@@ static int bnxt_poll_p5(struct napi_str int work_done = 0; u32 cons;
+ if (unlikely(test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state))) { + napi_complete(napi); + return 0; + } if (cpr->has_more_work) { cpr->has_more_work = 0; work_done = __bnxt_poll_cqs(bp, bnapi, budget); @@@ -2704,23 -2675,6 +2704,23 @@@ static void bnxt_free_skbs(struct bnxt bnxt_free_rx_skbs(bp); }
+static void bnxt_init_ctx_mem(struct bnxt_mem_init *mem_init, void *p, int len) +{ + u8 init_val = mem_init->init_val; + u16 offset = mem_init->offset; + u8 *p2 = p; + int i; + + if (!init_val) + return; + if (offset == BNXT_MEM_INVALID_OFFSET) { + memset(p, init_val, len); + return; + } + for (i = 0; i < len; i += mem_init->size) + *(p2 + i + offset) = init_val; +} + static void bnxt_free_ring(struct bnxt *bp, struct bnxt_ring_mem_info *rmem) { struct pci_dev *pdev = bp->pdev; @@@ -2780,9 -2734,9 +2780,9 @@@ static int bnxt_alloc_ring(struct bnxt if (!rmem->pg_arr[i]) return -ENOMEM;
- if (rmem->init_val) - memset(rmem->pg_arr[i], rmem->init_val, - rmem->page_size); + if (rmem->mem_init) + bnxt_init_ctx_mem(rmem->mem_init, rmem->pg_arr[i], + rmem->page_size); if (rmem->nr_pages > 1 || rmem->depth > 0) { if (i == rmem->nr_pages - 2 && (rmem->flags & BNXT_RMEM_RING_PTE_FLAG)) @@@ -4318,9 -4272,6 +4318,9 @@@ static void bnxt_disable_int_sync(struc { int i;
+ if (!bp->irq_tbl) + return; + atomic_inc(&bp->intr_sem);
bnxt_disable_int(bp); @@@ -4474,8 -4425,6 +4474,8 @@@ static int bnxt_hwrm_do_send_msg(struc
if (!timeout) timeout = DFLT_HWRM_CMD_TIMEOUT; + /* Limit timeout to an upper limit */ + timeout = min(timeout, HWRM_CMD_MAX_TIMEOUT); /* convert timeout to usec */ timeout *= 1000;
@@@ -6783,39 -6732,6 +6783,39 @@@ func_qcfg_exit return rc; }
+static void bnxt_init_ctx_initializer(struct bnxt_ctx_mem_info *ctx, + struct hwrm_func_backing_store_qcaps_output *resp) +{ + struct bnxt_mem_init *mem_init; + u16 init_mask; + u8 init_val; + u8 *offset; + int i; + + init_val = resp->ctx_kind_initializer; + init_mask = le16_to_cpu(resp->ctx_init_mask); + offset = &resp->qp_init_offset; + mem_init = &ctx->mem_init[BNXT_CTX_MEM_INIT_QP]; + for (i = 0; i < BNXT_CTX_MEM_INIT_MAX; i++, mem_init++, offset++) { + mem_init->init_val = init_val; + mem_init->offset = BNXT_MEM_INVALID_OFFSET; + if (!init_mask) + continue; + if (i == BNXT_CTX_MEM_INIT_STAT) + offset = &resp->stat_init_offset; + if (init_mask & (1 << i)) + mem_init->offset = *offset * 4; + else + mem_init->init_val = 0; + } + ctx->mem_init[BNXT_CTX_MEM_INIT_QP].size = ctx->qp_entry_size; + ctx->mem_init[BNXT_CTX_MEM_INIT_SRQ].size = ctx->srq_entry_size; + ctx->mem_init[BNXT_CTX_MEM_INIT_CQ].size = ctx->cq_entry_size; + ctx->mem_init[BNXT_CTX_MEM_INIT_VNIC].size = ctx->vnic_entry_size; + ctx->mem_init[BNXT_CTX_MEM_INIT_STAT].size = ctx->stat_entry_size; + ctx->mem_init[BNXT_CTX_MEM_INIT_MRAV].size = ctx->mrav_entry_size; +} + static int bnxt_hwrm_func_backing_store_qcaps(struct bnxt *bp) { struct hwrm_func_backing_store_qcaps_input req = {0}; @@@ -6870,9 -6786,7 +6870,9 @@@ le16_to_cpu(resp->mrav_num_entries_units); ctx->tim_entry_size = le16_to_cpu(resp->tim_entry_size); ctx->tim_max_entries = le32_to_cpu(resp->tim_max_entries); - ctx->ctx_kind_initializer = resp->ctx_kind_initializer; + + bnxt_init_ctx_initializer(ctx, resp); + ctx->tqm_fp_rings_count = resp->tqm_fp_rings_count; if (!ctx->tqm_fp_rings_count) ctx->tqm_fp_rings_count = bp->max_q; @@@ -6902,9 -6816,6 +6902,9 @@@ static void bnxt_hwrm_set_pg_attr(struc { u8 pg_size = 0;
+ if (!rmem->nr_pages) + return; + if (BNXT_PAGE_SHIFT == 13) pg_size = 1 << 4; else if (BNXT_PAGE_SIZE == 16) @@@ -6934,7 -6845,6 +6934,7 @@@ static int bnxt_hwrm_func_backing_store struct hwrm_func_backing_store_cfg_input req = {0}; struct bnxt_ctx_mem_info *ctx = bp->ctx; struct bnxt_ctx_pg_info *ctx_pg; + u32 req_len = sizeof(req); __le32 *num_entries; __le64 *pg_dir; u32 flags = 0; @@@ -6945,8 -6855,6 +6945,8 @@@ if (!ctx) return 0;
+ if (req_len > bp->hwrm_max_ext_req_len) + req_len = BNXT_BACKING_STORE_CFG_LEGACY_LEN; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_BACKING_STORE_CFG, -1, -1); req.enables = cpu_to_le32(enables);
@@@ -7030,7 -6938,7 +7030,7 @@@ bnxt_hwrm_set_pg_attr(&ctx_pg->ring_mem, pg_attr, pg_dir); } req.flags = cpu_to_le32(flags); - return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); + return hwrm_send_message(bp, &req, req_len, HWRM_CMD_TIMEOUT); }
static int bnxt_alloc_ctx_mem_blk(struct bnxt *bp, @@@ -7049,7 -6957,7 +7049,7 @@@
static int bnxt_alloc_ctx_pg_tbls(struct bnxt *bp, struct bnxt_ctx_pg_info *ctx_pg, u32 mem_size, - u8 depth, bool use_init_val) + u8 depth, struct bnxt_mem_init *mem_init) { struct bnxt_ring_mem_info *rmem = &ctx_pg->ring_mem; int rc; @@@ -7087,7 -6995,8 +7087,7 @@@ rmem->pg_tbl_map = ctx_pg->ctx_dma_arr[i]; rmem->depth = 1; rmem->nr_pages = MAX_CTX_PAGES; - if (use_init_val) - rmem->init_val = bp->ctx->ctx_kind_initializer; + rmem->mem_init = mem_init; if (i == (nr_tbls - 1)) { int rem = ctx_pg->nr_pages % MAX_CTX_PAGES;
@@@ -7102,7 -7011,8 +7102,7 @@@ rmem->nr_pages = DIV_ROUND_UP(mem_size, BNXT_PAGE_SIZE); if (rmem->nr_pages > 1 || depth) rmem->depth = 1; - if (use_init_val) - rmem->init_val = bp->ctx->ctx_kind_initializer; + rmem->mem_init = mem_init; rc = bnxt_alloc_ctx_mem_blk(bp, ctx_pg); } return rc; @@@ -7166,7 -7076,6 +7166,7 @@@ static int bnxt_alloc_ctx_mem(struct bn { struct bnxt_ctx_pg_info *ctx_pg; struct bnxt_ctx_mem_info *ctx; + struct bnxt_mem_init *init; u32 mem_size, ena, entries; u32 entries_sp, min; u32 num_mr, num_ah; @@@ -7194,54 -7103,39 +7194,54 @@@ ctx_pg = &ctx->qp_mem; ctx_pg->entries = ctx->qp_min_qp1_entries + ctx->qp_max_l2_entries + extra_qps; - mem_size = ctx->qp_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, true); - if (rc) - return rc; + if (ctx->qp_entry_size) { + mem_size = ctx->qp_entry_size * ctx_pg->entries; + init = &ctx->mem_init[BNXT_CTX_MEM_INIT_QP]; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, init); + if (rc) + return rc; + }
ctx_pg = &ctx->srq_mem; ctx_pg->entries = ctx->srq_max_l2_entries + extra_srqs; - mem_size = ctx->srq_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, true); - if (rc) - return rc; + if (ctx->srq_entry_size) { + mem_size = ctx->srq_entry_size * ctx_pg->entries; + init = &ctx->mem_init[BNXT_CTX_MEM_INIT_SRQ]; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, init); + if (rc) + return rc; + }
ctx_pg = &ctx->cq_mem; ctx_pg->entries = ctx->cq_max_l2_entries + extra_qps * 2; - mem_size = ctx->cq_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, true); - if (rc) - return rc; + if (ctx->cq_entry_size) { + mem_size = ctx->cq_entry_size * ctx_pg->entries; + init = &ctx->mem_init[BNXT_CTX_MEM_INIT_CQ]; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, init); + if (rc) + return rc; + }
ctx_pg = &ctx->vnic_mem; ctx_pg->entries = ctx->vnic_max_vnic_entries + ctx->vnic_max_ring_table_entries; - mem_size = ctx->vnic_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, true); - if (rc) - return rc; + if (ctx->vnic_entry_size) { + mem_size = ctx->vnic_entry_size * ctx_pg->entries; + init = &ctx->mem_init[BNXT_CTX_MEM_INIT_VNIC]; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, init); + if (rc) + return rc; + }
ctx_pg = &ctx->stat_mem; ctx_pg->entries = ctx->stat_max_entries; - mem_size = ctx->stat_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, true); - if (rc) - return rc; + if (ctx->stat_entry_size) { + mem_size = ctx->stat_entry_size * ctx_pg->entries; + init = &ctx->mem_init[BNXT_CTX_MEM_INIT_STAT]; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, init); + if (rc) + return rc; + }
ena = 0; if (!(bp->flags & BNXT_FLAG_ROCE_CAP)) @@@ -7254,13 -7148,10 +7254,13 @@@ num_mr = 1024 * 256; num_ah = 1024 * 128; ctx_pg->entries = num_mr + num_ah; - mem_size = ctx->mrav_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 2, true); - if (rc) - return rc; + if (ctx->mrav_entry_size) { + mem_size = ctx->mrav_entry_size * ctx_pg->entries; + init = &ctx->mem_init[BNXT_CTX_MEM_INIT_MRAV]; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 2, init); + if (rc) + return rc; + } ena = FUNC_BACKING_STORE_CFG_REQ_ENABLES_MRAV; if (ctx->mrav_num_entries_units) ctx_pg->entries = @@@ -7269,12 -7160,10 +7269,12 @@@
ctx_pg = &ctx->tim_mem; ctx_pg->entries = ctx->qp_mem.entries; - mem_size = ctx->tim_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, false); - if (rc) - return rc; + if (ctx->tim_entry_size) { + mem_size = ctx->tim_entry_size * ctx_pg->entries; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, NULL); + if (rc) + return rc; + } ena |= FUNC_BACKING_STORE_CFG_REQ_ENABLES_TIM;
skip_rdma: @@@ -7288,13 -7177,10 +7288,13 @@@ for (i = 0; i < ctx->tqm_fp_rings_count + 1; i++) { ctx_pg = ctx->tqm_mem[i]; ctx_pg->entries = i ? entries : entries_sp; - mem_size = ctx->tqm_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, false); - if (rc) - return rc; + if (ctx->tqm_entry_size) { + mem_size = ctx->tqm_entry_size * ctx_pg->entries; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, + NULL); + if (rc) + return rc; + } ena |= FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_SP << i; } ena |= FUNC_BACKING_STORE_CFG_REQ_DFLT_ENABLES; @@@ -7552,22 -7438,9 +7552,22 @@@ static void bnxt_try_map_fw_health_reg(
sig = readl(hs + offsetof(struct hcomm_status, sig_ver)); if ((sig & HCOMM_STATUS_SIGNATURE_MASK) != HCOMM_STATUS_SIGNATURE_VAL) { - if (bp->fw_health) - bp->fw_health->status_reliable = false; - return; + if (!bp->chip_num) { + __bnxt_map_fw_health_reg(bp, BNXT_GRC_REG_BASE); + bp->chip_num = readl(bp->bar0 + + BNXT_FW_HEALTH_WIN_BASE + + BNXT_GRC_REG_CHIP_NUM); + } + if (!BNXT_CHIP_P5(bp)) { + if (bp->fw_health) + bp->fw_health->status_reliable = false; + return; + } + status_loc = BNXT_GRC_REG_STATUS_P5 | + BNXT_FW_HEALTH_REG_TYPE_BAR0; + } else { + status_loc = readl(hs + offsetof(struct hcomm_status, + fw_status_loc)); }
if (__bnxt_alloc_fw_health(bp)) { @@@ -7575,6 -7448,7 +7575,6 @@@ return; }
- status_loc = readl(hs + offsetof(struct hcomm_status, fw_status_loc)); bp->fw_health->regs[BNXT_FW_HEALTH_REG] = status_loc; reg_type = BNXT_FW_HEALTH_REG_TYPE(status_loc); if (reg_type == BNXT_FW_HEALTH_REG_TYPE_GRC) { @@@ -8729,7 -8603,7 +8729,7 @@@ msix_setup_exit
static int bnxt_init_inta(struct bnxt *bp) { - bp->irq_tbl = kcalloc(1, sizeof(struct bnxt_irq), GFP_KERNEL); + bp->irq_tbl = kzalloc(sizeof(struct bnxt_irq), GFP_KERNEL); if (!bp->irq_tbl) return -ENOMEM;
@@@ -8937,8 -8811,7 +8937,8 @@@ static void bnxt_disable_napi(struct bn { int i;
- if (!bp->bnapi) + if (!bp->bnapi || + test_and_set_bit(BNXT_STATE_NAPI_DISABLED, &bp->state)) return;
for (i = 0; i < bp->cp_nr_rings; i++) { @@@ -8955,7 -8828,6 +8955,7 @@@ static void bnxt_enable_napi(struct bnx { int i;
+ clear_bit(BNXT_STATE_NAPI_DISABLED, &bp->state); for (i = 0; i < bp->cp_nr_rings; i++) { struct bnxt_napi *bnapi = bp->bnapi[i]; struct bnxt_cp_ring_info *cpr; @@@ -8984,9 -8856,10 +8984,10 @@@ void bnxt_tx_disable(struct bnxt *bp txr->dev_state = BNXT_DEV_STATE_CLOSING; } } + /* Drop carrier first to prevent TX timeout */ + netif_carrier_off(bp->dev); /* Stop all TX queues */ netif_tx_disable(bp->dev); - netif_carrier_off(bp->dev); }
void bnxt_tx_enable(struct bnxt *bp) @@@ -9462,60 -9335,13 +9463,60 @@@ static int bnxt_hwrm_shutdown_link(stru
static int bnxt_fw_init_one(struct bnxt *bp);
+static int bnxt_fw_reset_via_optee(struct bnxt *bp) +{ +#ifdef CONFIG_TEE_BNXT_FW + int rc = tee_bnxt_fw_load(); + + if (rc) + netdev_err(bp->dev, "Failed FW reset via OP-TEE, rc=%d\n", rc); + + return rc; +#else + netdev_err(bp->dev, "OP-TEE not supported\n"); + return -ENODEV; +#endif +} + +static int bnxt_try_recover_fw(struct bnxt *bp) +{ + if (bp->fw_health && bp->fw_health->status_reliable) { + int retry = 0, rc; + u32 sts; + + mutex_lock(&bp->hwrm_cmd_lock); + do { + rc = __bnxt_hwrm_ver_get(bp, true); + sts = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG); + if (!sts || !BNXT_FW_IS_BOOTING(sts)) + break; + retry++; + } while (rc == -EBUSY && retry < BNXT_FW_RETRY); + mutex_unlock(&bp->hwrm_cmd_lock); + + if (!BNXT_FW_IS_HEALTHY(sts)) { + netdev_err(bp->dev, + "Firmware not responding, status: 0x%x\n", + sts); + rc = -ENODEV; + } + if (sts & FW_STATUS_REG_CRASHED_NO_MASTER) { + netdev_warn(bp->dev, "Firmware recover via OP-TEE requested\n"); + return bnxt_fw_reset_via_optee(bp); + } + return rc; + } + + return -ENODEV; +} + static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) { struct hwrm_func_drv_if_change_output *resp = bp->hwrm_cmd_resp_addr; struct hwrm_func_drv_if_change_input req = {0}; bool resc_reinit = false, fw_reset = false; + int rc, retry = 0; u32 flags = 0; - int rc;
if (!(bp->fw_cap & BNXT_FW_CAP_IF_CHANGE)) return 0; @@@ -9524,25 -9350,10 +9525,25 @@@ if (up) req.flags = cpu_to_le32(FUNC_DRV_IF_CHANGE_REQ_FLAGS_UP); mutex_lock(&bp->hwrm_cmd_lock); - rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); + while (retry < BNXT_FW_IF_RETRY) { + rc = _hwrm_send_message(bp, &req, sizeof(req), + HWRM_CMD_TIMEOUT); + if (rc != -EAGAIN) + break; + + msleep(50); + retry++; + } if (!rc) flags = le32_to_cpu(resp->flags); mutex_unlock(&bp->hwrm_cmd_lock); + + if (rc == -EAGAIN) + return rc; + if (rc && up) { + rc = bnxt_try_recover_fw(bp); + fw_reset = true; + } if (rc) return rc;
@@@ -9882,25 -9693,6 +9883,25 @@@ static void bnxt_preset_reg_win(struct
static int bnxt_init_dflt_ring_mode(struct bnxt *bp);
+static int bnxt_reinit_after_abort(struct bnxt *bp) +{ + int rc; + + if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) + return -EBUSY; + + rc = bnxt_fw_init_one(bp); + if (!rc) { + bnxt_clear_int_mode(bp); + rc = bnxt_init_int_mode(bp); + if (!rc) { + clear_bit(BNXT_STATE_ABORT_ERR, &bp->state); + set_bit(BNXT_STATE_FW_RESET_DET, &bp->state); + } + } + return rc; +} + static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) { int rc = 0; @@@ -10059,14 -9851,8 +10060,14 @@@ static int bnxt_open(struct net_device int rc;
if (test_bit(BNXT_STATE_ABORT_ERR, &bp->state)) { - netdev_err(bp->dev, "A previous firmware reset did not complete, aborting\n"); - return -ENODEV; + rc = bnxt_reinit_after_abort(bp); + if (rc) { + if (rc == -EBUSY) + netdev_err(bp->dev, "A previous firmware reset has not completed, aborting\n"); + else + netdev_err(bp->dev, "Failed to reinitialize after aborted firmware reset\n"); + return -ENODEV; + } }
rc = bnxt_hwrm_if_change(bp, true); @@@ -11003,23 -10789,11 +11004,23 @@@ static void bnxt_rx_ring_reset(struct b static void bnxt_fw_reset_close(struct bnxt *bp) { bnxt_ulp_stop(bp); - /* When firmware is fatal state, disable PCI device to prevent - * any potential bad DMAs before freeing kernel memory. + /* When firmware is in fatal state, quiesce device and disable + * bus master to prevent any potential bad DMAs before freeing + * kernel memory. */ - if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) + if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) { + u16 val = 0; + + pci_read_config_word(bp->pdev, PCI_SUBSYSTEM_ID, &val); + if (val == 0xffff) + bp->fw_reset_min_dsecs = 0; + bnxt_tx_disable(bp); + bnxt_disable_napi(bp); + bnxt_disable_int_sync(bp); + bnxt_free_irq(bp); + bnxt_clear_int_mode(bp); pci_disable_device(bp->pdev); + } __bnxt_close_nic(bp, true, false); bnxt_clear_int_mode(bp); bnxt_hwrm_func_drv_unrgtr(bp); @@@ -11224,17 -10998,6 +11225,17 @@@ static void bnxt_init_ethtool_link_sett link_info->req_flow_ctrl = link_info->force_pause_setting; }
+static void bnxt_fw_echo_reply(struct bnxt *bp) +{ + struct bnxt_fw_health *fw_health = bp->fw_health; + struct hwrm_func_echo_response_input req = {0}; + + bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_ECHO_RESPONSE, -1, -1); + req.event_data1 = cpu_to_le32(fw_health->echo_req_data1); + req.event_data2 = cpu_to_le32(fw_health->echo_req_data2); + hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); +} + static void bnxt_sp_task(struct work_struct *work) { struct bnxt *bp = container_of(work, struct bnxt, sp_task); @@@ -11302,9 -11065,6 +11303,9 @@@ if (test_and_clear_bit(BNXT_RING_COAL_NOW_SP_EVENT, &bp->sp_event)) bnxt_chk_missed_irq(bp);
+ if (test_and_clear_bit(BNXT_FW_ECHO_REQUEST_SP_EVENT, &bp->sp_event)) + bnxt_fw_echo_reply(bp); + /* These functions below will clear BNXT_STATE_IN_SP_TASK. They * must be the last functions to be called before exiting. */ @@@ -11421,6 -11181,21 +11422,6 @@@ static void bnxt_init_dflt_coal(struct bp->stats_coal_ticks = BNXT_DEF_STATS_COAL_TICKS; }
-static int bnxt_fw_reset_via_optee(struct bnxt *bp) -{ -#ifdef CONFIG_TEE_BNXT_FW - int rc = tee_bnxt_fw_load(); - - if (rc) - netdev_err(bp->dev, "Failed FW reset via OP-TEE, rc=%d\n", rc); - - return rc; -#else - netdev_err(bp->dev, "OP-TEE not supported\n"); - return -ENODEV; -#endif -} - static int bnxt_fw_init_one_p1(struct bnxt *bp) { int rc; @@@ -11429,10 -11204,19 +11430,10 @@@ rc = bnxt_hwrm_ver_get(bp); bnxt_try_map_fw_health_reg(bp); if (rc) { - if (bp->fw_health && bp->fw_health->status_reliable) { - u32 sts = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG); - - netdev_err(bp->dev, - "Firmware not responding, status: 0x%x\n", - sts); - if (sts & FW_STATUS_REG_CRASHED_NO_MASTER) { - netdev_warn(bp->dev, "Firmware recover via OP-TEE requested\n"); - rc = bnxt_fw_reset_via_optee(bp); - if (!rc) - rc = bnxt_hwrm_ver_get(bp); - } - } + rc = bnxt_try_recover_fw(bp); + if (rc) + return rc; + rc = bnxt_hwrm_ver_get(bp); if (rc) return rc; } @@@ -11632,12 -11416,6 +11633,12 @@@ static void bnxt_reset_all(struct bnxt bp->fw_reset_timestamp = jiffies; }
+static bool bnxt_fw_reset_timeout(struct bnxt *bp) +{ + return time_after(jiffies, bp->fw_reset_timestamp + + (bp->fw_reset_max_dsecs * HZ / 10)); +} + static void bnxt_fw_reset_task(struct work_struct *work) { struct bnxt *bp = container_of(work, struct bnxt, fw_reset_task.work); @@@ -11659,7 -11437,8 +11660,7 @@@ bp->fw_reset_timestamp)); goto fw_reset_abort; } else if (n > 0) { - if (time_after(jiffies, bp->fw_reset_timestamp + - (bp->fw_reset_max_dsecs * HZ / 10))) { + if (bnxt_fw_reset_timeout(bp)) { clear_bit(BNXT_STATE_IN_FW_RESET, &bp->state); bp->fw_reset_state = 0; netdev_err(bp->dev, "Firmware reset aborted, bnxt_get_registered_vfs() returns %d\n", @@@ -11688,7 -11467,8 +11689,7 @@@
val = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG); if (!(val & BNXT_FW_STATUS_SHUTDOWN) && - !time_after(jiffies, bp->fw_reset_timestamp + - (bp->fw_reset_max_dsecs * HZ / 10))) { + !bnxt_fw_reset_timeout(bp)) { bnxt_queue_fw_reset_work(bp, HZ / 5); return; } @@@ -11712,20 -11492,6 +11713,20 @@@ if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) { u32 val;
+ if (!bp->fw_reset_min_dsecs) { + u16 val; + + pci_read_config_word(bp->pdev, PCI_SUBSYSTEM_ID, + &val); + if (val == 0xffff) { + if (bnxt_fw_reset_timeout(bp)) { + netdev_err(bp->dev, "Firmware reset aborted, PCI config space invalid\n"); + goto fw_reset_abort; + } + bnxt_queue_fw_reset_work(bp, HZ / 1000); + return; + } + } val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_INPROG_REG); if (val) @@@ -11744,7 -11510,8 +11745,7 @@@ bp->hwrm_cmd_timeout = SHORT_HWRM_CMD_TIMEOUT; rc = __bnxt_hwrm_ver_get(bp, true); if (rc) { - if (time_after(jiffies, bp->fw_reset_timestamp + - (bp->fw_reset_max_dsecs * HZ / 10))) { + if (bnxt_fw_reset_timeout(bp)) { netdev_err(bp->dev, "Firmware reset aborted\n"); goto fw_reset_abort_status; } @@@ -12325,6 -12092,8 +12326,6 @@@ static const struct net_device_ops bnxt #ifdef CONFIG_RFS_ACCEL .ndo_rx_flow_steer = bnxt_rx_flow_steer, #endif - .ndo_udp_tunnel_add = udp_tunnel_nic_add_port, - .ndo_udp_tunnel_del = udp_tunnel_nic_del_port, .ndo_bpf = bnxt_xdp, .ndo_xdp_xmit = bnxt_xdp_xmit, .ndo_bridge_getlink = bnxt_bridge_getlink, @@@ -12776,6 -12545,9 +12777,6 @@@ static int bnxt_init_one(struct pci_de dev->ethtool_ops = &bnxt_ethtool_ops; pci_set_drvdata(pdev, dev);
- if (BNXT_PF(bp)) - bnxt_vpd_read_info(bp); - rc = bnxt_alloc_hwrm_resources(bp); if (rc) goto init_err_pci_clean; @@@ -12787,9 -12559,6 +12788,9 @@@ if (rc) goto init_err_pci_clean;
+ if (BNXT_PF(bp)) + bnxt_vpd_read_info(bp); + if (BNXT_CHIP_P5(bp)) { bp->flags |= BNXT_FLAG_CHIP_P5; if (BNXT_CHIP_SR2(bp)) diff --combined drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 90a31b4a3020,a9bcf887d2fb..64381be935a8 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@@ -44,20 -44,21 +44,20 @@@ static int bnxt_fw_reporter_diagnose(st struct netlink_ext_ack *extack) { struct bnxt *bp = devlink_health_reporter_priv(reporter); - u32 val, health_status; + u32 val; int rc;
if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) return 0;
val = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG); - health_status = val & 0xffff;
- if (health_status < BNXT_FW_STATUS_HEALTHY) { + if (BNXT_FW_IS_BOOTING(val)) { rc = devlink_fmsg_string_pair_put(fmsg, "Description", "Not yet completed initialization"); if (rc) return rc; - } else if (health_status > BNXT_FW_STATUS_HEALTHY) { + } else if (BNXT_FW_IS_ERR(val)) { rc = devlink_fmsg_string_pair_put(fmsg, "Description", "Encountered fatal error and cannot recover"); if (rc) @@@ -471,8 -472,8 +471,8 @@@ static int bnxt_dl_info_get(struct devl if (BNXT_PF(bp) && !bnxt_hwrm_get_nvm_cfg_ver(bp, &nvm_cfg_ver)) { u32 ver = nvm_cfg_ver.vu32;
- sprintf(buf, "%X.%X.%X", (ver >> 16) & 0xF, (ver >> 8) & 0xF, - ver & 0xF); + sprintf(buf, "%d.%d.%d", (ver >> 16) & 0xf, (ver >> 8) & 0xf, + ver & 0xf); rc = bnxt_dl_info_put(bp, req, BNXT_VERSION_STORED, DEVLINK_INFO_VERSION_GENERIC_FW_PSID, buf); diff --combined drivers/net/ethernet/chelsio/cxgb4/sge.c index 550cc065649f,3334c9e2152a..256fae15e032 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@@ -1600,8 -1600,7 +1600,8 @@@ static netdev_tx_t cxgb4_eth_xmit(struc * has opened up. */ eth_txq_stop(q); - wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F; + if (chip_ver > CHELSIO_T5) + wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F; }
wr = (void *)&q->q.desc[q->q.pidx]; @@@ -1833,7 -1832,6 +1833,7 @@@ static netdev_tx_t cxgb4_vf_eth_xmit(st struct adapter *adapter; int qidx, credits, ret; size_t fw_hdr_copy_len; + unsigned int chip_ver; u64 cntrl, *end; u32 wr_mid;
@@@ -1898,7 -1896,6 +1898,7 @@@ goto out_free; }
+ chip_ver = CHELSIO_CHIP_VERSION(adapter->params.chip); wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2)); if (unlikely(credits < ETHTXQ_STOP_THRES)) { /* After we're done injecting the Work Request for this @@@ -1910,8 -1907,7 +1910,8 @@@ * has opened up. */ eth_txq_stop(txq); - wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F; + if (chip_ver > CHELSIO_T5) + wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F; }
/* Start filling in our Work Request. Note that we do _not_ handle @@@ -1964,7 -1960,7 +1964,7 @@@ */ cpl = (void *)(lso + 1);
- if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5) + if (chip_ver <= CHELSIO_T5) cntrl = TXPKT_ETHHDR_LEN_V(eth_xtra_len); else cntrl = T6_TXPKT_ETHHDR_LEN_V(eth_xtra_len); @@@ -2846,17 -2842,22 +2846,22 @@@ int t4_mgmt_tx(struct adapter *adap, st * @skb: the packet * * Returns true if a packet can be sent as an offload WR with immediate - * data. We currently use the same limit as for Ethernet packets. + * data. + * FW_OFLD_TX_DATA_WR limits the payload to 255 bytes due to 8-bit field. + * However, FW_ULPTX_WR commands have a 256 byte immediate only + * payload limit. */ static inline int is_ofld_imm(const struct sk_buff *skb) { struct work_request_hdr *req = (struct work_request_hdr *)skb->data; unsigned long opcode = FW_WR_OP_G(ntohl(req->wr_hi));
- if (opcode == FW_CRYPTO_LOOKASIDE_WR) + if (unlikely(opcode == FW_ULPTX_WR)) + return skb->len <= MAX_IMM_ULPTX_WR_LEN; + else if (opcode == FW_CRYPTO_LOOKASIDE_WR) return skb->len <= SGE_MAX_WR_LEN; else - return skb->len <= MAX_IMM_TX_PKT_LEN; + return skb->len <= MAX_IMM_OFLD_TX_DATA_WR_LEN; }
/** @@@ -3602,25 -3603,6 +3607,25 @@@ static void t4_tx_completion_handler(st }
txq = &s->ethtxq[pi->first_qset + rspq->idx]; + + /* We've got the Hardware Consumer Index Update in the Egress Update + * message. These Egress Update messages will be our sole CIDX Updates + * we get since we don't want to chew up PCIe bandwidth for both Ingress + * Messages and Status Page writes. However, The code which manages + * reclaiming successfully DMA'ed TX Work Requests uses the CIDX value + * stored in the Status Page at the end of the TX Queue. It's easiest + * to simply copy the CIDX Update value from the Egress Update message + * to the Status Page. Also note that no Endian issues need to be + * considered here since both are Big Endian and we're just copying + * bytes consistently ... + */ + if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5) { + struct cpl_sge_egr_update *egr; + + egr = (struct cpl_sge_egr_update *)rsp; + WRITE_ONCE(txq->q.stat->cidx, egr->cidx); + } + t4_sge_eth_txq_egress_update(adapter, txq, -1); }
@@@ -4606,15 -4588,11 +4611,15 @@@ int t4_sge_alloc_eth_txq(struct adapte * write the CIDX Updates into the Status Page at the end of the * TX Queue. */ - c.autoequiqe_to_viid = htonl(FW_EQ_ETH_CMD_AUTOEQUEQE_F | + c.autoequiqe_to_viid = htonl(((chip_ver <= CHELSIO_T5) ? + FW_EQ_ETH_CMD_AUTOEQUIQE_F : + FW_EQ_ETH_CMD_AUTOEQUEQE_F) | FW_EQ_ETH_CMD_VIID_V(pi->viid));
c.fetchszm_to_iqid = - htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) | + htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V((chip_ver <= CHELSIO_T5) ? + HOSTFCMODE_INGRESS_QUEUE_X : + HOSTFCMODE_STATUS_PAGE_X) | FW_EQ_ETH_CMD_PCIECHN_V(pi->tx_chan) | FW_EQ_ETH_CMD_FETCHRO_F | FW_EQ_ETH_CMD_IQID_V(iqid));
@@@ -4625,7 -4603,6 +4630,7 @@@ : FETCHBURSTMIN_64B_T6_X) | FW_EQ_ETH_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) | FW_EQ_ETH_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) | + FW_EQ_ETH_CMD_CIDXFTHRESHO_V(chip_ver == CHELSIO_T5) | FW_EQ_ETH_CMD_EQSIZE_V(nentries));
c.eqaddr = cpu_to_be64(txq->q.phys_addr); diff --combined drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 19f74d4cbb4e,f1c2b3c7f7e9..492943bb9c48 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@@ -350,7 -350,7 +350,7 @@@ static u32 dpaa2_eth_run_xdp(struct dpa struct bpf_prog *xdp_prog; struct xdp_buff xdp; u32 xdp_act = XDP_PASS; - int err; + int err, offset;
rcu_read_lock();
@@@ -358,10 -358,14 +358,10 @@@ if (!xdp_prog) goto out;
- xdp.data = vaddr + dpaa2_fd_get_offset(fd); - xdp.data_end = xdp.data + dpaa2_fd_get_len(fd); - xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; - xdp_set_data_meta_invalid(&xdp); - xdp.rxq = &ch->xdp_rxq; - - xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE - - (dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM); + offset = dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM; + xdp_init_buff(&xdp, DPAA2_ETH_RX_BUF_RAW_SIZE - offset, &ch->xdp_rxq); + xdp_prepare_buff(&xdp, vaddr + offset, XDP_PACKET_HEADROOM, + dpaa2_fd_get_len(fd), false);
xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp);
@@@ -395,10 -399,20 +395,20 @@@ xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE;
err = xdp_do_redirect(priv->net_dev, &xdp, xdp_prog); - if (unlikely(err)) + if (unlikely(err)) { + addr = dma_map_page(priv->net_dev->dev.parent, + virt_to_page(vaddr), 0, + priv->rx_buf_size, DMA_BIDIRECTIONAL); + if (unlikely(dma_mapping_error(priv->net_dev->dev.parent, addr))) { + free_pages((unsigned long)vaddr, 0); + } else { + ch->buf_count++; + dpaa2_eth_xdp_release_buf(priv, ch, addr); + } ch->stats.xdp_drop++; - else + } else { ch->stats.xdp_redirect++; + } break; }
@@@ -764,11 -778,12 +774,11 @@@ static int dpaa2_eth_build_sg_fd(struc /* Prepare the HW SGT structure */ sgt_buf_size = priv->tx_data_offset + sizeof(struct dpaa2_sg_entry) * num_dma_bufs; - sgt_buf = napi_alloc_frag(sgt_buf_size + DPAA2_ETH_TX_BUF_ALIGN); + sgt_buf = napi_alloc_frag_align(sgt_buf_size, DPAA2_ETH_TX_BUF_ALIGN); if (unlikely(!sgt_buf)) { err = -ENOMEM; goto sgt_buf_alloc_failed; } - sgt_buf = PTR_ALIGN(sgt_buf, DPAA2_ETH_TX_BUF_ALIGN); memset(sgt_buf, 0, sgt_buf_size);
sgt = (struct dpaa2_sg_entry *)(sgt_buf + priv->tx_data_offset); @@@ -1257,22 -1272,6 +1267,22 @@@ static void dpaa2_eth_tx_conf(struct dp percpu_stats->tx_errors++; }
+static int dpaa2_eth_set_rx_vlan_filtering(struct dpaa2_eth_priv *priv, + bool enable) +{ + int err; + + err = dpni_enable_vlan_filter(priv->mc_io, 0, priv->mc_token, enable); + + if (err) { + netdev_err(priv->net_dev, + "dpni_enable_vlan_filter failed\n"); + return err; + } + + return 0; +} + static int dpaa2_eth_set_rx_csum(struct dpaa2_eth_priv *priv, bool enable) { int err; @@@ -1659,7 -1658,7 +1669,7 @@@ set_cgtd * CG taildrop threshold, so it won't interfere with it; we also * want frames in non-PFC enabled traffic classes to be kept in check) */ - td.enable = !tx_pause || (tx_pause && pfc); + td.enable = !tx_pause || pfc; if (priv->rx_cgtd_enabled == td.enable) return;
@@@ -1702,7 -1701,7 +1712,7 @@@ static int dpaa2_eth_link_state_update( /* When we manage the MAC/PHY using phylink there is no need * to manually update the netif_carrier. */ - if (priv->mac) + if (dpaa2_eth_is_type_phy(priv)) goto out;
/* Chech link state; speed / duplex changes are not treated yet */ @@@ -1741,7 -1740,7 +1751,7 @@@ static int dpaa2_eth_open(struct net_de priv->dpbp_dev->obj_desc.id, priv->bpid); }
- if (!priv->mac) { + if (!dpaa2_eth_is_type_phy(priv)) { /* We'll only start the txqs when the link is actually ready; * make sure we don't race against the link up notification, * which may come immediately after dpni_enable(); @@@ -1763,7 -1762,7 +1773,7 @@@ goto enable_err; }
- if (priv->mac) + if (dpaa2_eth_is_type_phy(priv)) phylink_start(priv->mac->phylink);
return 0; @@@ -1837,11 -1836,11 +1847,11 @@@ static int dpaa2_eth_stop(struct net_de int dpni_enabled = 0; int retries = 10;
- if (!priv->mac) { + if (dpaa2_eth_is_type_phy(priv)) { + phylink_stop(priv->mac->phylink); + } else { netif_tx_stop_all_queues(net_dev); netif_carrier_off(net_dev); - } else { - phylink_stop(priv->mac->phylink); }
/* On dpni_disable(), the MC firmware will: @@@ -1963,43 -1962,6 +1973,43 @@@ static void dpaa2_eth_add_mc_hw_addr(co } }
+static int dpaa2_eth_rx_add_vid(struct net_device *net_dev, + __be16 vlan_proto, u16 vid) +{ + struct dpaa2_eth_priv *priv = netdev_priv(net_dev); + int err; + + err = dpni_add_vlan_id(priv->mc_io, 0, priv->mc_token, + vid, 0, 0, 0); + + if (err) { + netdev_warn(priv->net_dev, + "Could not add the vlan id %u\n", + vid); + return err; + } + + return 0; +} + +static int dpaa2_eth_rx_kill_vid(struct net_device *net_dev, + __be16 vlan_proto, u16 vid) +{ + struct dpaa2_eth_priv *priv = netdev_priv(net_dev); + int err; + + err = dpni_remove_vlan_id(priv->mc_io, 0, priv->mc_token, vid); + + if (err) { + netdev_warn(priv->net_dev, + "Could not remove the vlan id %u\n", + vid); + return err; + } + + return 0; +} + static void dpaa2_eth_set_rx_mode(struct net_device *net_dev) { struct dpaa2_eth_priv *priv = netdev_priv(net_dev); @@@ -2106,13 -2068,6 +2116,13 @@@ static int dpaa2_eth_set_features(struc bool enable; int err;
+ if (changed & NETIF_F_HW_VLAN_CTAG_FILTER) { + enable = !!(features & NETIF_F_HW_VLAN_CTAG_FILTER); + err = dpaa2_eth_set_rx_vlan_filtering(priv, enable); + if (err) + return err; + } + if (changed & NETIF_F_RXCSUM) { enable = !!(features & NETIF_F_RXCSUM); err = dpaa2_eth_set_rx_csum(priv, enable); @@@ -2170,7 -2125,7 +2180,7 @@@ static int dpaa2_eth_ioctl(struct net_d if (cmd == SIOCSHWTSTAMP) return dpaa2_eth_ts_ioctl(dev, rq, cmd);
- if (priv->mac) + if (dpaa2_eth_is_type_phy(priv)) return phylink_mii_ioctl(priv->mac->phylink, rq, cmd);
return -EOPNOTSUPP; @@@ -2562,8 -2517,6 +2572,8 @@@ static const struct net_device_ops dpaa .ndo_bpf = dpaa2_eth_xdp, .ndo_xdp_xmit = dpaa2_eth_xdp_xmit, .ndo_setup_tc = dpaa2_eth_setup_tc, + .ndo_vlan_rx_add_vid = dpaa2_eth_rx_add_vid, + .ndo_vlan_rx_kill_vid = dpaa2_eth_rx_kill_vid };
static void dpaa2_eth_cdan_cb(struct dpaa2_io_notification_ctx *ctx) @@@ -4072,9 -4025,6 +4082,9 @@@ static int dpaa2_eth_netdev_init(struc NETIF_F_LLTX | NETIF_F_HW_TC; net_dev->hw_features = net_dev->features;
+ if (priv->dpni_attrs.vlan_filter_entries) + net_dev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER; + return 0; }
@@@ -4102,11 -4052,10 +4112,11 @@@ static int dpaa2_eth_connect_mac(struc
dpni_dev = to_fsl_mc_device(priv->net_dev->dev.parent); dpmac_dev = fsl_mc_get_endpoint(dpni_dev); - if (IS_ERR_OR_NULL(dpmac_dev) || dpmac_dev->dev.type != &fsl_mc_bus_dpmac_type) - return 0;
- if (dpaa2_mac_is_type_fixed(dpmac_dev, priv->mc_io)) + if (PTR_ERR(dpmac_dev) == -EPROBE_DEFER) + return PTR_ERR(dpmac_dev); + + if (IS_ERR(dpmac_dev) || dpmac_dev->dev.type != &fsl_mc_bus_dpmac_type) return 0;
mac = kzalloc(sizeof(struct dpaa2_mac), GFP_KERNEL); @@@ -4117,38 -4066,23 +4127,38 @@@ mac->mc_io = priv->mc_io; mac->net_dev = priv->net_dev;
- err = dpaa2_mac_connect(mac); - if (err) { - netdev_err(priv->net_dev, "Error connecting to the MAC endpoint\n"); - kfree(mac); - return err; - } + err = dpaa2_mac_open(mac); + if (err) + goto err_free_mac; priv->mac = mac;
+ if (dpaa2_eth_is_type_phy(priv)) { + err = dpaa2_mac_connect(mac); + if (err) { + netdev_err(priv->net_dev, "Error connecting to the MAC endpoint\n"); + goto err_close_mac; + } + } + return 0; + +err_close_mac: + dpaa2_mac_close(mac); + priv->mac = NULL; +err_free_mac: + kfree(mac); + return err; }
static void dpaa2_eth_disconnect_mac(struct dpaa2_eth_priv *priv) { - if (!priv->mac) + if (dpaa2_eth_is_type_phy(priv)) + dpaa2_mac_disconnect(priv->mac); + + if (!dpaa2_eth_has_mac(priv)) return;
- dpaa2_mac_disconnect(priv->mac); + dpaa2_mac_close(priv->mac); kfree(priv->mac); priv->mac = NULL; } @@@ -4177,7 -4111,7 +4187,7 @@@ static irqreturn_t dpni_irq0_handler_th dpaa2_eth_update_tx_fqids(priv);
rtnl_lock(); - if (priv->mac) + if (dpaa2_eth_has_mac(priv)) dpaa2_eth_disconnect_mac(priv); else dpaa2_eth_connect_mac(priv); diff --combined drivers/net/ethernet/ibm/ibmvnic.c index 927d5f36d308,13ae7eee7ef5..5cf7e5a367f0 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@@ -115,7 -115,7 +115,7 @@@ struct ibmvnic_stat
#define IBMVNIC_STAT_OFF(stat) (offsetof(struct ibmvnic_adapter, stats) + \ offsetof(struct ibmvnic_statistics, stat)) -#define IBMVNIC_GET_STAT(a, off) (*((u64 *)(((unsigned long)(a)) + off))) +#define IBMVNIC_GET_STAT(a, off) (*((u64 *)(((unsigned long)(a)) + (off))))
static const struct ibmvnic_stat ibmvnic_stats[] = { {"rx_packets", IBMVNIC_STAT_OFF(rx_packets)}, @@@ -247,18 -247,51 +247,23 @@@ static void free_long_term_buff(struct if (!ltb->buff) return;
+ /* VIOS automatically unmaps the long term buffer at remote + * end for the following resets: + * FAILOVER, MOBILITY, TIMEOUT. + */ if (adapter->reset_reason != VNIC_RESET_FAILOVER && - adapter->reset_reason != VNIC_RESET_MOBILITY) + adapter->reset_reason != VNIC_RESET_MOBILITY && + adapter->reset_reason != VNIC_RESET_TIMEOUT) send_request_unmap(adapter, ltb->map_id); dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr); }
-static int reset_long_term_buff(struct ibmvnic_adapter *adapter, - struct ibmvnic_long_term_buff *ltb) +static int reset_long_term_buff(struct ibmvnic_long_term_buff *ltb) { - struct device *dev = &adapter->vdev->dev; - int rc; + if (!ltb->buff) + return -EINVAL;
memset(ltb->buff, 0, ltb->size); - - mutex_lock(&adapter->fw_lock); - adapter->fw_done_rc = 0; - - reinit_completion(&adapter->fw_done); - rc = send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id); - if (rc) { - mutex_unlock(&adapter->fw_lock); - return rc; - } - - rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 10000); - if (rc) { - dev_info(dev, - "Reset failed, long term map request timed out or aborted\n"); - mutex_unlock(&adapter->fw_lock); - return rc; - } - - if (adapter->fw_done_rc) { - dev_info(dev, - "Reset failed, attempting to free and reallocate buffer\n"); - free_long_term_buff(adapter, ltb); - mutex_unlock(&adapter->fw_lock); - return alloc_long_term_buff(adapter, ltb, ltb->size); - } - mutex_unlock(&adapter->fw_lock); return 0; }
@@@ -480,7 -513,8 +485,7 @@@ static int reset_rx_pools(struct ibmvni rx_pool->size * rx_pool->buff_size); } else { - rc = reset_long_term_buff(adapter, - &rx_pool->long_term_buff); + rc = reset_long_term_buff(&rx_pool->long_term_buff); }
if (rc) @@@ -603,11 -637,12 +608,11 @@@ static int init_rx_pools(struct net_dev return 0; }
-static int reset_one_tx_pool(struct ibmvnic_adapter *adapter, - struct ibmvnic_tx_pool *tx_pool) +static int reset_one_tx_pool(struct ibmvnic_tx_pool *tx_pool) { int rc, i;
- rc = reset_long_term_buff(adapter, &tx_pool->long_term_buff); + rc = reset_long_term_buff(&tx_pool->long_term_buff); if (rc) return rc;
@@@ -634,10 -669,10 +639,10 @@@ static int reset_tx_pools(struct ibmvni
tx_scrqs = adapter->num_active_tx_pools; for (i = 0; i < tx_scrqs; i++) { - rc = reset_one_tx_pool(adapter, &adapter->tso_pool[i]); + rc = reset_one_tx_pool(&adapter->tso_pool[i]); if (rc) return rc; - rc = reset_one_tx_pool(adapter, &adapter->tx_pool[i]); + rc = reset_one_tx_pool(&adapter->tx_pool[i]); if (rc) return rc; } @@@ -1191,7 -1226,8 +1196,7 @@@ static int ibmvnic_open(struct net_devi rc = __ibmvnic_open(netdev);
out: - /* - * If open fails due to a pending failover, set device state and + /* If open fails due to a pending failover, set device state and * return. Device operation will be handled by reset routine. */ if (rc && adapter->failover_pending) { @@@ -1322,10 -1358,8 +1327,8 @@@ static int __ibmvnic_close(struct net_d
adapter->state = VNIC_CLOSING; rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN); - if (rc) - return rc; adapter->state = VNIC_CLOSED; - return 0; + return rc; }
static int ibmvnic_close(struct net_device *netdev) @@@ -1353,10 -1387,10 +1356,10 @@@
/** * build_hdr_data - creates L2/L3/L4 header data buffer - * @hdr_field - bitfield determining needed headers - * @skb - socket buffer - * @hdr_len - array of header lengths - * @tot_len - total length of data + * @hdr_field: bitfield determining needed headers + * @skb: socket buffer + * @hdr_len: array of header lengths + * @hdr_data: buffer to write the header to * * Reads hdr_field to determine which headers are needed by firmware. * Builds a buffer containing these headers. Saves individual header @@@ -1413,11 -1447,11 +1416,11 @@@ static int build_hdr_data(u8 hdr_field
/** * create_hdr_descs - create header and header extension descriptors - * @hdr_field - bitfield determining needed headers - * @data - buffer containing header data - * @len - length of data buffer - * @hdr_len - array of individual header lengths - * @scrq_arr - descriptor array + * @hdr_field: bitfield determining needed headers + * @hdr_data: buffer containing header data + * @len: length of data buffer + * @hdr_len: array of individual header lengths + * @scrq_arr: descriptor array * * Creates header and, if needed, header extension descriptors and * places them in a descriptor array, scrq_arr @@@ -1465,9 -1499,10 +1468,9 @@@ static int create_hdr_descs(u8 hdr_fiel
/** * build_hdr_descs_arr - build a header descriptor array - * @skb - socket buffer - * @num_entries - number of descriptors to be sent - * @subcrq - first TX descriptor - * @hdr_field - bit field determining which headers will be sent + * @txbuff: tx buffer + * @num_entries: number of descriptors to be sent + * @hdr_field: bit field determining which headers will be sent * * This function will build a TX descriptor array with applicable * L2/L3/L4 packet header descriptors to be sent by send_subcrq_indirect. @@@ -1670,6 -1705,9 +1673,9 @@@ static netdev_tx_t ibmvnic_xmit(struct skb_copy_from_linear_data(skb, dst, skb->len); }
+ /* post changes to long_term_buff *dst before VIOS accessing it */ + dma_wmb(); + tx_pool->consumer_index = (tx_pool->consumer_index + 1) % tx_pool->num_buffers;
@@@ -1893,7 -1931,93 +1899,7 @@@ static int ibmvnic_set_mac(struct net_d return rc; }
-/** - * do_change_param_reset returns zero if we are able to keep processing reset - * events, or non-zero if we hit a fatal error and must halt. - */ -static int do_change_param_reset(struct ibmvnic_adapter *adapter, - struct ibmvnic_rwi *rwi, - u32 reset_state) -{ - struct net_device *netdev = adapter->netdev; - int i, rc; - - netdev_dbg(adapter->netdev, "Change param resetting driver (%d)\n", - rwi->reset_reason); - - netif_carrier_off(netdev); - adapter->reset_reason = rwi->reset_reason; - - ibmvnic_cleanup(netdev); - - if (reset_state == VNIC_OPEN) { - rc = __ibmvnic_close(netdev); - if (rc) - goto out; - } - - release_resources(adapter); - release_sub_crqs(adapter, 1); - release_crq_queue(adapter); - - adapter->state = VNIC_PROBED; - - rc = init_crq_queue(adapter); - - if (rc) { - netdev_err(adapter->netdev, - "Couldn't initialize crq. rc=%d\n", rc); - return rc; - } - - rc = ibmvnic_reset_init(adapter, true); - if (rc) { - rc = IBMVNIC_INIT_FAILED; - goto out; - } - - /* If the adapter was in PROBE state prior to the reset, - * exit here. - */ - if (reset_state == VNIC_PROBED) - goto out; - - rc = ibmvnic_login(netdev); - if (rc) { - goto out; - } - - rc = init_resources(adapter); - if (rc) - goto out; - - ibmvnic_disable_irqs(adapter); - - adapter->state = VNIC_CLOSED; - - if (reset_state == VNIC_CLOSED) - return 0; - - rc = __ibmvnic_open(netdev); - if (rc) { - rc = IBMVNIC_OPEN_FAILED; - goto out; - } - - /* refresh device's multicast list */ - ibmvnic_set_multi(netdev); - - /* kick napi */ - for (i = 0; i < adapter->req_rx_queues; i++) - napi_schedule(&adapter->napi[i]); - -out: - if (rc) - adapter->state = reset_state; - return rc; -} - -/** +/* * do_reset returns zero if we are able to keep processing reset events, or * non-zero if we hit a fatal error and must halt. */ @@@ -1910,12 -2034,9 +1916,12 @@@ static int do_reset(struct ibmvnic_adap adapter->state, adapter->failover_pending, rwi->reset_reason, reset_state);
- rtnl_lock(); - /* - * Now that we have the rtnl lock, clear any pending failover. + adapter->reset_reason = rwi->reset_reason; + /* requestor of VNIC_RESET_CHANGE_PARAM already has the rtnl lock */ + if (!(adapter->reset_reason == VNIC_RESET_CHANGE_PARAM)) + rtnl_lock(); + + /* Now that we have the rtnl lock, clear any pending failover. * This will ensure ibmvnic_open() has either completed or will * block until failover is complete. */ @@@ -1923,6 -2044,7 +1929,6 @@@ adapter->failover_pending = false;
netif_carrier_off(netdev); - adapter->reset_reason = rwi->reset_reason;
old_num_rx_queues = adapter->req_rx_queues; old_num_tx_queues = adapter->req_tx_queues; @@@ -1934,37 -2056,25 +1940,37 @@@ if (reset_state == VNIC_OPEN && adapter->reset_reason != VNIC_RESET_MOBILITY && adapter->reset_reason != VNIC_RESET_FAILOVER) { - adapter->state = VNIC_CLOSING; + if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) { + rc = __ibmvnic_close(netdev); + if (rc) + goto out; + } else { + adapter->state = VNIC_CLOSING;
- /* Release the RTNL lock before link state change and - * re-acquire after the link state change to allow - * linkwatch_event to grab the RTNL lock and run during - * a reset. - */ - rtnl_unlock(); - rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN); - rtnl_lock(); - if (rc) - goto out; + /* Release the RTNL lock before link state change and + * re-acquire after the link state change to allow + * linkwatch_event to grab the RTNL lock and run during + * a reset. + */ + rtnl_unlock(); + rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN); + rtnl_lock(); + if (rc) + goto out;
- if (adapter->state != VNIC_CLOSING) { - rc = -1; - goto out; + if (adapter->state != VNIC_CLOSING) { + rc = -1; + goto out; + } + + adapter->state = VNIC_CLOSED; } + }
- adapter->state = VNIC_CLOSED; + if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) { + release_resources(adapter); + release_sub_crqs(adapter, 1); + release_crq_queue(adapter); }
if (adapter->reset_reason != VNIC_RESET_NON_FATAL) { @@@ -1973,9 -2083,7 +1979,9 @@@ */ adapter->state = VNIC_PROBED;
- if (adapter->reset_reason == VNIC_RESET_MOBILITY) { + if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) { + rc = init_crq_queue(adapter); + } else if (adapter->reset_reason == VNIC_RESET_MOBILITY) { rc = ibmvnic_reenable_crq_queue(adapter); release_sub_crqs(adapter, 1); } else { @@@ -2010,14 -2118,11 +2016,14 @@@ }
rc = ibmvnic_login(netdev); - if (rc) { + if (rc) goto out; - }
- if (adapter->req_rx_queues != old_num_rx_queues || + if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) { + rc = init_resources(adapter); + if (rc) + goto out; + } else if (adapter->req_rx_queues != old_num_rx_queues || adapter->req_tx_queues != old_num_tx_queues || adapter->req_rx_add_entries_per_subcrq != old_num_rx_slots || @@@ -2039,14 -2144,14 +2045,14 @@@ rc = reset_tx_pools(adapter); if (rc) { netdev_dbg(adapter->netdev, "reset tx pools failed (%d)\n", - rc); + rc); goto out; }
rc = reset_rx_pools(adapter); if (rc) { netdev_dbg(adapter->netdev, "reset rx pools failed (%d)\n", - rc); + rc); goto out; } } @@@ -2082,9 -2187,7 +2088,9 @@@ out /* restore the adapter state if reset failed */ if (rc) adapter->state = reset_state; - rtnl_unlock(); + /* requestor of VNIC_RESET_CHANGE_PARAM should still hold the rtnl lock */ + if (!(adapter->reset_reason == VNIC_RESET_CHANGE_PARAM)) + rtnl_unlock();
netdev_dbg(adapter->netdev, "[S:%d FOP:%d] Reset done, rc %d\n", adapter->state, adapter->failover_pending, rc); @@@ -2215,8 -2318,12 +2221,8 @@@ static void __ibmvnic_reset(struct work } spin_unlock_irqrestore(&adapter->state_lock, flags);
- if (rwi->reset_reason == VNIC_RESET_CHANGE_PARAM) { - /* CHANGE_PARAM requestor holds rtnl_lock */ - rc = do_change_param_reset(adapter, rwi, reset_state); - } else if (adapter->force_reset_recovery) { - /* - * Since we are doing a hard reset now, clear the + if (adapter->force_reset_recovery) { + /* Since we are doing a hard reset now, clear the * failover_pending flag so we don't ignore any * future MOBILITY or other resets. */ @@@ -2288,7 -2395,10 +2294,8 @@@ static int ibmvnic_reset(struct ibmvnic unsigned long flags; int ret;
- /* If failover is pending don't schedule any other reset. - spin_lock_irqsave(&adapter->rwi_lock, flags); - + /* + * If failover is pending don't schedule any other reset. * Instead let the failover complete. If there is already a * a failover reset scheduled, we will detect and drop the * duplicate reset when walking the ->rwi_list below. @@@ -2303,19 -2413,15 +2310,16 @@@
if (adapter->state == VNIC_PROBING) { netdev_warn(netdev, "Adapter reset during probe\n"); - ret = adapter->init_done_rc = EAGAIN; + adapter->init_done_rc = EAGAIN; + ret = EAGAIN; goto err; }
- spin_lock_irqsave(&adapter->rwi_lock, flags); - list_for_each(entry, &adapter->rwi_list) { tmp = list_entry(entry, struct ibmvnic_rwi, list); if (tmp->reset_reason == reason) { netdev_dbg(netdev, "Skipping matching reset, reason=%d\n", reason); - spin_unlock_irqrestore(&adapter->rwi_lock, flags); ret = EBUSY; goto err; } @@@ -2323,8 -2429,6 +2327,6 @@@
rwi = kzalloc(sizeof(*rwi), GFP_ATOMIC); if (!rwi) { - spin_unlock_irqrestore(&adapter->rwi_lock, flags); - ibmvnic_close(netdev); ret = ENOMEM; goto err; } @@@ -2337,12 -2441,17 +2339,17 @@@ } rwi->reset_reason = reason; list_add_tail(&rwi->list, &adapter->rwi_list); - spin_unlock_irqrestore(&adapter->rwi_lock, flags); netdev_dbg(adapter->netdev, "Scheduling reset (reason %d)\n", reason); schedule_work(&adapter->ibmvnic_reset);
- return 0; + ret = 0; err: + /* ibmvnic_close() below can block, so drop the lock first */ + spin_unlock_irqrestore(&adapter->rwi_lock, flags); + + if (ret == ENOMEM) + ibmvnic_close(netdev); + return -ret; }
@@@ -2410,9 -2519,16 +2417,9 @@@ restart_poll
if (!pending_scrq(adapter, rx_scrq)) break; - /* The queue entry at the current index is peeked at above - * to determine that there is a valid descriptor awaiting - * processing. We want to be sure that the current slot - * holds a valid descriptor before reading its contents. - */ - dma_rmb(); next = ibmvnic_next_scrq(adapter, rx_scrq); - rx_buff = - (struct ibmvnic_rx_buff *)be64_to_cpu(next-> - rx_comp.correlator); + rx_buff = (struct ibmvnic_rx_buff *) + be64_to_cpu(next->rx_comp.correlator); /* do error checking */ if (next->rx_comp.rc) { netdev_dbg(netdev, "rx buffer returned with rc %x\n", @@@ -2433,6 -2549,8 +2440,8 @@@ offset = be16_to_cpu(next->rx_comp.off_frame_data); flags = next->rx_comp.flags; skb = rx_buff->skb; + /* load long_term_buff before copying to skb */ + dma_rmb(); skb_copy_to_linear_data(skb, rx_buff->data + offset, length);
@@@ -2475,6 -2593,7 +2484,6 @@@ if (napi_complete_done(napi, frames_processed)) { enable_scrq_irq(adapter, rx_scrq); if (pending_scrq(adapter, rx_scrq)) { - rmb(); if (napi_reschedule(napi)) { disable_scrq_irq(adapter, rx_scrq); goto restart_poll; @@@ -2603,9 -2722,9 +2612,9 @@@ static void ibmvnic_get_drvinfo(struct { struct ibmvnic_adapter *adapter = netdev_priv(netdev);
- strlcpy(info->driver, ibmvnic_driver_name, sizeof(info->driver)); - strlcpy(info->version, IBMVNIC_DRIVER_VERSION, sizeof(info->version)); - strlcpy(info->fw_version, adapter->fw_version, + strscpy(info->driver, ibmvnic_driver_name, sizeof(info->driver)); + strscpy(info->version, IBMVNIC_DRIVER_VERSION, sizeof(info->version)); + strscpy(info->fw_version, adapter->fw_version, sizeof(info->fw_version)); }
@@@ -2717,6 -2836,7 +2726,6 @@@ static int ibmvnic_set_channels(struct channels->rx_count, channels->tx_count, adapter->req_rx_queues, adapter->req_tx_queues); return ret; - }
static void ibmvnic_get_strings(struct net_device *dev, u32 stringset, u8 *data) @@@ -2805,8 -2925,8 +2814,8 @@@ static void ibmvnic_get_ethtool_stats(s return;
for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++) - data[i] = be64_to_cpu(IBMVNIC_GET_STAT(adapter, - ibmvnic_stats[i].offset)); + data[i] = be64_to_cpu(IBMVNIC_GET_STAT + (adapter, ibmvnic_stats[i].offset));
for (j = 0; j < adapter->req_tx_queues; j++) { data[i] = adapter->tx_stats_buffers[j].packets; @@@ -2846,7 -2966,6 +2855,7 @@@ static int ibmvnic_set_priv_flags(struc
return 0; } + static const struct ethtool_ops ibmvnic_ethtool_ops = { .get_drvinfo = ibmvnic_get_drvinfo, .get_msglevel = ibmvnic_get_msglevel, @@@ -3116,7 -3235,7 +3125,7 @@@ static int enable_scrq_irq(struct ibmvn /* H_EOI would fail with rc = H_FUNCTION when running * in XIVE mode which is expected, but not an error. */ - if (rc && (rc != H_FUNCTION)) + if (rc && rc != H_FUNCTION) dev_err(dev, "H_EOI FAILED irq 0x%llx. rc=%ld\n", val, rc); } @@@ -3147,6 -3266,13 +3156,6 @@@ restart_loop int total_bytes = 0; int num_packets = 0;
- /* The queue entry at the current index is peeked at above - * to determine that there is a valid descriptor awaiting - * processing. We want to be sure that the current slot - * holds a valid descriptor before reading its contents. - */ - dma_rmb(); - next = ibmvnic_next_scrq(adapter, scrq); for (i = 0; i < next->tx_comp.num_comps; i++) { if (next->tx_comp.rcs[i]) @@@ -3520,16 -3646,11 +3529,16 @@@ static int pending_scrq(struct ibmvnic_ struct ibmvnic_sub_crq_queue *scrq) { union sub_crq *entry = &scrq->msgs[scrq->cur]; + int rc;
- if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP) - return 1; - else - return 0; + rc = !!(entry->generic.first & IBMVNIC_CRQ_CMD_RSP); + + /* Ensure that the SCRQ valid flag is loaded prior to loading the + * contents of the SCRQ descriptor + */ + dma_rmb(); + + return rc; }
static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *adapter, @@@ -3548,8 -3669,8 +3557,8 @@@ } spin_unlock_irqrestore(&scrq->lock, flags);
- /* Ensure that the entire buffer descriptor has been - * loaded before reading its contents + /* Ensure that the SCRQ valid flag is loaded prior to loading the + * contents of the SCRQ descriptor */ dma_rmb();
@@@ -3599,7 -3720,7 +3608,7 @@@ static int send_subcrq_indirect(struct int rc;
/* Make sure the hypervisor sees the complete request */ - mb(); + dma_wmb(); rc = plpar_hcall_norets(H_SEND_SUB_CRQ_INDIRECT, ua, cpu_to_be64(remote_handle), ioba, num_entries); @@@ -3619,8 -3740,8 +3628,8 @@@ static int ibmvnic_send_crq(struct ibmv int rc;
netdev_dbg(adapter->netdev, "Sending CRQ: %016lx %016lx\n", - (unsigned long int)cpu_to_be64(u64_crq[0]), - (unsigned long int)cpu_to_be64(u64_crq[1])); + (unsigned long)cpu_to_be64(u64_crq[0]), + (unsigned long)cpu_to_be64(u64_crq[1]));
if (!adapter->crq.active && crq->generic.first != IBMVNIC_CRQ_INIT_CMD) { @@@ -3629,7 -3750,7 +3638,7 @@@ }
/* Make sure the hypervisor sees the complete request */ - mb(); + dma_wmb();
rc = plpar_hcall_norets(H_SEND_CRQ, ua, cpu_to_be64(u64_crq[0]), @@@ -3825,15 -3946,15 +3834,15 @@@ static int send_login(struct ibmvnic_ad
for (i = 0; i < adapter->req_tx_queues; i++) { if (adapter->tx_scrq[i]) { - tx_list_p[i] = cpu_to_be64(adapter->tx_scrq[i]-> - crq_num); + tx_list_p[i] = + cpu_to_be64(adapter->tx_scrq[i]->crq_num); } }
for (i = 0; i < adapter->req_rx_queues; i++) { if (adapter->rx_scrq[i]) { - rx_list_p[i] = cpu_to_be64(adapter->rx_scrq[i]-> - crq_num); + rx_list_p[i] = + cpu_to_be64(adapter->rx_scrq[i]->crq_num); } }
@@@ -3849,7 -3970,7 +3858,7 @@@ netdev_dbg(adapter->netdev, "Login Buffer:\n"); for (i = 0; i < (adapter->login_buf_sz - 1) / 8 + 1; i++) { netdev_dbg(adapter->netdev, "%016lx\n", - ((unsigned long int *)(adapter->login_buf))[i]); + ((unsigned long *)(adapter->login_buf))[i]); }
memset(&crq, 0, sizeof(crq)); @@@ -4217,7 -4338,7 +4226,7 @@@ static void handle_query_ip_offload_rsp netdev_dbg(adapter->netdev, "Query IP Offload Buffer:\n"); for (i = 0; i < (sizeof(adapter->ip_offload_buf) - 1) / 8 + 1; i++) netdev_dbg(adapter->netdev, "%016lx\n", - ((unsigned long int *)(buf))[i]); + ((unsigned long *)(buf))[i]);
netdev_dbg(adapter->netdev, "ipv4_chksum = %d\n", buf->ipv4_chksum); netdev_dbg(adapter->netdev, "ipv6_chksum = %d\n", buf->ipv6_chksum); @@@ -4376,8 -4497,8 +4385,8 @@@ static void handle_request_cap_rsp(unio case PARTIALSUCCESS: dev_info(dev, "req=%lld, rsp=%ld in %s queue, retrying.\n", *req_value, - (long int)be64_to_cpu(crq->request_capability_rsp. - number), name); + (long)be64_to_cpu(crq->request_capability_rsp.number), + name);
if (be16_to_cpu(crq->request_capability_rsp.capability) == REQ_MTU) { @@@ -4447,7 -4568,7 +4456,7 @@@ static int handle_login_rsp(union ibmvn netdev_dbg(adapter->netdev, "Login Response Buffer:\n"); for (i = 0; i < (adapter->login_rsp_buf_sz - 1) / 8 + 1; i++) { netdev_dbg(adapter->netdev, "%016lx\n", - ((unsigned long int *)(adapter->login_rsp_buf))[i]); + ((unsigned long *)(adapter->login_rsp_buf))[i]); }
/* Sanity checks */ @@@ -4790,8 -4911,8 +4799,8 @@@ static void ibmvnic_handle_crq(union ib long rc;
netdev_dbg(netdev, "Handling CRQ: %016lx %016lx\n", - (unsigned long int)cpu_to_be64(u64_crq[0]), - (unsigned long int)cpu_to_be64(u64_crq[1])); + (unsigned long)cpu_to_be64(u64_crq[0]), + (unsigned long)cpu_to_be64(u64_crq[1])); switch (gen_crq->first) { case IBMVNIC_CRQ_INIT_RSP: switch (gen_crq->cmd) { @@@ -5265,6 -5386,8 +5274,6 @@@ static int ibmvnic_probe(struct vio_de netdev->ethtool_ops = &ibmvnic_ethtool_ops; SET_NETDEV_DEV(netdev, &dev->dev);
- spin_lock_init(&adapter->stats_lock); - INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset); INIT_DELAYED_WORK(&adapter->ibmvnic_delayed_reset, __ibmvnic_delayed_reset); @@@ -5346,7 -5469,18 +5355,18 @@@ static int ibmvnic_remove(struct vio_de unsigned long flags;
spin_lock_irqsave(&adapter->state_lock, flags); + + /* If ibmvnic_reset() is scheduling a reset, wait for it to + * finish. Then, set the state to REMOVING to prevent it from + * scheduling any more work and to have reset functions ignore + * any resets that have already been scheduled. Drop the lock + * after setting state, so __ibmvnic_reset() which is called + * from the flush_work() below, can make progress. + */ + spin_lock_irqsave(&adapter->rwi_lock, flags); adapter->state = VNIC_REMOVING; + spin_unlock_irqrestore(&adapter->rwi_lock, flags); + spin_unlock_irqrestore(&adapter->state_lock, flags);
flush_work(&adapter->ibmvnic_reset); diff --combined drivers/net/ethernet/ibm/ibmvnic.h index 270d1cac86a4,72fea3b1c87d..e4dcc63b9710 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@@ -31,7 -31,7 +31,7 @@@ #define IBMVNIC_BUFFS_PER_POOL 100 #define IBMVNIC_MAX_QUEUES 16 #define IBMVNIC_MAX_QUEUE_SZ 4096 - #define IBMVNIC_MAX_IND_DESCS 128 + #define IBMVNIC_MAX_IND_DESCS 16 #define IBMVNIC_IND_ARR_SZ (IBMVNIC_MAX_IND_DESCS * 32)
#define IBMVNIC_TSO_BUF_SZ 65536 @@@ -845,7 -845,6 +845,7 @@@ struct ibmvnic_crq_queue union ibmvnic_crq *msgs; int size, cur; dma_addr_t msg_token; + /* Used for serialization of msgs, cur */ spinlock_t lock; bool active; char name[32]; @@@ -877,7 -876,6 +877,7 @@@ struct ibmvnic_sub_crq_queue unsigned int irq; unsigned int pool_index; int scrq_num; + /* Used for serialization of msgs, cur */ spinlock_t lock; struct sk_buff *rx_skb_top; struct ibmvnic_adapter *adapter; @@@ -987,6 -985,7 +987,6 @@@ struct ibmvnic_adapter struct ibmvnic_statistics stats; dma_addr_t stats_token; struct completion stats_done; - spinlock_t stats_lock; int replenish_no_mem; int replenish_add_buff_success; int replenish_add_buff_failure; @@@ -1081,12 -1080,10 +1081,14 @@@
struct tasklet_struct tasklet; enum vnic_state state; + /* Used for serializatin of state field */ + spinlock_t state_lock; enum ibmvnic_reset_reason reset_reason; + /* when taking both state and rwi locks, take state lock first */ + spinlock_t rwi_lock; struct list_head rwi_list; + /* Used for serialization of rwi_list */ + spinlock_t rwi_lock; struct work_struct ibmvnic_reset; struct delayed_work ibmvnic_delayed_reset; unsigned long resetting; @@@ -1100,4 -1097,9 +1102,4 @@@
struct ibmvnic_tunables desired; struct ibmvnic_tunables fallback; - - /* Used for serialization of state field. When taking both state - * and rwi locks, take state lock first. - */ - spinlock_t state_lock; }; diff --combined drivers/net/ethernet/mellanox/mlx5/core/devlink.c index aa76a6e0dae8,41474e42a819..d7d8a68ef23d --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@@ -7,8 -7,6 +7,8 @@@ #include "fw_reset.h" #include "fs_core.h" #include "eswitch.h" +#include "sf/dev/dev.h" +#include "sf/sf.h"
static int mlx5_devlink_flash_update(struct devlink *devlink, struct devlink_flash_update_params *params, @@@ -129,18 -127,12 +129,23 @@@ static int mlx5_devlink_reload_down(str struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); + bool sf_dev_allocated; + + sf_dev_allocated = mlx5_sf_dev_allocated(dev); + if (sf_dev_allocated) { + /* Reload results in deleting SF device which further results in + * unregistering devlink instance while holding devlink_mutext. + * Hence, do not support reload. + */ + NL_SET_ERR_MSG_MOD(extack, "reload is unsupported when SFs are allocated\n"); + return -EOPNOTSUPP; + }
+ if (mlx5_lag_is_active(dev)) { + NL_SET_ERR_MSG_MOD(extack, "reload is unsupported in Lag mode\n"); + return -EOPNOTSUPP; + } + switch (action) { case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: mlx5_unload_one(dev, false); @@@ -181,91 -173,6 +186,91 @@@ static int mlx5_devlink_reload_up(struc return 0; }
+static struct mlx5_devlink_trap *mlx5_find_trap_by_id(struct mlx5_core_dev *dev, int trap_id) +{ + struct mlx5_devlink_trap *dl_trap; + + list_for_each_entry(dl_trap, &dev->priv.traps, list) + if (dl_trap->trap.id == trap_id) + return dl_trap; + + return NULL; +} + +static int mlx5_devlink_trap_init(struct devlink *devlink, const struct devlink_trap *trap, + void *trap_ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_devlink_trap *dl_trap; + + dl_trap = kzalloc(sizeof(*dl_trap), GFP_KERNEL); + if (!dl_trap) + return -ENOMEM; + + dl_trap->trap.id = trap->id; + dl_trap->trap.action = DEVLINK_TRAP_ACTION_DROP; + dl_trap->item = trap_ctx; + + if (mlx5_find_trap_by_id(dev, trap->id)) { + kfree(dl_trap); + mlx5_core_err(dev, "Devlink trap: Trap 0x%x already found", trap->id); + return -EEXIST; + } + + list_add_tail(&dl_trap->list, &dev->priv.traps); + return 0; +} + +static void mlx5_devlink_trap_fini(struct devlink *devlink, const struct devlink_trap *trap, + void *trap_ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_devlink_trap *dl_trap; + + dl_trap = mlx5_find_trap_by_id(dev, trap->id); + if (!dl_trap) { + mlx5_core_err(dev, "Devlink trap: Missing trap id 0x%x", trap->id); + return; + } + list_del(&dl_trap->list); + kfree(dl_trap); +} + +static int mlx5_devlink_trap_action_set(struct devlink *devlink, + const struct devlink_trap *trap, + enum devlink_trap_action action, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + enum devlink_trap_action action_orig; + struct mlx5_devlink_trap *dl_trap; + int err = 0; + + dl_trap = mlx5_find_trap_by_id(dev, trap->id); + if (!dl_trap) { + mlx5_core_err(dev, "Devlink trap: Set action on invalid trap id 0x%x", trap->id); + err = -EINVAL; + goto out; + } + + if (action != DEVLINK_TRAP_ACTION_DROP && action != DEVLINK_TRAP_ACTION_TRAP) { + err = -EOPNOTSUPP; + goto out; + } + + if (action == dl_trap->trap.action) + goto out; + + action_orig = dl_trap->trap.action; + dl_trap->trap.action = action; + err = mlx5_blocking_notifier_call_chain(dev, MLX5_DRIVER_EVENT_TYPE_TRAP, + &dl_trap->trap); + if (err) + dl_trap->trap.action = action_orig; +out: + return err; +} + static const struct devlink_ops mlx5_devlink_ops = { #ifdef CONFIG_MLX5_ESWITCH .eswitch_mode_set = mlx5_devlink_eswitch_mode_set, @@@ -276,12 -183,6 +281,12 @@@ .eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get, .port_function_hw_addr_get = mlx5_devlink_port_function_hw_addr_get, .port_function_hw_addr_set = mlx5_devlink_port_function_hw_addr_set, +#endif +#ifdef CONFIG_MLX5_SF_MANAGER + .port_new = mlx5_devlink_sf_port_new, + .port_del = mlx5_devlink_sf_port_del, + .port_fn_state_get = mlx5_devlink_sf_port_fn_state_get, + .port_fn_state_set = mlx5_devlink_sf_port_fn_state_set, #endif .flash_update = mlx5_devlink_flash_update, .info_get = mlx5_devlink_info_get, @@@ -290,59 -191,8 +295,59 @@@ .reload_limits = BIT(DEVLINK_RELOAD_LIMIT_NO_RESET), .reload_down = mlx5_devlink_reload_down, .reload_up = mlx5_devlink_reload_up, + .trap_init = mlx5_devlink_trap_init, + .trap_fini = mlx5_devlink_trap_fini, + .trap_action_set = mlx5_devlink_trap_action_set, };
+void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb, + struct devlink_port *dl_port) +{ + struct devlink *devlink = priv_to_devlink(dev); + struct mlx5_devlink_trap *dl_trap; + + dl_trap = mlx5_find_trap_by_id(dev, trap_id); + if (!dl_trap) { + mlx5_core_err(dev, "Devlink trap: Report on invalid trap id 0x%x", trap_id); + return; + } + + if (dl_trap->trap.action != DEVLINK_TRAP_ACTION_TRAP) { + mlx5_core_dbg(dev, "Devlink trap: Trap id %d has action %d", trap_id, + dl_trap->trap.action); + return; + } + devlink_trap_report(devlink, skb, dl_trap->item, dl_port, NULL); +} + +int mlx5_devlink_trap_get_num_active(struct mlx5_core_dev *dev) +{ + struct mlx5_devlink_trap *dl_trap; + int count = 0; + + list_for_each_entry(dl_trap, &dev->priv.traps, list) + if (dl_trap->trap.action == DEVLINK_TRAP_ACTION_TRAP) + count++; + + return count; +} + +int mlx5_devlink_traps_get_action(struct mlx5_core_dev *dev, int trap_id, + enum devlink_trap_action *action) +{ + struct mlx5_devlink_trap *dl_trap; + + dl_trap = mlx5_find_trap_by_id(dev, trap_id); + if (!dl_trap) { + mlx5_core_err(dev, "Devlink trap: Get action on invalid trap id 0x%x", + trap_id); + return -EINVAL; + } + + *action = dl_trap->trap.action; + return 0; +} + struct devlink *mlx5_devlink_alloc(void) { return devlink_alloc(&mlx5_devlink_ops, sizeof(struct mlx5_core_dev)); @@@ -428,6 -278,10 +433,10 @@@ static int mlx5_devlink_enable_roce_val NL_SET_ERR_MSG_MOD(extack, "Device doesn't support RoCE"); return -EOPNOTSUPP; } + if (mlx5_core_is_mp_slave(dev) || mlx5_lag_is_active(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Multi port slave/Lag device can't configure RoCE"); + return -EOPNOTSUPP; + }
return 0; } @@@ -513,49 -367,6 +522,49 @@@ static void mlx5_devlink_set_params_ini #endif }
+#define MLX5_TRAP_DROP(_id, _group_id) \ + DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \ + DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ + DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) + +static const struct devlink_trap mlx5_traps_arr[] = { + MLX5_TRAP_DROP(INGRESS_VLAN_FILTER, L2_DROPS), + MLX5_TRAP_DROP(DMAC_FILTER, L2_DROPS), +}; + +static const struct devlink_trap_group mlx5_trap_groups_arr[] = { + DEVLINK_TRAP_GROUP_GENERIC(L2_DROPS, 0), +}; + +static int mlx5_devlink_traps_register(struct devlink *devlink) +{ + struct mlx5_core_dev *core_dev = devlink_priv(devlink); + int err; + + err = devlink_trap_groups_register(devlink, mlx5_trap_groups_arr, + ARRAY_SIZE(mlx5_trap_groups_arr)); + if (err) + return err; + + err = devlink_traps_register(devlink, mlx5_traps_arr, ARRAY_SIZE(mlx5_traps_arr), + &core_dev->priv); + if (err) + goto err_trap_group; + return 0; + +err_trap_group: + devlink_trap_groups_unregister(devlink, mlx5_trap_groups_arr, + ARRAY_SIZE(mlx5_trap_groups_arr)); + return err; +} + +static void mlx5_devlink_traps_unregister(struct devlink *devlink) +{ + devlink_traps_unregister(devlink, mlx5_traps_arr, ARRAY_SIZE(mlx5_traps_arr)); + devlink_trap_groups_unregister(devlink, mlx5_trap_groups_arr, + ARRAY_SIZE(mlx5_trap_groups_arr)); +} + int mlx5_devlink_register(struct devlink *devlink, struct device *dev) { int err; @@@ -570,16 -381,8 +579,16 @@@ goto params_reg_err; mlx5_devlink_set_params_init_values(devlink); devlink_params_publish(devlink); + + err = mlx5_devlink_traps_register(devlink); + if (err) + goto traps_reg_err; + return 0;
+traps_reg_err: + devlink_params_unregister(devlink, mlx5_devlink_params, + ARRAY_SIZE(mlx5_devlink_params)); params_reg_err: devlink_unregister(devlink); return err; @@@ -587,7 -390,6 +596,7 @@@
void mlx5_devlink_unregister(struct devlink *devlink) { + mlx5_devlink_traps_unregister(devlink); devlink_params_unregister(devlink, mlx5_devlink_params, ARRAY_SIZE(mlx5_devlink_params)); devlink_unregister(devlink); diff --combined drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 0b503ebe59ec,24e2c0d955b9..f3f6eb081948 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@@ -12,6 -12,7 +12,7 @@@ #include <net/flow_offload.h> #include <net/netfilter/nf_flow_table.h> #include <linux/workqueue.h> + #include <linux/refcount.h> #include <linux/xarray.h>
#include "lib/fs_chains.h" @@@ -27,7 -28,6 +28,7 @@@ #define MLX5_CT_STATE_ESTABLISHED_BIT BIT(1) #define MLX5_CT_STATE_TRK_BIT BIT(2) #define MLX5_CT_STATE_NAT_BIT BIT(3) +#define MLX5_CT_STATE_REPLY_BIT BIT(4)
#define MLX5_FTE_ID_BITS (mlx5e_tc_attr_to_reg_mappings[FTEID_TO_REG].mlen * 8) #define MLX5_FTE_ID_MAX GENMASK(MLX5_FTE_ID_BITS - 1, 0) @@@ -52,11 -52,11 +53,11 @@@ struct mlx5_tc_ct_priv struct mlx5_flow_table *ct_nat; struct mlx5_flow_table *post_ct; struct mutex control_lock; /* guards parallel adds/dels */ - struct mutex shared_counter_lock; struct mapping_ctx *zone_mapping; struct mapping_ctx *labels_mapping; enum mlx5_flow_namespace_type ns_type; struct mlx5_fs_chains *chains; + spinlock_t ht_lock; /* protects ft entries */ };
struct mlx5_ct_flow { @@@ -125,6 -125,10 +126,10 @@@ struct mlx5_ct_counter bool is_shared; };
+ enum { + MLX5_CT_ENTRY_FLAG_VALID, + }; + struct mlx5_ct_entry { struct rhash_head node; struct rhash_head tuple_node; @@@ -135,6 -139,12 +140,12 @@@ struct mlx5_ct_tuple tuple; struct mlx5_ct_tuple tuple_nat; struct mlx5_ct_zone_rule zone_rules[2]; + + struct mlx5_tc_ct_priv *ct_priv; + struct work_struct work; + + refcount_t refcnt; + unsigned long flags; };
static const struct rhashtable_params cts_ht_params = { @@@ -642,7 -652,6 +653,7 @@@ mlx5_tc_ct_entry_create_mod_hdr(struct }
ct_state |= MLX5_CT_STATE_ESTABLISHED_BIT | MLX5_CT_STATE_TRK_BIT; + ct_state |= meta->ct_metadata.orig_dir ? 0 : MLX5_CT_STATE_REPLY_BIT; err = mlx5_tc_ct_entry_set_registers(ct_priv, &mod_acts, ct_state, meta->ct_metadata.mark, @@@ -711,11 -720,11 +722,11 @@@ mlx5_tc_ct_entry_add_rule(struct mlx5_t attr->outer_match_level = MLX5_MATCH_L4; attr->counter = entry->counter->counter; attr->flags |= MLX5_ESW_ATTR_FLAG_NO_IN_PORT; + if (ct_priv->ns_type == MLX5_FLOW_NAMESPACE_FDB) + attr->esw_attr->in_mdev = priv->mdev;
mlx5_tc_ct_set_tuple_match(netdev_priv(ct_priv->netdev), spec, flow_rule); - mlx5e_tc_match_to_reg_match(spec, ZONE_TO_REG, - entry->tuple.zone & MLX5_CT_ZONE_MASK, - MLX5_CT_ZONE_MASK); + mlx5e_tc_match_to_reg_match(spec, ZONE_TO_REG, entry->tuple.zone, MLX5_CT_ZONE_MASK);
zone_rule->rule = mlx5_tc_rule_insert(priv, spec, attr); if (IS_ERR(zone_rule->rule)) { @@@ -742,6 -751,87 +753,87 @@@ err_attr return err; }
+ static bool + mlx5_tc_ct_entry_valid(struct mlx5_ct_entry *entry) + { + return test_bit(MLX5_CT_ENTRY_FLAG_VALID, &entry->flags); + } + + static struct mlx5_ct_entry * + mlx5_tc_ct_entry_get(struct mlx5_tc_ct_priv *ct_priv, struct mlx5_ct_tuple *tuple) + { + struct mlx5_ct_entry *entry; + + entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, tuple, + tuples_ht_params); + if (entry && mlx5_tc_ct_entry_valid(entry) && + refcount_inc_not_zero(&entry->refcnt)) { + return entry; + } else if (!entry) { + entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_nat_ht, + tuple, tuples_nat_ht_params); + if (entry && mlx5_tc_ct_entry_valid(entry) && + refcount_inc_not_zero(&entry->refcnt)) + return entry; + } + + return entry ? ERR_PTR(-EINVAL) : NULL; + } + + static void mlx5_tc_ct_entry_remove_from_tuples(struct mlx5_ct_entry *entry) + { + struct mlx5_tc_ct_priv *ct_priv = entry->ct_priv; + + rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht, + &entry->tuple_nat_node, + tuples_nat_ht_params); + rhashtable_remove_fast(&ct_priv->ct_tuples_ht, &entry->tuple_node, + tuples_ht_params); + } + + static void mlx5_tc_ct_entry_del(struct mlx5_ct_entry *entry) + { + struct mlx5_tc_ct_priv *ct_priv = entry->ct_priv; + + mlx5_tc_ct_entry_del_rules(ct_priv, entry); + + spin_lock_bh(&ct_priv->ht_lock); + mlx5_tc_ct_entry_remove_from_tuples(entry); + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_tc_ct_counter_put(ct_priv, entry); + kfree(entry); + } + + static void + mlx5_tc_ct_entry_put(struct mlx5_ct_entry *entry) + { + if (!refcount_dec_and_test(&entry->refcnt)) + return; + + mlx5_tc_ct_entry_del(entry); + } + + static void mlx5_tc_ct_entry_del_work(struct work_struct *work) + { + struct mlx5_ct_entry *entry = container_of(work, struct mlx5_ct_entry, work); + + mlx5_tc_ct_entry_del(entry); + } + + static void + __mlx5_tc_ct_entry_put(struct mlx5_ct_entry *entry) + { + struct mlx5e_priv *priv; + + if (!refcount_dec_and_test(&entry->refcnt)) + return; + + priv = netdev_priv(entry->ct_priv->netdev); + INIT_WORK(&entry->work, mlx5_tc_ct_entry_del_work); + queue_work(priv->wq, &entry->work); + } + static struct mlx5_ct_counter * mlx5_tc_ct_counter_create(struct mlx5_tc_ct_priv *ct_priv) { @@@ -772,6 -862,7 +864,6 @@@ mlx5_tc_ct_shared_counter_get(struct ml struct mlx5_ct_counter *shared_counter; struct mlx5_ct_entry *rev_entry; __be16 tmp_port; - int ret;
/* get the reversed tuple */ tmp_port = rev_tuple.port.src; @@@ -793,20 -884,32 +885,30 @@@ }
/* Use the same counter as the reverse direction */ - mutex_lock(&ct_priv->shared_counter_lock); - rev_entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, &rev_tuple, - tuples_ht_params); - if (rev_entry) { - if (refcount_inc_not_zero(&rev_entry->counter->refcount)) { - mutex_unlock(&ct_priv->shared_counter_lock); - return rev_entry->counter; - } + spin_lock_bh(&ct_priv->ht_lock); + rev_entry = mlx5_tc_ct_entry_get(ct_priv, &rev_tuple); + + if (IS_ERR(rev_entry)) { + spin_unlock_bh(&ct_priv->ht_lock); + goto create_counter; } - mutex_unlock(&ct_priv->shared_counter_lock); + + if (rev_entry && refcount_inc_not_zero(&rev_entry->counter->refcount)) { + ct_dbg("Using shared counter entry=0x%p rev=0x%p\n", entry, rev_entry); + shared_counter = rev_entry->counter; + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_tc_ct_entry_put(rev_entry); + return shared_counter; + } + + spin_unlock_bh(&ct_priv->ht_lock); + + create_counter:
shared_counter = mlx5_tc_ct_counter_create(ct_priv); - if (IS_ERR(shared_counter)) { - ret = PTR_ERR(shared_counter); - return ERR_PTR(ret); - } + if (IS_ERR(shared_counter)) + return shared_counter;
shared_counter->is_shared = true; refcount_set(&shared_counter->refcount, 1); @@@ -865,10 -968,14 +967,14 @@@ mlx5_tc_ct_block_flow_offload_add(struc if (!meta_action) return -EOPNOTSUPP;
- entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, - cts_ht_params); - if (entry) - return 0; + spin_lock_bh(&ct_priv->ht_lock); + entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params); + if (entry && refcount_inc_not_zero(&entry->refcnt)) { + spin_unlock_bh(&ct_priv->ht_lock); + mlx5_tc_ct_entry_put(entry); + return -EEXIST; + } + spin_unlock_bh(&ct_priv->ht_lock);
entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) @@@ -877,6 -984,8 +983,8 @@@ entry->tuple.zone = ft->zone; entry->cookie = flow->cookie; entry->restore_cookie = meta_action->ct_metadata.cookie; + refcount_set(&entry->refcnt, 2); + entry->ct_priv = ct_priv;
err = mlx5_tc_ct_rule_to_tuple(&entry->tuple, flow_rule); if (err) @@@ -887,35 -996,40 +995,40 @@@ if (err) goto err_set;
- err = rhashtable_insert_fast(&ct_priv->ct_tuples_ht, - &entry->tuple_node, - tuples_ht_params); + spin_lock_bh(&ct_priv->ht_lock); + + err = rhashtable_lookup_insert_fast(&ft->ct_entries_ht, &entry->node, + cts_ht_params); + if (err) + goto err_entries; + + err = rhashtable_lookup_insert_fast(&ct_priv->ct_tuples_ht, + &entry->tuple_node, + tuples_ht_params); if (err) goto err_tuple;
if (memcmp(&entry->tuple, &entry->tuple_nat, sizeof(entry->tuple))) { - err = rhashtable_insert_fast(&ct_priv->ct_tuples_nat_ht, - &entry->tuple_nat_node, - tuples_nat_ht_params); + err = rhashtable_lookup_insert_fast(&ct_priv->ct_tuples_nat_ht, + &entry->tuple_nat_node, + tuples_nat_ht_params); if (err) goto err_tuple_nat; } + spin_unlock_bh(&ct_priv->ht_lock);
err = mlx5_tc_ct_entry_add_rules(ct_priv, flow_rule, entry, ft->zone_restore_id); if (err) goto err_rules;
- err = rhashtable_insert_fast(&ft->ct_entries_ht, &entry->node, - cts_ht_params); - if (err) - goto err_insert; + set_bit(MLX5_CT_ENTRY_FLAG_VALID, &entry->flags); + mlx5_tc_ct_entry_put(entry); /* this function reference */
return 0;
- err_insert: - mlx5_tc_ct_entry_del_rules(ct_priv, entry); err_rules: + spin_lock_bh(&ct_priv->ht_lock); if (mlx5_tc_ct_entry_has_nat(entry)) rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht, &entry->tuple_nat_node, tuples_nat_ht_params); @@@ -924,47 -1038,43 +1037,43 @@@ err_tuple_nat &entry->tuple_node, tuples_ht_params); err_tuple: + rhashtable_remove_fast(&ft->ct_entries_ht, + &entry->node, + cts_ht_params); + err_entries: + spin_unlock_bh(&ct_priv->ht_lock); err_set: kfree(entry); - netdev_warn(ct_priv->netdev, - "Failed to offload ct entry, err: %d\n", err); + if (err != -EEXIST) + netdev_warn(ct_priv->netdev, "Failed to offload ct entry, err: %d\n", err); return err; }
- static void - mlx5_tc_ct_del_ft_entry(struct mlx5_tc_ct_priv *ct_priv, - struct mlx5_ct_entry *entry) - { - mlx5_tc_ct_entry_del_rules(ct_priv, entry); - mutex_lock(&ct_priv->shared_counter_lock); - if (mlx5_tc_ct_entry_has_nat(entry)) - rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht, - &entry->tuple_nat_node, - tuples_nat_ht_params); - rhashtable_remove_fast(&ct_priv->ct_tuples_ht, &entry->tuple_node, - tuples_ht_params); - mutex_unlock(&ct_priv->shared_counter_lock); - mlx5_tc_ct_counter_put(ct_priv, entry); - - } - static int mlx5_tc_ct_block_flow_offload_del(struct mlx5_ct_ft *ft, struct flow_cls_offload *flow) { + struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv; unsigned long cookie = flow->cookie; struct mlx5_ct_entry *entry;
- entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, - cts_ht_params); - if (!entry) + spin_lock_bh(&ct_priv->ht_lock); + entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params); + if (!entry) { + spin_unlock_bh(&ct_priv->ht_lock); return -ENOENT; + }
- mlx5_tc_ct_del_ft_entry(ft->ct_priv, entry); - WARN_ON(rhashtable_remove_fast(&ft->ct_entries_ht, - &entry->node, - cts_ht_params)); - kfree(entry); + if (!mlx5_tc_ct_entry_valid(entry)) { + spin_unlock_bh(&ct_priv->ht_lock); + return -EINVAL; + } + + rhashtable_remove_fast(&ft->ct_entries_ht, &entry->node, cts_ht_params); + mlx5_tc_ct_entry_remove_from_tuples(entry); + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_tc_ct_entry_put(entry);
return 0; } @@@ -973,19 -1083,30 +1082,30 @@@ static in mlx5_tc_ct_block_flow_offload_stats(struct mlx5_ct_ft *ft, struct flow_cls_offload *f) { + struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv; unsigned long cookie = f->cookie; struct mlx5_ct_entry *entry; u64 lastuse, packets, bytes;
- entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, - cts_ht_params); - if (!entry) + spin_lock_bh(&ct_priv->ht_lock); + entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params); + if (!entry) { + spin_unlock_bh(&ct_priv->ht_lock); return -ENOENT; + } + + if (!mlx5_tc_ct_entry_valid(entry) || !refcount_inc_not_zero(&entry->refcnt)) { + spin_unlock_bh(&ct_priv->ht_lock); + return -EINVAL; + } + + spin_unlock_bh(&ct_priv->ht_lock);
mlx5_fc_query_cached(entry->counter->counter, &bytes, &packets, &lastuse); flow_stats_update(&f->stats, bytes, packets, 0, lastuse, FLOW_ACTION_HW_STATS_DELAYED);
+ mlx5_tc_ct_entry_put(entry); return 0; }
@@@ -1087,8 -1208,8 +1207,8 @@@ mlx5_tc_ct_match_add(struct mlx5_tc_ct_ struct netlink_ext_ack *extack) { struct flow_rule *rule = flow_cls_offload_flow_rule(f); + bool trk, est, untrk, unest, new, rpl, unrpl; struct flow_dissector_key_ct *mask, *key; - bool trk, est, untrk, unest, new; u32 ctstate = 0, ctstate_mask = 0; u16 ct_state_on, ct_state_off; u16 ct_state, ct_state_mask; @@@ -1114,10 -1235,9 +1234,10 @@@
if (ct_state_mask & ~(TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED | - TCA_FLOWER_KEY_CT_FLAGS_NEW)) { + TCA_FLOWER_KEY_CT_FLAGS_NEW | + TCA_FLOWER_KEY_CT_FLAGS_REPLY)) { NL_SET_ERR_MSG_MOD(extack, - "only ct_state trk, est and new are supported for offload"); + "only ct_state trk, est, new and rpl are supported for offload"); return -EOPNOTSUPP; }
@@@ -1126,17 -1246,13 +1246,17 @@@ trk = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_TRACKED; new = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_NEW; est = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED; + rpl = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_REPLY; untrk = ct_state_off & TCA_FLOWER_KEY_CT_FLAGS_TRACKED; unest = ct_state_off & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED; + unrpl = ct_state_off & TCA_FLOWER_KEY_CT_FLAGS_REPLY;
ctstate |= trk ? MLX5_CT_STATE_TRK_BIT : 0; ctstate |= est ? MLX5_CT_STATE_ESTABLISHED_BIT : 0; + ctstate |= rpl ? MLX5_CT_STATE_REPLY_BIT : 0; ctstate_mask |= (untrk || trk) ? MLX5_CT_STATE_TRK_BIT : 0; ctstate_mask |= (unest || est) ? MLX5_CT_STATE_ESTABLISHED_BIT : 0; + ctstate_mask |= (unrpl || rpl) ? MLX5_CT_STATE_REPLY_BIT : 0;
if (new) { NL_SET_ERR_MSG_MOD(extack, @@@ -1251,8 -1367,9 +1371,8 @@@ static int tc_ct_pre_ct_add_rules(struc pre_ct->flow_rule = rule;
/* add miss rule */ - memset(spec, 0, sizeof(*spec)); dest.ft = nat ? ct_priv->ct_nat : ct_priv->ct; - rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + rule = mlx5_add_flow_rules(ft, NULL, &flow_act, &dest, 1); if (IS_ERR(rule)) { err = PTR_ERR(rule); ct_dbg("Failed to add pre ct miss rule zone %d", zone); @@@ -1481,11 -1598,9 +1601,9 @@@ err_mapping static void mlx5_tc_ct_flush_ft_entry(void *ptr, void *arg) { - struct mlx5_tc_ct_priv *ct_priv = arg; struct mlx5_ct_entry *entry = ptr;
- mlx5_tc_ct_del_ft_entry(ct_priv, entry); - kfree(entry); + mlx5_tc_ct_entry_put(entry); }
static void @@@ -1763,6 -1878,7 +1881,6 @@@ __mlx5_tc_ct_flow_offload_clear(struct goto err_set_registers; }
- dealloc_mod_hdr_actions(mod_acts); pre_ct_attr->modify_hdr = mod_hdr; pre_ct_attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
@@@ -1962,6 -2078,7 +2080,7 @@@ mlx5_tc_ct_init(struct mlx5e_priv *priv goto err_mapping_labels; }
+ spin_lock_init(&ct_priv->ht_lock); ct_priv->ns_type = ns_type; ct_priv->chains = chains; ct_priv->netdev = priv->netdev; @@@ -1996,7 -2113,6 +2115,6 @@@
idr_init(&ct_priv->fte_ids); mutex_init(&ct_priv->control_lock); - mutex_init(&ct_priv->shared_counter_lock); rhashtable_init(&ct_priv->zone_ht, &zone_params); rhashtable_init(&ct_priv->ct_tuples_ht, &tuples_ht_params); rhashtable_init(&ct_priv->ct_tuples_nat_ht, &tuples_nat_ht_params); @@@ -2039,7 -2155,6 +2157,6 @@@ mlx5_tc_ct_clean(struct mlx5_tc_ct_pri rhashtable_destroy(&ct_priv->ct_tuples_nat_ht); rhashtable_destroy(&ct_priv->zone_ht); mutex_destroy(&ct_priv->control_lock); - mutex_destroy(&ct_priv->shared_counter_lock); idr_destroy(&ct_priv->fte_ids); kfree(ct_priv); } @@@ -2061,14 -2176,22 +2178,22 @@@ mlx5e_tc_ct_restore_flow(struct mlx5_tc if (!mlx5_tc_ct_skb_to_tuple(skb, &tuple, zone)) return false;
- entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, &tuple, - tuples_ht_params); - if (!entry) - entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_nat_ht, - &tuple, tuples_nat_ht_params); - if (!entry) + spin_lock(&ct_priv->ht_lock); + + entry = mlx5_tc_ct_entry_get(ct_priv, &tuple); + if (!entry) { + spin_unlock(&ct_priv->ht_lock); + return false; + } + + if (IS_ERR(entry)) { + spin_unlock(&ct_priv->ht_lock); return false; + } + spin_unlock(&ct_priv->ht_lock);
tcf_ct_flow_table_restore_skb(skb, entry->restore_cookie); + __mlx5_tc_ct_entry_put(entry); + return true; } diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h index 959bb6cd7203,ff81b69a59a9..cc0efac7b812 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h @@@ -85,7 -85,7 +85,7 @@@ mlx5e_tx_tunnel_accel(struct sk_buff *s }
mlx5e_set_eseg_swp(skb, eseg, &swp_spec); - if (skb_vlan_tag_present(skb) && ihs) + if (skb_vlan_tag_present(skb) && ihs) mlx5e_eseg_swp_offsets_add_vlan(eseg); }
@@@ -144,9 -144,9 +144,9 @@@ static inline bool mlx5e_accel_tx_is_ip { #ifdef CONFIG_MLX5_EN_IPSEC return mlx5e_ipsec_is_tx_flow(&state->ipsec); -#endif - +#else return false; +#endif }
static inline unsigned int mlx5e_accel_tx_ids_len(struct mlx5e_txqsq *sq, @@@ -173,7 -173,7 +173,7 @@@ static inline bool mlx5e_accel_tx_eseg( #endif
#if IS_ENABLED(CONFIG_GENEVE) - if (skb->encapsulation) + if (skb->encapsulation && skb->ip_summed == CHECKSUM_PARTIAL) mlx5e_tx_tunnel_accel(skb, eseg, ihs); #endif
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 5e9474dba4e5,8612c388db7d..abdf721bb264 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@@ -447,17 -447,6 +447,17 @@@ int mlx5e_ethtool_set_channels(struct m goto out; }
+ /* Don't allow changing the number of channels if HTB offload is active, + * because the numeration of the QoS SQs will change, while per-queue + * qdiscs are attached. + */ + if (priv->htb.maj_id) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: HTB offload is active, cannot change the number of channels\n", + __func__); + goto out; + } + new_channels.params = *cur_params; new_channels.params.num_channels = count;
@@@ -536,7 -525,7 +536,7 @@@ static int mlx5e_get_coalesce(struct ne #define MLX5E_MAX_COAL_FRAMES MLX5_MAX_CQ_COUNT
static void - mlx5e_set_priv_channels_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) + mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) { struct mlx5_core_dev *mdev = priv->mdev; int tc; @@@ -551,6 -540,17 +551,17 @@@ coal->tx_coalesce_usecs, coal->tx_max_coalesced_frames); } + } + } + + static void + mlx5e_set_priv_channels_rx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) + { + struct mlx5_core_dev *mdev = priv->mdev; + int i; + + for (i = 0; i < priv->channels.num; ++i) { + struct mlx5e_channel *c = priv->channels.c[i];
mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq, coal->rx_coalesce_usecs, @@@ -597,21 -597,9 +608,9 @@@ int mlx5e_ethtool_set_coalesce(struct m tx_moder->pkts = coal->tx_max_coalesced_frames; new_channels.params.tx_dim_enabled = !!coal->use_adaptive_tx_coalesce;
- if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { - priv->channels.params = new_channels.params; - goto out; - } - /* we are opened */ - reset_rx = !!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled; reset_tx = !!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled;
- if (!reset_rx && !reset_tx) { - mlx5e_set_priv_channels_coalesce(priv, coal); - priv->channels.params = new_channels.params; - goto out; - } - if (reset_rx) { u8 mode = MLX5E_GET_PFLAG(&new_channels.params, MLX5E_PFLAG_RX_CQE_BASED_MODER); @@@ -625,6 -613,20 +624,20 @@@ mlx5e_reset_tx_moderation(&new_channels.params, mode); }
+ if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { + priv->channels.params = new_channels.params; + goto out; + } + + if (!reset_rx && !reset_tx) { + if (!coal->use_adaptive_rx_coalesce) + mlx5e_set_priv_channels_rx_coalesce(priv, coal); + if (!coal->use_adaptive_tx_coalesce) + mlx5e_set_priv_channels_tx_coalesce(priv, coal); + priv->channels.params = new_channels.params; + goto out; + } + err = mlx5e_safe_switch_channels(priv, &new_channels, NULL, NULL);
out: @@@ -1983,16 -1985,6 +1996,16 @@@ static int set_pflag_tx_port_ts(struct if (!MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn)) return -EOPNOTSUPP;
+ /* Don't allow changing the PTP state if HTB offload is active, because + * the numeration of the QoS SQs will change, while per-queue qdiscs are + * attached. + */ + if (priv->htb.maj_id) { + netdev_err(priv->netdev, "%s: HTB offload is active, cannot change the PTP state\n", + __func__); + return -EINVAL; + } + new_channels.params = priv->channels.params; MLX5E_SET_PFLAG(&new_channels.params, MLX5E_PFLAG_TX_PORT_TS, enable); /* No need to verify SQ stop room as diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c8866c14b8a3,a2e0b548bf57..39acbc83682d --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@@ -65,8 -65,7 +65,9 @@@ #include "en/devlink.h" #include "lib/mlx5.h" #include "en/ptp.h" +#include "qos.h" +#include "en/trap.h" + #include "fpga/ipsec.h"
bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev) { @@@ -108,7 -107,7 +109,7 @@@ bool mlx5e_striding_rq_possible(struct if (!mlx5e_check_fragmented_striding_rq_cap(mdev)) return false;
- if (MLX5_IPSEC_DEV(mdev)) + if (mlx5_fpga_is_ipsec_device(mdev)) return false;
if (params->xdp_prog) { @@@ -213,33 -212,6 +214,33 @@@ static void mlx5e_disable_async_events( mlx5_notifier_unregister(priv->mdev, &priv->events_nb); }
+static int blocking_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5e_priv *priv = container_of(nb, struct mlx5e_priv, blocking_events_nb); + int err; + + switch (event) { + case MLX5_DRIVER_EVENT_TYPE_TRAP: + err = mlx5e_handle_trap_event(priv, data); + break; + default: + netdev_warn(priv->netdev, "Sync event: Unknown event %ld\n", event); + err = -EINVAL; + } + return err; +} + +static void mlx5e_enable_blocking_events(struct mlx5e_priv *priv) +{ + priv->blocking_events_nb.notifier_call = blocking_event; + mlx5_blocking_notifier_register(priv->mdev, &priv->blocking_events_nb); +} + +static void mlx5e_disable_blocking_events(struct mlx5e_priv *priv) +{ + mlx5_blocking_notifier_unregister(priv->mdev, &priv->blocking_events_nb); +} + static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq, struct mlx5e_icosq *sq, struct mlx5e_umr_wqe *wqe) @@@ -371,11 -343,13 +372,11 @@@ static void mlx5e_init_frags_partition( prev->last_in_page = true; }
-static int mlx5e_init_di_list(struct mlx5e_rq *rq, - int wq_sz, int cpu) +int mlx5e_init_di_list(struct mlx5e_rq *rq, int wq_sz, int node) { int len = wq_sz << rq->wqe.info.log_num_frags;
- rq->wqe.di = kvzalloc_node(array_size(len, sizeof(*rq->wqe.di)), - GFP_KERNEL, cpu_to_node(cpu)); + rq->wqe.di = kvzalloc_node(array_size(len, sizeof(*rq->wqe.di)), GFP_KERNEL, node); if (!rq->wqe.di) return -ENOMEM;
@@@ -384,7 -358,7 +385,7 @@@ return 0; }
-static void mlx5e_free_di_list(struct mlx5e_rq *rq) +void mlx5e_free_di_list(struct mlx5e_rq *rq) { kvfree(rq->wqe.di); } @@@ -449,9 -423,6 +450,9 @@@ static int mlx5e_alloc_rq(struct mlx5e_ rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); rq->xdpsq = &c->rq_xdpsq; rq->xsk_pool = xsk_pool; + rq->ptp_cyc2time = mlx5_is_real_time_rq(mdev) ? + mlx5_real_time_cyc2time : + mlx5_timecounter_cyc2time;
if (rq->xsk_pool) rq->stats = &c->priv->channel_stats[c->ix].xskrq; @@@ -529,7 -500,7 +530,7 @@@ goto err_rq_wq_destroy; }
- err = mlx5e_init_di_list(rq, wq_sz, c->cpu); + err = mlx5e_init_di_list(rq, wq_sz, cpu_to_node(c->cpu)); if (err) goto err_rq_frags;
@@@ -680,10 -651,11 +681,10 @@@ static void mlx5e_free_rq(struct mlx5e_ mlx5_wq_destroy(&rq->wq_ctrl); }
-static int mlx5e_create_rq(struct mlx5e_rq *rq, - struct mlx5e_rq_param *param) +int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param) { struct mlx5_core_dev *mdev = rq->mdev; - + u8 ts_format; void *in; void *rqc; void *wq; @@@ -696,9 -668,6 +697,9 @@@ if (!in) return -ENOMEM;
+ ts_format = mlx5_is_real_time_rq(mdev) ? + MLX5_RQC_TIMESTAMP_FORMAT_REAL_TIME : + MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING; rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); wq = MLX5_ADDR_OF(rqc, rqc, wq);
@@@ -706,7 -675,6 +707,7 @@@
MLX5_SET(rqc, rqc, cqn, rq->cq.mcq.cqn); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, ts_format, ts_format); MLX5_SET(wq, wq, log_wq_pg_sz, rq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, rq->wq_ctrl.db.dma); @@@ -807,7 -775,7 +808,7 @@@ static int mlx5e_modify_rq_vsd(struct m return err; }
-static void mlx5e_destroy_rq(struct mlx5e_rq *rq) +void mlx5e_destroy_rq(struct mlx5e_rq *rq) { mlx5_core_destroy_rq(rq->mdev, rq->rqn); } @@@ -947,7 -915,7 +948,7 @@@ void mlx5e_activate_rq(struct mlx5e_rq void mlx5e_deactivate_rq(struct mlx5e_rq *rq) { clear_bit(MLX5E_RQ_STATE_ENABLED, &rq->state); - synchronize_rcu(); /* Sync with NAPI to prevent mlx5e_post_rx_wqes. */ + synchronize_net(); /* Sync with NAPI to prevent mlx5e_post_rx_wqes. */ }
void mlx5e_close_rq(struct mlx5e_rq *rq) @@@ -1176,6 -1144,7 +1177,6 @@@ static int mlx5e_alloc_txqsq(struct mlx sq->uar_map = mdev->mlx5e_res.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); - sq->stats = &c->priv->channel_stats[c->ix].sq[tc]; INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work); if (!MLX5_CAP_ETH(mdev, wqe_vlan_insert)) set_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state); @@@ -1186,9 -1155,6 +1187,9 @@@ if (param->is_mpw) set_bit(MLX5E_SQ_STATE_MPWQE, &sq->state); sq->stop_room = param->stop_room; + sq->ptp_cyc2time = mlx5_is_real_time_sq(mdev) ? + mlx5_real_time_cyc2time : + mlx5_timecounter_cyc2time;
param->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl); @@@ -1222,7 -1188,6 +1223,7 @@@ static int mlx5e_create_sq(struct mlx5_ struct mlx5e_create_sq_param *csp, u32 *sqn) { + u8 ts_format; void *in; void *sqc; void *wq; @@@ -1235,9 -1200,6 +1236,9 @@@ if (!in) return -ENOMEM;
+ ts_format = mlx5_is_real_time_sq(mdev) ? + MLX5_SQC_TIMESTAMP_FORMAT_REAL_TIME : + MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING; sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); wq = MLX5_ADDR_OF(sqc, sqc, wq);
@@@ -1246,8 -1208,6 +1247,8 @@@ MLX5_SET(sqc, sqc, tis_num_0, csp->tisn); MLX5_SET(sqc, sqc, cqn, csp->cqn); MLX5_SET(sqc, sqc, ts_cqe_to_dest_cqn, csp->ts_cqe_to_dest_cqn); + MLX5_SET(sqc, sqc, ts_format, ts_format); +
if (MLX5_CAP_ETH(mdev, wqe_inline_mode) == MLX5_CAP_INLINE_MODE_VPORT_CONTEXT) MLX5_SET(sqc, sqc, min_wqe_inline_mode, csp->min_inline_mode); @@@ -1274,7 -1234,6 +1275,7 @@@ int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, struct mlx5e_modify_sq_param *p) { + u64 bitmask = 0; void *in; void *sqc; int inlen; @@@ -1290,14 -1249,9 +1291,14 @@@ MLX5_SET(modify_sq_in, in, sq_state, p->curr_state); MLX5_SET(sqc, sqc, state, p->next_state); if (p->rl_update && p->next_state == MLX5_SQC_STATE_RDY) { - MLX5_SET64(modify_sq_in, in, modify_bitmask, 1); - MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, p->rl_index); + bitmask |= 1; + MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, p->rl_index); + } + if (p->qos_update && p->next_state == MLX5_SQC_STATE_RDY) { + bitmask |= 1 << 2; + MLX5_SET(sqc, sqc, qos_queue_group_id, p->qos_queue_group_id); } + MLX5_SET64(modify_sq_in, in, modify_bitmask, bitmask);
err = mlx5_core_modify_sq(mdev, sqn, in);
@@@ -1314,7 -1268,6 +1315,7 @@@ static void mlx5e_destroy_sq(struct mlx int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev, struct mlx5e_sq_param *param, struct mlx5e_create_sq_param *csp, + u16 qos_queue_group_id, u32 *sqn) { struct mlx5e_modify_sq_param msp = {0}; @@@ -1326,10 -1279,6 +1327,10 @@@
msp.curr_state = MLX5_SQC_STATE_RST; msp.next_state = MLX5_SQC_STATE_RDY; + if (qos_queue_group_id) { + msp.qos_update = true; + msp.qos_queue_group_id = qos_queue_group_id; + } err = mlx5e_modify_sq(mdev, *sqn, &msp); if (err) mlx5e_destroy_sq(mdev, *sqn); @@@ -1340,9 -1289,13 +1341,9 @@@ static int mlx5e_set_sq_maxrate(struct net_device *dev, struct mlx5e_txqsq *sq, u32 rate);
-static int mlx5e_open_txqsq(struct mlx5e_channel *c, - u32 tisn, - int txq_ix, - struct mlx5e_params *params, - struct mlx5e_sq_param *param, - struct mlx5e_txqsq *sq, - int tc) +int mlx5e_open_txqsq(struct mlx5e_channel *c, u32 tisn, int txq_ix, + struct mlx5e_params *params, struct mlx5e_sq_param *param, + struct mlx5e_txqsq *sq, int tc, u16 qos_queue_group_id, u16 qos_qid) { struct mlx5e_create_sq_param csp = {}; u32 tx_rate; @@@ -1352,17 -1305,12 +1353,17 @@@ if (err) return err;
+ if (qos_queue_group_id) + sq->stats = c->priv->htb.qos_sq_stats[qos_qid]; + else + sq->stats = &c->priv->channel_stats[c->ix].sq[tc]; + csp.tisn = tisn; csp.tis_lst_sz = 1; csp.cqn = sq->cq.mcq.cqn; csp.wq_ctrl = &sq->wq_ctrl; csp.min_inline_mode = sq->min_inline_mode; - err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn); + err = mlx5e_create_sq_rdy(c->mdev, param, &csp, qos_queue_group_id, &sq->sqn); if (err) goto err_free_txqsq;
@@@ -1401,7 -1349,7 +1402,7 @@@ void mlx5e_deactivate_txqsq(struct mlx5 struct mlx5_wq_cyc *wq = &sq->wq;
clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); - synchronize_rcu(); /* Sync with NAPI to prevent netif_tx_wake_queue. */ + synchronize_net(); /* Sync with NAPI to prevent netif_tx_wake_queue. */
mlx5e_tx_disable_queue(sq->txq);
@@@ -1419,7 -1367,7 +1420,7 @@@ } }
-static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq) +void mlx5e_close_txqsq(struct mlx5e_txqsq *sq) { struct mlx5_core_dev *mdev = sq->mdev; struct mlx5_rate_limit rl = {0}; @@@ -1456,7 -1404,7 +1457,7 @@@ int mlx5e_open_icosq(struct mlx5e_chann csp.cqn = sq->cq.mcq.cqn; csp.wq_ctrl = &sq->wq_ctrl; csp.min_inline_mode = params->tx_min_inline_mode; - err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn); + err = mlx5e_create_sq_rdy(c->mdev, param, &csp, 0, &sq->sqn); if (err) goto err_free_icosq;
@@@ -1476,7 -1424,7 +1477,7 @@@ void mlx5e_activate_icosq(struct mlx5e_ void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq) { clear_bit(MLX5E_SQ_STATE_ENABLED, &icosq->state); - synchronize_rcu(); /* Sync with NAPI. */ + synchronize_net(); /* Sync with NAPI. */ }
void mlx5e_close_icosq(struct mlx5e_icosq *sq) @@@ -1505,7 -1453,7 +1506,7 @@@ int mlx5e_open_xdpsq(struct mlx5e_chann csp.wq_ctrl = &sq->wq_ctrl; csp.min_inline_mode = sq->min_inline_mode; set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); - err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn); + err = mlx5e_create_sq_rdy(c->mdev, param, &csp, 0, &sq->sqn); if (err) goto err_free_xdpsq;
@@@ -1555,7 -1503,7 +1556,7 @@@ void mlx5e_close_xdpsq(struct mlx5e_xdp struct mlx5e_channel *c = sq->channel;
clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); - synchronize_rcu(); /* Sync with NAPI. */ + synchronize_net(); /* Sync with NAPI. */
mlx5e_destroy_sq(c->mdev, sq->sqn); mlx5e_free_xdpsq_descs(sq); @@@ -1756,7 -1704,7 +1757,7 @@@ static int mlx5e_open_sqs(struct mlx5e_ int txq_ix = c->ix + tc * params->num_channels;
err = mlx5e_open_txqsq(c, c->priv->tisn[c->lag_port][tc], txq_ix, - params, &cparam->txq_sq, &c->sq[tc], tc); + params, &cparam->txq_sq, &c->sq[tc], tc, 0, 0); if (err) goto err_close_sqs; } @@@ -1879,12 -1827,12 +1880,12 @@@ static int mlx5e_open_queues(struct mlx
mlx5e_build_create_cq_param(&ccp, c);
- err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->icosq.cqp, &ccp, + err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->async_icosq.cqp, &ccp, &c->async_icosq.cq); if (err) return err;
- err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->async_icosq.cqp, &ccp, + err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->icosq.cqp, &ccp, &c->icosq.cq); if (err) goto err_close_async_icosq_cq; @@@ -1908,11 -1856,13 +1909,11 @@@ if (err) goto err_close_rx_cq;
- napi_enable(&c->napi); - spin_lock_init(&c->async_icosq_lock);
err = mlx5e_open_icosq(c, params, &cparam->async_icosq, &c->async_icosq); if (err) - goto err_disable_napi; + goto err_close_xdpsq_cq;
err = mlx5e_open_icosq(c, params, &cparam->icosq, &c->icosq); if (err) @@@ -1955,7 -1905,9 +1956,7 @@@ err_close_icosq err_close_async_icosq: mlx5e_close_icosq(&c->async_icosq);
-err_disable_napi: - napi_disable(&c->napi); - +err_close_xdpsq_cq: if (c->xdp) mlx5e_close_cq(&c->rq_xdpsq.cq);
@@@ -1986,6 -1938,7 +1987,6 @@@ static void mlx5e_close_queues(struct m mlx5e_close_sqs(c); mlx5e_close_icosq(&c->icosq); mlx5e_close_icosq(&c->async_icosq); - napi_disable(&c->napi); if (c->xdp) mlx5e_close_cq(&c->rq_xdpsq.cq); mlx5e_close_cq(&c->rq.cq); @@@ -2070,8 -2023,6 +2071,8 @@@ static void mlx5e_activate_channel(stru { int tc;
+ napi_enable(&c->napi); + for (tc = 0; tc < c->num_tc; tc++) mlx5e_activate_txqsq(&c->sq[tc]); mlx5e_activate_icosq(&c->icosq); @@@ -2094,9 -2045,6 +2095,9 @@@ static void mlx5e_deactivate_channel(st mlx5e_deactivate_icosq(&c->icosq); for (tc = 0; tc < c->num_tc; tc++) mlx5e_deactivate_txqsq(&c->sq[tc]); + mlx5e_qos_deactivate_queues(c); + + napi_disable(&c->napi); }
static void mlx5e_close_channel(struct mlx5e_channel *c) @@@ -2104,7 -2052,6 +2105,7 @@@ if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) mlx5e_close_xsk(c); mlx5e_close_queues(c); + mlx5e_qos_close_queues(c); netif_napi_del(&c->napi);
kvfree(c); @@@ -2122,8 -2069,10 +2123,13 @@@ static void mlx5e_build_rq_frags_info(s u32 buf_size = 0; int i;
++<<<<<<< HEAD + if (MLX5_IPSEC_DEV(mdev)) ++======= + #ifdef CONFIG_MLX5_EN_IPSEC + if (mlx5_fpga_is_ipsec_device(mdev)) ++>>>>>>> 3af409ca278d4a8d50e91f9f7c4c33b175645cf3 byte_count += MLX5E_METADATA_ETHER_LEN; -#endif
if (mlx5e_rx_is_linear_skb(params, xsk)) { int frag_stride; @@@ -2252,8 -2201,9 +2258,8 @@@ void mlx5e_build_sq_param_common(struc param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(priv->mdev)); }
-static void mlx5e_build_sq_param(struct mlx5e_priv *priv, - struct mlx5e_params *params, - struct mlx5e_sq_param *param) +void mlx5e_build_sq_param(struct mlx5e_priv *priv, struct mlx5e_params *params, + struct mlx5e_sq_param *param) { void *sqc = param->sqc; void *wq = MLX5_ADDR_OF(sqc, sqc, wq); @@@ -2432,18 -2382,10 +2438,18 @@@ int mlx5e_open_channels(struct mlx5e_pr goto err_close_channels; }
+ err = mlx5e_qos_open_queues(priv, chs); + if (err) + goto err_close_ptp; + mlx5e_health_channels_update(priv); kvfree(cparam); return 0;
+err_close_ptp: + if (chs->port_ptp) + mlx5e_port_ptp_close(chs->port_ptp); + err_close_channels: for (i--; i >= 0; i--) mlx5e_close_channel(chs->c[i]); @@@ -2976,31 -2918,11 +2982,31 @@@ static void mlx5e_netdev_set_tcs(struc netdev_set_tc_queue(netdev, tc, nch, 0); }
+int mlx5e_update_tx_netdev_queues(struct mlx5e_priv *priv) +{ + int qos_queues, nch, ntc, num_txqs, err; + + qos_queues = mlx5e_qos_cur_leaf_nodes(priv); + + nch = priv->channels.params.num_channels; + ntc = priv->channels.params.num_tc; + num_txqs = nch * ntc + qos_queues; + if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_PORT_TS)) + num_txqs += ntc; + + mlx5e_dbg(DRV, priv, "Setting num_txqs %d\n", num_txqs); + err = netif_set_real_num_tx_queues(priv->netdev, num_txqs); + if (err) + netdev_warn(priv->netdev, "netif_set_real_num_tx_queues failed, %d\n", err); + + return err; +} + static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv) { struct net_device *netdev = priv->netdev; - int num_txqs, num_rxqs, nch, ntc; int old_num_txqs, old_ntc; + int num_rxqs, nch, ntc; int err;
old_num_txqs = netdev->real_num_tx_queues; @@@ -3008,13 -2930,18 +3014,13 @@@
nch = priv->channels.params.num_channels; ntc = priv->channels.params.num_tc; - num_txqs = nch * ntc; - if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_PORT_TS)) - num_txqs += ntc; num_rxqs = nch * priv->profile->rq_groups;
mlx5e_netdev_set_tcs(netdev, nch, ntc);
- err = netif_set_real_num_tx_queues(netdev, num_txqs); - if (err) { - netdev_warn(netdev, "netif_set_real_num_tx_queues failed, %d\n", err); + err = mlx5e_update_tx_netdev_queues(priv); + if (err) goto err_tcs; - } err = netif_set_real_num_rx_queues(netdev, num_rxqs); if (err) { netdev_warn(netdev, "netif_set_real_num_rx_queues failed, %d\n", err); @@@ -3118,7 -3045,6 +3124,7 @@@ void mlx5e_activate_priv_channels(struc mlx5e_update_num_tc_x_num_ch(priv); mlx5e_build_txq_maps(priv); mlx5e_activate_channels(&priv->channels); + mlx5e_qos_activate_queues(priv); mlx5e_xdp_tx_enable(priv); netif_tx_start_all_queues(priv->netdev);
@@@ -3261,7 -3187,6 +3267,7 @@@ int mlx5e_open_locked(struct net_devic
priv->profile->update_rx(priv); mlx5e_activate_priv_channels(priv); + mlx5e_apply_traps(priv, true); if (priv->profile->update_carrier) priv->profile->update_carrier(priv);
@@@ -3297,7 -3222,6 +3303,7 @@@ int mlx5e_close_locked(struct net_devic if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) return 0;
+ mlx5e_apply_traps(priv, false); clear_bit(MLX5E_STATE_OPENED, &priv->state);
netif_carrier_off(priv->netdev); @@@ -3687,14 -3611,6 +3693,14 @@@ static int mlx5e_setup_tc_mqprio(struc
mutex_lock(&priv->state_lock);
+ /* MQPRIO is another toplevel qdisc that can't be attached + * simultaneously with the offloaded HTB. + */ + if (WARN_ON(priv->htb.maj_id)) { + err = -EINVAL; + goto out; + } + new_channels.params = priv->channels.params; new_channels.params.num_tc = tc ? tc : 1;
@@@ -3720,55 -3636,12 +3726,55 @@@ out return err; }
+static int mlx5e_setup_tc_htb(struct mlx5e_priv *priv, struct tc_htb_qopt_offload *htb) +{ + int res; + + switch (htb->command) { + case TC_HTB_CREATE: + return mlx5e_htb_root_add(priv, htb->parent_classid, htb->classid, + htb->extack); + case TC_HTB_DESTROY: + return mlx5e_htb_root_del(priv); + case TC_HTB_LEAF_ALLOC_QUEUE: + res = mlx5e_htb_leaf_alloc_queue(priv, htb->classid, htb->parent_classid, + htb->rate, htb->ceil, htb->extack); + if (res < 0) + return res; + htb->qid = res; + return 0; + case TC_HTB_LEAF_TO_INNER: + return mlx5e_htb_leaf_to_inner(priv, htb->parent_classid, htb->classid, + htb->rate, htb->ceil, htb->extack); + case TC_HTB_LEAF_DEL: + return mlx5e_htb_leaf_del(priv, htb->classid, &htb->moved_qid, &htb->qid, + htb->extack); + case TC_HTB_LEAF_DEL_LAST: + case TC_HTB_LEAF_DEL_LAST_FORCE: + return mlx5e_htb_leaf_del_last(priv, htb->classid, + htb->command == TC_HTB_LEAF_DEL_LAST_FORCE, + htb->extack); + case TC_HTB_NODE_MODIFY: + return mlx5e_htb_node_modify(priv, htb->classid, htb->rate, htb->ceil, + htb->extack); + case TC_HTB_LEAF_QUERY_QUEUE: + res = mlx5e_get_txq_by_classid(priv, htb->classid); + if (res < 0) + return res; + htb->qid = res; + return 0; + default: + return -EOPNOTSUPP; + } +} + static LIST_HEAD(mlx5e_block_cb_list);
static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { struct mlx5e_priv *priv = netdev_priv(dev); + int err;
switch (type) { case TC_SETUP_BLOCK: { @@@ -3782,11 -3655,6 +3788,11 @@@ } case TC_SETUP_QDISC_MQPRIO: return mlx5e_setup_tc_mqprio(priv, type_data); + case TC_SETUP_QDISC_HTB: + mutex_lock(&priv->state_lock); + err = mlx5e_setup_tc_htb(priv, type_data); + mutex_unlock(&priv->state_lock); + return err; default: return -EOPNOTSUPP; } @@@ -3902,7 -3770,7 +3908,7 @@@ static int set_feature_lro(struct net_d mutex_lock(&priv->state_lock);
if (enable && priv->xsk.refcnt) { - netdev_warn(netdev, "LRO is incompatible with AF_XDP (%hu XSKs are active)\n", + netdev_warn(netdev, "LRO is incompatible with AF_XDP (%u XSKs are active)\n", priv->xsk.refcnt); err = -EINVAL; goto out; @@@ -3956,25 -3824,20 +3962,25 @@@ static int set_feature_cvlan_filter(str return 0; }
-#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) -static int set_feature_tc_num_filters(struct net_device *netdev, bool enable) +static int set_feature_hw_tc(struct net_device *netdev, bool enable) { struct mlx5e_priv *priv = netdev_priv(netdev);
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) if (!enable && mlx5e_tc_num_filters(priv, MLX5_TC_FLAG(NIC_OFFLOAD))) { netdev_err(netdev, "Active offloaded tc filters, can't turn hw_tc_offload off\n"); return -EINVAL; } +#endif + + if (!enable && priv->htb.maj_id) { + netdev_err(netdev, "Active HTB offload, can't turn hw_tc_offload off\n"); + return -EINVAL; + }
return 0; } -#endif
static int set_feature_rx_all(struct net_device *netdev, bool enable) { @@@ -4072,7 -3935,9 +4078,7 @@@ int mlx5e_set_features(struct net_devic err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro); err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER, set_feature_cvlan_filter); -#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) - err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_tc_num_filters); -#endif + err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_hw_tc); err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all); err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs); err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_RX, set_feature_rx_vlan); @@@ -4105,7 -3970,6 +4111,7 @@@ static netdev_features_t mlx5e_fix_feat if (!params->vlan_strip_disable) netdev_warn(netdev, "Dropping C-tag vlan stripping offload due to S-tag vlan\n"); } + if (!MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ)) { if (features & NETIF_F_LRO) { netdev_warn(netdev, "Disabling LRO, not supported in legacy RQ\n"); @@@ -4153,7 -4017,7 +4159,7 @@@ static bool mlx5e_xsk_validate_mtu(stru max_mtu_page = mlx5e_xdp_max_mtu(new_params, &xsk); max_mtu = min(max_mtu_frame, max_mtu_page);
- netdev_err(netdev, "MTU %d is too big for an XSK running on channel %hu. Try MTU <= %d\n", + netdev_err(netdev, "MTU %d is too big for an XSK running on channel %u. Try MTU <= %d\n", new_params->sw_mtu, ix, max_mtu); return false; } @@@ -4530,8 -4394,10 +4536,8 @@@ netdev_features_t mlx5e_features_check( features = vlan_features_check(skb, features); features = vxlan_features_check(skb, features);
-#ifdef CONFIG_MLX5_EN_IPSEC if (mlx5e_ipsec_feature_check(skb, netdev, features)) return features; -#endif
/* Validate if the tunneled packet is being offloaded by HW */ if (skb->encapsulation && @@@ -4590,8 -4456,9 +4596,9 @@@ static int mlx5e_xdp_allowed(struct mlx return -EINVAL; }
- if (MLX5_IPSEC_DEV(priv->mdev)) { - netdev_warn(netdev, "can't set XDP with IPSec offload\n"); + if (mlx5_fpga_is_ipsec_device(priv->mdev)) { + netdev_warn(netdev, + "XDP is not available on Innova cards with IPsec support\n"); return -EINVAL; }
@@@ -4774,6 -4641,8 +4781,6 @@@ const struct net_device_ops mlx5e_netde .ndo_change_mtu = mlx5e_change_nic_mtu, .ndo_do_ioctl = mlx5e_ioctl, .ndo_set_tx_maxrate = mlx5e_set_tx_maxrate, - .ndo_udp_tunnel_add = udp_tunnel_nic_add_port, - .ndo_udp_tunnel_del = udp_tunnel_nic_del_port, .ndo_features_check = mlx5e_features_check, .ndo_tx_timeout = mlx5e_tx_timeout, .ndo_bpf = mlx5e_xdp, @@@ -4941,15 -4810,15 +4948,15 @@@ void mlx5e_build_rss_params(struct mlx5 tirc_default_config[tt].rx_hash_fields; }
-void mlx5e_build_nic_params(struct mlx5e_priv *priv, - struct mlx5e_xsk *xsk, - struct mlx5e_rss_params *rss_params, - struct mlx5e_params *params, - u16 mtu) +void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu) { + struct mlx5e_rss_params *rss_params = &priv->rss_params; + struct mlx5e_params *params = &priv->channels.params; struct mlx5_core_dev *mdev = priv->mdev; u8 rx_cq_period_mode;
+ priv->max_nch = mlx5e_calc_max_nch(priv, priv->profile); + params->sw_mtu = mtu; params->hard_mtu = MLX5E_ETH_HARD_MTU; params->num_channels = min_t(unsigned int, MLX5E_MAX_NUM_CHANNELS / 2, @@@ -5007,11 -4876,6 +5014,11 @@@
/* AF_XDP */ params->xsk = xsk; + + /* Do not update netdev->features directly in here + * on mlx5e_attach_netdev() we will call mlx5e_update_features() + * To update netdev->features please modify mlx5e_fix_features() + */ }
static void mlx5e_set_netdev_dev_addr(struct net_device *netdev) @@@ -5113,6 -4977,8 +5120,6 @@@ static void mlx5e_build_nic_netdev(stru netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER; netdev->hw_features |= NETIF_F_HW_VLAN_STAG_TX;
- mlx5e_vxlan_set_netdev_info(priv); - if (mlx5e_tunnel_any_tx_proto_supported(mdev)) { netdev->hw_enc_features |= NETIF_F_HW_CSUM; netdev->hw_enc_features |= NETIF_F_TSO; @@@ -5162,12 -5028,18 +5169,12 @@@ netdev->hw_features |= NETIF_F_RXFCS;
netdev->features = netdev->hw_features; - if (!priv->channels.params.lro_en) - netdev->features &= ~NETIF_F_LRO;
+ /* Defaults */ if (fcs_enabled) netdev->features &= ~NETIF_F_RXALL; - - if (!priv->channels.params.scatter_fcs_en) - netdev->features &= ~NETIF_F_RXFCS; - - /* prefere CQE compression over rxhash */ - if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS)) - netdev->features &= ~NETIF_F_RXHASH; + netdev->features &= ~NETIF_F_LRO; + netdev->features &= ~NETIF_F_RXFCS;
#define FT_CAP(f) MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.f) if (FT_CAP(flow_modify_en) && @@@ -5181,8 -5053,6 +5188,8 @@@ netdev->hw_features |= NETIF_F_NTUPLE; #endif } + if (mlx5_qos_is_supported(mdev)) + netdev->features |= NETIF_F_HW_TC;
netdev->features |= NETIF_F_HIGHDMA; netdev->features |= NETIF_F_HW_VLAN_STAG_FILTER; @@@ -5233,28 -5103,33 +5240,28 @@@ void mlx5e_destroy_q_counters(struct ml }
static int mlx5e_nic_init(struct mlx5_core_dev *mdev, - struct net_device *netdev, - const struct mlx5e_profile *profile, - void *ppriv) + struct net_device *netdev) { struct mlx5e_priv *priv = netdev_priv(netdev); - struct mlx5e_rss_params *rss = &priv->rss_params; int err;
- err = mlx5e_netdev_init(netdev, priv, mdev, profile, ppriv); - if (err) - return err; - - mlx5e_build_nic_params(priv, &priv->xsk, rss, &priv->channels.params, - netdev->mtu); + mlx5e_build_nic_params(priv, &priv->xsk, netdev->mtu); + mlx5e_vxlan_set_netdev_info(priv);
mlx5e_timestamp_init(priv);
err = mlx5e_ipsec_init(priv); if (err) mlx5_core_err(mdev, "IPSec initialization failed, %d\n", err); + err = mlx5e_tls_init(priv); if (err) mlx5_core_err(mdev, "TLS initialization failed, %d\n", err); - mlx5e_build_nic_netdev(netdev); + err = mlx5e_devlink_port_register(priv); if (err) mlx5_core_err(mdev, "mlx5e_devlink_port_register failed, %d\n", err); + mlx5e_health_create_reporters(priv);
return 0; @@@ -5266,6 -5141,7 +5273,6 @@@ static void mlx5e_nic_cleanup(struct ml mlx5e_devlink_port_unregister(priv); mlx5e_tls_cleanup(priv); mlx5e_ipsec_cleanup(priv); - mlx5e_netdev_cleanup(priv->netdev, priv); }
static int mlx5e_init_nic_rx(struct mlx5e_priv *priv) @@@ -5394,7 -5270,6 +5401,7 @@@ static void mlx5e_nic_enable(struct mlx mlx5_lag_add(mdev, netdev);
mlx5e_enable_async_events(priv); + mlx5e_enable_blocking_events(priv); if (mlx5e_monitor_counter_supported(priv)) mlx5e_monitor_counter_init(priv);
@@@ -5432,12 -5307,6 +5439,12 @@@ static void mlx5e_nic_disable(struct ml if (mlx5e_monitor_counter_supported(priv)) mlx5e_monitor_counter_cleanup(priv);
+ mlx5e_disable_blocking_events(priv); + if (priv->en_trap) { + mlx5e_deactivate_trap(priv); + mlx5e_close_trap(priv->en_trap); + priv->en_trap = NULL; + } mlx5e_disable_async_events(priv); mlx5_lag_remove(mdev); mlx5_vxlan_reset_to_default(mdev->vxlan); @@@ -5468,23 -5337,27 +5475,23 @@@ static const struct mlx5e_profile mlx5e };
/* mlx5e generic netdev management API (move to en_common.c) */ - -/* mlx5e_netdev_init/cleanup must be called from profile->init/cleanup callbacks */ -int mlx5e_netdev_init(struct net_device *netdev, - struct mlx5e_priv *priv, - struct mlx5_core_dev *mdev, - const struct mlx5e_profile *profile, - void *ppriv) +int mlx5e_priv_init(struct mlx5e_priv *priv, + struct net_device *netdev, + struct mlx5_core_dev *mdev) { + memset(priv, 0, sizeof(*priv)); + /* priv init */ priv->mdev = mdev; priv->netdev = netdev; - priv->profile = profile; - priv->ppriv = ppriv; priv->msglevel = MLX5E_MSG_LEVEL; - priv->max_nch = netdev->num_rx_queues / max_t(u8, profile->rq_groups, 1); priv->max_opened_tc = 1;
if (!alloc_cpumask_var(&priv->scratchpad.cpumask, GFP_KERNEL)) return -ENOMEM;
mutex_init(&priv->state_lock); + hash_init(priv->htb.qos_tc2node); INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work); INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work); INIT_WORK(&priv->tx_timeout_work, mlx5e_tx_timeout_work); @@@ -5494,6 -5367,9 +5501,6 @@@ if (!priv->wq) goto err_free_cpumask;
- /* netdev init */ - netif_carrier_off(netdev); - return 0;
err_free_cpumask: @@@ -5502,39 -5378,38 +5509,39 @@@ return -ENOMEM; }
-void mlx5e_netdev_cleanup(struct net_device *netdev, struct mlx5e_priv *priv) +void mlx5e_priv_cleanup(struct mlx5e_priv *priv) { + int i; + destroy_workqueue(priv->wq); free_cpumask_var(priv->scratchpad.cpumask); + + for (i = 0; i < priv->htb.max_qos_sqs; i++) + kfree(priv->htb.qos_sq_stats[i]); + kvfree(priv->htb.qos_sq_stats); }
-struct net_device *mlx5e_create_netdev(struct mlx5_core_dev *mdev, - const struct mlx5e_profile *profile, - int nch, - void *ppriv) +struct net_device * +mlx5e_create_netdev(struct mlx5_core_dev *mdev, unsigned int txqs, unsigned int rxqs) { struct net_device *netdev; - unsigned int ptp_txqs = 0; int err;
- if (MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn)) - ptp_txqs = profile->max_tc; - - netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv), - nch * profile->max_tc + ptp_txqs, - nch * profile->rq_groups); + netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv), txqs, rxqs); if (!netdev) { mlx5_core_err(mdev, "alloc_etherdev_mqs() failed\n"); return NULL; }
- err = profile->init(mdev, netdev, profile, ppriv); + err = mlx5e_priv_init(netdev_priv(netdev), netdev, mdev); if (err) { - mlx5_core_err(mdev, "failed to init mlx5e profile %d\n", err); + mlx5_core_err(mdev, "mlx5e_priv_init failed, err=%d\n", err); goto err_free_netdev; }
+ netif_carrier_off(netdev); + dev_net_set(netdev, mlx5_core_net(mdev)); + return netdev;
err_free_netdev: @@@ -5543,23 -5418,14 +5550,23 @@@ return NULL; }
+static void mlx5e_update_features(struct net_device *netdev) +{ + if (netdev->reg_state != NETREG_REGISTERED) + return; /* features will be updated on netdev registration */ + + rtnl_lock(); + netdev_update_features(netdev); + rtnl_unlock(); +} + int mlx5e_attach_netdev(struct mlx5e_priv *priv) { const bool take_rtnl = priv->netdev->reg_state == NETREG_REGISTERED; - const struct mlx5e_profile *profile; + const struct mlx5e_profile *profile = priv->profile; int max_nch; int err;
- profile = priv->profile; clear_bit(MLX5E_STATE_DESTROYING, &priv->state);
/* max number of channels may have changed */ @@@ -5599,8 -5465,6 +5606,8 @@@ if (profile->enable) profile->enable(priv);
+ mlx5e_update_features(priv->netdev); + return 0;
err_cleanup_tx: @@@ -5627,76 -5491,13 +5634,76 @@@ void mlx5e_detach_netdev(struct mlx5e_p cancel_work_sync(&priv->update_stats_work); }
+static int +mlx5e_netdev_attach_profile(struct mlx5e_priv *priv, + const struct mlx5e_profile *new_profile, void *new_ppriv) +{ + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + err = mlx5e_priv_init(priv, netdev, mdev); + if (err) { + mlx5_core_err(mdev, "mlx5e_priv_init failed, err=%d\n", err); + return err; + } + netif_carrier_off(netdev); + priv->profile = new_profile; + priv->ppriv = new_ppriv; + err = new_profile->init(priv->mdev, priv->netdev); + if (err) + return err; + err = mlx5e_attach_netdev(priv); + if (err) + new_profile->cleanup(priv); + return err; +} + +int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, + const struct mlx5e_profile *new_profile, void *new_ppriv) +{ + unsigned int new_max_nch = mlx5e_calc_max_nch(priv, new_profile); + const struct mlx5e_profile *orig_profile = priv->profile; + void *orig_ppriv = priv->ppriv; + int err, rollback_err; + + /* sanity */ + if (new_max_nch != priv->max_nch) { + netdev_warn(priv->netdev, + "%s: Replacing profile with different max channels\n", + __func__); + return -EINVAL; + } + + /* cleanup old profile */ + mlx5e_detach_netdev(priv); + priv->profile->cleanup(priv); + mlx5e_priv_cleanup(priv); + + err = mlx5e_netdev_attach_profile(priv, new_profile, new_ppriv); + if (err) { /* roll back to original profile */ + netdev_warn(priv->netdev, "%s: new profile init failed, %d\n", + __func__, err); + goto rollback; + } + + return 0; + +rollback: + rollback_err = mlx5e_netdev_attach_profile(priv, orig_profile, orig_ppriv); + if (rollback_err) { + netdev_err(priv->netdev, + "%s: failed to rollback to orig profile, %d\n", + __func__, rollback_err); + } + return err; +} + void mlx5e_destroy_netdev(struct mlx5e_priv *priv) { - const struct mlx5e_profile *profile = priv->profile; struct net_device *netdev = priv->netdev;
- if (profile->cleanup) - profile->cleanup(priv); + mlx5e_priv_cleanup(priv); free_netdev(netdev); }
@@@ -5742,48 -5543,28 +5749,48 @@@ static int mlx5e_probe(struct auxiliary const struct auxiliary_device_id *id) { struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev); + const struct mlx5e_profile *profile = &mlx5e_nic_profile; struct mlx5_core_dev *mdev = edev->mdev; struct net_device *netdev; pm_message_t state = {}; - void *priv; + unsigned int txqs, rxqs, ptp_txqs = 0; + struct mlx5e_priv *priv; + int qos_sqs = 0; int err; int nch;
+ if (MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn)) + ptp_txqs = profile->max_tc; + + if (mlx5_qos_is_supported(mdev)) + qos_sqs = mlx5e_qos_max_leaf_nodes(mdev); + nch = mlx5e_get_max_num_channels(mdev); - netdev = mlx5e_create_netdev(mdev, &mlx5e_nic_profile, nch, NULL); + txqs = nch * profile->max_tc + ptp_txqs + qos_sqs; + rxqs = nch * profile->rq_groups; + netdev = mlx5e_create_netdev(mdev, txqs, rxqs); if (!netdev) { mlx5_core_err(mdev, "mlx5e_create_netdev failed\n"); return -ENOMEM; }
- dev_net_set(netdev, mlx5_core_net(mdev)); + mlx5e_build_nic_netdev(netdev); + priv = netdev_priv(netdev); dev_set_drvdata(&adev->dev, priv);
+ priv->profile = profile; + priv->ppriv = NULL; + err = profile->init(mdev, netdev); + if (err) { + mlx5_core_err(mdev, "mlx5e_nic_profile init failed, %d\n", err); + goto err_destroy_netdev; + } + err = mlx5e_resume(adev); if (err) { mlx5_core_err(mdev, "mlx5e_resume failed, %d\n", err); - goto err_destroy_netdev; + goto err_profile_cleanup; }
err = register_netdev(netdev); @@@ -5799,8 -5580,6 +5806,8 @@@
err_resume: mlx5e_suspend(adev, state); +err_profile_cleanup: + profile->cleanup(priv); err_destroy_netdev: mlx5e_destroy_netdev(priv); return err; @@@ -5814,7 -5593,6 +5821,7 @@@ static void mlx5e_remove(struct auxilia mlx5e_dcbnl_delete_app(priv); unregister_netdev(priv->netdev); mlx5e_suspend(adev, state); + priv->profile->cleanup(priv); mlx5e_destroy_netdev(priv); }
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index fac96ea819a1,4864deed9dc9..1b6ad94ebb10 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@@ -47,11 -47,11 +47,11 @@@ #include "fpga/ipsec.h" #include "en_accel/ipsec_rxtx.h" #include "en_accel/tls_rxtx.h" -#include "lib/clock.h" #include "en/xdp.h" #include "en/xsk/rx.h" #include "en/health.h" #include "en/params.h" +#include "devlink.h"
static struct sk_buff * mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, @@@ -212,6 -212,11 +212,6 @@@ static inline u32 mlx5e_decompress_cqes return mlx5e_decompress_cqes_cont(rq, wq, 1, budget_rem) - 1; }
-static inline bool mlx5e_page_is_reserved(struct page *page) -{ - return page_is_pfmemalloc(page) || page_to_nid(page) != numa_mem_id(); -} - static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info) { @@@ -224,7 -229,7 +224,7 @@@ return false; }
- if (unlikely(mlx5e_page_is_reserved(dma_info->page))) { + if (!dev_page_is_reusable(dma_info->page)) { stats->cache_waive++; return false; } @@@ -1061,8 -1066,9 +1061,8 @@@ static inline void mlx5e_build_rx_skb(s }
if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp))) - skb_hwtstamps(skb)->hwtstamp = - mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe)); - + skb_hwtstamps(skb)->hwtstamp = mlx5e_cqe_ts_to_ns(rq->ptp_cyc2time, + rq->clock, get_cqe_ts(cqe)); skb_record_rx_queue(skb, rq->ix);
if (likely(netdev->features & NETIF_F_RXHASH)) @@@ -1120,8 -1126,12 +1120,8 @@@ struct sk_buff *mlx5e_build_linear_skb( static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom, u32 len, struct xdp_buff *xdp) { - xdp->data_hard_start = va; - xdp->data = va + headroom; - xdp_set_data_meta_invalid(xdp); - xdp->data_end = xdp->data + len; - xdp->rxq = &rq->xdp_rxq; - xdp->frame_sz = rq->buff.frame0_sz; + xdp_init_buff(xdp, rq->buff.frame0_sz, &rq->xdp_rxq); + xdp_prepare_buff(xdp, va, headroom, len, false); }
static struct sk_buff * @@@ -1665,8 -1675,9 +1665,8 @@@ static inline void mlx5i_complete_rx_cq }
if (unlikely(mlx5e_rx_hw_stamp(tstamp))) - skb_hwtstamps(skb)->hwtstamp = - mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe)); - + skb_hwtstamps(skb)->hwtstamp = mlx5e_cqe_ts_to_ns(rq->ptp_cyc2time, + rq->clock, get_cqe_ts(cqe)); skb_record_rx_queue(skb, rq->ix);
if (likely(netdev->features & NETIF_F_RXHASH)) @@@ -1783,10 -1794,12 +1783,10 @@@ int mlx5e_rq_set_handlers(struct mlx5e_ rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
rq->handle_rx_cqe = priv->profile->rx_handlers->handle_rx_cqe_mpwqe; - if (MLX5_IPSEC_DEV(mdev)) { - netdev_err(netdev, "MPWQE RQ with IPSec offload not supported\n"); -#ifdef CONFIG_MLX5_EN_IPSEC + if (mlx5_fpga_is_ipsec_device(mdev)) { + netdev_err(netdev, "MPWQE RQ with Innova IPSec offload not supported\n"); return -EINVAL; } -#endif if (!rq->handle_rx_cqe) { netdev_err(netdev, "RX handler of MPWQE RQ is not set\n"); return -EINVAL; @@@ -1816,48 -1829,3 +1816,48 @@@
return 0; } + +static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + struct mlx5e_priv *priv = netdev_priv(rq->netdev); + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + struct mlx5e_wqe_frag_info *wi; + struct sk_buff *skb; + u32 cqe_bcnt; + u16 trap_id; + u16 ci; + + trap_id = get_cqe_flow_tag(cqe); + ci = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter)); + wi = get_frag(rq, ci); + cqe_bcnt = be32_to_cpu(cqe->byte_cnt); + + if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + rq->stats->wqe_err++; + goto free_wqe; + } + + skb = mlx5e_skb_from_cqe_nonlinear(rq, cqe, wi, cqe_bcnt); + if (!skb) + goto free_wqe; + + mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + skb_push(skb, ETH_HLEN); + + mlx5_devlink_trap_report(rq->mdev, trap_id, skb, &priv->dl_port); + dev_kfree_skb_any(skb); + +free_wqe: + mlx5e_free_rx_wqe(rq, wi, false); + mlx5_wq_cyc_pop(wq); +} + +void mlx5e_rq_set_trap_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params) +{ + rq->wqe.skb_from_cqe = mlx5e_rx_is_linear_skb(params, NULL) ? + mlx5e_skb_from_cqe_linear : + mlx5e_skb_from_cqe_nonlinear; + rq->post_wqes = mlx5e_post_rx_wqes; + rq->dealloc_wqe = mlx5e_dealloc_rx_wqe; + rq->handle_rx_cqe = mlx5e_trap_handle_rx_cqe; +} diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9f126054d371,717fbaa6ce73..0da69b98f38f --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@@ -63,8 -63,6 +63,8 @@@ #include "en/mapping.h" #include "en/tc_ct.h" #include "en/mod_hdr.h" +#include "en/tc_priv.h" +#include "en/tc_tun_encap.h" #include "lib/devcom.h" #include "lib/geneve.h" #include "lib/fs_chains.h" @@@ -73,6 -71,90 +73,6 @@@
#define nic_chains(priv) ((priv)->fs.tc.chains) #define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) -#define MLX5E_TC_FLOW_BASE (MLX5E_TC_FLAG_LAST_EXPORTED_BIT + 1) - -enum { - MLX5E_TC_FLOW_FLAG_INGRESS = MLX5E_TC_FLAG_INGRESS_BIT, - MLX5E_TC_FLOW_FLAG_EGRESS = MLX5E_TC_FLAG_EGRESS_BIT, - MLX5E_TC_FLOW_FLAG_ESWITCH = MLX5E_TC_FLAG_ESW_OFFLOAD_BIT, - MLX5E_TC_FLOW_FLAG_FT = MLX5E_TC_FLAG_FT_OFFLOAD_BIT, - MLX5E_TC_FLOW_FLAG_NIC = MLX5E_TC_FLAG_NIC_OFFLOAD_BIT, - MLX5E_TC_FLOW_FLAG_OFFLOADED = MLX5E_TC_FLOW_BASE, - MLX5E_TC_FLOW_FLAG_HAIRPIN = MLX5E_TC_FLOW_BASE + 1, - MLX5E_TC_FLOW_FLAG_HAIRPIN_RSS = MLX5E_TC_FLOW_BASE + 2, - MLX5E_TC_FLOW_FLAG_SLOW = MLX5E_TC_FLOW_BASE + 3, - MLX5E_TC_FLOW_FLAG_DUP = MLX5E_TC_FLOW_BASE + 4, - MLX5E_TC_FLOW_FLAG_NOT_READY = MLX5E_TC_FLOW_BASE + 5, - MLX5E_TC_FLOW_FLAG_DELETED = MLX5E_TC_FLOW_BASE + 6, - MLX5E_TC_FLOW_FLAG_CT = MLX5E_TC_FLOW_BASE + 7, - MLX5E_TC_FLOW_FLAG_L3_TO_L2_DECAP = MLX5E_TC_FLOW_BASE + 8, -}; - -#define MLX5E_TC_MAX_SPLITS 1 - -/* Helper struct for accessing a struct containing list_head array. - * Containing struct - * |- Helper array - * [0] Helper item 0 - * |- list_head item 0 - * |- index (0) - * [1] Helper item 1 - * |- list_head item 1 - * |- index (1) - * To access the containing struct from one of the list_head items: - * 1. Get the helper item from the list_head item using - * helper item = - * container_of(list_head item, helper struct type, list_head field) - * 2. Get the contining struct from the helper item and its index in the array: - * containing struct = - * container_of(helper item, containing struct type, helper field[index]) - */ -struct encap_flow_item { - struct mlx5e_encap_entry *e; /* attached encap instance */ - struct list_head list; - int index; -}; - -struct mlx5e_tc_flow { - struct rhash_head node; - struct mlx5e_priv *priv; - u64 cookie; - unsigned long flags; - struct mlx5_flow_handle *rule[MLX5E_TC_MAX_SPLITS + 1]; - - /* flows sharing the same reformat object - currently mpls decap */ - struct list_head l3_to_l2_reformat; - struct mlx5e_decap_entry *decap_reformat; - - /* Flow can be associated with multiple encap IDs. - * The number of encaps is bounded by the number of supported - * destinations. - */ - struct encap_flow_item encaps[MLX5_MAX_FLOW_FWD_VPORTS]; - struct mlx5e_tc_flow *peer_flow; - struct mlx5e_mod_hdr_handle *mh; /* attached mod header instance */ - struct mlx5e_hairpin_entry *hpe; /* attached hairpin instance */ - struct list_head hairpin; /* flows sharing the same hairpin */ - struct list_head peer; /* flows with peer flow */ - struct list_head unready; /* flows not ready to be offloaded (e.g due to missing route) */ - struct net_device *orig_dev; /* netdev adding flow first */ - int tmp_efi_index; - struct list_head tmp_list; /* temporary flow list used by neigh update */ - refcount_t refcnt; - struct rcu_head rcu_head; - struct completion init_done; - int tunnel_id; /* the mapped tunnel id of this flow */ - struct mlx5_flow_attr *attr; -}; - -struct mlx5e_tc_flow_parse_attr { - const struct ip_tunnel_info *tun_info[MLX5_MAX_FLOW_FWD_VPORTS]; - struct net_device *filter_dev; - struct mlx5_flow_spec spec; - struct mlx5e_tc_mod_hdr_acts mod_hdr_acts; - int mirred_ifindex[MLX5_MAX_FLOW_FWD_VPORTS]; - struct ethhdr eth; -};
#define MLX5E_TC_TABLE_NUM_GROUPS 4 #define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(18) @@@ -83,15 -165,10 +83,15 @@@ struct mlx5e_tc_attr_to_reg_mapping mlx .moffset = 0, .mlen = 2, }, + [VPORT_TO_REG] = { + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_0, + .moffset = 2, + .mlen = 2, + }, [TUNNEL_TO_REG] = { .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_1, .moffset = 1, - .mlen = 3, + .mlen = ((ESW_TUN_OPTS_BITS + ESW_TUN_ID_BITS) / 8), .soffset = MLX5_BYTE_OFF(fte_match_param, misc_parameters_2.metadata_reg_c_1), }, @@@ -113,14 -190,6 +113,14 @@@ [NIC_ZONE_RESTORE_TO_REG] = nic_zone_restore_to_reg_ct, };
+/* To avoid false lock dependency warning set the tc_ht lock + * class different than the lock class of the ht being used when deleting + * last flow from a group and then deleting a group, we get into del_sw_flow_group() + * which call rhashtable_destroy on fg->ftes_hash which will take ht->mutex but + * it's different than the ht->mutex here. + */ +static struct lock_class_key tc_ht_lock_key; + static void mlx5e_put_flow_tunnel_id(struct mlx5e_tc_flow *flow);
void @@@ -170,11 -239,11 +170,11 @@@ mlx5e_tc_match_to_reg_get_match(struct }
int -mlx5e_tc_match_to_reg_set(struct mlx5_core_dev *mdev, - struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, - enum mlx5_flow_namespace_type ns, - enum mlx5e_tc_attr_to_reg type, - u32 data) +mlx5e_tc_match_to_reg_set_and_get_id(struct mlx5_core_dev *mdev, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, + enum mlx5_flow_namespace_type ns, + enum mlx5e_tc_attr_to_reg type, + u32 data) { int moffset = mlx5e_tc_attr_to_reg_mappings[type].moffset; int mfield = mlx5e_tc_attr_to_reg_mappings[type].mfield; @@@ -198,10 -267,9 +198,10 @@@ MLX5_SET(set_action_in, modact, offset, moffset * 8); MLX5_SET(set_action_in, modact, length, mlen * 8); MLX5_SET(set_action_in, modact, data, data); + err = mod_hdr_acts->num_actions; mod_hdr_acts->num_actions++;
- return 0; + return err; }
static struct mlx5_tc_ct_priv * @@@ -250,41 -318,6 +250,41 @@@ mlx5_tc_rule_delete(struct mlx5e_priv * mlx5e_del_offloaded_nic_rule(priv, rule, attr); }
+int +mlx5e_tc_match_to_reg_set(struct mlx5_core_dev *mdev, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, + enum mlx5_flow_namespace_type ns, + enum mlx5e_tc_attr_to_reg type, + u32 data) +{ + int ret = mlx5e_tc_match_to_reg_set_and_get_id(mdev, mod_hdr_acts, ns, type, data); + + return ret < 0 ? ret : 0; +} + +void mlx5e_tc_match_to_reg_mod_hdr_change(struct mlx5_core_dev *mdev, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, + enum mlx5e_tc_attr_to_reg type, + int act_id, u32 data) +{ + int moffset = mlx5e_tc_attr_to_reg_mappings[type].moffset; + int mfield = mlx5e_tc_attr_to_reg_mappings[type].mfield; + int mlen = mlx5e_tc_attr_to_reg_mappings[type].mlen; + char *modact; + + modact = mod_hdr_acts->actions + (act_id * MLX5_MH_ACT_SZ); + + /* Firmware has 5bit length field and 0 means 32bits */ + if (mlen == 4) + mlen = 0; + + MLX5_SET(set_action_in, modact, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, modact, field, mfield); + MLX5_SET(set_action_in, modact, offset, moffset * 8); + MLX5_SET(set_action_in, modact, length, mlen * 8); + MLX5_SET(set_action_in, modact, data, data); +} + struct mlx5e_hairpin { struct mlx5_hairpin *pair;
@@@ -322,14 -355,15 +322,14 @@@ struct mlx5e_hairpin_entry static void mlx5e_tc_del_flow(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow);
-static struct mlx5e_tc_flow *mlx5e_flow_get(struct mlx5e_tc_flow *flow) +struct mlx5e_tc_flow *mlx5e_flow_get(struct mlx5e_tc_flow *flow) { if (!flow || !refcount_inc_not_zero(&flow->refcnt)) return ERR_PTR(-EINVAL); return flow; }
-static void mlx5e_flow_put(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow) +void mlx5e_flow_put(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow) { if (refcount_dec_and_test(&flow->refcnt)) { mlx5e_tc_del_flow(priv, flow); @@@ -337,6 -371,48 +337,6 @@@ } }
-static void __flow_flag_set(struct mlx5e_tc_flow *flow, unsigned long flag) -{ - /* Complete all memory stores before setting bit. */ - smp_mb__before_atomic(); - set_bit(flag, &flow->flags); -} - -#define flow_flag_set(flow, flag) __flow_flag_set(flow, MLX5E_TC_FLOW_FLAG_##flag) - -static bool __flow_flag_test_and_set(struct mlx5e_tc_flow *flow, - unsigned long flag) -{ - /* test_and_set_bit() provides all necessary barriers */ - return test_and_set_bit(flag, &flow->flags); -} - -#define flow_flag_test_and_set(flow, flag) \ - __flow_flag_test_and_set(flow, \ - MLX5E_TC_FLOW_FLAG_##flag) - -static void __flow_flag_clear(struct mlx5e_tc_flow *flow, unsigned long flag) -{ - /* Complete all memory stores before clearing bit. */ - smp_mb__before_atomic(); - clear_bit(flag, &flow->flags); -} - -#define flow_flag_clear(flow, flag) __flow_flag_clear(flow, \ - MLX5E_TC_FLOW_FLAG_##flag) - -static bool __flow_flag_test(struct mlx5e_tc_flow *flow, unsigned long flag) -{ - bool ret = test_bit(flag, &flow->flags); - - /* Read fields of flow structure only after checking flags. */ - smp_mb__after_atomic(); - return ret; -} - -#define flow_flag_test(flow, flag) __flow_flag_test(flow, \ - MLX5E_TC_FLOW_FLAG_##flag) - bool mlx5e_is_eswitch_flow(struct mlx5e_tc_flow *flow) { return flow_flag_test(flow, ESWITCH); @@@ -347,7 -423,7 +347,7 @@@ static bool mlx5e_is_ft_flow(struct mlx return flow_flag_test(flow, FT); }
-static bool mlx5e_is_offloaded_flow(struct mlx5e_tc_flow *flow) +bool mlx5e_is_offloaded_flow(struct mlx5e_tc_flow *flow) { return flow_flag_test(flow, OFFLOADED); } @@@ -1062,7 -1138,23 +1062,7 @@@ static void mlx5e_tc_del_nic_flow(struc kfree(flow->attr); }
-static void mlx5e_detach_encap(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow, int out_index); - -static int mlx5e_attach_encap(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow, - struct net_device *mirred_dev, - int out_index, - struct netlink_ext_ack *extack, - struct net_device **encap_dev, - bool *encap_valid); -static int mlx5e_attach_decap(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow, - struct netlink_ext_ack *extack); -static void mlx5e_detach_decap(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow); - -static struct mlx5_flow_handle * +struct mlx5_flow_handle * mlx5e_tc_offload_fdb_rules(struct mlx5_eswitch *esw, struct mlx5e_tc_flow *flow, struct mlx5_flow_spec *spec, @@@ -1097,9 -1189,10 +1097,9 @@@ return rule; }
-static void -mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw, - struct mlx5e_tc_flow *flow, - struct mlx5_flow_attr *attr) +void mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr) { flow_flag_clear(flow, OFFLOADED);
@@@ -1118,7 -1211,7 +1118,7 @@@ offload_rule_0 mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr); }
-static struct mlx5_flow_handle * +struct mlx5_flow_handle * mlx5e_tc_offload_to_slow_path(struct mlx5_eswitch *esw, struct mlx5e_tc_flow *flow, struct mlx5_flow_spec *spec) @@@ -1144,8 -1237,9 +1144,8 @@@ return rule; }
-static void -mlx5e_tc_unoffload_from_slow_path(struct mlx5_eswitch *esw, - struct mlx5e_tc_flow *flow) +void mlx5e_tc_unoffload_from_slow_path(struct mlx5_eswitch *esw, + struct mlx5e_tc_flow *flow) { struct mlx5_flow_attr *slow_attr;
@@@ -1213,63 -1307,6 +1213,63 @@@ static void remove_unready_flow(struct mutex_unlock(&uplink_priv->unready_flows_lock); }
+static bool same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv); + +bool mlx5e_tc_is_vf_tunnel(struct net_device *out_dev, struct net_device *route_dev) +{ + struct mlx5_core_dev *out_mdev, *route_mdev; + struct mlx5e_priv *out_priv, *route_priv; + + out_priv = netdev_priv(out_dev); + out_mdev = out_priv->mdev; + route_priv = netdev_priv(route_dev); + route_mdev = route_priv->mdev; + + if (out_mdev->coredev_type != MLX5_COREDEV_PF || + route_mdev->coredev_type != MLX5_COREDEV_VF) + return false; + + return same_hw_devs(out_priv, route_priv); +} + +int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *route_dev, u16 *vport) +{ + struct mlx5e_priv *out_priv, *route_priv; + struct mlx5_core_dev *route_mdev; + struct mlx5_eswitch *esw; + u16 vhca_id; + int err; + + out_priv = netdev_priv(out_dev); + esw = out_priv->mdev->priv.eswitch; + route_priv = netdev_priv(route_dev); + route_mdev = route_priv->mdev; + + vhca_id = MLX5_CAP_GEN(route_mdev, vhca_id); + err = mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport); + return err; +} + +int mlx5e_tc_add_flow_mod_hdr(struct mlx5e_priv *priv, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct mlx5e_tc_flow *flow) +{ + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts = &parse_attr->mod_hdr_acts; + struct mlx5_modify_hdr *mod_hdr; + + mod_hdr = mlx5_modify_header_alloc(priv->mdev, + get_flow_name_space(flow), + mod_hdr_acts->num_actions, + mod_hdr_acts->actions); + if (IS_ERR(mod_hdr)) + return PTR_ERR(mod_hdr); + + WARN_ON(flow->attr->modify_hdr); + flow->attr->modify_hdr = mod_hdr; + + return 0; +} + static int mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, @@@ -1279,15 -1316,21 +1279,15 @@@ struct net_device *out_dev, *encap_dev = NULL; struct mlx5e_tc_flow_parse_attr *parse_attr; struct mlx5_flow_attr *attr = flow->attr; + bool vf_tun = false, encap_valid = true; struct mlx5_esw_flow_attr *esw_attr; struct mlx5_fc *counter = NULL; struct mlx5e_rep_priv *rpriv; struct mlx5e_priv *out_priv; - bool encap_valid = true; u32 max_prio, max_chain; int err = 0; int out_index;
- if (!mlx5_chains_prios_supported(esw_chains(esw)) && attr->prio != 1) { - NL_SET_ERR_MSG_MOD(extack, - "E-switch priorities unsupported, upgrade FW"); - return -EOPNOTSUPP; - } - /* We check chain range only for tc flows. * For ft flows, we checked attr->chain was originally 0 and set it to * FDB_FT_CHAIN which is outside tc range. @@@ -1297,28 -1340,20 +1297,28 @@@ if (!mlx5e_is_ft_flow(flow) && attr->chain > max_chain) { NL_SET_ERR_MSG_MOD(extack, "Requested chain is out of supported range"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto err_out; }
max_prio = mlx5_chains_get_prio_range(esw_chains(esw)); if (attr->prio > max_prio) { NL_SET_ERR_MSG_MOD(extack, "Requested priority is out of supported range"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto err_out; + } + + if (flow_flag_test(flow, TUN_RX)) { + err = mlx5e_attach_decap_route(priv, flow); + if (err) + goto err_out; }
if (flow_flag_test(flow, L3_TO_L2_DECAP)) { err = mlx5e_attach_decap(priv, flow, extack); if (err) - return err; + goto err_out; }
parse_attr = attr->parse_attr; @@@ -1336,11 -1371,8 +1336,11 @@@ err = mlx5e_attach_encap(priv, flow, out_dev, out_index, extack, &encap_dev, &encap_valid); if (err) - return err; + goto err_out;
+ if (esw_attr->dests[out_index].flags & + MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE) + vf_tun = true; out_priv = netdev_priv(encap_dev); rpriv = out_priv->ppriv; esw_attr->dests[out_index].rep = rpriv->rep; @@@ -1349,27 -1381,20 +1349,27 @@@
err = mlx5_eswitch_add_vlan_action(esw, attr); if (err) - return err; + goto err_out;
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && !(attr->ct_attr.ct_action & TCA_CT_ACT_CLEAR)) { - err = mlx5e_attach_mod_hdr(priv, flow, parse_attr); - dealloc_mod_hdr_actions(&parse_attr->mod_hdr_acts); - if (err) - return err; + if (vf_tun) { + err = mlx5e_tc_add_flow_mod_hdr(priv, parse_attr, flow); + if (err) + goto err_out; + } else { + err = mlx5e_attach_mod_hdr(priv, flow, parse_attr); + if (err) + goto err_out; + } }
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { counter = mlx5_fc_create(esw_attr->counter_dev, true); - if (IS_ERR(counter)) - return PTR_ERR(counter); + if (IS_ERR(counter)) { + err = PTR_ERR(counter); + goto err_out; + }
attr->counter = counter; } @@@ -1383,17 -1408,12 +1383,17 @@@ else flow->rule[0] = mlx5e_tc_offload_fdb_rules(esw, flow, &parse_attr->spec, attr);
- if (IS_ERR(flow->rule[0])) - return PTR_ERR(flow->rule[0]); - else - flow_flag_set(flow, OFFLOADED); + if (IS_ERR(flow->rule[0])) { + err = PTR_ERR(flow->rule[0]); + goto err_out; + } + flow_flag_set(flow, OFFLOADED);
return 0; + +err_out: + flow_flag_set(flow, FAILED); + return err; }
static bool mlx5_flow_has_geneve_opt(struct mlx5e_tc_flow *flow) @@@ -1414,11 -1434,8 +1414,11 @@@ static void mlx5e_tc_del_fdb_flow(struc { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct mlx5_flow_attr *attr = flow->attr; + struct mlx5_esw_flow_attr *esw_attr; + bool vf_tun = false; int out_index;
+ esw_attr = attr->esw_attr; mlx5e_put_flow_tunnel_id(flow);
if (flow_flag_test(flow, NOT_READY)) @@@ -1436,33 -1453,20 +1436,33 @@@
mlx5_eswitch_del_vlan_action(esw, attr);
- for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) - if (attr->esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP) { + if (flow->decap_route) + mlx5e_detach_decap_route(priv, flow); + + for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { + if (esw_attr->dests[out_index].flags & + MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE) + vf_tun = true; + if (esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP) { mlx5e_detach_encap(priv, flow, out_index); kfree(attr->parse_attr->tun_info[out_index]); } - kvfree(attr->parse_attr); + }
mlx5_tc_ct_match_del(get_ct_priv(priv), &flow->attr->ct_attr);
- if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) - mlx5e_detach_mod_hdr(priv, flow); + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) { + dealloc_mod_hdr_actions(&attr->parse_attr->mod_hdr_acts); + if (vf_tun && attr->modify_hdr) + mlx5_modify_header_dealloc(priv->mdev, attr->modify_hdr); + else + mlx5e_detach_mod_hdr(priv, flow); + } + kvfree(attr->parse_attr); + kvfree(attr->esw_attr->rx_tun_attr);
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) - mlx5_fc_destroy(attr->esw_attr->counter_dev, attr->counter); + mlx5_fc_destroy(esw_attr->counter_dev, attr->counter);
if (flow_flag_test(flow, L3_TO_L2_DECAP)) mlx5e_detach_decap(priv, flow); @@@ -1470,13 -1474,141 +1470,13 @@@ kfree(flow->attr); }
-void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv, - struct mlx5e_encap_entry *e, - struct list_head *flow_list) -{ - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - struct mlx5_esw_flow_attr *esw_attr; - struct mlx5_flow_handle *rule; - struct mlx5_flow_attr *attr; - struct mlx5_flow_spec *spec; - struct mlx5e_tc_flow *flow; - int err; - - e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, - e->reformat_type, - e->encap_size, e->encap_header, - MLX5_FLOW_NAMESPACE_FDB); - if (IS_ERR(e->pkt_reformat)) { - mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %lu\n", - PTR_ERR(e->pkt_reformat)); - return; - } - e->flags |= MLX5_ENCAP_ENTRY_VALID; - mlx5e_rep_queue_neigh_stats_work(priv); - - list_for_each_entry(flow, flow_list, tmp_list) { - bool all_flow_encaps_valid = true; - int i; - - if (!mlx5e_is_offloaded_flow(flow)) - continue; - attr = flow->attr; - esw_attr = attr->esw_attr; - spec = &attr->parse_attr->spec; - - esw_attr->dests[flow->tmp_efi_index].pkt_reformat = e->pkt_reformat; - esw_attr->dests[flow->tmp_efi_index].flags |= MLX5_ESW_DEST_ENCAP_VALID; - /* Flow can be associated with multiple encap entries. - * Before offloading the flow verify that all of them have - * a valid neighbour. - */ - for (i = 0; i < MLX5_MAX_FLOW_FWD_VPORTS; i++) { - if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP)) - continue; - if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP_VALID)) { - all_flow_encaps_valid = false; - break; - } - } - /* Do not offload flows with unresolved neighbors */ - if (!all_flow_encaps_valid) - continue; - /* update from slow path rule to encap rule */ - rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr); - if (IS_ERR(rule)) { - err = PTR_ERR(rule); - mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n", - err); - continue; - } - - mlx5e_tc_unoffload_from_slow_path(esw, flow); - flow->rule[0] = rule; - /* was unset when slow path rule removed */ - flow_flag_set(flow, OFFLOADED); - } -} - -void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv, - struct mlx5e_encap_entry *e, - struct list_head *flow_list) -{ - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - struct mlx5_esw_flow_attr *esw_attr; - struct mlx5_flow_handle *rule; - struct mlx5_flow_attr *attr; - struct mlx5_flow_spec *spec; - struct mlx5e_tc_flow *flow; - int err; - - list_for_each_entry(flow, flow_list, tmp_list) { - if (!mlx5e_is_offloaded_flow(flow)) - continue; - attr = flow->attr; - esw_attr = attr->esw_attr; - spec = &attr->parse_attr->spec; - - /* update from encap rule to slow path rule */ - rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec); - /* mark the flow's encap dest as non-valid */ - esw_attr->dests[flow->tmp_efi_index].flags &= ~MLX5_ESW_DEST_ENCAP_VALID; - - if (IS_ERR(rule)) { - err = PTR_ERR(rule); - mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n", - err); - continue; - } - - mlx5e_tc_unoffload_fdb_rules(esw, flow, attr); - flow->rule[0] = rule; - /* was unset when fast path rule removed */ - flow_flag_set(flow, OFFLOADED); - } - - /* we know that the encap is valid */ - e->flags &= ~MLX5_ENCAP_ENTRY_VALID; - mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat); -} - -static struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow) +struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow) { return flow->attr->counter; }
-/* Takes reference to all flows attached to encap and adds the flows to - * flow_list using 'tmp_list' list_head in mlx5e_tc_flow. - */ -void mlx5e_take_all_encap_flows(struct mlx5e_encap_entry *e, struct list_head *flow_list) -{ - struct encap_flow_item *efi; - struct mlx5e_tc_flow *flow; - - list_for_each_entry(efi, &e->flows, list) { - flow = container_of(efi, struct mlx5e_tc_flow, encaps[efi->index]); - if (IS_ERR(mlx5e_flow_get(flow))) - continue; - wait_for_completion(&flow->init_done); - - flow->tmp_efi_index = efi->index; - list_add(&flow->tmp_list, flow_list); - } -} - /* Iterate over tmp_list of flows attached to flow_list head. */ -void mlx5e_put_encap_flow_list(struct mlx5e_priv *priv, struct list_head *flow_list) +void mlx5e_put_flow_list(struct mlx5e_priv *priv, struct list_head *flow_list) { struct mlx5e_tc_flow *flow, *tmp;
@@@ -1484,6 -1616,222 +1484,6 @@@ mlx5e_flow_put(priv, flow); }
-static struct mlx5e_encap_entry * -mlx5e_get_next_valid_encap(struct mlx5e_neigh_hash_entry *nhe, - struct mlx5e_encap_entry *e) -{ - struct mlx5e_encap_entry *next = NULL; - -retry: - rcu_read_lock(); - - /* find encap with non-zero reference counter value */ - for (next = e ? - list_next_or_null_rcu(&nhe->encap_list, - &e->encap_list, - struct mlx5e_encap_entry, - encap_list) : - list_first_or_null_rcu(&nhe->encap_list, - struct mlx5e_encap_entry, - encap_list); - next; - next = list_next_or_null_rcu(&nhe->encap_list, - &next->encap_list, - struct mlx5e_encap_entry, - encap_list)) - if (mlx5e_encap_take(next)) - break; - - rcu_read_unlock(); - - /* release starting encap */ - if (e) - mlx5e_encap_put(netdev_priv(e->out_dev), e); - if (!next) - return next; - - /* wait for encap to be fully initialized */ - wait_for_completion(&next->res_ready); - /* continue searching if encap entry is not in valid state after completion */ - if (!(next->flags & MLX5_ENCAP_ENTRY_VALID)) { - e = next; - goto retry; - } - - return next; -} - -void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe) -{ - struct mlx5e_neigh *m_neigh = &nhe->m_neigh; - struct mlx5e_encap_entry *e = NULL; - struct mlx5e_tc_flow *flow; - struct mlx5_fc *counter; - struct neigh_table *tbl; - bool neigh_used = false; - struct neighbour *n; - u64 lastuse; - - if (m_neigh->family == AF_INET) - tbl = &arp_tbl; -#if IS_ENABLED(CONFIG_IPV6) - else if (m_neigh->family == AF_INET6) - tbl = ipv6_stub->nd_tbl; -#endif - else - return; - - /* mlx5e_get_next_valid_encap() releases previous encap before returning - * next one. - */ - while ((e = mlx5e_get_next_valid_encap(nhe, e)) != NULL) { - struct mlx5e_priv *priv = netdev_priv(e->out_dev); - struct encap_flow_item *efi, *tmp; - struct mlx5_eswitch *esw; - LIST_HEAD(flow_list); - - esw = priv->mdev->priv.eswitch; - mutex_lock(&esw->offloads.encap_tbl_lock); - list_for_each_entry_safe(efi, tmp, &e->flows, list) { - flow = container_of(efi, struct mlx5e_tc_flow, - encaps[efi->index]); - if (IS_ERR(mlx5e_flow_get(flow))) - continue; - list_add(&flow->tmp_list, &flow_list); - - if (mlx5e_is_offloaded_flow(flow)) { - counter = mlx5e_tc_get_counter(flow); - lastuse = mlx5_fc_query_lastuse(counter); - if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) { - neigh_used = true; - break; - } - } - } - mutex_unlock(&esw->offloads.encap_tbl_lock); - - mlx5e_put_encap_flow_list(priv, &flow_list); - if (neigh_used) { - /* release current encap before breaking the loop */ - mlx5e_encap_put(priv, e); - break; - } - } - - trace_mlx5e_tc_update_neigh_used_value(nhe, neigh_used); - - if (neigh_used) { - nhe->reported_lastuse = jiffies; - - /* find the relevant neigh according to the cached device and - * dst ip pair - */ - n = neigh_lookup(tbl, &m_neigh->dst_ip, m_neigh->dev); - if (!n) - return; - - neigh_event_send(n, NULL); - neigh_release(n); - } -} - -static void mlx5e_encap_dealloc(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e) -{ - WARN_ON(!list_empty(&e->flows)); - - if (e->compl_result > 0) { - mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e); - - if (e->flags & MLX5_ENCAP_ENTRY_VALID) - mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat); - } - - kfree(e->tun_info); - kfree(e->encap_header); - kfree_rcu(e, rcu); -} - -static void mlx5e_decap_dealloc(struct mlx5e_priv *priv, - struct mlx5e_decap_entry *d) -{ - WARN_ON(!list_empty(&d->flows)); - - if (!d->compl_result) - mlx5_packet_reformat_dealloc(priv->mdev, d->pkt_reformat); - - kfree_rcu(d, rcu); -} - -void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e) -{ - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - - if (!refcount_dec_and_mutex_lock(&e->refcnt, &esw->offloads.encap_tbl_lock)) - return; - hash_del_rcu(&e->encap_hlist); - mutex_unlock(&esw->offloads.encap_tbl_lock); - - mlx5e_encap_dealloc(priv, e); -} - -static void mlx5e_decap_put(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d) -{ - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - - if (!refcount_dec_and_mutex_lock(&d->refcnt, &esw->offloads.decap_tbl_lock)) - return; - hash_del_rcu(&d->hlist); - mutex_unlock(&esw->offloads.decap_tbl_lock); - - mlx5e_decap_dealloc(priv, d); -} - -static void mlx5e_detach_encap(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow, int out_index) -{ - struct mlx5e_encap_entry *e = flow->encaps[out_index].e; - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - - /* flow wasn't fully initialized */ - if (!e) - return; - - mutex_lock(&esw->offloads.encap_tbl_lock); - list_del(&flow->encaps[out_index].list); - flow->encaps[out_index].e = NULL; - if (!refcount_dec_and_test(&e->refcnt)) { - mutex_unlock(&esw->offloads.encap_tbl_lock); - return; - } - hash_del_rcu(&e->encap_hlist); - mutex_unlock(&esw->offloads.encap_tbl_lock); - - mlx5e_encap_dealloc(priv, e); -} - -static void mlx5e_detach_decap(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow) -{ - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - struct mlx5e_decap_entry *d = flow->decap_reformat; - - if (!d) - return; - - mutex_lock(&esw->offloads.decap_tbl_lock); - list_del(&flow->l3_to_l2_reformat); - flow->decap_reformat = NULL; - - if (!refcount_dec_and_test(&d->refcnt)) { - mutex_unlock(&esw->offloads.decap_tbl_lock); - return; - } - hash_del_rcu(&d->hlist); - mutex_unlock(&esw->offloads.decap_tbl_lock); - - mlx5e_decap_dealloc(priv, d); -} - static void __mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow) { struct mlx5_eswitch *esw = flow->priv->mdev->priv.eswitch; @@@ -1741,29 -2089,6 +1741,29 @@@ void mlx5e_tc_set_ethertype(struct mlx5 } }
+u8 mlx5e_tc_get_ip_version(struct mlx5_flow_spec *spec, bool outer) +{ + void *headers_v; + u16 ethertype; + u8 ip_version; + + if (outer) + headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers); + else + headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, inner_headers); + + ip_version = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_version); + /* Return ip_version converted from ethertype anyway */ + if (!ip_version) { + ethertype = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ethertype); + if (ethertype == ETH_P_IP || ethertype == ETH_P_ARP) + ip_version = 4; + else if (ethertype == ETH_P_IPV6) + ip_version = 6; + } + return ip_version; +} + static int parse_tunnel_attr(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, struct mlx5_flow_spec *spec, @@@ -1772,7 -2097,6 +1772,7 @@@ u8 *match_level, bool *match_inner) { + struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(filter_dev); struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct netlink_ext_ack *extack = f->common.extack; bool needs_mapping, sets_mapping; @@@ -1810,31 -2134,6 +1810,31 @@@ */ if (!netif_is_bareudp(filter_dev)) flow->attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP; + err = mlx5e_tc_set_attr_rx_tun(flow, spec); + if (err) + return err; + } else if (tunnel && tunnel->tunnel_type == MLX5E_TC_TUNNEL_TYPE_VXLAN) { + struct mlx5_flow_spec *tmp_spec; + + tmp_spec = kvzalloc(sizeof(*tmp_spec), GFP_KERNEL); + if (!tmp_spec) { + NL_SET_ERR_MSG_MOD(extack, "Failed to allocate memory for vxlan tmp spec"); + netdev_warn(priv->netdev, "Failed to allocate memory for vxlan tmp spec"); + return -ENOMEM; + } + memcpy(tmp_spec, spec, sizeof(*tmp_spec)); + + err = mlx5e_tc_tun_parse(filter_dev, priv, tmp_spec, f, match_level); + if (err) { + kvfree(tmp_spec); + NL_SET_ERR_MSG_MOD(extack, "Failed to parse tunnel attributes"); + netdev_warn(priv->netdev, "Failed to parse tunnel attributes"); + return err; + } + err = mlx5e_tc_set_attr_rx_tun(flow, tmp_spec); + kvfree(tmp_spec); + if (err) + return err; }
if (!needs_mapping && !sets_mapping) @@@ -3283,6 -3582,35 +3283,6 @@@ static int parse_tc_nic_actions(struct return 0; }
-struct encap_key { - const struct ip_tunnel_key *ip_tun_key; - struct mlx5e_tc_tunnel *tc_tunnel; -}; - -static inline int cmp_encap_info(struct encap_key *a, - struct encap_key *b) -{ - return memcmp(a->ip_tun_key, b->ip_tun_key, sizeof(*a->ip_tun_key)) || - a->tc_tunnel->tunnel_type != b->tc_tunnel->tunnel_type; -} - -static inline int cmp_decap_info(struct mlx5e_decap_key *a, - struct mlx5e_decap_key *b) -{ - return memcmp(&a->key, &b->key, sizeof(b->key)); -} - -static inline int hash_encap_info(struct encap_key *key) -{ - return jhash(key->ip_tun_key, sizeof(*key->ip_tun_key), - key->tc_tunnel->tunnel_type); -} - -static inline int hash_decap_info(struct mlx5e_decap_key *key) -{ - return jhash(&key->key, sizeof(key->key), 0); -} - static bool is_merged_eswitch_vfs(struct mlx5e_priv *priv, struct net_device *peer_netdev) { @@@ -3296,6 -3624,277 +3296,6 @@@ same_hw_devs(priv, peer_priv)); }
-bool mlx5e_encap_take(struct mlx5e_encap_entry *e) -{ - return refcount_inc_not_zero(&e->refcnt); -} - -static bool mlx5e_decap_take(struct mlx5e_decap_entry *e) -{ - return refcount_inc_not_zero(&e->refcnt); -} - -static struct mlx5e_encap_entry * -mlx5e_encap_get(struct mlx5e_priv *priv, struct encap_key *key, - uintptr_t hash_key) -{ - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - struct mlx5e_encap_entry *e; - struct encap_key e_key; - - hash_for_each_possible_rcu(esw->offloads.encap_tbl, e, - encap_hlist, hash_key) { - e_key.ip_tun_key = &e->tun_info->key; - e_key.tc_tunnel = e->tunnel; - if (!cmp_encap_info(&e_key, key) && - mlx5e_encap_take(e)) - return e; - } - - return NULL; -} - -static struct mlx5e_decap_entry * -mlx5e_decap_get(struct mlx5e_priv *priv, struct mlx5e_decap_key *key, - uintptr_t hash_key) -{ - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - struct mlx5e_decap_key r_key; - struct mlx5e_decap_entry *e; - - hash_for_each_possible_rcu(esw->offloads.decap_tbl, e, - hlist, hash_key) { - r_key = e->key; - if (!cmp_decap_info(&r_key, key) && - mlx5e_decap_take(e)) - return e; - } - return NULL; -} - -static struct ip_tunnel_info *dup_tun_info(const struct ip_tunnel_info *tun_info) -{ - size_t tun_size = sizeof(*tun_info) + tun_info->options_len; - - return kmemdup(tun_info, tun_size, GFP_KERNEL); -} - -static bool is_duplicated_encap_entry(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow, - int out_index, - struct mlx5e_encap_entry *e, - struct netlink_ext_ack *extack) -{ - int i; - - for (i = 0; i < out_index; i++) { - if (flow->encaps[i].e != e) - continue; - NL_SET_ERR_MSG_MOD(extack, "can't duplicate encap action"); - netdev_err(priv->netdev, "can't duplicate encap action\n"); - return true; - } - - return false; -} - -static int mlx5e_attach_encap(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow, - struct net_device *mirred_dev, - int out_index, - struct netlink_ext_ack *extack, - struct net_device **encap_dev, - bool *encap_valid) -{ - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - struct mlx5e_tc_flow_parse_attr *parse_attr; - struct mlx5_flow_attr *attr = flow->attr; - const struct ip_tunnel_info *tun_info; - struct encap_key key; - struct mlx5e_encap_entry *e; - unsigned short family; - uintptr_t hash_key; - int err = 0; - - parse_attr = attr->parse_attr; - tun_info = parse_attr->tun_info[out_index]; - family = ip_tunnel_info_af(tun_info); - key.ip_tun_key = &tun_info->key; - key.tc_tunnel = mlx5e_get_tc_tun(mirred_dev); - if (!key.tc_tunnel) { - NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel"); - return -EOPNOTSUPP; - } - - hash_key = hash_encap_info(&key); - - mutex_lock(&esw->offloads.encap_tbl_lock); - e = mlx5e_encap_get(priv, &key, hash_key); - - /* must verify if encap is valid or not */ - if (e) { - /* Check that entry was not already attached to this flow */ - if (is_duplicated_encap_entry(priv, flow, out_index, e, extack)) { - err = -EOPNOTSUPP; - goto out_err; - } - - mutex_unlock(&esw->offloads.encap_tbl_lock); - wait_for_completion(&e->res_ready); - - /* Protect against concurrent neigh update. */ - mutex_lock(&esw->offloads.encap_tbl_lock); - if (e->compl_result < 0) { - err = -EREMOTEIO; - goto out_err; - } - goto attach_flow; - } - - e = kzalloc(sizeof(*e), GFP_KERNEL); - if (!e) { - err = -ENOMEM; - goto out_err; - } - - refcount_set(&e->refcnt, 1); - init_completion(&e->res_ready); - - tun_info = dup_tun_info(tun_info); - if (!tun_info) { - err = -ENOMEM; - goto out_err_init; - } - e->tun_info = tun_info; - err = mlx5e_tc_tun_init_encap_attr(mirred_dev, priv, e, extack); - if (err) - goto out_err_init; - - INIT_LIST_HEAD(&e->flows); - hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key); - mutex_unlock(&esw->offloads.encap_tbl_lock); - - if (family == AF_INET) - err = mlx5e_tc_tun_create_header_ipv4(priv, mirred_dev, e); - else if (family == AF_INET6) - err = mlx5e_tc_tun_create_header_ipv6(priv, mirred_dev, e); - - /* Protect against concurrent neigh update. */ - mutex_lock(&esw->offloads.encap_tbl_lock); - complete_all(&e->res_ready); - if (err) { - e->compl_result = err; - goto out_err; - } - e->compl_result = 1; - -attach_flow: - flow->encaps[out_index].e = e; - list_add(&flow->encaps[out_index].list, &e->flows); - flow->encaps[out_index].index = out_index; - *encap_dev = e->out_dev; - if (e->flags & MLX5_ENCAP_ENTRY_VALID) { - attr->esw_attr->dests[out_index].pkt_reformat = e->pkt_reformat; - attr->esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_ENCAP_VALID; - *encap_valid = true; - } else { - *encap_valid = false; - } - mutex_unlock(&esw->offloads.encap_tbl_lock); - - return err; - -out_err: - mutex_unlock(&esw->offloads.encap_tbl_lock); - if (e) - mlx5e_encap_put(priv, e); - return err; - -out_err_init: - mutex_unlock(&esw->offloads.encap_tbl_lock); - kfree(tun_info); - kfree(e); - return err; -} - -static int mlx5e_attach_decap(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow, - struct netlink_ext_ack *extack) -{ - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - struct mlx5_esw_flow_attr *attr = flow->attr->esw_attr; - struct mlx5e_tc_flow_parse_attr *parse_attr; - struct mlx5e_decap_entry *d; - struct mlx5e_decap_key key; - uintptr_t hash_key; - int err = 0; - - parse_attr = flow->attr->parse_attr; - if (sizeof(parse_attr->eth) > MLX5_CAP_ESW(priv->mdev, max_encap_header_size)) { - NL_SET_ERR_MSG_MOD(extack, - "encap header larger than max supported"); - return -EOPNOTSUPP; - } - - key.key = parse_attr->eth; - hash_key = hash_decap_info(&key); - mutex_lock(&esw->offloads.decap_tbl_lock); - d = mlx5e_decap_get(priv, &key, hash_key); - if (d) { - mutex_unlock(&esw->offloads.decap_tbl_lock); - wait_for_completion(&d->res_ready); - mutex_lock(&esw->offloads.decap_tbl_lock); - if (d->compl_result) { - err = -EREMOTEIO; - goto out_free; - } - goto found; - } - - d = kzalloc(sizeof(*d), GFP_KERNEL); - if (!d) { - err = -ENOMEM; - goto out_err; - } - - d->key = key; - refcount_set(&d->refcnt, 1); - init_completion(&d->res_ready); - INIT_LIST_HEAD(&d->flows); - hash_add_rcu(esw->offloads.decap_tbl, &d->hlist, hash_key); - mutex_unlock(&esw->offloads.decap_tbl_lock); - - d->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, - MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2, - sizeof(parse_attr->eth), - &parse_attr->eth, - MLX5_FLOW_NAMESPACE_FDB); - if (IS_ERR(d->pkt_reformat)) { - err = PTR_ERR(d->pkt_reformat); - d->compl_result = err; - } - mutex_lock(&esw->offloads.decap_tbl_lock); - complete_all(&d->res_ready); - if (err) - goto out_free; - -found: - flow->decap_reformat = d; - attr->decap_pkt_reformat = d->pkt_reformat; - list_add(&flow->l3_to_l2_reformat, &d->flows); - mutex_unlock(&esw->offloads.decap_tbl_lock); - return 0; - -out_free: - mutex_unlock(&esw->offloads.decap_tbl_lock); - mlx5e_decap_put(priv, d); - return err; - -out_err: - mutex_unlock(&esw->offloads.decap_tbl_lock); - return err; -} - static int parse_tc_vlan_action(struct mlx5e_priv *priv, const struct flow_action_entry *act, struct mlx5_esw_flow_attr *attr, @@@ -3648,8 -4247,7 +3648,8 @@@ static int parse_tc_fdb_actions(struct if (encap) { parse_attr->mirred_ifindex[esw_attr->out_count] = out_dev->ifindex; - parse_attr->tun_info[esw_attr->out_count] = dup_tun_info(info); + parse_attr->tun_info[esw_attr->out_count] = + mlx5e_dup_tun_info(info); if (!parse_attr->tun_info[esw_attr->out_count]) return -ENOMEM; encap = false; @@@ -3786,9 -4384,6 +3786,9 @@@ } }
+ /* always set IP version for indirect table handling */ + attr->ip_version = mlx5e_tc_get_ip_version(&parse_attr->spec, true); + if (MLX5_CAP_GEN(esw->dev, prio_tag_required) && action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP) { /* For prio tag mode, replace vlan pop with rewrite vlan prio @@@ -4069,6 -4664,7 +4069,6 @@@ __mlx5e_add_fdb_flow(struct mlx5e_priv return flow;
err_free: - dealloc_mod_hdr_actions(&parse_attr->mod_hdr_acts); mlx5e_flow_put(priv, flow); out: return ERR_PTR(err); @@@ -4213,7 -4809,6 +4213,7 @@@ mlx5e_add_nic_flow(struct mlx5e_priv *p return 0;
err_free: + flow_flag_set(flow, FAILED); dealloc_mod_hdr_actions(&parse_attr->mod_hdr_acts); mlx5e_flow_put(priv, flow); out: @@@ -4445,7 -5040,7 +4445,7 @@@ static int apply_police_params(struct m */ if (rate) { rate = (rate * BITS_PER_BYTE) + 500000; - rate_mbps = max_t(u32, do_div(rate, 1000000), 1); + rate_mbps = max_t(u64, do_div(rate, 1000000), 1); }
err = mlx5_esw_modify_vport_rate(esw, vport_num, rate_mbps); @@@ -4626,8 -5221,6 +4626,8 @@@ int mlx5e_tc_nic_init(struct mlx5e_pri if (err) return err;
+ lockdep_set_class(&tc->ht.mutex, &tc_ht_lock_key); + if (MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, ignore_flow_level)) { attr.flags = MLX5_CHAINS_AND_PRIOS_SUPPORTED | MLX5_CHAINS_IGNORE_FLOW_LEVEL_SUPPORTED; @@@ -4735,8 -5328,7 +4735,8 @@@ int mlx5e_tc_esw_init(struct rhashtabl } uplink_priv->tunnel_mapping = mapping;
- mapping = mapping_create(sz_enc_opts, ENC_OPTS_BITS_MASK, true); + /* 0xFFF is reserved for stack devices slow path table mark */ + mapping = mapping_create(sz_enc_opts, ENC_OPTS_BITS_MASK - 1, true); if (IS_ERR(mapping)) { err = PTR_ERR(mapping); goto err_enc_opts_mapping; @@@ -4747,18 -5339,8 +4747,18 @@@ if (err) goto err_ht_init;
- return err; + lockdep_set_class(&tc_ht->mutex, &tc_ht_lock_key); + + uplink_priv->encap = mlx5e_tc_tun_init(priv); + if (IS_ERR(uplink_priv->encap)) { + err = PTR_ERR(uplink_priv->encap); + goto err_register_fib_notifier; + }
+ return 0; + +err_register_fib_notifier: + rhashtable_destroy(tc_ht); err_ht_init: mapping_destroy(uplink_priv->tunnel_enc_opts_mapping); err_enc_opts_mapping: @@@ -4775,11 -5357,10 +4775,11 @@@ void mlx5e_tc_esw_cleanup(struct rhasht { struct mlx5_rep_uplink_priv *uplink_priv;
- rhashtable_free_and_destroy(tc_ht, _mlx5e_tc_del_flow, NULL); - uplink_priv = container_of(tc_ht, struct mlx5_rep_uplink_priv, tc_ht);
+ rhashtable_free_and_destroy(tc_ht, _mlx5e_tc_del_flow, NULL); + mlx5e_tc_tun_cleanup(uplink_priv->encap); + mapping_destroy(uplink_priv->tunnel_enc_opts_mapping); mapping_destroy(uplink_priv->tunnel_mapping);
@@@ -4879,7 -5460,7 +4879,7 @@@ bool mlx5e_tc_update_skb(struct mlx5_cq tc_skb_ext->chain = chain;
zone_restore_id = (reg_b >> REG_MAPPING_SHIFT(NIC_ZONE_RESTORE_TO_REG)) & - ZONE_RESTORE_MAX; + ESW_ZONE_ID_MASK;
if (!mlx5e_tc_ct_restore_flow(tc->ct, skb, zone_restore_id)) diff --combined drivers/net/ethernet/mellanox/mlx5/core/main.c index e4c9627485aa,ba1a4ae28097..2f2c352f301e --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@@ -73,9 -73,6 +73,9 @@@ #include "ecpf.h" #include "lib/hv_vhca.h" #include "diag/rsc_dump.h" +#include "sf/vhca_event.h" +#include "sf/dev/dev.h" +#include "sf/sf.h"
MODULE_AUTHOR("Eli Cohen eli@mellanox.com"); MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver"); @@@ -85,6 -82,7 +85,6 @@@ unsigned int mlx5_core_debug_mask module_param_named(debug_mask, mlx5_core_debug_mask, uint, 0644); MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec time, 3 = both. Default=0");
-#define MLX5_DEFAULT_PROF 2 static unsigned int prof_sel = MLX5_DEFAULT_PROF; module_param_named(prof_sel, prof_sel, uint, 0444); MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2"); @@@ -569,8 -567,6 +569,8 @@@ static int handle_hca_cap(struct mlx5_c if (MLX5_CAP_GEN_MAX(dev, mkey_by_name)) MLX5_SET(cmd_hca_cap, set_hca_cap, mkey_by_name, 1);
+ mlx5_vhca_state_cap_handle(dev, set_hca_cap); + return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); }
@@@ -888,24 -884,6 +888,24 @@@ static int mlx5_init_once(struct mlx5_c goto err_eswitch_cleanup; }
+ err = mlx5_vhca_event_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init vhca event notifier %d\n", err); + goto err_fpga_cleanup; + } + + err = mlx5_sf_hw_table_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init SF HW table %d\n", err); + goto err_sf_hw_table_cleanup; + } + + err = mlx5_sf_table_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init SF table %d\n", err); + goto err_sf_table_cleanup; + } + dev->dm = mlx5_dm_create(dev); if (IS_ERR(dev->dm)) mlx5_core_warn(dev, "Failed to init device memory%d\n", err); @@@ -916,12 -894,6 +916,12 @@@
return 0;
+err_sf_table_cleanup: + mlx5_sf_hw_table_cleanup(dev); +err_sf_hw_table_cleanup: + mlx5_vhca_event_cleanup(dev); +err_fpga_cleanup: + mlx5_fpga_cleanup(dev); err_eswitch_cleanup: mlx5_eswitch_cleanup(dev->priv.eswitch); err_sriov_cleanup: @@@ -953,9 -925,6 +953,9 @@@ static void mlx5_cleanup_once(struct ml mlx5_hv_vhca_destroy(dev->hv_vhca); mlx5_fw_tracer_destroy(dev->tracer); mlx5_dm_cleanup(dev); + mlx5_sf_table_cleanup(dev); + mlx5_sf_hw_table_cleanup(dev); + mlx5_vhca_event_cleanup(dev); mlx5_fpga_cleanup(dev); mlx5_eswitch_cleanup(dev->priv.eswitch); mlx5_sriov_cleanup(dev); @@@ -1160,14 -1129,6 +1160,14 @@@ static int mlx5_load(struct mlx5_core_d goto err_sriov; }
+ mlx5_vhca_event_start(dev); + + err = mlx5_sf_hw_table_create(dev); + if (err) { + mlx5_core_err(dev, "sf table create failed %d\n", err); + goto err_vhca; + } + err = mlx5_ec_init(dev); if (err) { mlx5_core_err(dev, "Failed to init embedded CPU\n"); @@@ -1180,16 -1141,11 +1180,16 @@@ goto err_sriov; }
+ mlx5_sf_dev_table_create(dev); + return 0;
err_sriov: mlx5_ec_cleanup(dev); err_ec: + mlx5_sf_hw_table_destroy(dev); +err_vhca: + mlx5_vhca_event_stop(dev); mlx5_cleanup_fs(dev); err_fs: mlx5_accel_tls_cleanup(dev); @@@ -1215,11 -1171,8 +1215,11 @@@ err_irq_table
static void mlx5_unload(struct mlx5_core_dev *dev) { + mlx5_sf_dev_table_destroy(dev); mlx5_sriov_detach(dev); mlx5_ec_cleanup(dev); + mlx5_sf_hw_table_destroy(dev); + mlx5_vhca_event_stop(dev); mlx5_cleanup_fs(dev); mlx5_accel_ipsec_cleanup(dev); mlx5_accel_tls_cleanup(dev); @@@ -1330,7 -1283,7 +1330,7 @@@ out mutex_unlock(&dev->intf_state_mutex); }
-static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) +int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) { struct mlx5_priv *priv = &dev->priv; int err; @@@ -1352,8 -1305,6 +1352,8 @@@
priv->dbg_root = debugfs_create_dir(dev_name(dev->device), mlx5_debugfs_root); + INIT_LIST_HEAD(&priv->traps); + err = mlx5_health_init(dev); if (err) goto err_health_init; @@@ -1382,7 -1333,7 +1382,7 @@@ err_health_init return err; }
-static void mlx5_mdev_uninit(struct mlx5_core_dev *dev) +void mlx5_mdev_uninit(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv;
@@@ -1445,7 -1396,8 +1445,8 @@@ static int init_one(struct pci_dev *pde dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err);
pci_save_state(pdev); - devlink_reload_enable(devlink); + if (!mlx5_core_is_mp_slave(dev)) + devlink_reload_enable(devlink); return 0;
err_load_one: @@@ -1725,10 -1677,6 +1726,10 @@@ static int __init init(void if (err) goto err_debug;
+ err = mlx5_sf_driver_register(); + if (err) + goto err_sf; + #ifdef CONFIG_MLX5_CORE_EN err = mlx5e_init(); if (err) { @@@ -1739,8 -1687,6 +1740,8 @@@
return 0;
+err_sf: + pci_unregister_driver(&mlx5_core_driver); err_debug: mlx5_unregister_debugfs(); return err; @@@ -1751,7 -1697,6 +1752,7 @@@ static void __exit cleanup(void #ifdef CONFIG_MLX5_CORE_EN mlx5e_cleanup(); #endif + mlx5_sf_driver_unregister(); pci_unregister_driver(&mlx5_core_driver); mlx5_unregister_debugfs(); } diff --combined drivers/net/ethernet/realtek/r8169_main.c index cbc30df4e08a,e7a59dc5fe49..9ce98e3d3f9f --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@@ -28,7 -28,6 +28,7 @@@ #include <linux/bitfield.h> #include <linux/prefetch.h> #include <linux/ipv6.h> +#include <asm/unaligned.h> #include <net/ip6_checksum.h>
#include "r8169.h" @@@ -146,7 -145,6 +146,7 @@@ static const struct [RTL_GIGA_MAC_VER_50] = {"RTL8168ep/8111ep" }, [RTL_GIGA_MAC_VER_51] = {"RTL8168ep/8111ep" }, [RTL_GIGA_MAC_VER_52] = {"RTL8168fp/RTL8117", FIRMWARE_8168FP_3}, + [RTL_GIGA_MAC_VER_53] = {"RTL8168fp/RTL8117", }, [RTL_GIGA_MAC_VER_60] = {"RTL8125A" }, [RTL_GIGA_MAC_VER_61] = {"RTL8125A", FIRMWARE_8125A_3}, /* reserve 62 for CFG_METHOD_4 in the vendor driver */ @@@ -262,9 -260,6 +262,9 @@@ enum rtl8168_8101_registers #define CSIAR_BYTE_ENABLE 0x0000f000 #define CSIAR_ADDR_MASK 0x00000fff PMCH = 0x6f, +#define D3COLD_NO_PLL_DOWN BIT(7) +#define D3HOT_NO_PLL_DOWN BIT(6) +#define D3_NO_PLL_DOWN (BIT(7) | BIT(6)) EPHYAR = 0x80, #define EPHYAR_FLAG 0x80000000 #define EPHYAR_WRITE_CMD 0x80000000 @@@ -534,9 -529,6 +534,9 @@@ enum rtl_rx_desc_bit IPFail = (1 << 16), /* IP checksum failed */ UDPFail = (1 << 15), /* UDP/IP checksum failed */ TCPFail = (1 << 14), /* TCP/IP checksum failed */ + +#define RxCSFailMask (IPFail | UDPFail | TCPFail) + RxVlanTag = (1 << 16), /* VLAN tag available */ };
@@@ -592,12 -584,6 +592,12 @@@ enum rtl_flag RTL_FLAG_MAX };
+enum rtl_dash_type { + RTL_DASH_NONE, + RTL_DASH_DP, + RTL_DASH_EP, +}; + struct rtl8169_private { void __iomem *mmio_addr; /* memory map physical address */ struct pci_dev *pci_dev; @@@ -605,7 -591,6 +605,7 @@@ struct phy_device *phydev; struct napi_struct napi; enum mac_version mac_version; + enum rtl_dash_type dash_type; u32 cur_rx; /* Index into the Rx descriptor buffer of next Rx pkt. */ u32 cur_tx; /* Index into the Tx descriptor buffer of next Rx pkt. */ u32 dirty_tx; @@@ -697,7 -682,7 +697,7 @@@ static bool rtl_is_8168evl_up(struct rt { return tp->mac_version >= RTL_GIGA_MAC_VER_34 && tp->mac_version != RTL_GIGA_MAC_VER_39 && - tp->mac_version <= RTL_GIGA_MAC_VER_52; + tp->mac_version <= RTL_GIGA_MAC_VER_53; }
static bool rtl_supports_eee(struct rtl8169_private *tp) @@@ -761,77 -746,14 +761,77 @@@ static const struct rtl_cond name = \ static bool name ## _check(struct rtl8169_private *tp)
-static bool rtl_ocp_reg_failure(struct rtl8169_private *tp, u32 reg) +static void r8168fp_adjust_ocp_cmd(struct rtl8169_private *tp, u32 *cmd, int type) { - if (reg & 0xffff0001) { - if (net_ratelimit()) - netdev_err(tp->dev, "Invalid ocp reg %x!\n", reg); - return true; - } - return false; + /* based on RTL8168FP_OOBMAC_BASE in vendor driver */ + if (type == ERIAR_OOB && + (tp->mac_version == RTL_GIGA_MAC_VER_52 || + tp->mac_version == RTL_GIGA_MAC_VER_53)) + *cmd |= 0x7f0 << 18; +} + +DECLARE_RTL_COND(rtl_eriar_cond) +{ + return RTL_R32(tp, ERIAR) & ERIAR_FLAG; +} + +static void _rtl_eri_write(struct rtl8169_private *tp, int addr, u32 mask, + u32 val, int type) +{ + u32 cmd = ERIAR_WRITE_CMD | type | mask | addr; + + if (WARN(addr & 3 || !mask, "addr: 0x%x, mask: 0x%08x\n", addr, mask)) + return; + + RTL_W32(tp, ERIDR, val); + r8168fp_adjust_ocp_cmd(tp, &cmd, type); + RTL_W32(tp, ERIAR, cmd); + + rtl_loop_wait_low(tp, &rtl_eriar_cond, 100, 100); +} + +static void rtl_eri_write(struct rtl8169_private *tp, int addr, u32 mask, + u32 val) +{ + _rtl_eri_write(tp, addr, mask, val, ERIAR_EXGMAC); +} + +static u32 _rtl_eri_read(struct rtl8169_private *tp, int addr, int type) +{ + u32 cmd = ERIAR_READ_CMD | type | ERIAR_MASK_1111 | addr; + + r8168fp_adjust_ocp_cmd(tp, &cmd, type); + RTL_W32(tp, ERIAR, cmd); + + return rtl_loop_wait_high(tp, &rtl_eriar_cond, 100, 100) ? + RTL_R32(tp, ERIDR) : ~0; +} + +static u32 rtl_eri_read(struct rtl8169_private *tp, int addr) +{ + return _rtl_eri_read(tp, addr, ERIAR_EXGMAC); +} + +static void rtl_w0w1_eri(struct rtl8169_private *tp, int addr, u32 p, u32 m) +{ + u32 val = rtl_eri_read(tp, addr); + + rtl_eri_write(tp, addr, ERIAR_MASK_1111, (val & ~m) | p); +} + +static void rtl_eri_set_bits(struct rtl8169_private *tp, int addr, u32 p) +{ + rtl_w0w1_eri(tp, addr, p, 0); +} + +static void rtl_eri_clear_bits(struct rtl8169_private *tp, int addr, u32 m) +{ + rtl_w0w1_eri(tp, addr, 0, m); +} + +static bool rtl_ocp_reg_failure(u32 reg) +{ + return WARN_ONCE(reg & 0xffff0001, "Invalid ocp reg %x!\n", reg); }
DECLARE_RTL_COND(rtl_ocp_gphy_cond) @@@ -841,7 -763,7 +841,7 @@@
static void r8168_phy_ocp_write(struct rtl8169_private *tp, u32 reg, u32 data) { - if (rtl_ocp_reg_failure(tp, reg)) + if (rtl_ocp_reg_failure(reg)) return;
RTL_W32(tp, GPHY_OCP, OCPAR_FLAG | (reg << 15) | data); @@@ -851,7 -773,7 +851,7 @@@
static int r8168_phy_ocp_read(struct rtl8169_private *tp, u32 reg) { - if (rtl_ocp_reg_failure(tp, reg)) + if (rtl_ocp_reg_failure(reg)) return 0;
RTL_W32(tp, GPHY_OCP, reg << 15); @@@ -862,7 -784,7 +862,7 @@@
static void r8168_mac_ocp_write(struct rtl8169_private *tp, u32 reg, u32 data) { - if (rtl_ocp_reg_failure(tp, reg)) + if (rtl_ocp_reg_failure(reg)) return;
RTL_W32(tp, OCPDR, OCPAR_FLAG | (reg << 15) | data); @@@ -870,7 -792,7 +870,7 @@@
static u16 r8168_mac_ocp_read(struct rtl8169_private *tp, u32 reg) { - if (rtl_ocp_reg_failure(tp, reg)) + if (rtl_ocp_reg_failure(reg)) return 0;
RTL_W32(tp, OCPDR, reg << 15); @@@ -886,25 -808,6 +886,25 @@@ static void r8168_mac_ocp_modify(struc r8168_mac_ocp_write(tp, reg, (data & ~mask) | set); }
+/* Work around a hw issue with RTL8168g PHY, the quirk disables + * PHY MCU interrupts before PHY power-down. + */ +static void rtl8168g_phy_suspend_quirk(struct rtl8169_private *tp, int value) +{ + switch (tp->mac_version) { + case RTL_GIGA_MAC_VER_40: + case RTL_GIGA_MAC_VER_41: + case RTL_GIGA_MAC_VER_49: + if (value & BMCR_RESET || !(value & BMCR_PDOWN)) + rtl_eri_set_bits(tp, 0x1a8, 0xfc000000); + else + rtl_eri_clear_bits(tp, 0x1a8, 0xfc000000); + break; + default: + break; + } +}; + static void r8168g_mdio_write(struct rtl8169_private *tp, int reg, int value) { if (reg == 0x1f) { @@@ -915,9 -818,6 +915,9 @@@ if (tp->ocp_base != OCP_STD_PHY_BASE) reg -= 0x10;
+ if (tp->ocp_base == OCP_STD_PHY_BASE && reg == MII_BMCR) + rtl8168g_phy_suspend_quirk(tp, value); + r8168_phy_ocp_write(tp, tp->ocp_base + reg * 2, value); }
@@@ -1109,6 -1009,70 +1109,6 @@@ static u16 rtl_ephy_read(struct rtl8169 RTL_R32(tp, EPHYAR) & EPHYAR_DATA_MASK : ~0; }
-static void r8168fp_adjust_ocp_cmd(struct rtl8169_private *tp, u32 *cmd, int type) -{ - /* based on RTL8168FP_OOBMAC_BASE in vendor driver */ - if (tp->mac_version == RTL_GIGA_MAC_VER_52 && type == ERIAR_OOB) - *cmd |= 0x7f0 << 18; -} - -DECLARE_RTL_COND(rtl_eriar_cond) -{ - return RTL_R32(tp, ERIAR) & ERIAR_FLAG; -} - -static void _rtl_eri_write(struct rtl8169_private *tp, int addr, u32 mask, - u32 val, int type) -{ - u32 cmd = ERIAR_WRITE_CMD | type | mask | addr; - - BUG_ON((addr & 3) || (mask == 0)); - RTL_W32(tp, ERIDR, val); - r8168fp_adjust_ocp_cmd(tp, &cmd, type); - RTL_W32(tp, ERIAR, cmd); - - rtl_loop_wait_low(tp, &rtl_eriar_cond, 100, 100); -} - -static void rtl_eri_write(struct rtl8169_private *tp, int addr, u32 mask, - u32 val) -{ - _rtl_eri_write(tp, addr, mask, val, ERIAR_EXGMAC); -} - -static u32 _rtl_eri_read(struct rtl8169_private *tp, int addr, int type) -{ - u32 cmd = ERIAR_READ_CMD | type | ERIAR_MASK_1111 | addr; - - r8168fp_adjust_ocp_cmd(tp, &cmd, type); - RTL_W32(tp, ERIAR, cmd); - - return rtl_loop_wait_high(tp, &rtl_eriar_cond, 100, 100) ? - RTL_R32(tp, ERIDR) : ~0; -} - -static u32 rtl_eri_read(struct rtl8169_private *tp, int addr) -{ - return _rtl_eri_read(tp, addr, ERIAR_EXGMAC); -} - -static void rtl_w0w1_eri(struct rtl8169_private *tp, int addr, u32 p, u32 m) -{ - u32 val = rtl_eri_read(tp, addr); - - rtl_eri_write(tp, addr, ERIAR_MASK_1111, (val & ~m) | p); -} - -static void rtl_eri_set_bits(struct rtl8169_private *tp, int addr, u32 p) -{ - rtl_w0w1_eri(tp, addr, p, 0); -} - -static void rtl_eri_clear_bits(struct rtl8169_private *tp, int addr, u32 m) -{ - rtl_w0w1_eri(tp, addr, 0, m); -} - static u32 r8168dp_ocp_read(struct rtl8169_private *tp, u16 reg) { RTL_W32(tp, OCPAR, 0x0fu << 12 | (reg & 0x0fff)); @@@ -1194,10 -1158,19 +1194,10 @@@ static void rtl8168ep_driver_start(stru
static void rtl8168_driver_start(struct rtl8169_private *tp) { - switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_27: - case RTL_GIGA_MAC_VER_28: - case RTL_GIGA_MAC_VER_31: + if (tp->dash_type == RTL_DASH_DP) rtl8168dp_driver_start(tp); - break; - case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_52: + else rtl8168ep_driver_start(tp); - break; - default: - BUG(); - break; - } }
static void rtl8168dp_driver_stop(struct rtl8169_private *tp) @@@ -1216,52 -1189,44 +1216,52 @@@ static void rtl8168ep_driver_stop(struc
static void rtl8168_driver_stop(struct rtl8169_private *tp) { - switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_27: - case RTL_GIGA_MAC_VER_28: - case RTL_GIGA_MAC_VER_31: + if (tp->dash_type == RTL_DASH_DP) rtl8168dp_driver_stop(tp); - break; - case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_52: + else rtl8168ep_driver_stop(tp); - break; - default: - BUG(); - break; - } }
static bool r8168dp_check_dash(struct rtl8169_private *tp) { u16 reg = rtl8168_get_ocp_reg(tp);
- return !!(r8168dp_ocp_read(tp, reg) & 0x00008000); + return r8168dp_ocp_read(tp, reg) & BIT(15); }
static bool r8168ep_check_dash(struct rtl8169_private *tp) { - return r8168ep_ocp_read(tp, 0x128) & 0x00000001; + return r8168ep_ocp_read(tp, 0x128) & BIT(0); }
-static bool r8168_check_dash(struct rtl8169_private *tp) +static enum rtl_dash_type rtl_check_dash(struct rtl8169_private *tp) { switch (tp->mac_version) { case RTL_GIGA_MAC_VER_27: case RTL_GIGA_MAC_VER_28: case RTL_GIGA_MAC_VER_31: - return r8168dp_check_dash(tp); - case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_52: - return r8168ep_check_dash(tp); + return r8168dp_check_dash(tp) ? RTL_DASH_DP : RTL_DASH_NONE; + case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_53: + return r8168ep_check_dash(tp) ? RTL_DASH_EP : RTL_DASH_NONE; default: - return false; + return RTL_DASH_NONE; + } +} + +static void rtl_set_d3_pll_down(struct rtl8169_private *tp, bool enable) +{ + switch (tp->mac_version) { + case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_26: + case RTL_GIGA_MAC_VER_29 ... RTL_GIGA_MAC_VER_30: + case RTL_GIGA_MAC_VER_32 ... RTL_GIGA_MAC_VER_37: + case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_63: + if (enable) + RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) & ~D3_NO_PLL_DOWN); + else + RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | D3_NO_PLL_DOWN); + break; + default: + break; } }
@@@ -1431,7 -1396,6 +1431,7 @@@ static void __rtl8169_set_wol(struct rt rtl_lock_config_regs(tp);
device_set_wakeup_enable(tp_to_dev(tp), wolopts); + rtl_set_d3_pll_down(tp, !wolopts); tp->dev->wol_enabled = wolopts ? 1 : 0; }
@@@ -1966,7 -1930,6 +1966,7 @@@ static enum mac_version rtl8169_get_mac { 0x7c8, 0x608, RTL_GIGA_MAC_VER_61 },
/* RTL8117 */ + { 0x7cf, 0x54b, RTL_GIGA_MAC_VER_53 }, { 0x7cf, 0x54a, RTL_GIGA_MAC_VER_52 },
/* 8168EP family. */ @@@ -1999,11 -1962,7 +1999,11 @@@ { 0x7c8, 0x280, RTL_GIGA_MAC_VER_26 },
/* 8168DP family. */ - { 0x7cf, 0x288, RTL_GIGA_MAC_VER_27 }, + /* It seems this early RTL8168dp version never made it to + * the wild. Let's see whether somebody complains, if not + * we'll remove support for this chip version completely. + * { 0x7cf, 0x288, RTL_GIGA_MAC_VER_27 }, + */ { 0x7cf, 0x28a, RTL_GIGA_MAC_VER_28 }, { 0x7cf, 0x28b, RTL_GIGA_MAC_VER_31 },
@@@ -2037,12 -1996,9 +2037,12 @@@ { 0x7c8, 0x348, RTL_GIGA_MAC_VER_09 }, { 0x7c8, 0x248, RTL_GIGA_MAC_VER_09 }, { 0x7c8, 0x340, RTL_GIGA_MAC_VER_16 }, - /* FIXME: where did these entries come from ? -- FR */ - { 0xfc8, 0x388, RTL_GIGA_MAC_VER_13 }, - { 0xfc8, 0x308, RTL_GIGA_MAC_VER_13 }, + /* FIXME: where did these entries come from ? -- FR + * Not even r8101 vendor driver knows these id's, + * so let's disable detection for now. -- HK + * { 0xfc8, 0x388, RTL_GIGA_MAC_VER_13 }, + * { 0xfc8, 0x308, RTL_GIGA_MAC_VER_13 }, + */
/* 8110 family. */ { 0xfc8, 0x980, RTL_GIGA_MAC_VER_06 }, @@@ -2125,12 -2081,18 +2125,12 @@@ static void rtl8125b_config_eee_mac(str r8168_mac_ocp_modify(tp, 0xe040, 0, BIT(1) | BIT(0)); }
-static void rtl_rar_exgmac_set(struct rtl8169_private *tp, u8 *addr) +static void rtl_rar_exgmac_set(struct rtl8169_private *tp, const u8 *addr) { - const u16 w[] = { - addr[0] | (addr[1] << 8), - addr[2] | (addr[3] << 8), - addr[4] | (addr[5] << 8) - }; - - rtl_eri_write(tp, 0xe0, ERIAR_MASK_1111, w[0] | (w[1] << 16)); - rtl_eri_write(tp, 0xe4, ERIAR_MASK_1111, w[2]); - rtl_eri_write(tp, 0xf0, ERIAR_MASK_1111, w[0] << 16); - rtl_eri_write(tp, 0xf4, ERIAR_MASK_1111, w[1] | (w[2] << 16)); + rtl_eri_write(tp, 0xe0, ERIAR_MASK_1111, get_unaligned_le32(addr)); + rtl_eri_write(tp, 0xe4, ERIAR_MASK_1111, get_unaligned_le16(addr + 4)); + rtl_eri_write(tp, 0xf0, ERIAR_MASK_1111, get_unaligned_le16(addr) << 16); + rtl_eri_write(tp, 0xf4, ERIAR_MASK_1111, get_unaligned_le32(addr + 2)); }
u16 rtl8168h_2_get_adc_bias_ioffset(struct rtl8169_private *tp) @@@ -2180,14 -2142,14 +2180,14 @@@ static void rtl8169_init_phy(struct rtl genphy_soft_reset(tp->phydev); }
-static void rtl_rar_set(struct rtl8169_private *tp, u8 *addr) +static void rtl_rar_set(struct rtl8169_private *tp, const u8 *addr) { rtl_unlock_config_regs(tp);
- RTL_W32(tp, MAC4, addr[4] | addr[5] << 8); + RTL_W32(tp, MAC4, get_unaligned_le16(addr + 4)); rtl_pci_commit(tp);
- RTL_W32(tp, MAC0, addr[0] | addr[1] << 8 | addr[2] << 16 | addr[3] << 24); + RTL_W32(tp, MAC0, get_unaligned_le32(addr)); rtl_pci_commit(tp);
if (tp->mac_version == RTL_GIGA_MAC_VER_34) @@@ -2210,16 -2172,28 +2210,16 @@@ static int rtl_set_mac_address(struct n return 0; }
-static void rtl_wol_suspend_quirk(struct rtl8169_private *tp) +static void rtl_wol_enable_rx(struct rtl8169_private *tp) { - switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_25: - case RTL_GIGA_MAC_VER_26: - case RTL_GIGA_MAC_VER_29: - case RTL_GIGA_MAC_VER_30: - case RTL_GIGA_MAC_VER_32: - case RTL_GIGA_MAC_VER_33: - case RTL_GIGA_MAC_VER_34: - case RTL_GIGA_MAC_VER_37 ... RTL_GIGA_MAC_VER_63: + if (tp->mac_version >= RTL_GIGA_MAC_VER_25) RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) | AcceptBroadcast | AcceptMulticast | AcceptMyPhys); - break; - default: - break; - } }
-static void rtl_pll_power_down(struct rtl8169_private *tp) +static void rtl_prepare_power_down(struct rtl8169_private *tp) { - if (r8168_check_dash(tp)) + if (tp->dash_type != RTL_DASH_NONE) return;
if (tp->mac_version == RTL_GIGA_MAC_VER_32 || @@@ -2228,10 -2202,68 +2228,35 @@@
if (device_may_wakeup(tp_to_dev(tp))) { phy_speed_down(tp->phydev, false); - rtl_wol_suspend_quirk(tp); - return; + rtl_wol_enable_rx(tp); } + + switch (tp->mac_version) { + case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_26: + case RTL_GIGA_MAC_VER_29 ... RTL_GIGA_MAC_VER_30: + case RTL_GIGA_MAC_VER_32 ... RTL_GIGA_MAC_VER_33: + case RTL_GIGA_MAC_VER_37: + case RTL_GIGA_MAC_VER_39: + case RTL_GIGA_MAC_VER_43: + case RTL_GIGA_MAC_VER_44: + case RTL_GIGA_MAC_VER_45: + case RTL_GIGA_MAC_VER_46: + case RTL_GIGA_MAC_VER_47: + case RTL_GIGA_MAC_VER_48: + case RTL_GIGA_MAC_VER_50 ... RTL_GIGA_MAC_VER_63: + RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) & ~0x80); + break; + case RTL_GIGA_MAC_VER_40: + case RTL_GIGA_MAC_VER_41: + case RTL_GIGA_MAC_VER_49: + rtl_eri_clear_bits(tp, 0x1a8, 0xfc000000); + RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) & ~0x80); + break; + default: + break; + } }
-static void rtl_pll_power_up(struct rtl8169_private *tp) -{ - switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_26: - case RTL_GIGA_MAC_VER_29 ... RTL_GIGA_MAC_VER_30: - case RTL_GIGA_MAC_VER_32 ... RTL_GIGA_MAC_VER_33: - case RTL_GIGA_MAC_VER_37: - case RTL_GIGA_MAC_VER_39: - case RTL_GIGA_MAC_VER_43: - RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0x80); - break; - case RTL_GIGA_MAC_VER_44: - case RTL_GIGA_MAC_VER_45: - case RTL_GIGA_MAC_VER_46: - case RTL_GIGA_MAC_VER_47: - case RTL_GIGA_MAC_VER_48: - case RTL_GIGA_MAC_VER_50 ... RTL_GIGA_MAC_VER_63: - RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0xc0); - break; - case RTL_GIGA_MAC_VER_40: - case RTL_GIGA_MAC_VER_41: - case RTL_GIGA_MAC_VER_49: - RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0xc0); - rtl_eri_set_bits(tp, 0x1a8, 0xfc000000); - break; - default: - break; - } - - phy_resume(tp->phydev); -} - static void rtl_init_rxcfg(struct rtl8169_private *tp) { switch (tp->mac_version) { @@@ -2244,7 -2276,7 +2269,7 @@@ case RTL_GIGA_MAC_VER_38: RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_52: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_53: RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST | RX_EARLY_OFF); break; case RTL_GIGA_MAC_VER_60 ... RTL_GIGA_MAC_VER_63: @@@ -2310,14 -2342,13 +2335,14 @@@ static void r8168b_1_hw_jumbo_disable(s static void rtl_jumbo_config(struct rtl8169_private *tp) { bool jumbo = tp->dev->mtu > ETH_DATA_LEN; + int readrq = 4096;
rtl_unlock_config_regs(tp); switch (tp->mac_version) { case RTL_GIGA_MAC_VER_12: case RTL_GIGA_MAC_VER_17: if (jumbo) { - pcie_set_readrq(tp->pci_dev, 512); + readrq = 512; r8168b_1_hw_jumbo_enable(tp); } else { r8168b_1_hw_jumbo_disable(tp); @@@ -2325,7 -2356,7 +2350,7 @@@ break; case RTL_GIGA_MAC_VER_18 ... RTL_GIGA_MAC_VER_26: if (jumbo) { - pcie_set_readrq(tp->pci_dev, 512); + readrq = 512; r8168c_hw_jumbo_enable(tp); } else { r8168c_hw_jumbo_disable(tp); @@@ -2338,18 -2369,20 +2363,18 @@@ r8168dp_hw_jumbo_disable(tp); break; case RTL_GIGA_MAC_VER_31 ... RTL_GIGA_MAC_VER_33: - if (jumbo) { - pcie_set_readrq(tp->pci_dev, 512); + if (jumbo) r8168e_hw_jumbo_enable(tp); - } else { + else r8168e_hw_jumbo_disable(tp); - } break; default: break; } rtl_lock_config_regs(tp);
- if (!jumbo && pci_is_pcie(tp->pci_dev) && tp->supports_gmii) - pcie_set_readrq(tp->pci_dev, 4096); + if (pci_is_pcie(tp->pci_dev) && tp->supports_gmii) + pcie_set_readrq(tp->pci_dev, readrq); }
DECLARE_RTL_COND(rtl_chipcmd_cond) @@@ -2418,7 -2451,7 +2443,7 @@@ DECLARE_RTL_COND(rtl_rxtx_empty_cond_2 static void rtl_wait_txrx_fifo_empty(struct rtl8169_private *tp) { switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_52: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_53: rtl_loop_wait_high(tp, &rtl_txcfg_empty_cond, 100, 42); rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond, 100, 42); break; @@@ -3677,7 -3710,6 +3702,7 @@@ static void rtl_hw_config(struct rtl816 [RTL_GIGA_MAC_VER_50] = rtl_hw_start_8168ep_2, [RTL_GIGA_MAC_VER_51] = rtl_hw_start_8168ep_3, [RTL_GIGA_MAC_VER_52] = rtl_hw_start_8117, + [RTL_GIGA_MAC_VER_53] = rtl_hw_start_8117, [RTL_GIGA_MAC_VER_60] = rtl_hw_start_8125a_1, [RTL_GIGA_MAC_VER_61] = rtl_hw_start_8125a_2, [RTL_GIGA_MAC_VER_63] = rtl_hw_start_8125b, @@@ -4437,9 -4469,10 +4462,9 @@@ static inline int rtl8169_fragmented_fr
static inline void rtl8169_rx_csum(struct sk_buff *skb, u32 opts1) { - u32 status = opts1 & RxProtoMask; + u32 status = opts1 & (RxProtoMask | RxCSFailMask);
- if (((status == RxProtoTCP) && !(opts1 & TCPFail)) || - ((status == RxProtoUDP) && !(opts1 & UDPFail))) + if (status == RxProtoTCP || status == RxProtoUDP) skb->ip_summed = CHECKSUM_UNNECESSARY; else skb_checksum_none_assert(skb); @@@ -4553,10 -4586,8 +4578,10 @@@ static irqreturn_t rtl8169_interrupt(in rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING); }
- rtl_irq_disable(tp); - napi_schedule(&tp->napi); + if (napi_schedule_prep(&tp->napi)) { + rtl_irq_disable(tp); + __napi_schedule(&tp->napi); + } out: rtl_ack_events(tp, status);
@@@ -4588,10 -4619,10 +4613,10 @@@ static int rtl8169_poll(struct napi_str struct net_device *dev = tp->dev; int work_done;
- work_done = rtl_rx(dev, tp, budget); - rtl_tx(dev, tp, budget);
+ work_done = rtl_rx(dev, tp, budget); + if (work_done < budget && napi_complete_done(napi, work_done)) rtl_irq_enable(tp);
@@@ -4648,12 -4679,12 +4673,12 @@@ static void rtl8169_down(struct rtl8169
rtl8169_cleanup(tp, true);
- rtl_pll_power_down(tp); + rtl_prepare_power_down(tp); }
static void rtl8169_up(struct rtl8169_private *tp) { - rtl_pll_power_up(tp); + phy_resume(tp->phydev); rtl8169_init_phy(tp); napi_enable(&tp->napi); set_bit(RTL_FLAG_TASK_ENABLED, tp->wk.flags); @@@ -4808,12 -4839,9 +4833,12 @@@ static void rtl8169_net_suspend(struct
#ifdef CONFIG_PM
-static int rtl8169_net_resume(struct rtl8169_private *tp) +static int rtl8169_runtime_resume(struct device *dev) { + struct rtl8169_private *tp = dev_get_drvdata(dev); + rtl_rar_set(tp, tp->dev->dev_addr); + __rtl8169_set_wol(tp, tp->saved_wolopts);
if (tp->TxDescArray) rtl8169_up(tp); @@@ -4847,7 -4875,7 +4872,7 @@@ static int __maybe_unused rtl8169_resum if (tp->mac_version == RTL_GIGA_MAC_VER_37) rtl_init_rxcfg(tp);
- return rtl8169_net_resume(tp); + return rtl8169_runtime_resume(device); }
static int rtl8169_runtime_suspend(struct device *device) @@@ -4867,6 -4895,15 +4892,6 @@@ return 0; }
-static int rtl8169_runtime_resume(struct device *device) -{ - struct rtl8169_private *tp = dev_get_drvdata(device); - - __rtl8169_set_wol(tp, tp->saved_wolopts); - - return rtl8169_net_resume(tp); -} - static int rtl8169_runtime_idle(struct device *device) { struct rtl8169_private *tp = dev_get_drvdata(device); @@@ -4914,10 -4951,12 +4939,10 @@@ static void rtl_shutdown(struct pci_de rtl_rar_set(tp, tp->dev->perm_addr);
if (system_state == SYSTEM_POWER_OFF) { - if (tp->saved_wolopts) { - rtl_wol_suspend_quirk(tp); + if (tp->saved_wolopts) rtl_wol_shutdown_quirk(tp); - }
- pci_wake_from_d3(pdev, true); + pci_wake_from_d3(pdev, tp->saved_wolopts); pci_set_power_state(pdev, PCI_D3hot); } } @@@ -4931,7 -4970,7 +4956,7 @@@ static void rtl_remove_one(struct pci_d
unregister_netdev(tp->dev);
- if (r8168_check_dash(tp)) + if (tp->dash_type != RTL_DASH_NONE) rtl8168_driver_stop(tp);
rtl_release_firmware(tp); @@@ -4999,12 -5038,16 +5024,12 @@@ static void rtl_read_mac_address(struc { /* Get MAC address */ if (rtl_is_8168evl_up(tp) && tp->mac_version != RTL_GIGA_MAC_VER_34) { - u32 value = rtl_eri_read(tp, 0xe0); - - mac_addr[0] = (value >> 0) & 0xff; - mac_addr[1] = (value >> 8) & 0xff; - mac_addr[2] = (value >> 16) & 0xff; - mac_addr[3] = (value >> 24) & 0xff; + u32 value;
+ value = rtl_eri_read(tp, 0xe0); + put_unaligned_le32(value, mac_addr); value = rtl_eri_read(tp, 0xe4); - mac_addr[4] = (value >> 0) & 0xff; - mac_addr[5] = (value >> 8) & 0xff; + put_unaligned_le16(value, mac_addr + 4); } else if (rtl_is_8125(tp)) { rtl_read_mac_from_reg(tp, mac_addr, MAC0_BKP); } @@@ -5056,7 -5099,7 +5081,7 @@@ static int r8169_mdio_register(struct r new_bus->name = "r8169"; new_bus->priv = tp; new_bus->parent = &pdev->dev; - new_bus->irq[0] = PHY_IGNORE_INTERRUPT; + new_bus->irq[0] = PHY_MAC_INTERRUPT; snprintf(new_bus->id, MII_BUS_ID_SIZE, "r8169-%x", pci_dev_id(pdev));
new_bus->read = r8169_mdio_read_reg; @@@ -5119,7 -5162,7 +5144,7 @@@ static void rtl_hw_init_8125(struct rtl static void rtl_hw_initialize(struct rtl8169_private *tp) { switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_52: + case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_53: rtl8168ep_stop_cmac(tp); fallthrough; case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_48: @@@ -5285,14 -5328,12 +5310,14 @@@ static int rtl_init_one(struct pci_dev /* Identify chip attached to board */ chipset = rtl8169_get_mac_version(xid, tp->supports_gmii); if (chipset == RTL_GIGA_MAC_NONE) { - dev_err(&pdev->dev, "unknown chip XID %03x\n", xid); + dev_err(&pdev->dev, "unknown chip XID %03x, contact r8169 maintainers (see MAINTAINERS file)\n", xid); return -ENODEV; }
tp->mac_version = chipset;
+ tp->dash_type = rtl_check_dash(tp); + tp->cp_cmd = RTL_R16(tp, CPlusCmd) & CPCMD_MASK;
if (sizeof(dma_addr_t) > 4 && tp->mac_version >= RTL_GIGA_MAC_VER_18 && @@@ -5362,8 -5403,6 +5387,8 @@@ /* configure chip for default features */ rtl8169_set_features(dev, dev->features);
+ rtl_set_d3_pll_down(tp, true); + jumbo_max = rtl_jumbo_max(tp); if (jumbo_max) dev->max_mtu = jumbo_max; @@@ -5384,6 -5423,9 +5409,6 @@@ if (rc) return rc;
- /* chip gets powered up in rtl_open() */ - rtl_pll_power_down(tp); - rc = register_netdev(dev); if (rc) return rc; @@@ -5397,7 -5439,7 +5422,7 @@@ jumbo_max, tp->mac_version <= RTL_GIGA_MAC_VER_06 ? "ok" : "ko");
- if (r8168_check_dash(tp)) { + if (tp->dash_type != RTL_DASH_NONE) { netdev_info(dev, "DASH enabled\n"); rtl8168_driver_start(tp); } diff --combined drivers/net/ethernet/xilinx/xilinx_axienet_main.c index e8febfb1924d,b4a0bfce5b76..3a8775e0ca55 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@@ -1469,13 -1469,6 +1469,13 @@@ axienet_ethtools_set_link_ksettings(str return phylink_ethtool_ksettings_set(lp->phylink, cmd); }
+static int axienet_ethtools_nway_reset(struct net_device *dev) +{ + struct axienet_local *lp = netdev_priv(dev); + + return phylink_ethtool_nway_reset(lp->phylink); +} + static const struct ethtool_ops axienet_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES, .get_drvinfo = axienet_ethtools_get_drvinfo, @@@ -1490,7 -1483,6 +1490,7 @@@ .set_coalesce = axienet_ethtools_set_coalesce, .get_link_ksettings = axienet_ethtools_get_link_ksettings, .set_link_ksettings = axienet_ethtools_set_link_ksettings, + .nway_reset = axienet_ethtools_nway_reset, };
static void axienet_validate(struct phylink_config *config, @@@ -1502,22 -1494,13 +1502,22 @@@ __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
/* Only support the mode we are configured for */ - if (state->interface != PHY_INTERFACE_MODE_NA && - state->interface != lp->phy_mode) { - netdev_warn(ndev, "Cannot use PHY mode %s, supported: %s\n", - phy_modes(state->interface), - phy_modes(lp->phy_mode)); - bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS); - return; + switch (state->interface) { + case PHY_INTERFACE_MODE_NA: + break; + case PHY_INTERFACE_MODE_1000BASEX: + case PHY_INTERFACE_MODE_SGMII: + if (lp->switch_x_sgmii) + break; + fallthrough; + default: + if (state->interface != lp->phy_mode) { + netdev_warn(ndev, "Cannot use PHY mode %s, supported: %s\n", + phy_modes(state->interface), + phy_modes(lp->phy_mode)); + bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS); + return; + } }
phylink_set(mask, Autoneg); @@@ -1577,33 -1560,6 +1577,33 @@@ static void axienet_mac_an_restart(stru phylink_mii_c22_pcs_an_restart(lp->pcs_phy); }
+static int axienet_mac_prepare(struct phylink_config *config, unsigned int mode, + phy_interface_t iface) +{ + struct net_device *ndev = to_net_dev(config->dev); + struct axienet_local *lp = netdev_priv(ndev); + int ret; + + switch (iface) { + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_1000BASEX: + if (!lp->switch_x_sgmii) + return 0; + + ret = mdiobus_write(lp->pcs_phy->bus, + lp->pcs_phy->addr, + XLNX_MII_STD_SELECT_REG, + iface == PHY_INTERFACE_MODE_SGMII ? + XLNX_MII_STD_SELECT_SGMII : 0); + if (ret < 0) + netdev_warn(ndev, "Failed to switch PHY interface: %d\n", + ret); + return ret; + default: + return 0; + } +} + static void axienet_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) { @@@ -1681,7 -1637,6 +1681,7 @@@ static const struct phylink_mac_ops axi .validate = axienet_validate, .mac_pcs_get_state = axienet_mac_pcs_get_state, .mac_an_restart = axienet_mac_an_restart, + .mac_prepare = axienet_mac_prepare, .mac_config = axienet_mac_config, .mac_link_down = axienet_mac_link_down, .mac_link_up = axienet_mac_link_up, @@@ -1862,6 -1817,18 +1862,18 @@@ static int axienet_probe(struct platfor lp->options = XAE_OPTION_DEFAULTS; lp->rx_bd_num = RX_BD_NUM_DEFAULT; lp->tx_bd_num = TX_BD_NUM_DEFAULT; + + lp->clk = devm_clk_get_optional(&pdev->dev, NULL); + if (IS_ERR(lp->clk)) { + ret = PTR_ERR(lp->clk); + goto free_netdev; + } + ret = clk_prepare_enable(lp->clk); + if (ret) { + dev_err(&pdev->dev, "Unable to enable clock: %d\n", ret); + goto free_netdev; + } + /* Map device registers */ ethres = platform_get_resource(pdev, IORESOURCE_MEM, 0); lp->regs = devm_ioremap_resource(&pdev->dev, ethres); @@@ -1921,9 -1888,6 +1933,9 @@@ */ of_property_read_u32(pdev->dev.of_node, "xlnx,rxmem", &lp->rxmem);
+ lp->switch_x_sgmii = of_property_read_bool(pdev->dev.of_node, + "xlnx,switch-x-sgmii"); + /* Start with the proprietary, and broken phy_type */ ret = of_property_read_u32(pdev->dev.of_node, "xlnx,phy-type", &value); if (!ret) { @@@ -1953,12 -1917,6 +1965,12 @@@ if (ret) goto free_netdev; } + if (lp->switch_x_sgmii && lp->phy_mode != PHY_INTERFACE_MODE_SGMII && + lp->phy_mode != PHY_INTERFACE_MODE_1000BASEX) { + dev_err(&pdev->dev, "xlnx,switch-x-sgmii only supported with SGMII or 1000BaseX\n"); + ret = -EINVAL; + goto free_netdev; + }
/* Find the DMA node, map the DMA registers, and decode the DMA IRQs */ np = of_parse_phandle(pdev->dev.of_node, "axistream-connected", 0); @@@ -2046,20 -2004,6 +2058,6 @@@
lp->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); if (lp->phy_node) { - lp->clk = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(lp->clk)) { - dev_warn(&pdev->dev, "Failed to get clock: %ld\n", - PTR_ERR(lp->clk)); - lp->clk = NULL; - } else { - ret = clk_prepare_enable(lp->clk); - if (ret) { - dev_err(&pdev->dev, "Unable to enable clock: %d\n", - ret); - goto free_netdev; - } - } - ret = axienet_mdio_setup(lp); if (ret) dev_warn(&pdev->dev, diff --combined drivers/net/ipa/ipa_main.c index c10e7340b031,eb1c8396bcdd..97c1b55405cb --- a/drivers/net/ipa/ipa_main.c +++ b/drivers/net/ipa/ipa_main.c @@@ -15,6 -15,7 +15,6 @@@ #include <linux/of.h> #include <linux/of_device.h> #include <linux/of_address.h> -#include <linux/remoteproc.h> #include <linux/qcom_scm.h> #include <linux/soc/qcom/mdt_loader.h>
@@@ -580,10 -581,10 +580,10 @@@ ipa_resource_config(struct ipa *ipa, co return -EINVAL;
for (i = 0; i < data->resource_src_count; i++) - ipa_resource_config_src(ipa, data->resource_src); + ipa_resource_config_src(ipa, &data->resource_src[i]);
for (i = 0; i < data->resource_dst_count; i++) - ipa_resource_config_dst(ipa, data->resource_dst); + ipa_resource_config_dst(ipa, &data->resource_dst[i]);
return 0; } @@@ -728,6 -729,19 +728,6 @@@ static const struct of_device_id ipa_ma }; MODULE_DEVICE_TABLE(of, ipa_match);
-static phandle of_property_read_phandle(const struct device_node *np, - const char *name) -{ - struct property *prop; - int len = 0; - - prop = of_find_property(np, name, &len); - if (!prop || len != sizeof(__be32)) - return 0; - - return be32_to_cpup(prop->value); -} - /* Check things that can be validated at build time. This just * groups these things BUILD_BUG_ON() calls don't clutter the rest * of the code. @@@ -793,8 -807,10 +793,8 @@@ static int ipa_probe(struct platform_de struct device *dev = &pdev->dev; const struct ipa_data *data; struct ipa_clock *clock; - struct rproc *rproc; bool modem_init; struct ipa *ipa; - phandle ph; int ret;
ipa_validate_build(); @@@ -813,12 -829,25 +813,12 @@@ if (!qcom_scm_is_available()) return -EPROBE_DEFER;
- /* We rely on remoteproc to tell us about modem state changes */ - ph = of_property_read_phandle(dev->of_node, "modem-remoteproc"); - if (!ph) { - dev_err(dev, "DT missing "modem-remoteproc" property\n"); - return -EINVAL; - } - - rproc = rproc_get_by_phandle(ph); - if (!rproc) - return -EPROBE_DEFER; - /* The clock and interconnects might not be ready when we're * probed, so might return -EPROBE_DEFER. */ clock = ipa_clock_init(dev, data->clock_data); - if (IS_ERR(clock)) { - ret = PTR_ERR(clock); - goto err_rproc_put; - } + if (IS_ERR(clock)) + return PTR_ERR(clock);
/* No more EPROBE_DEFER. Allocate and initialize the IPA structure */ ipa = kzalloc(sizeof(*ipa), GFP_KERNEL); @@@ -829,9 -858,9 +829,9 @@@
ipa->pdev = pdev; dev_set_drvdata(dev, ipa); - ipa->modem_rproc = rproc; ipa->clock = clock; ipa->version = data->version; + init_completion(&ipa->completion);
ret = ipa_reg_init(ipa); if (ret) @@@ -906,6 -935,8 +906,6 @@@ err_kfree_ipa kfree(ipa); err_clock_exit: ipa_clock_exit(clock); -err_rproc_put: - rproc_put(rproc);
return ret; } @@@ -913,6 -944,7 +913,6 @@@ static int ipa_remove(struct platform_device *pdev) { struct ipa *ipa = dev_get_drvdata(&pdev->dev); - struct rproc *rproc = ipa->modem_rproc; struct ipa_clock *clock = ipa->clock; int ret;
@@@ -938,6 -970,7 +938,6 @@@ ipa_reg_exit(ipa); kfree(ipa); ipa_clock_exit(clock); - rproc_put(rproc);
return 0; } diff --combined drivers/net/phy/phy_device.c index d6ac3ed38197,71169e7d6177..ce495473cd5d --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@@ -300,50 -300,22 +300,22 @@@ static int mdio_bus_phy_resume(struct d
phydev->suspended_by_mdio_bus = 0;
- ret = phy_resume(phydev); + ret = phy_init_hw(phydev); if (ret < 0) return ret;
- no_resume: - if (phydev->attached_dev && phydev->adjust_link) - phy_start_machine(phydev); - - return 0; - } - - static int mdio_bus_phy_restore(struct device *dev) - { - struct phy_device *phydev = to_phy_device(dev); - struct net_device *netdev = phydev->attached_dev; - int ret; - - if (!netdev) - return 0; - - ret = phy_init_hw(phydev); + ret = phy_resume(phydev); if (ret < 0) return ret; - + no_resume: if (phydev->attached_dev && phydev->adjust_link) phy_start_machine(phydev);
return 0; }
- static const struct dev_pm_ops mdio_bus_phy_pm_ops = { - .suspend = mdio_bus_phy_suspend, - .resume = mdio_bus_phy_resume, - .freeze = mdio_bus_phy_suspend, - .thaw = mdio_bus_phy_resume, - .restore = mdio_bus_phy_restore, - }; - - #define MDIO_BUS_PHY_PM_OPS (&mdio_bus_phy_pm_ops) - - #else - - #define MDIO_BUS_PHY_PM_OPS NULL - + static SIMPLE_DEV_PM_OPS(mdio_bus_phy_pm_ops, mdio_bus_phy_suspend, + mdio_bus_phy_resume); #endif /* CONFIG_PM */
/** @@@ -554,7 -526,7 +526,7 @@@ static const struct device_type mdio_bu .name = "PHY", .groups = phy_dev_groups, .release = phy_device_release, - .pm = MDIO_BUS_PHY_PM_OPS, + .pm = pm_ptr(&mdio_bus_phy_pm_ops), };
static int phy_request_driver_module(struct phy_device *dev, u32 phy_id) @@@ -606,7 -578,6 +578,7 @@@ struct phy_device *phy_device_create(st dev->pause = 0; dev->asym_pause = 0; dev->link = 0; + dev->port = PORT_TP; dev->interface = PHY_INTERFACE_MODE_GMII;
dev->autoneg = AUTONEG_ENABLE; @@@ -1144,10 -1115,19 +1116,19 @@@ int phy_init_hw(struct phy_device *phyd if (ret < 0) return ret;
- if (phydev->drv->config_init) + if (phydev->drv->config_init) { ret = phydev->drv->config_init(phydev); + if (ret < 0) + return ret; + }
- return ret; + if (phydev->drv->config_intr) { + ret = phydev->drv->config_intr(phydev); + if (ret < 0) + return ret; + } + + return 0; } EXPORT_SYMBOL(phy_init_hw);
@@@ -1167,8 -1147,8 +1148,8 @@@ char *phy_attached_info_irq(struct phy_ case PHY_POLL: irq_str = "POLL"; break; - case PHY_IGNORE_INTERRUPT: - irq_str = "IGNORE"; + case PHY_MAC_INTERRUPT: + irq_str = "MAC"; break; default: snprintf(irq_num, sizeof(irq_num), "%d", phydev->irq); @@@ -1377,8 -1357,6 +1358,8 @@@ int phy_attach_direct(struct net_devic
if (phydev->sfp_bus_attached) dev->sfp_bus = phydev->sfp_bus; + else if (dev->sfp_bus) + phydev->is_on_sfp_module = true; }
/* Some Ethernet drivers try to connect to a PHY device before @@@ -1406,14 -1384,6 +1387,14 @@@
phydev->state = PHY_READY;
+ /* Port is set to PORT_TP by default and the actual PHY driver will set + * it to different value depending on the PHY configuration. If we have + * the generic PHY driver we can't figure it out, thus set the old + * legacy PORT_MII value. + */ + if (using_genphy) + phydev->port = PORT_MII; + /* Initial carrier state is off as the phy is about to be * (re)initialized. */ @@@ -1751,7 -1721,7 +1732,7 @@@ int __phy_resume(struct phy_device *phy struct phy_driver *phydrv = phydev->drv; int ret;
- WARN_ON(!mutex_is_locked(&phydev->lock)); + lockdep_assert_held(&phydev->lock);
if (!phydrv || !phydrv->resume) return 0; diff --combined include/net/act_api.h index 761c0e331915,57be7c5d273b..2bf3092ae7ec --- a/include/net/act_api.h +++ b/include/net/act_api.h @@@ -166,6 -166,7 +166,7 @@@ int tcf_idr_create_from_flags(struct tc struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags); + void tcf_idr_insert_many(struct tc_action *actions[]); void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind); @@@ -186,13 -187,10 +187,13 @@@ int tcf_action_init(struct net *net, st struct nlattr *est, char *name, int ovr, int bind, struct tc_action *actions[], size_t *attr_size, bool rtnl_held, struct netlink_ext_ack *extack); +struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, + bool rtnl_held, + struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - bool rtnl_held, + struct tc_action_ops *ops, bool rtnl_held, struct netlink_ext_ack *extack); int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse); diff --combined include/uapi/linux/pkt_cls.h index afe6836e44b1,88f4bf0047e7..7ea59cfe1fa7 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@@ -591,8 -591,8 +591,9 @@@ enum TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */ TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */ TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ - + TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */ + TCA_FLOWER_KEY_CT_FLAGS_REPLY = 1 << 5, /* Packet is in the reply direction. */ + __TCA_FLOWER_KEY_CT_FLAGS_MAX, };
enum { diff --combined kernel/bpf/verifier.c index 36d1e7339ede,20babdd06278..1dda9d81f12c --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@@ -228,12 -228,6 +228,12 @@@ static void bpf_map_key_store(struct bp (poisoned ? BPF_MAP_KEY_POISON : 0ULL); }
+static bool bpf_pseudo_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == BPF_PSEUDO_CALL; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@@ -1079,51 -1073,6 +1079,51 @@@ static void mark_reg_known_zero(struct __mark_reg_known_zero(regs + regno); }
+static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) +{ + switch (reg->type) { + case PTR_TO_MAP_VALUE_OR_NULL: { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { + reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; + } else { + reg->type = PTR_TO_MAP_VALUE; + } + break; + } + case PTR_TO_SOCKET_OR_NULL: + reg->type = PTR_TO_SOCKET; + break; + case PTR_TO_SOCK_COMMON_OR_NULL: + reg->type = PTR_TO_SOCK_COMMON; + break; + case PTR_TO_TCP_SOCK_OR_NULL: + reg->type = PTR_TO_TCP_SOCK; + break; + case PTR_TO_BTF_ID_OR_NULL: + reg->type = PTR_TO_BTF_ID; + break; + case PTR_TO_MEM_OR_NULL: + reg->type = PTR_TO_MEM; + break; + case PTR_TO_RDONLY_BUF_OR_NULL: + reg->type = PTR_TO_RDONLY_BUF; + break; + case PTR_TO_RDWR_BUF_OR_NULL: + reg->type = PTR_TO_RDWR_BUF; + break; + default: + WARN_ON("unknown nullable register type"); + } +} + static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) { return type_is_pkt_pointer(reg->type); @@@ -1537,7 -1486,9 +1537,7 @@@ static int check_subprogs(struct bpf_ve
/* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { - if (insn[i].code != (BPF_JMP | BPF_CALL)) - continue; - if (insn[i].src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn + i)) continue; if (!env->bpf_capable) { verbose(env, @@@ -2320,14 -2271,12 +2320,14 @@@ static void save_register_state(struct state->stack[spi].slot_type[i] = STACK_SPILL; }
-/* check_stack_read/write functions track spill/fill of registers, +/* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_func_state *state, /* func where register points to */ - int off, int size, int value_regno, int insn_idx) +static int check_stack_write_fixed_off(struct bpf_verifier_env *env, + /* stack frame we're writing to */ + struct bpf_func_state *state, + int off, int size, int value_regno, + int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; @@@ -2453,175 -2402,9 +2453,175 @@@ return 0; }
-static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_func_state *reg_state /* func where register points to */, - int off, int size, int value_regno) +/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is + * known to contain a variable offset. + * This function checks whether the write is permitted and conservatively + * tracks the effects of the write, considering that each stack slot in the + * dynamic range is potentially written to. + * + * 'off' includes 'regno->off'. + * 'value_regno' can be -1, meaning that an unknown value is being written to + * the stack. + * + * Spilled pointers in range are not marked as written because we don't know + * what's going to be actually written. This means that read propagation for + * future reads cannot be terminated by this write. + * + * For privileged programs, uninitialized stack slots are considered + * initialized by this write (even though we don't know exactly what offsets + * are going to be written to). The idea is that we don't want the verifier to + * reject future reads that access slots written to through variable offsets. + */ +static int check_stack_write_var_off(struct bpf_verifier_env *env, + /* func where register points to */ + struct bpf_func_state *state, + int ptr_regno, int off, int size, + int value_regno, int insn_idx) +{ + struct bpf_func_state *cur; /* state of the current function */ + int min_off, max_off; + int i, err; + struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; + bool writing_zero = false; + /* set if the fact that we're writing a zero is used to let any + * stack slots remain STACK_ZERO + */ + bool zero_used = false; + + cur = env->cur_state->frame[env->cur_state->curframe]; + ptr_reg = &cur->regs[ptr_regno]; + min_off = ptr_reg->smin_value + off; + max_off = ptr_reg->smax_value + off + size; + if (value_regno >= 0) + value_reg = &cur->regs[value_regno]; + if (value_reg && register_is_null(value_reg)) + writing_zero = true; + + err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE), + state->acquired_refs, true); + if (err) + return err; + + + /* Variable offset writes destroy any spilled pointers in range. */ + for (i = min_off; i < max_off; i++) { + u8 new_type, *stype; + int slot, spi; + + slot = -i - 1; + spi = slot / BPF_REG_SIZE; + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + + if (!env->allow_ptr_leaks + && *stype != NOT_INIT + && *stype != SCALAR_VALUE) { + /* Reject the write if there's are spilled pointers in + * range. If we didn't reject here, the ptr status + * would be erased below (even though not all slots are + * actually overwritten), possibly opening the door to + * leaks. + */ + verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d", + insn_idx, i); + return -EINVAL; + } + + /* Erase all spilled pointers. */ + state->stack[spi].spilled_ptr.type = NOT_INIT; + + /* Update the slot type. */ + new_type = STACK_MISC; + if (writing_zero && *stype == STACK_ZERO) { + new_type = STACK_ZERO; + zero_used = true; + } + /* If the slot is STACK_INVALID, we check whether it's OK to + * pretend that it will be initialized by this write. The slot + * might not actually be written to, and so if we mark it as + * initialized future reads might leak uninitialized memory. + * For privileged programs, we will accept such reads to slots + * that may or may not be written because, if we're reject + * them, the error would be too confusing. + */ + if (*stype == STACK_INVALID && !env->allow_uninit_stack) { + verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d", + insn_idx, i); + return -EINVAL; + } + *stype = new_type; + } + if (zero_used) { + /* backtracking doesn't work for STACK_ZERO yet. */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; + } + return 0; +} + +/* When register 'dst_regno' is assigned some values from stack[min_off, + * max_off), we set the register's type according to the types of the + * respective stack slots. If all the stack values are known to be zeros, then + * so is the destination reg. Otherwise, the register is considered to be + * SCALAR. This function does not deal with register filling; the caller must + * ensure that all spilled registers in the stack range have been marked as + * read. + */ +static void mark_reg_stack_read(struct bpf_verifier_env *env, + /* func where src register points to */ + struct bpf_func_state *ptr_state, + int min_off, int max_off, int dst_regno) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + int i, slot, spi; + u8 *stype; + int zeros = 0; + + for (i = min_off; i < max_off; i++) { + slot = -i - 1; + spi = slot / BPF_REG_SIZE; + stype = ptr_state->stack[spi].slot_type; + if (stype[slot % BPF_REG_SIZE] != STACK_ZERO) + break; + zeros++; + } + if (zeros == max_off - min_off) { + /* any access_size read into register is zero extended, + * so the whole register == const_zero + */ + __mark_reg_const_zero(&state->regs[dst_regno]); + /* backtracking doesn't support STACK_ZERO yet, + * so mark it precise here, so that later + * backtracking can stop here. + * Backtracking may not need this if this register + * doesn't participate in pointer adjustment. + * Forward propagation of precise flag is not + * necessary either. This mark is only to stop + * backtracking. Any register that contributed + * to const 0 was marked precise before spill. + */ + state->regs[dst_regno].precise = true; + } else { + /* have read misc data from the stack */ + mark_reg_unknown(env, state->regs, dst_regno); + } + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; +} + +/* Read the stack at 'off' and put the results into the register indicated by + * 'dst_regno'. It handles reg filling if the addressed stack slot is a + * spilled reg. + * + * 'dst_regno' can be -1, meaning that the read value is not going to a + * register. + * + * The access is assumed to be within the current stack bounds. + */ +static int check_stack_read_fixed_off(struct bpf_verifier_env *env, + /* func where src register points to */ + struct bpf_func_state *reg_state, + int off, int size, int dst_regno) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; @@@ -2629,6 -2412,11 +2629,6 @@@ struct bpf_reg_state *reg; u8 *stype;
- if (reg_state->allocated_stack <= slot) { - verbose(env, "invalid read from stack off %d+0 size %d\n", - off, size); - return -EACCES; - } stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr;
@@@ -2639,9 -2427,9 +2639,9 @@@ verbose(env, "invalid size of register fill\n"); return -EACCES; } - if (value_regno >= 0) { - mark_reg_unknown(env, state->regs, value_regno); - state->regs[value_regno].live |= REG_LIVE_WRITTEN; + if (dst_regno >= 0) { + mark_reg_unknown(env, state->regs, dst_regno); + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); return 0; @@@ -2653,16 -2441,16 +2653,16 @@@ } }
- if (value_regno >= 0) { + if (dst_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = *reg; + state->regs[dst_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ - state->regs[value_regno].live |= REG_LIVE_WRITTEN; + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { - /* If value_regno==-1, the caller is asking us whether + /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE * (e.g. for XADD). * We must not allow unprivileged callers to do that @@@ -2674,167 -2462,70 +2674,167 @@@ } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { - int zeros = 0; + u8 type;
for (i = 0; i < size; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) + type = stype[(slot - i) % BPF_REG_SIZE]; + if (type == STACK_MISC) continue; - if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { - zeros++; + if (type == STACK_ZERO) continue; - } verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); - if (value_regno >= 0) { - if (zeros == size) { - /* any size read into register is zero extended, - * so the whole register == const_zero - */ - __mark_reg_const_zero(&state->regs[value_regno]); - /* backtracking doesn't support STACK_ZERO yet, - * so mark it precise here, so that later - * backtracking can stop here. - * Backtracking may not need this if this register - * doesn't participate in pointer adjustment. - * Forward propagation of precise flag is not - * necessary either. This mark is only to stop - * backtracking. Any register that contributed - * to const 0 was marked precise before spill. - */ - state->regs[value_regno].precise = true; - } else { - /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); - } - state->regs[value_regno].live |= REG_LIVE_WRITTEN; - } + if (dst_regno >= 0) + mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); } return 0; }
-static int check_stack_access(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int off, int size) +enum stack_access_src { + ACCESS_DIRECT = 1, /* the access is performed by an instruction */ + ACCESS_HELPER = 2, /* the access is performed by a helper */ +}; + +static int check_stack_range_initialized(struct bpf_verifier_env *env, + int regno, int off, int access_size, + bool zero_size_allowed, + enum stack_access_src type, + struct bpf_call_arg_meta *meta); + +static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) +{ + return cur_regs(env) + regno; +} + +/* Read the stack at 'ptr_regno + off' and put the result into the register + * 'dst_regno'. + * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'), + * but not its variable offset. + * 'size' is assumed to be <= reg size and the access is assumed to be aligned. + * + * As opposed to check_stack_read_fixed_off, this function doesn't deal with + * filling registers (i.e. reads of spilled register cannot be detected when + * the offset is not fixed). We conservatively mark 'dst_regno' as containing + * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable + * offset; for a fixed offset check_stack_read_fixed_off should be used + * instead. + */ +static int check_stack_read_var_off(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, int dst_regno) { - /* Stack accesses must be at a fixed offset, so that we - * can determine what type of data were returned. See - * check_stack_read(). + /* The state of the source register. */ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *ptr_state = func(env, reg); + int err; + int min_off, max_off; + + /* Note that we pass a NULL meta, so raw access will not be permitted. */ - if (!tnum_is_const(reg->var_off)) { + err = check_stack_range_initialized(env, ptr_regno, off, size, + false, ACCESS_DIRECT, NULL); + if (err) + return err; + + min_off = reg->smin_value + off; + max_off = reg->smax_value + off; + mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); + return 0; +} + +/* check_stack_read dispatches to check_stack_read_fixed_off or + * check_stack_read_var_off. + * + * The caller must ensure that the offset falls within the allocated stack + * bounds. + * + * 'dst_regno' is a register which will receive the value from the stack. It + * can be -1, meaning that the read value is not going to a register. + */ +static int check_stack_read(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, + int dst_regno) +{ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *state = func(env, reg); + int err; + /* Some accesses are only permitted with a static offset. */ + bool var_off = !tnum_is_const(reg->var_off); + + /* The offset is required to be static when reads don't go to a + * register, in order to not leak pointers (see + * check_stack_read_fixed_off). + */ + if (dst_regno < 0 && var_off) { char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d\n", + verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n", tn_buf, off, size); return -EACCES; } + /* Variable offset is prohibited for unprivileged mode for simplicity + * since it requires corresponding support in Spectre masking for stack + * ALU. See also retrieve_ptr_limit(). + */ + if (!env->bypass_spec_v1 && var_off) { + char tn_buf[48];
- if (off >= 0 || off < -MAX_BPF_STACK) { - verbose(env, "invalid stack off=%d size=%d\n", off, size); + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", + ptr_regno, tn_buf); return -EACCES; }
- return 0; + if (!var_off) { + off += reg->var_off.value; + err = check_stack_read_fixed_off(env, state, off, size, + dst_regno); + } else { + /* Variable offset stack reads need more conservative handling + * than fixed offset ones. Note that dst_regno >= 0 on this + * branch. + */ + err = check_stack_read_var_off(env, ptr_regno, off, size, + dst_regno); + } + return err; +} + + +/* check_stack_write dispatches to check_stack_write_fixed_off or + * check_stack_write_var_off. + * + * 'ptr_regno' is the register used as a pointer into the stack. + * 'off' includes 'ptr_regno->off', but not its variable offset (if any). + * 'value_regno' is the register whose value we're writing to the stack. It can + * be -1, meaning that we're not writing from a register. + * + * The caller must ensure that the offset falls within the maximum stack size. + */ +static int check_stack_write(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, + int value_regno, int insn_idx) +{ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *state = func(env, reg); + int err; + + if (tnum_is_const(reg->var_off)) { + off += reg->var_off.value; + err = check_stack_write_fixed_off(env, state, off, size, + value_regno, insn_idx); + } else { + /* Variable offset stack reads need more conservative handling + * than fixed offset ones. + */ + err = check_stack_write_var_off(env, state, + ptr_regno, off, size, + value_regno, insn_idx); + } + return err; }
static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, @@@ -3167,6 -2858,11 +3167,6 @@@ static int check_sock_access(struct bpf return -EACCES; }
-static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) -{ - return cur_regs(env) + regno; -} - static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno)); @@@ -3285,8 -2981,8 +3285,8 @@@ static int check_ptr_alignment(struct b break; case PTR_TO_STACK: pointer_desc = "stack "; - /* The stack spill tracking logic in check_stack_write() - * and check_stack_read() relies on stack accesses being + /* The stack spill tracking logic in check_stack_write_fixed_off() + * and check_stack_read_fixed_off() relies on stack accesses being * aligned. */ strict = true; @@@ -3378,7 -3074,9 +3378,7 @@@ process_func continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (insn[i].code != (BPF_JMP | BPF_CALL)) - continue; - if (insn[i].src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@@ -3702,91 -3400,6 +3702,91 @@@ static int check_ptr_to_map_access(stru return 0; }
+/* Check that the stack access at the given offset is within bounds. The + * maximum valid offset is -1. + * + * The minimum valid offset is -MAX_BPF_STACK for writes, and + * -state->allocated_stack for reads. + */ +static int check_stack_slot_within_bounds(int off, + struct bpf_func_state *state, + enum bpf_access_type t) +{ + int min_valid_off; + + if (t == BPF_WRITE) + min_valid_off = -MAX_BPF_STACK; + else + min_valid_off = -state->allocated_stack; + + if (off < min_valid_off || off > -1) + return -EACCES; + return 0; +} + +/* Check that the stack access at 'regno + off' falls within the maximum stack + * bounds. + * + * 'off' includes `regno->offset`, but not its dynamic part (if any). + */ +static int check_stack_access_within_bounds( + struct bpf_verifier_env *env, + int regno, int off, int access_size, + enum stack_access_src src, enum bpf_access_type type) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; + struct bpf_func_state *state = func(env, reg); + int min_off, max_off; + int err; + char *err_extra; + + if (src == ACCESS_HELPER) + /* We don't know if helpers are reading or writing (or both). */ + err_extra = " indirect access to"; + else if (type == BPF_READ) + err_extra = " read from"; + else + err_extra = " write to"; + + if (tnum_is_const(reg->var_off)) { + min_off = reg->var_off.value + off; + if (access_size > 0) + max_off = min_off + access_size - 1; + else + max_off = min_off; + } else { + if (reg->smax_value >= BPF_MAX_VAR_OFF || + reg->smin_value <= -BPF_MAX_VAR_OFF) { + verbose(env, "invalid unbounded variable-offset%s stack R%d\n", + err_extra, regno); + return -EACCES; + } + min_off = reg->smin_value + off; + if (access_size > 0) + max_off = reg->smax_value + off + access_size - 1; + else + max_off = min_off; + } + + err = check_stack_slot_within_bounds(min_off, state, type); + if (!err) + err = check_stack_slot_within_bounds(max_off, state, type); + + if (err) { + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid%s stack R%d off=%d size=%d\n", + err_extra, regno, off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n", + err_extra, regno, tn_buf, access_size); + } + } + return err; +}
/* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory @@@ -3902,8 -3515,8 +3902,8 @@@ static int check_mem_access(struct bpf_ }
} else if (reg->type == PTR_TO_STACK) { - off += reg->var_off.value; - err = check_stack_access(env, reg, off, size); + /* Basic bounds checks. */ + err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t); if (err) return err;
@@@ -3912,12 -3525,12 +3912,12 @@@ if (err) return err;
- if (t == BPF_WRITE) - err = check_stack_write(env, state, off, size, - value_regno, insn_idx); - else - err = check_stack_read(env, state, off, size, + if (t == BPF_READ) + err = check_stack_read(env, regno, off, size, value_regno); + else + err = check_stack_write(env, regno, off, size, + value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose(env, "cannot write into packet\n"); @@@ -3993,30 -3606,13 +3993,30 @@@ return err; }
-static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { + int load_reg; int err;
- if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || - insn->imm != 0) { - verbose(env, "BPF_XADD uses reserved fields\n"); + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: + break; + default: + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); + return -EINVAL; + } + + if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { + verbose(env, "invalid atomic operand size\n"); return -EINVAL; }
@@@ -4030,13 -3626,6 +4030,13 @@@ if (err) return err;
+ if (insn->imm == BPF_CMPXCHG) { + /* Check comparison of R0 with memory location */ + err = check_reg_arg(env, BPF_REG_0, SRC_OP); + if (err) + return err; + } + if (is_pointer_value(env, insn->src_reg)) { verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES; @@@ -4046,91 -3635,66 +4046,91 @@@ is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || is_sk_reg(env, insn->dst_reg)) { - verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; }
- /* check whether atomic_add can read the memory */ + if (insn->imm & BPF_FETCH) { + if (insn->imm == BPF_CMPXCHG) + load_reg = BPF_REG_0; + else + load_reg = insn->src_reg; + + /* check and record load of old value */ + err = check_reg_arg(env, load_reg, DST_OP); + if (err) + return err; + } else { + /* This instruction accesses a memory location but doesn't + * actually load it into a register. + */ + load_reg = -1; + } + + /* check whether we can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, -1, true); + BPF_SIZE(insn->code), BPF_READ, load_reg, true); if (err) return err;
- /* check whether atomic_add can write into the same memory */ - return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, -1, true); -} - -static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, - int off, int access_size, - bool zero_size_allowed) -{ - struct bpf_reg_state *reg = reg_state(env, regno); - - if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size < 0 || (access_size == 0 && !zero_size_allowed)) { - if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", - regno, off, access_size); - } else { - char tn_buf[48]; + /* check whether we can write into the same memory */ + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1, true); + if (err) + return err;
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", - regno, tn_buf, access_size); - } - return -EACCES; - } return 0; }
-/* when register 'regno' is passed into function that will read 'access_size' - * bytes from that pointer, make sure that it's within stack boundary - * and all elements of stack are initialized. - * Unlike most pointer bounds-checking functions, this one doesn't take an - * 'off' argument, so it has to add in reg->off itself. +/* When register 'regno' is used to read the stack (either directly or through + * a helper function) make sure that it's within stack boundary and, depending + * on the access type, that all elements of the stack are initialized. + * + * 'off' includes 'regno->off', but not its dynamic part (if any). + * + * All registers that have been spilled on the stack in the slots within the + * read offsets are marked as read. */ -static int check_stack_boundary(struct bpf_verifier_env *env, int regno, - int access_size, bool zero_size_allowed, - struct bpf_call_arg_meta *meta) +static int check_stack_range_initialized( + struct bpf_verifier_env *env, int regno, int off, + int access_size, bool zero_size_allowed, + enum stack_access_src type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); int err, min_off, max_off, i, j, slot, spi; + char *err_extra = type == ACCESS_HELPER ? " indirect" : ""; + enum bpf_access_type bounds_check_type; + /* Some accesses can write anything into the stack, others are + * read-only. + */ + bool clobber = false; + + if (access_size == 0 && !zero_size_allowed) { + verbose(env, "invalid zero-sized read\n"); + return -EACCES; + } + + if (type == ACCESS_HELPER) { + /* The bounds checks for writes are more permissive than for + * reads. However, if raw_mode is not set, we'll do extra + * checks below. + */ + bounds_check_type = BPF_WRITE; + clobber = true; + } else { + bounds_check_type = BPF_READ; + } + err = check_stack_access_within_bounds(env, regno, off, access_size, + type, bounds_check_type); + if (err) + return err; +
if (tnum_is_const(reg->var_off)) { - min_off = max_off = reg->var_off.value + reg->off; - err = __check_stack_boundary(env, regno, min_off, access_size, - zero_size_allowed); - if (err) - return err; + min_off = max_off = reg->var_off.value + off; } else { /* Variable offset is prohibited for unprivileged mode for * simplicity since it requires corresponding support in @@@ -4141,8 -3705,8 +4141,8 @@@ char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", - regno, tn_buf); + verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n", + regno, err_extra, tn_buf); return -EACCES; } /* Only initialized buffer on stack is allowed to be accessed @@@ -4154,8 -3718,28 +4154,8 @@@ if (meta && meta->raw_mode) meta = NULL;
- if (reg->smax_value >= BPF_MAX_VAR_OFF || - reg->smax_value <= -BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded indirect variable offset stack access\n", - regno); - return -EACCES; - } - min_off = reg->smin_value + reg->off; - max_off = reg->smax_value + reg->off; - err = __check_stack_boundary(env, regno, min_off, access_size, - zero_size_allowed); - if (err) { - verbose(env, "R%d min value is outside of stack bound\n", - regno); - return err; - } - err = __check_stack_boundary(env, regno, max_off, access_size, - zero_size_allowed); - if (err) { - verbose(env, "R%d max value is outside of stack bound\n", - regno); - return err; - } + min_off = reg->smin_value + off; + max_off = reg->smax_value + off; }
if (meta && meta->raw_mode) { @@@ -4175,10 -3759,8 +4175,10 @@@ if (*stype == STACK_MISC) goto mark; if (*stype == STACK_ZERO) { - /* helper can write anything into the stack */ - *stype = STACK_MISC; + if (clobber) { + /* helper can write anything into the stack */ + *stype = STACK_MISC; + } goto mark; }
@@@ -4189,24 -3771,22 +4189,24 @@@ if (state->stack[spi].slot_type[0] == STACK_SPILL && (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || env->allow_ptr_leaks)) { - __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); - for (j = 0; j < BPF_REG_SIZE; j++) - state->stack[spi].slot_type[j] = STACK_MISC; + if (clobber) { + __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + state->stack[spi].slot_type[j] = STACK_MISC; + } goto mark; }
err: if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - min_off, i - min_off, access_size); + verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n", + err_extra, regno, min_off, i - min_off, access_size); } else { char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", - tn_buf, i - min_off, access_size); + verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n", + err_extra, regno, tn_buf, i - min_off, access_size); } return -EACCES; mark: @@@ -4255,10 -3835,8 +4255,10 @@@ static int check_helper_mem_access(stru "rdwr", &env->prog->aux->max_rdwr_access); case PTR_TO_STACK: - return check_stack_boundary(env, regno, access_size, - zero_size_allowed, meta); + return check_stack_range_initialized( + env, + regno, reg->off, access_size, + zero_size_allowed, ACCESS_HELPER, meta); default: /* scalar_value or invalid ptr */ /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && @@@ -4272,29 -3850,6 +4272,29 @@@ } }
+int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno, u32 mem_size) +{ + if (register_is_null(reg)) + return 0; + + if (reg_type_may_be_null(reg->type)) { + /* Assuming that the register contains a value check if the memory + * access is safe. Temporarily save and restore the register's state as + * the conversion shouldn't be visible to a caller. + */ + const struct bpf_reg_state saved_reg = *reg; + int rv; + + mark_ptr_not_null_reg(reg); + rv = check_helper_mem_access(env, regno, mem_size, true, NULL); + *reg = saved_reg; + return rv; + } + + return check_helper_mem_access(env, regno, mem_size, true, NULL); +} + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL * Two bpf_map_lookups (even with the same key) will have different reg->id. @@@ -4766,7 -4321,7 +4766,7 @@@ skip_type_check err = mark_chain_precision(env, regno); } else if (arg_type_is_alloc_size(arg_type)) { if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + verbose(env, "R%d is not a known constant'\n", regno); return -EACCES; } @@@ -5279,9 -4834,8 +5279,9 @@@ static int check_func_call(struct bpf_v subprog); clear_caller_saved_regs(env, caller->regs);
- /* All global functions return SCALAR_VALUE */ + /* All global functions return a 64-bit SCALAR_VALUE */ mark_reg_unknown(env, caller->regs, BPF_REG_0); + caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
/* continue with next insn after call */ return 0; @@@ -5946,41 -5500,6 +5946,41 @@@ do_sim return !ret ? -EFAULT : 0; }
+/* check that stack access falls within stack limits and that 'reg' doesn't + * have a variable offset. + * + * Variable offset is prohibited for unprivileged mode for simplicity since it + * requires corresponding support in Spectre masking for stack ALU. See also + * retrieve_ptr_limit(). + * + * + * 'off' includes 'reg->off'. + */ +static int check_stack_access_for_ptr_arithmetic( + struct bpf_verifier_env *env, + int regno, + const struct bpf_reg_state *reg, + int off) +{ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n", + regno, tn_buf, off); + return -EACCES; + } + + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose(env, "R%d stack pointer arithmetic goes out of range, " + "prohibited for !root; off=%d\n", regno, off); + return -EACCES; + } + + return 0; +} + + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@@ -6224,9 -5743,10 +6224,9 @@@ static int adjust_ptr_min_max_vals(stru "prohibited for !root\n", dst); return -EACCES; } else if (dst_reg->type == PTR_TO_STACK && - check_stack_access(env, dst_reg, dst_reg->off + - dst_reg->var_off.value, 1)) { - verbose(env, "R%d stack pointer arithmetic goes out of range, " - "prohibited for !root\n", dst); + check_stack_access_for_ptr_arithmetic( + env, dst, dst_reg, dst_reg->off + + dst_reg->var_off.value)) { return -EACCES; } } @@@ -6705,7 -6225,7 +6705,7 @@@ static void scalar32_min_max_rsh(struc * 3) the signed bounds cross zero, so they tell us nothing * about the result * If the value in dst_reg is known nonnegative, then again the - * unsigned bounts capture the signed bounds. + * unsigned bounds capture the signed bounds. * Thus, in all cases it suffices to blow away our signed bounds * and rely on inferring new ones from the unsigned bounds and * var_off of the result. @@@ -6736,7 -6256,7 +6736,7 @@@ static void scalar_min_max_rsh(struct b * 3) the signed bounds cross zero, so they tell us nothing * about the result * If the value in dst_reg is known nonnegative, then again the - * unsigned bounts capture the signed bounds. + * unsigned bounds capture the signed bounds. * Thus, in all cases it suffices to blow away our signed bounds * and rely on inferring new ones from the unsigned bounds and * var_off of the result. @@@ -7806,19 -7326,43 +7806,19 @@@ static void mark_ptr_or_null_reg(struc } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - const struct bpf_map *map = reg->map_ptr; - - if (map->inner_map_meta) { - reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = map->inner_map_meta; - } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { - reg->type = PTR_TO_XDP_SOCK; - } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || - map->map_type == BPF_MAP_TYPE_SOCKHASH) { - reg->type = PTR_TO_SOCKET; - } else { - reg->type = PTR_TO_MAP_VALUE; - } - } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { - reg->type = PTR_TO_SOCKET; - } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { - reg->type = PTR_TO_SOCK_COMMON; - } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { - reg->type = PTR_TO_TCP_SOCK; - } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { - reg->type = PTR_TO_BTF_ID; - } else if (reg->type == PTR_TO_MEM_OR_NULL) { - reg->type = PTR_TO_MEM; - } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) { - reg->type = PTR_TO_RDONLY_BUF; - } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) { - reg->type = PTR_TO_RDWR_BUF; - } - if (is_null) { /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ reg->id = 0; reg->ref_obj_id = 0; - } else if (!reg_may_point_to_spin_lock(reg)) { + + return; + } + + mark_ptr_not_null_reg(reg); + + if (!reg_may_point_to_spin_lock(reg)) { /* For not-NULL ptr, reg->ref_obj_id will be reset * in release_reg_references(). * @@@ -8401,9 -7945,6 +8401,9 @@@ static int check_return_code(struct bpf env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); + if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || + env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) + range = tnum_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SKB: if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { @@@ -9989,19 -9530,14 +9989,19 @@@ static int do_check(struct bpf_verifier } else if (class == BPF_STX) { enum bpf_reg_type *prev_dst_type, dst_reg_type;
- if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, env->insn_idx, insn); + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + err = check_atomic(env, env->insn_idx, insn); if (err) return err; env->insn_idx++; continue; }
+ if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@@ -10173,36 -9709,6 +10173,36 @@@ process_bpf_exit return 0; }
+static int find_btf_percpu_datasec(struct btf *btf) +{ + const struct btf_type *t; + const char *tname; + int i, n; + + /* + * Both vmlinux and module each have their own ".data..percpu" + * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF + * types to look at only module's own BTF types. + */ + n = btf_nr_types(btf); + if (btf_is_module(btf)) + i = btf_nr_types(btf_vmlinux); + else + i = 1; + + for(; i < n; i++) { + t = btf_type_by_id(btf, i); + if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC) + continue; + + tname = btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, ".data..percpu")) + return i; + } + + return -ENOENT; +} + /* replace pseudo btf_id with kernel symbol address */ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, @@@ -10210,57 -9716,48 +10210,57 @@@ { const struct btf_var_secinfo *vsi; const struct btf_type *datasec; + struct btf_mod_pair *btf_mod; const struct btf_type *t; const char *sym_name; bool percpu = false; u32 type, id = insn->imm; + struct btf *btf; s32 datasec_id; u64 addr; - int i; - - if (!btf_vmlinux) { - verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); - return -EINVAL; - } + int i, btf_fd, err;
- if (insn[1].imm != 0) { - verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); - return -EINVAL; + btf_fd = insn[1].imm; + if (btf_fd) { + btf = btf_get_by_fd(btf_fd); + if (IS_ERR(btf)) { + verbose(env, "invalid module BTF object FD specified.\n"); + return -EINVAL; + } + } else { + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; + } + btf = btf_vmlinux; + btf_get(btf); }
- t = btf_type_by_id(btf_vmlinux, id); + t = btf_type_by_id(btf, id); if (!t) { verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); - return -ENOENT; + err = -ENOENT; + goto err_put; }
if (!btf_type_is_var(t)) { - verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", - id); - return -EINVAL; + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id); + err = -EINVAL; + goto err_put; }
- sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); + sym_name = btf_name_by_offset(btf, t->name_off); addr = kallsyms_lookup_name(sym_name); if (!addr) { verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", sym_name); - return -ENOENT; + err = -ENOENT; + goto err_put; }
- datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", - BTF_KIND_DATASEC); + datasec_id = find_btf_percpu_datasec(btf); if (datasec_id > 0) { - datasec = btf_type_by_id(btf_vmlinux, datasec_id); + datasec = btf_type_by_id(btf, datasec_id); for_each_vsi(i, datasec, vsi) { if (vsi->type == id) { percpu = true; @@@ -10273,10 -9770,10 +10273,10 @@@ insn[1].imm = addr >> 32;
type = t->type; - t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); + t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; - aux->btf_var.btf = btf_vmlinux; + aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { const struct btf_type *ret; @@@ -10284,54 -9781,21 +10284,54 @@@ u32 tsize;
/* resolve the type size of ksym. */ - ret = btf_resolve_size(btf_vmlinux, t, &tsize); + ret = btf_resolve_size(btf, t, &tsize); if (IS_ERR(ret)) { - tname = btf_name_by_offset(btf_vmlinux, t->name_off); + tname = btf_name_by_offset(btf, t->name_off); verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", tname, PTR_ERR(ret)); - return -EINVAL; + err = -EINVAL; + goto err_put; } aux->btf_var.reg_type = PTR_TO_MEM; aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; - aux->btf_var.btf = btf_vmlinux; + aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } + + /* check whether we recorded this BTF (and maybe module) already */ + for (i = 0; i < env->used_btf_cnt; i++) { + if (env->used_btfs[i].btf == btf) { + btf_put(btf); + return 0; + } + } + + if (env->used_btf_cnt >= MAX_USED_BTFS) { + err = -E2BIG; + goto err_put; + } + + btf_mod = &env->used_btfs[env->used_btf_cnt]; + btf_mod->btf = btf; + btf_mod->module = NULL; + + /* if we reference variables from kernel module, bump its refcount */ + if (btf_is_module(btf)) { + btf_mod->module = btf_try_get_module(btf); + if (!btf_mod->module) { + err = -ENXIO; + goto err_put; + } + } + + env->used_btf_cnt++; + return 0; +err_put: + btf_put(btf); + return err; }
static int check_map_prealloc(struct bpf_map *map) @@@ -10433,22 -9897,15 +10433,22 @@@ static int check_map_prog_compatibility case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_LRU_HASH: case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: if (!is_preallocated_map(map)) { verbose(env, - "Sleepable programs can only use preallocated hash maps\n"); + "Sleepable programs can only use preallocated maps\n"); return -EINVAL; } break; + case BPF_MAP_TYPE_RINGBUF: + break; default: verbose(env, - "Sleepable programs can only use array and hash maps\n"); + "Sleepable programs can only use array, hash, and ringbuf maps\n"); return -EINVAL; }
@@@ -10485,6 -9942,13 +10485,6 @@@ static int resolve_pseudo_ldimm64(struc return -EINVAL; }
- if (BPF_CLASS(insn->code) == BPF_STX && - ((BPF_MODE(insn->code) != BPF_MEM && - BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } - if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_insn_aux_data *aux; struct bpf_map *map; @@@ -10628,13 -10092,6 +10628,13 @@@ static void release_maps(struct bpf_ver env->used_map_cnt); }
+/* drop refcnt of maps used by the rejected program */ +static void release_btfs(struct bpf_verifier_env *env) +{ + __bpf_free_used_btfs(env->prog->aux, env->used_btfs, + env->used_btf_cnt); +} + /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) { @@@ -11006,7 -10463,6 +11006,7 @@@ static int opt_subreg_zext_lo32_rnd_hi3 for (i = 0; i < len; i++) { int adj_idx = i + delta; struct bpf_insn insn; + u8 load_reg;
insn = insns[adj_idx]; if (!aux[adj_idx].zext_dst) { @@@ -11049,27 -10505,9 +11049,27 @@@ if (!bpf_jit_needs_zext()) continue;
+ /* zext_dst means that we want to zero-extend whatever register + * the insn defines, which is dst_reg most of the time, with + * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH. + */ + if (BPF_CLASS(insn.code) == BPF_STX && + BPF_MODE(insn.code) == BPF_ATOMIC) { + /* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not + * define any registers, therefore zext_dst cannot be + * set. + */ + if (WARN_ON(!(insn.imm & BPF_FETCH))) + return -EINVAL; + load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0 + : insn.src_reg; + } else { + load_reg = insn.dst_reg; + } + zext_patch[0] = insn; - zext_patch[1].dst_reg = insn.dst_reg; - zext_patch[1].src_reg = insn.dst_reg; + zext_patch[1].dst_reg = load_reg; + zext_patch[1].src_reg = load_reg; patch = zext_patch; patch_len = 2; apply_patch_buffer: @@@ -11285,7 -10723,8 +11285,7 @@@ static int jit_subprogs(struct bpf_veri return 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but * need a hard reject of the program. Thus -EFAULT is @@@ -11326,7 -10765,7 +11326,7 @@@ /* BPF_PROG_RUN doesn't call subprogs directly, * hence main prog stats include the runtime of subprogs. * subprogs don't have IDs and not reachable via prog_get_next_id - * func[i]->aux->stats will never be accessed and stays NULL + * func[i]->stats will never be accessed and stays NULL */ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); if (!func[i]) @@@ -11414,7 -10853,8 +11414,7 @@@ for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - @@@ -11459,7 -10899,8 +11459,7 @@@ * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); @@@ -11488,7 -10929,8 +11488,7 @@@ out_undo_insn /* cleanup main prog to be interpreted */ prog->jit_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; insn->off = 0; insn->imm = env->insn_aux_data[i].call_imm; @@@ -11523,7 -10965,8 +11523,7 @@@ static int fixup_call_args(struct bpf_v return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); if (depth < 0) @@@ -11563,7 -11006,7 +11563,7 @@@ static int fixup_bpf_calls(struct bpf_v bool isdiv = BPF_OP(insn->code) == BPF_DIV; struct bpf_insn *patchlet; struct bpf_insn chk_and_div[] = { - /* Rx div 0 -> 0 */ + /* [R,W]x div 0 -> 0 */ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | BPF_JNE | BPF_K, insn->src_reg, 0, 2, 0), @@@ -11572,16 -11015,18 +11572,18 @@@ *insn, }; struct bpf_insn chk_and_mod[] = { - /* Rx mod 0 -> Rx */ + /* [R,W]x mod 0 -> [R,W]x */ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | BPF_JEQ | BPF_K, insn->src_reg, - 0, 1, 0), + 0, 1 + (is64 ? 0 : 1), 0), *insn, + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), };
patchlet = isdiv ? chk_and_div : chk_and_mod; cnt = isdiv ? ARRAY_SIZE(chk_and_div) : - ARRAY_SIZE(chk_and_mod); + ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) @@@ -11986,13 -11431,6 +11988,13 @@@ static int do_check_common(struct bpf_v mark_reg_known_zero(env, regs, i); else if (regs[i].type == SCALAR_VALUE) mark_reg_unknown(env, regs, i); + else if (regs[i].type == PTR_TO_MEM_OR_NULL) { + const u32 mem_size = regs[i].mem_size; + + mark_reg_known_zero(env, regs, i); + regs[i].mem_size = mem_size; + regs[i].id = ++env->id_gen; + } } } else { /* 1st arg to a function */ @@@ -12571,7 -12009,6 +12573,7 @@@ int bpf_check(struct bpf_prog **prog, u env->strict_alignment = false;
env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->allow_uninit_stack = bpf_allow_uninit_stack(); env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); @@@ -12667,10 -12104,7 +12669,10 @@@ skip_full_check goto err_release_maps; }
- if (ret == 0 && env->used_map_cnt) { + if (ret) + goto err_release_maps; + + if (env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, sizeof(env->used_maps[0]), @@@ -12684,29 -12118,15 +12686,29 @@@ memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); env->prog->aux->used_map_cnt = env->used_map_cnt; + } + if (env->used_btf_cnt) { + /* if program passed verifier, update used_btfs in bpf_prog_aux */ + env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt, + sizeof(env->used_btfs[0]), + GFP_KERNEL); + if (!env->prog->aux->used_btfs) { + ret = -ENOMEM; + goto err_release_maps; + }
+ memcpy(env->prog->aux->used_btfs, env->used_btfs, + sizeof(env->used_btfs[0]) * env->used_btf_cnt); + env->prog->aux->used_btf_cnt = env->used_btf_cnt; + } + if (env->used_map_cnt || env->used_btf_cnt) { /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions */ convert_pseudo_ld_imm64(env); }
- if (ret == 0) - adjust_btf_func(env); + adjust_btf_func(env);
err_release_maps: if (!env->prog->aux->used_maps) @@@ -12714,8 -12134,6 +12716,8 @@@ * them now. Otherwise free_used_maps() will release them. */ release_maps(env); + if (!env->prog->aux->used_btfs) + release_btfs(env);
/* extension progs temporarily inherit the attach_type of their targets for verification purposes, so set it back to zero before returning diff --combined net/core/flow_dissector.c index c565c7a17091,0b4f536bc32d..2ef2224b3bff --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@@ -23,7 -23,6 +23,7 @@@ #include <linux/if_ether.h> #include <linux/mpls.h> #include <linux/tcp.h> +#include <linux/ptp_classify.h> #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> @@@ -237,8 -236,9 +237,8 @@@ skb_flow_dissect_set_enc_addr_type(enu void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - u16 *ctinfo_map, - size_t mapsize) + void *target_container, u16 *ctinfo_map, + size_t mapsize, bool post_ct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct flow_dissector_key_ct *key; @@@ -250,19 -250,13 +250,19 @@@ return;
ct = nf_ct_get(skb, &ctinfo); - if (!ct) + if (!ct && !post_ct) return;
key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CT, target_container);
+ if (!ct) { + key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_INVALID; + return; + } + if (ctinfo < mapsize) key->ct_state = ctinfo_map[ctinfo]; #if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) @@@ -1056,6 -1050,9 +1056,9 @@@ proto_again key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; }
+ __skb_flow_dissect_ipv4(skb, flow_dissector, + target_container, data, iph); + if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT;
@@@ -1072,9 -1069,6 +1075,6 @@@ } }
- __skb_flow_dissect_ipv4(skb, flow_dissector, - target_container, data, iph); - break; } case htons(ETH_P_IPV6): { @@@ -1257,21 -1251,6 +1257,21 @@@ &proto, &nhoff, hlen, flags); break;
+ case htons(ETH_P_1588): { + struct ptp_header *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + nhoff += ntohs(hdr->message_length); + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; diff --combined net/mptcp/options.c index bb874c5d663a,8fec3dabe109..b63574d6b812 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@@ -282,15 -282,6 +282,15 @@@ static void mptcp_parse_option(const st pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); break;
+ case MPTCPOPT_MP_PRIO: + if (opsize != TCPOLEN_MPTCP_PRIO) + break; + + mp_opt->mp_prio = 1; + mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP; + pr_debug("MP_PRIO: prio=%d", mp_opt->backup); + break; + case MPTCPOPT_MP_FASTCLOSE: if (opsize != TCPOLEN_MPTCP_FASTCLOSE) break; @@@ -322,7 -313,6 +322,7 @@@ void mptcp_get_options(const struct sk_ mp_opt->port = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; + mp_opt->mp_prio = 0;
length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@@ -508,8 -498,8 +508,8 @@@ static bool mptcp_established_options_d { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); + u64 snd_data_fin_enable, ack_seq; unsigned int dss_size = 0; - u64 snd_data_fin_enable; struct mptcp_ext *mpext; unsigned int ack_size; bool ret = false; @@@ -541,13 -531,14 +541,14 @@@ return ret; }
+ ack_seq = READ_ONCE(msk->ack_seq); if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; - opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq); + opts->ext_copy.data_ack = ack_seq; opts->ext_copy.ack64 = 1; } else { ack_size = TCPOLEN_MPTCP_DSS_ACK32; - opts->ext_copy.data_ack32 = (uint32_t)READ_ONCE(msk->ack_seq); + opts->ext_copy.data_ack32 = (uint32_t)ack_seq; opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; @@@ -689,29 -680,6 +690,29 @@@ static bool mptcp_established_options_r return true; }
+static bool mptcp_established_options_mp_prio(struct sock *sk, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (!subflow->send_mp_prio) + return false; + + /* account for the trailing 'nop' option */ + if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN) + return false; + + *size = TCPOLEN_MPTCP_PRIO_ALIGN; + opts->suboptions |= OPTION_MPTCP_PRIO; + opts->backup = subflow->request_bkup; + + pr_debug("prio=%d", opts->backup); + + return true; +} + bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@@ -754,12 -722,6 +755,12 @@@ ret = true; }
+ if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) { + *size += opt_size; + remaining -= opt_size; + ret = true; + } + return ret; }
@@@ -867,7 -829,7 +868,7 @@@ fully_established clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk, subflow); } else { - mptcp_pm_fully_established(msk); + mptcp_pm_fully_established(msk, ssk, GFP_ATOMIC); } return true;
@@@ -918,8 -880,7 +919,7 @@@ static void ack_update_msk(struct mptcp msk->wnd_end = new_wnd_end;
/* this assumes mptcp_incoming_options() is invoked after tcp_ack() */ - if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)) && - sk_stream_memory_free(ssk)) + if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt))) __mptcp_check_push(sk, ssk);
if (after64(new_snd_una, old_snd_una)) { @@@ -1025,10 -986,6 +1025,10 @@@ void mptcp_incoming_options(struct soc mptcp_pm_del_add_timer(msk, &addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); } + + if (mp_opt.port) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); + mp_opt.add_addr = 0; }
@@@ -1037,12 -994,6 +1037,12 @@@ mp_opt.rm_addr = 0; }
+ if (mp_opt.mp_prio) { + mptcp_pm_mp_prio_received(sk, mp_opt.backup); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX); + mp_opt.mp_prio = 0; + } + if (!mp_opt.dss) return;
@@@ -1217,18 -1168,6 +1217,18 @@@ mp_capable_done 0, opts->rm_id); }
+ if (OPTION_MPTCP_PRIO & opts->suboptions) { + const struct sock *ssk = (const struct sock *)tp; + struct mptcp_subflow_context *subflow; + + subflow = mptcp_subflow_ctx(ssk); + subflow->send_mp_prio = 0; + + *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, + TCPOLEN_MPTCP_PRIO, + opts->backup, TCPOPT_NOP); + } + if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYN, diff --combined net/mptcp/protocol.c index c2a8392254dc,06da6ad31c87..a57f3eab7b6a --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@@ -45,14 -45,11 +45,14 @@@ static struct percpu_counter mptcp_sock static void __mptcp_destroy_sock(struct sock *sk); static void __mptcp_check_send_data_fin(struct sock *sk);
+DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); +static struct net_device mptcp_napi_dev; + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. */ -static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) +struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) { if (!msk->subflow || READ_ONCE(msk->can_ack)) return NULL; @@@ -117,7 -114,11 +117,7 @@@ static int __mptcp_socket_create(struc list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; - - /* accept() will wait on first subflow sk_wq, and we always wakes up - * via msk->sk_socket - */ - RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq); + mptcp_sock_graft(msk->first, sk->sk_socket);
return 0; } @@@ -363,8 -364,6 +363,6 @@@ static void mptcp_check_data_fin_ack(st
/* Look for an acknowledged DATA_FIN */ if (mptcp_pending_data_fin_ack(sk)) { - mptcp_stop_timer(sk); - WRITE_ONCE(msk->snd_data_fin_enable, 0);
switch (sk->sk_state) { @@@ -458,7 -457,18 +456,18 @@@ static bool mptcp_subflow_cleanup_rbuf( static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) { struct sock *ack_hint = READ_ONCE(msk->ack_hint); + int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + bool cleanup; + + /* this is a simple superset of what tcp_cleanup_rbuf() implements + * so that we don't have to acquire the ssk socket lock most of the time + * to do actually nothing + */ + cleanup = __mptcp_space(sk) - old_space >= max(0, old_space); + if (!cleanup) + return;
/* if the hinted ssk is still active, try to use it */ if (likely(ack_hint)) { @@@ -733,14 -743,10 +742,14 @@@ wake
void __mptcp_flush_join_list(struct mptcp_sock *msk) { + struct mptcp_subflow_context *subflow; + if (likely(list_empty(&msk->join_list))) return;
spin_lock_bh(&msk->join_list_lock); + list_for_each_entry(subflow, &msk->join_list, node) + mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow)); list_splice_tail_init(&msk->join_list, &msk->conn_list); spin_unlock_bh(&msk->join_list_lock); } @@@ -1040,6 -1046,13 +1049,6 @@@ out __mptcp_update_wmem(sk); sk_mem_reclaim_partial(sk); } - - if (sk_stream_is_writeable(sk)) { - /* pairs with memory barrier in mptcp_poll */ - smp_mb(); - if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags)) - sk_stream_write_space(sk); - } }
if (snd_una == READ_ONCE(msk->snd_nxt)) { @@@ -1358,7 -1371,8 +1367,7 @@@ struct subflow_send_info u64 ratio; };
-static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, - u32 *sndbuf) +static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[2]; struct mptcp_subflow_context *subflow; @@@ -1369,17 -1383,24 +1378,17 @@@
sock_owned_by_me((struct sock *)msk);
- *sndbuf = 0; if (__mptcp_check_fallback(msk)) { if (!msk->first) return NULL; - *sndbuf = msk->first->sk_sndbuf; return sk_stream_memory_free(msk->first) ? msk->first : NULL; }
/* re-use last subflow, if the burst allow that */ if (msk->last_snd && msk->snd_burst > 0 && sk_stream_memory_free(msk->last_snd) && - mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { - mptcp_for_each_subflow(msk, subflow) { - ssk = mptcp_subflow_tcp_sock(subflow); - *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); - } + mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) return msk->last_snd; - }
/* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < 2; ++i) { @@@ -1392,7 -1413,8 +1401,7 @@@ continue;
nr_active += !subflow->backup; - *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); - if (!sk_stream_memory_free(subflow->tcp_sock)) + if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd) continue;
pace = READ_ONCE(ssk->sk_pacing_rate); @@@ -1418,10 -1440,9 +1427,10 @@@ if (send_info[0].ssk) { msk->last_snd = send_info[0].ssk; msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, - sk_stream_wspace(msk->last_snd)); + tcp_sk(msk->last_snd)->snd_wnd); return msk->last_snd; } + return NULL; }
@@@ -1442,6 -1463,7 +1451,6 @@@ static void mptcp_push_pending(struct s }; struct mptcp_data_frag *dfrag; int len, copied = 0; - u32 sndbuf;
while ((dfrag = mptcp_send_head(sk))) { info.sent = dfrag->already_sent; @@@ -1452,7 -1474,12 +1461,7 @@@
prev_ssk = ssk; __mptcp_flush_join_list(msk); - ssk = mptcp_subflow_get_send(msk, &sndbuf); - - /* do auto tuning */ - if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && - sndbuf > READ_ONCE(sk->sk_sndbuf)) - WRITE_ONCE(sk->sk_sndbuf, sndbuf); + ssk = mptcp_subflow_get_send(msk);
/* try to keep the subflow socket lock across * consecutive xmit on the same socket @@@ -1509,9 -1536,7 +1518,9 @@@ static void __mptcp_subflow_push_pendin struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info; struct mptcp_data_frag *dfrag; + struct sock *xmit_ssk; int len, copied = 0; + bool first = true;
info.flags = 0; while ((dfrag = mptcp_send_head(sk))) { @@@ -1521,17 -1546,10 +1530,17 @@@ while (len > 0) { int ret = 0;
- /* do auto tuning */ - if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && - ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf)) - WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf); + /* the caller already invoked the packet scheduler, + * check for a different subflow usage only after + * spooling the first chunk of data + */ + xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk)); + if (!xmit_ssk) + goto out; + if (xmit_ssk != ssk) { + mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk)); + goto out; + }
if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) { __mptcp_update_wmem(sk); @@@ -1551,7 -1569,6 +1560,7 @@@ msk->tx_pending_data -= ret; copied += ret; len -= ret; + first = false; } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } @@@ -1565,21 -1582,15 +1574,24 @@@ out mptcp_set_timeout(sk, ssk); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + if (msk->snd_data_fin_enable && msk->snd_nxt + 1 == msk->write_seq) mptcp_schedule_work(sk); } }
+static void mptcp_set_nospace(struct sock *sk) +{ + /* enable autotune */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + /* will be cleared on avail space */ + set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@@ -1681,7 -1692,7 +1693,7 @@@ continue;
wait_for_memory: - set_bit(MPTCP_NOSPACE, &msk->flags); + mptcp_set_nospace(sk); mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) @@@ -1868,7 -1879,7 +1880,7 @@@ static void __mptcp_splice_receive_queu skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); }
- static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv) + static bool __mptcp_move_skbs(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; @@@ -1888,13 -1899,10 +1900,10 @@@
slowpath = lock_sock_fast(ssk); mptcp_data_lock(sk); + __mptcp_update_rmem(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); mptcp_data_unlock(sk); - if (moved && rcv) { - WRITE_ONCE(msk->rmem_pending, min(rcv, moved)); - tcp_cleanup_rbuf(ssk, 1); - WRITE_ONCE(msk->rmem_pending, 0); - } + tcp_cleanup_rbuf(ssk, moved); unlock_sock_fast(ssk, slowpath); } while (!done);
@@@ -1907,6 -1915,7 +1916,7 @@@ ret |= __mptcp_ofo_queue(msk); __mptcp_splice_receive_queue(sk); mptcp_data_unlock(sk); + mptcp_cleanup_rbuf(msk); } if (ret) mptcp_check_data_fin((struct sock *)msk); @@@ -1936,7 -1945,7 +1946,7 @@@ static int mptcp_recvmsg(struct sock *s target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
while (copied < len) { - int bytes_read, old_space; + int bytes_read;
bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); if (unlikely(bytes_read < 0)) { @@@ -1947,14 -1956,11 +1957,11 @@@
copied += bytes_read;
- if (skb_queue_empty(&msk->receive_queue) && - __mptcp_move_skbs(msk, len - copied)) - continue; - /* be sure to advertise window change */ - old_space = READ_ONCE(msk->old_wspace); - if ((tcp_space(sk) - old_space) >= old_space) - mptcp_cleanup_rbuf(msk); + mptcp_cleanup_rbuf(msk); + + if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) + continue;
/* only the master socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() @@@ -1982,7 -1988,7 +1989,7 @@@ /* race breaker: the shutdown could be after the * previous receive queue check */ - if (__mptcp_move_skbs(msk, len - copied)) + if (__mptcp_move_skbs(msk)) continue; break; } @@@ -2015,7 -2021,7 +2022,7 @@@ /* .. race-breaker: ssk might have gotten new data * after last __mptcp_move_skbs() returned false. */ - if (unlikely(__mptcp_move_skbs(msk, 0))) + if (unlikely(__mptcp_move_skbs(msk))) set_bit(MPTCP_DATA_READY, &msk->flags); } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { /* data to read but mptcp_wait_data() cleared DATA_READY */ @@@ -2114,9 -2120,12 +2121,9 @@@ static struct sock *mptcp_subflow_get_r * so we need to use tcp_close() after detaching them from the mptcp * parent socket. */ -void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow) +static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow) { - bool dispose_socket = false; - struct socket *sock; - list_del(&subflow->node);
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); @@@ -2124,8 -2133,11 +2131,8 @@@ /* if we are invoked by the msk cleanup code, the subflow is * already orphaned */ - sock = ssk->sk_socket; - if (sock) { - dispose_socket = sock != sk->sk_socket; + if (ssk->sk_socket) sock_orphan(ssk); - }
subflow->disposable = 1;
@@@ -2143,40 -2155,59 +2150,40 @@@ __sock_put(ssk); } release_sock(ssk); - if (dispose_socket) - iput(SOCK_INODE(sock));
sock_put(ssk); }
-static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) +void mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow) { - return 0; + if (sk->sk_state == TCP_ESTABLISHED) + mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); + __mptcp_close_ssk(sk, ssk, subflow); }
-static void pm_work(struct mptcp_sock *msk) +static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) { - struct mptcp_pm_data *pm = &msk->pm; - - spin_lock_bh(&msk->pm.lock); - - pr_debug("msk=%p status=%x", msk, pm->status); - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); - mptcp_pm_nl_add_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); - mptcp_pm_nl_add_addr_send_ack(msk); - } - if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); - mptcp_pm_nl_rm_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); - mptcp_pm_nl_fully_established(msk); - } - if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); - mptcp_pm_nl_subflow_established(msk); - } - - spin_unlock_bh(&msk->pm.lock); + return 0; }
static void __mptcp_close_subflow(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp;
+ might_sleep(); + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (inet_sk_state_load(ssk) != TCP_CLOSE) continue;
- __mptcp_close_ssk((struct sock *)msk, ssk, subflow); + /* 'subflow_data_ready' will re-sched once rx queue is empty */ + if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) + continue; + + mptcp_close_ssk((struct sock *)msk, ssk, subflow); } }
@@@ -2248,8 -2279,11 +2255,8 @@@ static void mptcp_worker(struct work_st
mptcp_check_fastclose(msk);
- if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) - __mptcp_close_subflow(msk); - if (msk->pm.status) - pm_work(msk); + mptcp_pm_nl_work(msk);
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); @@@ -2269,12 -2303,10 +2276,13 @@@ goto unlock; }
+ if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + __mptcp_close_subflow(msk); + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock;
+ __mptcp_clean_una(sk); dfrag = mptcp_rtx_head(sk); if (!dfrag) goto unlock; @@@ -2512,14 -2544,6 +2520,14 @@@ static void __mptcp_destroy_sock(struc
pr_debug("msk=%p", msk);
+ might_sleep(); + + /* dispose the ancillatory tcp socket, if any */ + if (msk->subflow) { + iput(SOCK_INODE(msk->subflow)); + msk->subflow = NULL; + } + /* be sure to always acquire the join list lock, to sync vs * mptcp_finish_join(). */ @@@ -2570,10 -2594,20 +2578,10 @@@ cleanup inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow, dispose_socket; - struct socket *sock; + bool slow = lock_sock_fast(ssk);
- slow = lock_sock_fast(ssk); - sock = ssk->sk_socket; - dispose_socket = sock && sock != sk->sk_socket; sock_orphan(ssk); unlock_sock_fast(ssk, slow); - - /* for the outgoing subflows we additionally need to free - * the associated socket - */ - if (dispose_socket) - iput(SOCK_INODE(sock)); } sock_orphan(sk);
@@@ -2588,10 -2622,6 +2596,10 @@@ release_sock(sk); if (do_cancel_work) mptcp_cancel_work(sk); + + if (mptcp_sk(sk)->token) + mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + sock_put(sk); }
@@@ -2906,16 -2936,10 +2914,16 @@@ void __mptcp_check_push(struct sock *sk if (!mptcp_send_head(sk)) return;
- if (!sock_owned_by_user(sk)) - __mptcp_subflow_push_pending(sk, ssk); - else + if (!sock_owned_by_user(sk)) { + struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk)); + + if (xmit_ssk == ssk) + __mptcp_subflow_push_pending(sk, ssk); + else if (xmit_ssk) + mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk)); + } else { set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + } }
#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED) @@@ -2943,6 -2967,8 +2951,8 @@@ static void mptcp_release_cb(struct soc mptcp_push_pending(sk, 0); spin_lock_bh(&sk->sk_lock.slock); } + if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) + __mptcp_error_report(sk);
/* clear any wmem reservation and errors */ __mptcp_update_wmem(sk); @@@ -2963,20 -2989,6 +2973,20 @@@ } }
+void mptcp_subflow_process_delegated(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + __mptcp_subflow_push_pending(sk, ssk); + else + set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + mptcp_data_unlock(sk); + mptcp_subflow_delegated_done(subflow); +} + static int mptcp_hash(struct sock *sk) { /* should never be called, @@@ -3034,12 -3046,12 +3044,12 @@@ void mptcp_finish_connect(struct sock * WRITE_ONCE(msk->can_ack, 1); WRITE_ONCE(msk->snd_una, msk->write_seq);
- mptcp_pm_new_connection(msk, 0); + mptcp_pm_new_connection(msk, ssk, 0);
mptcp_rcv_space_init(msk, ssk); }
-static void mptcp_sock_graft(struct sock *sk, struct socket *parent) +void mptcp_sock_graft(struct sock *sk, struct socket *parent) { write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); @@@ -3063,7 -3075,7 +3073,7 @@@ bool mptcp_finish_join(struct sock *ssk return false;
if (!msk->pm.server_side) - return true; + goto out;
if (!mptcp_pm_allow_new_subflow(msk)) return false; @@@ -3090,8 -3102,6 +3100,8 @@@ if (parent_sock && !ssk->sk_socket) mptcp_sock_graft(ssk, parent_sock); subflow->map_seq = READ_ONCE(msk->ack_seq); +out: + mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); return true; }
@@@ -3268,8 -3278,9 +3278,8 @@@ static int mptcp_stream_accept(struct s struct mptcp_sock *msk = mptcp_sk(newsock->sk); struct mptcp_subflow_context *subflow; struct sock *newsk = newsock->sk; - bool slowpath;
- slowpath = lock_sock_fast(newsk); + lock_sock(newsk);
/* PM/worker can now acquire the first subflow socket * lock without racing with listener queue cleanup, @@@ -3279,11 -3290,10 +3289,11 @@@ list_add(&subflow->node, &msk->conn_list); sock_hold(msk->first); if (mptcp_is_fully_established(newsk)) - mptcp_pm_fully_established(msk); + mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL);
mptcp_copy_inaddrs(newsk, msk->first); mptcp_rcv_space_init(msk, msk->first); + mptcp_propagate_sndbuf(newsk, msk->first);
/* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. @@@ -3295,7 -3305,7 +3305,7 @@@ if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } - unlock_sock_fast(newsk, slowpath); + release_sock(newsk); }
if (inet_csk_listen_poll(ssock->sk)) @@@ -3319,12 -3329,12 +3329,12 @@@ static __poll_t mptcp_check_writeable(s struct sock *sk = (struct sock *)msk;
if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) - return 0; + return EPOLLOUT | EPOLLWRNORM;
if (sk_stream_is_writeable(sk)) return EPOLLOUT | EPOLLWRNORM;
- set_bit(MPTCP_NOSPACE, &msk->flags); + mptcp_set_nospace(sk); smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ if (sk_stream_is_writeable(sk)) return EPOLLOUT | EPOLLWRNORM; @@@ -3352,9 -3362,16 +3362,16 @@@ static __poll_t mptcp_poll(struct file mask |= mptcp_check_readable(msk); mask |= mptcp_check_writeable(msk); } + if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) + mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+ /* This barrier is coupled with smp_wmb() in tcp_reset() */ + smp_rmb(); + if (sk->sk_err) + mask |= EPOLLERR; + return mask; }
@@@ -3388,58 -3405,13 +3405,58 @@@ static struct inet_protosw mptcp_protos .flags = INET_PROTOSW_ICSK, };
+static int mptcp_napi_poll(struct napi_struct *napi, int budget) +{ + struct mptcp_delegated_action *delegated; + struct mptcp_subflow_context *subflow; + int work_done = 0; + + delegated = container_of(napi, struct mptcp_delegated_action, napi); + while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + bh_lock_sock_nested(ssk); + if (!sock_owned_by_user(ssk) && + mptcp_subflow_has_delegated_action(subflow)) + mptcp_subflow_process_delegated(ssk); + /* ... elsewhere tcp_release_cb_override already processed + * the action or will do at next release_sock(). + * In both case must dequeue the subflow here - on the same + * CPU that scheduled it. + */ + bh_unlock_sock(ssk); + sock_put(ssk); + + if (++work_done == budget) + return budget; + } + + /* always provide a 0 'work_done' argument, so that napi_complete_done + * will not try accessing the NULL napi->dev ptr + */ + napi_complete_done(napi, 0); + return work_done; +} + void __init mptcp_proto_init(void) { + struct mptcp_delegated_action *delegated; + int cpu; + mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) panic("Failed to allocate MPTCP pcpu counter\n");
+ init_dummy_netdev(&mptcp_napi_dev); + for_each_possible_cpu(cpu) { + delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); + INIT_LIST_HEAD(&delegated->head); + netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll, + NAPI_POLL_WEIGHT); + napi_enable(&delegated->napi); + } + mptcp_subflow_init(); mptcp_pm_init(); mptcp_token_init(); diff --combined net/mptcp/protocol.h index 1b6ec1773678,8d9f0ff10cb8..91827d949766 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@@ -10,7 -10,6 +10,7 @@@ #include <linux/random.h> #include <net/tcp.h> #include <net/inet_connection_sock.h> +#include <uapi/linux/mptcp.h>
#define MPTCP_SUPPORTED_VERSION 1
@@@ -25,7 -24,6 +25,7 @@@ #define OPTION_MPTCP_ADD_ADDR6 BIT(7) #define OPTION_MPTCP_RM_ADDR BIT(8) #define OPTION_MPTCP_FASTCLOSE BIT(9) +#define OPTION_MPTCP_PRIO BIT(10)
/* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@@ -61,8 -59,6 +61,8 @@@ #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24 #define TCPOLEN_MPTCP_PORT_LEN 4 #define TCPOLEN_MPTCP_RM_ADDR_BASE 4 +#define TCPOLEN_MPTCP_PRIO 3 +#define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12
/* MPTCP MP_JOIN flags */ @@@ -90,9 -86,6 +90,9 @@@ #define MPTCP_ADDR_IPVERSION_4 4 #define MPTCP_ADDR_IPVERSION_6 6
+/* MPTCP MP_PRIO flags */ +#define MPTCP_PRIO_BKUP BIT(0) + /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 #define MPTCP_NOSPACE 1 @@@ -102,6 -95,7 +102,7 @@@ #define MPTCP_WORK_CLOSE_SUBFLOW 5 #define MPTCP_PUSH_PENDING 6 #define MPTCP_CLEAN_UNA 7 + #define MPTCP_ERROR_REPORT 8
static inline bool before64(__u64 seq1, __u64 seq2) { @@@ -123,7 -117,6 +124,7 @@@ struct mptcp_options_received dss : 1, add_addr : 1, rm_addr : 1, + mp_prio : 1, family : 4, echo : 1, backup : 1; @@@ -204,6 -197,10 +205,6 @@@ struct mptcp_pm_data u8 add_addr_accepted; u8 local_addr_used; u8 subflows; - u8 add_addr_signal_max; - u8 add_addr_accept_max; - u8 local_addr_max; - u8 subflows_max; u8 status; u8 rm_id; }; @@@ -237,7 -234,6 +238,6 @@@ struct mptcp_sock u64 wnd_end; unsigned long timer_ival; u32 token; - int rmem_pending; int rmem_released; unsigned long flags; bool can_ack; @@@ -289,11 -285,6 +289,11 @@@ #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+static inline void msk_owned_by_me(const struct mptcp_sock *msk) +{ + sock_owned_by_me((const struct sock *)msk); +} + static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) { return (struct mptcp_sock *)sk; @@@ -301,7 -292,7 +301,7 @@@
static inline int __mptcp_space(const struct sock *sk) { - return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_pending); + return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_released); }
static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) @@@ -334,20 -325,13 +334,13 @@@ static inline struct mptcp_data_frag *m return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); }
- static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk) + static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk);
- if (!before64(msk->snd_nxt, READ_ONCE(msk->snd_una))) + if (msk->snd_una == READ_ONCE(msk->snd_nxt)) return NULL;
- return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); - } - - static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) - { - struct mptcp_sock *msk = mptcp_sk(sk); - return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); }
@@@ -381,15 -365,6 +374,15 @@@ enum mptcp_data_avail MPTCP_SUBFLOW_OOO_DATA };
+struct mptcp_delegated_action { + struct napi_struct napi; + struct list_head head; +}; + +DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); + +#define MPTCP_DELEGATE_SEND 0 + /* MPTCP subflow context */ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ @@@ -414,7 -389,6 +407,7 @@@ map_valid : 1, mpc_map : 1, backup : 1, + send_mp_prio : 1, rx_eof : 1, can_ack : 1, /* only after processing the remote a key */ disposable : 1; /* ctx can be free at ulp release time */ @@@ -427,15 -401,13 +420,16 @@@ u8 local_id; u8 remote_id;
+ long delegated_status; + struct list_head delegated_node; /* link into delegated_action, protected by local BH */ + struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; void (*tcp_data_ready)(struct sock *sk); void (*tcp_state_change)(struct sock *sk); void (*tcp_write_space)(struct sock *sk); + void (*tcp_error_report)(struct sock *sk);
struct rcu_head rcu; }; @@@ -478,61 -450,6 +472,61 @@@ static inline void mptcp_add_pending_su spin_unlock_bh(&msk->join_list_lock); }
+void mptcp_subflow_process_delegated(struct sock *ssk); + +static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow) +{ + struct mptcp_delegated_action *delegated; + bool schedule; + + /* The implied barrier pairs with mptcp_subflow_delegated_done(), and + * ensures the below list check sees list updates done prior to status + * bit changes + */ + if (!test_and_set_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) { + /* still on delegated list from previous scheduling */ + if (!list_empty(&subflow->delegated_node)) + return; + + /* the caller held the subflow bh socket lock */ + lockdep_assert_in_softirq(); + + delegated = this_cpu_ptr(&mptcp_delegated_actions); + schedule = list_empty(&delegated->head); + list_add_tail(&subflow->delegated_node, &delegated->head); + sock_hold(mptcp_subflow_tcp_sock(subflow)); + if (schedule) + napi_schedule(&delegated->napi); + } +} + +static inline struct mptcp_subflow_context * +mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) +{ + struct mptcp_subflow_context *ret; + + if (list_empty(&delegated->head)) + return NULL; + + ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node); + list_del_init(&ret->delegated_node); + return ret; +} + +static inline bool mptcp_subflow_has_delegated_action(const struct mptcp_subflow_context *subflow) +{ + return test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status); +} + +static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *subflow) +{ + /* pairs with mptcp_subflow_delegate, ensures delegate_node is updated before + * touching the status bit + */ + smp_wmb(); + clear_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status); +} + int mptcp_is_enabled(struct net *net); unsigned int mptcp_get_add_addr_timeout(struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, @@@ -540,19 -457,14 +534,19 @@@ bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); -void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow); +void mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow); void mptcp_subflow_reset(struct sock *ssk); +void mptcp_sock_graft(struct sock *sk, struct socket *parent); +struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
/* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); +void mptcp_info2sockaddr(const struct mptcp_addr_info *info, + struct sockaddr_storage *addr, + unsigned short family);
static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) @@@ -560,6 -472,7 +554,7 @@@ sk->sk_data_ready = ctx->tcp_data_ready; sk->sk_state_change = ctx->tcp_state_change; sk->sk_write_space = ctx->tcp_write_space; + sk->sk_error_report = ctx->tcp_error_report;
inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } @@@ -587,6 -500,7 +582,7 @@@ bool mptcp_finish_join(struct sock *sk) bool mptcp_schedule_work(struct sock *sk); void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); + void __mptcp_error_report(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); void __mptcp_flush_join_list(struct mptcp_sock *msk); @@@ -596,25 -510,6 +592,25 @@@ static inline bool mptcp_data_fin_enabl READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); }
+static inline bool mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) +{ + if ((sk->sk_userlocks & SOCK_SNDBUF_LOCK) || ssk->sk_sndbuf <= READ_ONCE(sk->sk_sndbuf)) + return false; + + WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf); + return true; +} + +static inline void mptcp_write_space(struct sock *sk) +{ + if (sk_stream_is_writeable(sk)) { + /* pairs with memory barrier in mptcp_poll */ + smp_mb(); + if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags)) + sk_stream_write_space(sk); + } +} + void mptcp_destroy_common(struct mptcp_sock *msk);
void __init mptcp_token_init(void); @@@ -640,8 -535,8 +636,8 @@@ void mptcp_crypto_hmac_sha(u64 key1, u6
void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); -void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side); -void mptcp_pm_fully_established(struct mptcp_sock *msk); +void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); +void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk); void mptcp_pm_subflow_established(struct mptcp_sock *msk, @@@ -651,12 -546,7 +647,12 @@@ void mptcp_pm_add_addr_received(struct const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id); +void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); +int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + u8 bkup); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); +bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct mptcp_addr_info *addr); @@@ -667,11 -557,6 +663,11 @@@ int mptcp_pm_announce_addr(struct mptcp int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id); int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id);
+void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, + const struct sock *ssk, gfp_t gfp); +void mptcp_event_addr_announced(const struct mptcp_sock *msk, const struct mptcp_addr_info *info); +void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); + static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); @@@ -719,13 -604,13 +715,13 @@@ int mptcp_pm_get_local_id(struct mptcp_
void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_data_init(struct mptcp_sock *msk); -void mptcp_pm_nl_fully_established(struct mptcp_sock *msk); -void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk); -void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk); -void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk); -void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk); +void mptcp_pm_nl_work(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); +unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk); +unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); +unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); +unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk);
static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) { diff --combined net/mptcp/subflow.c index ce2dea2a6e0a,8b2338dfdc80..06e233410e0e --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@@ -18,15 -18,12 +18,15 @@@ #include <net/tcp.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/ip6_route.h> +#include <net/transp_v6.h> #endif #include <net/mptcp.h> #include <uapi/linux/mptcp.h> #include "protocol.h" #include "mib.h"
+static void mptcp_subflow_ops_undo_override(struct sock *ssk); + static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, enum linux_mptcp_mib_field field) { @@@ -64,23 -61,11 +64,23 @@@ static bool mptcp_can_accept_new_subflo }
/* validate received token and create truncated hmac and nonce for SYN-ACK */ -static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, - const struct sk_buff *skb) +static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req) { - struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; + + get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); + + subflow_generate_hmac(msk->local_key, msk->remote_key, + subflow_req->local_nonce, + subflow_req->remote_nonce, hmac); + + subflow_req->thmac = get_unaligned_be64(hmac); +} + +static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_sock *msk; int local_id;
@@@ -97,10 -82,17 +97,10 @@@ } subflow_req->local_id = local_id;
- get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); - - subflow_generate_hmac(msk->local_key, msk->remote_key, - subflow_req->local_nonce, - subflow_req->remote_nonce, hmac); - - subflow_req->thmac = get_unaligned_be64(hmac); return msk; }
- static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener) + static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
@@@ -108,42 -100,30 +108,35 @@@ subflow_req->mp_join = 0; subflow_req->msk = NULL; mptcp_token_init_request(req); - - #ifdef CONFIG_TCP_MD5SIG - /* no MPTCP if MD5SIG is enabled on this socket or we may run out of - * TCP option space. - */ - if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) - return -EINVAL; - #endif - - return 0; }
+static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk) +{ + return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; +} + /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset * should be sent. */ - static int subflow_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) + static int subflow_check_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; - int ret;
pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
- ret = __subflow_init_req(req, sk_listener); - if (ret) - return 0; + #ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) + return -EINVAL; + #endif
mptcp_get_options(skb, &mp_opt);
@@@ -191,30 -171,12 +184,30 @@@ again subflow_req->remote_id = mp_opt.join_id; subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; - subflow_req->msk = subflow_token_join_request(req, skb); + subflow_req->msk = subflow_token_join_request(req);
/* Can't fall back to TCP in this case. */ if (!subflow_req->msk) return -EPERM;
+ if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { + pr_debug("syn inet_sport=%d %d", + ntohs(inet_sk(sk_listener)->inet_sport), + ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); + if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { + sock_put((struct sock *)subflow_req->msk); + mptcp_token_destroy_request(req); + tcp_request_sock_ops.destructor(req); + subflow_req->msk = NULL; + subflow_req->mp_join = 0; + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); + return -EPERM; + } + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); + } + + subflow_req_create_thmac(subflow_req); + if (unlikely(req->syncookie)) { if (mptcp_can_accept_new_subflow(subflow_req->msk)) subflow_init_req_cookie_join_save(subflow_req, skb); @@@ -236,10 -198,7 +229,7 @@@ int mptcp_subflow_init_cookie_req(struc struct mptcp_options_received mp_opt; int err;
- err = __subflow_init_req(req, sk_listener); - if (err) - return err; - + subflow_init_req(req, sk_listener); mptcp_get_options(skb, &mp_opt);
if (mp_opt.mp_capable && mp_opt.mp_join) @@@ -279,12 -238,13 +269,13 @@@ static struct dst_entry *subflow_v4_rou int err;
tcp_rsk(req)->is_mptcp = 1; + subflow_init_req(req, sk);
dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); if (!dst) return NULL;
- err = subflow_init_req(req, sk, skb); + err = subflow_check_req(req, sk, skb); if (err == 0) return dst;
@@@ -304,12 -264,13 +295,13 @@@ static struct dst_entry *subflow_v6_rou int err;
tcp_rsk(req)->is_mptcp = 1; + subflow_init_req(req, sk);
dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); if (!dst) return NULL;
- err = subflow_init_req(req, sk, skb); + err = subflow_check_req(req, sk, skb); if (err == 0) return dst;
@@@ -357,11 -318,6 +349,11 @@@ void mptcp_subflow_reset(struct sock *s sock_put(sk); }
+static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk) +{ + return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@@ -379,7 -335,6 +371,7 @@@ if (subflow->conn_finished) return;
+ mptcp_propagate_sndbuf(parent, sk); subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; @@@ -428,13 -383,6 +420,13 @@@
subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); + + if (subflow_use_different_dport(mptcp_sk(parent), sk)) { + pr_debug("synack inet_dport=%d %d", + ntohs(inet_sk(sk)->inet_dport), + ntohs(inet_sk(parent)->inet_dport)); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); + } } else if (mptcp_check_fallback(sk)) { fallback: mptcp_rcv_space_init(mptcp_sk(parent), sk); @@@ -471,7 -419,6 +463,7 @@@ drop static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; static struct inet_connection_sock_af_ops subflow_v6_specific; static struct inet_connection_sock_af_ops subflow_v6m_specific; +static struct proto tcpv6_prot_override;
static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { @@@ -553,8 -500,6 +545,8 @@@ static void subflow_ulp_fallback(struc icsk->icsk_ulp_ops = NULL; rcu_assign_pointer(icsk->icsk_ulp_data, NULL); tcp_sk(sk)->is_mptcp = 0; + + mptcp_subflow_ops_undo_override(sk); }
static void subflow_drop_ctx(struct sock *ssk) @@@ -675,7 -620,7 +667,7 @@@ create_child * created mptcp socket */ new_msk->sk_destruct = mptcp_sock_destruct; - mptcp_pm_new_connection(mptcp_sk(new_msk), 1); + mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1); mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); ctx->conn = new_msk; new_msk = NULL; @@@ -700,17 -645,6 +692,17 @@@
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); tcp_rsk(req)->drop_req = true; + + if (subflow_use_different_sport(owner, sk)) { + pr_debug("ack inet_sport=%d %d", + ntohs(inet_sk(sk)->inet_sport), + ntohs(inet_sk((struct sock *)owner)->inet_sport)); + if (!mptcp_pm_sport_in_anno_list(owner, sk)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); + goto out; + } + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); + } } }
@@@ -739,7 -673,6 +731,7 @@@ dispose_child }
static struct inet_connection_sock_af_ops subflow_specific; +static struct proto tcp_prot_override;
enum mapping_status { MAPPING_OK, @@@ -953,22 -886,6 +945,22 @@@ static void mptcp_subflow_discard_data( subflow->map_valid = 0; }
+/* sched mptcp worker to remove the subflow if no more data is pending */ +static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) +{ + struct sock *sk = (struct sock *)msk; + + if (likely(ssk->sk_state != TCP_CLOSE)) + return; + + if (skb_queue_empty(&ssk->sk_receive_queue) && + !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) { + sock_hold(sk); + if (!schedule_work(&msk->work)) + sock_put(sk); + } +} + static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); @@@ -1007,11 -924,11 +999,11 @@@ }
if (status != MAPPING_OK) - return false; + goto no_data;
skb = skb_peek(&ssk->sk_receive_queue); if (WARN_ON_ONCE(!skb)) - return false; + goto no_data;
/* if msk lacks the remote key, this subflow must provide an * MP_CAPABLE-based mapping @@@ -1045,9 -962,6 +1037,9 @@@ } return true;
+no_data: + subflow_sched_work_if_closed(msk, ssk); + return false; fatal: /* fatal protocol error, close the socket */ /* This barrier is coupled with smp_rmb() in tcp_poll() */ @@@ -1118,12 -1032,49 +1110,52 @@@ static void subflow_data_ready(struct s
static void subflow_write_space(struct sock *ssk) { - /* we take action in __mptcp_clean_una() */ + struct sock *sk = mptcp_subflow_ctx(ssk)->conn; + + mptcp_propagate_sndbuf(sk, ssk); + mptcp_write_space(sk); }
+ void __mptcp_error_report(struct sock *sk) + { + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int err = sock_error(ssk); + + if (!err) + continue; + + /* only propagate errors on fallen-back sockets or + * on MPC connect + */ + if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) + continue; + + inet_sk_state_store(sk, inet_sk_state_load(ssk)); + sk->sk_err = -err; + + /* This barrier is coupled with smp_rmb() in mptcp_poll() */ + smp_wmb(); + sk->sk_error_report(sk); + break; + } + } + + static void subflow_error_report(struct sock *ssk) + { + struct sock *sk = mptcp_subflow_ctx(ssk)->conn; + + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + __mptcp_error_report(sk); + else + set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags); + mptcp_data_unlock(sk); + } + static struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { @@@ -1154,32 -1105,22 +1186,32 @@@ void mptcpv6_handle_mapped(struct sock } #endif
-static void mptcp_info2sockaddr(const struct mptcp_addr_info *info, - struct sockaddr_storage *addr) +void mptcp_info2sockaddr(const struct mptcp_addr_info *info, + struct sockaddr_storage *addr, + unsigned short family) { memset(addr, 0, sizeof(*addr)); - addr->ss_family = info->family; + addr->ss_family = family; if (addr->ss_family == AF_INET) { struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
- in_addr->sin_addr = info->addr; + if (info->family == AF_INET) + in_addr->sin_addr = info->addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (ipv6_addr_v4mapped(&info->addr6)) + in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3]; +#endif in_addr->sin_port = info->port; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
- in6_addr->sin6_addr = info->addr6; + if (info->family == AF_INET) + ipv6_addr_set_v4mapped(info->addr.s_addr, + &in6_addr->sin6_addr); + else + in6_addr->sin6_addr = info->addr6; in6_addr->sin6_port = info->port; } #endif @@@ -1223,11 -1164,11 +1255,11 @@@ int __mptcp_subflow_connect(struct soc subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; subflow->token = msk->token; - mptcp_info2sockaddr(loc, &addr); + mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
addrlen = sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (loc->family == AF_INET6) + if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif ssk->sk_bound_dev_if = loc->ifindex; @@@ -1243,16 -1184,13 +1275,16 @@@ subflow->remote_id = remote_id; subflow->request_join = 1; subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - mptcp_info2sockaddr(remote, &addr); + mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
mptcp_add_pending_subflow(msk, subflow); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) goto failed_unlink;
+ /* discard the subflow socket */ + mptcp_sock_graft(ssk, sk->sk_socket); + iput(SOCK_INODE(sf)); return err;
failed_unlink: @@@ -1290,25 -1228,6 +1322,25 @@@ static void mptcp_attach_cgroup(struct #endif /* CONFIG_SOCK_CGROUP_DATA */ }
+static void mptcp_subflow_ops_override(struct sock *ssk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (ssk->sk_prot == &tcpv6_prot) + ssk->sk_prot = &tcpv6_prot_override; + else +#endif + ssk->sk_prot = &tcp_prot_override; +} + +static void mptcp_subflow_ops_undo_override(struct sock *ssk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (ssk->sk_prot == &tcpv6_prot_override) + ssk->sk_prot = &tcpv6_prot; + else +#endif + ssk->sk_prot = &tcp_prot; +} int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; @@@ -1364,7 -1283,6 +1396,7 @@@ *new_sock = sf; sock_hold(sk); subflow->conn = sk; + mptcp_subflow_ops_override(sf->sk);
return 0; } @@@ -1381,7 -1299,6 +1413,7 @@@ static struct mptcp_subflow_context *su
rcu_assign_pointer(icsk->icsk_ulp_data, ctx); INIT_LIST_HEAD(&ctx->node); + INIT_LIST_HEAD(&ctx->delegated_node);
pr_debug("subflow=%p", ctx);
@@@ -1414,7 -1331,6 +1446,7 @@@ static void subflow_state_change(struc __subflow_state_change(sk);
if (subflow_simultaneous_connect(sk)) { + mptcp_propagate_sndbuf(parent, sk); mptcp_do_fallback(sk); mptcp_rcv_space_init(mptcp_sk(parent), sk); pr_fallback(mptcp_sk(parent)); @@@ -1432,8 -1348,6 +1464,8 @@@ if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk);
+ subflow_sched_work_if_closed(mptcp_sk(parent), sk); + if (__mptcp_check_fallback(mptcp_sk(parent)) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; @@@ -1470,9 -1384,11 +1502,11 @@@ static int subflow_ulp_init(struct soc ctx->tcp_data_ready = sk->sk_data_ready; ctx->tcp_state_change = sk->sk_state_change; ctx->tcp_write_space = sk->sk_write_space; + ctx->tcp_error_report = sk->sk_error_report; sk->sk_data_ready = subflow_data_ready; sk->sk_write_space = subflow_write_space; sk->sk_state_change = subflow_state_change; + sk->sk_error_report = subflow_error_report; out: return err; } @@@ -1496,7 -1412,6 +1530,7 @@@ static void subflow_ulp_release(struct sock_put(sk); }
+ mptcp_subflow_ops_undo_override(ssk); if (release) kfree_rcu(ctx, rcu); } @@@ -1526,6 -1441,7 +1560,7 @@@ static void subflow_ulp_clone(const str new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_write_space = old_ctx->tcp_write_space; + new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; new_ctx->tcp_sock = newsk;
@@@ -1550,16 -1466,6 +1585,16 @@@ } }
+static void tcp_release_cb_override(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + + if (mptcp_subflow_has_delegated_action(subflow)) + mptcp_subflow_process_delegated(ssk); + + tcp_release_cb(ssk); +} + static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { .name = "mptcp", .owner = THIS_MODULE, @@@ -1600,9 -1506,6 +1635,9 @@@ void __init mptcp_subflow_init(void subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect;
+ tcp_prot_override = tcp_prot; + tcp_prot_override.release_cb = tcp_release_cb_override; + #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; @@@ -1618,9 -1521,6 +1653,9 @@@ subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.net_frag_header_len = 0; + + tcpv6_prot_override = tcpv6_prot; + tcpv6_prot_override.release_cb = tcp_release_cb_override; #endif
mptcp_diag_subflow_init(&subflow_ulp_ops); diff --combined net/sched/act_api.c index 4dd235ce9a07,ebc8f1413078..b919826939e0 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@@ -908,7 -908,7 +908,7 @@@ static const struct nla_policy tcf_acti [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), };
- static void tcf_idr_insert_many(struct tc_action *actions[]) + void tcf_idr_insert_many(struct tc_action *actions[]) { int i;
@@@ -928,13 -928,19 +928,13 @@@ } }
-struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, - struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind, - bool rtnl_held, - struct netlink_ext_ack *extack) +struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, + bool rtnl_held, + struct netlink_ext_ack *extack) { - struct nla_bitfield32 flags = { 0, 0 }; - u8 hw_stats = TCA_ACT_HW_STATS_ANY; - struct tc_action *a; + struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action_ops *a_o; - struct tc_cookie *cookie = NULL; char act_name[IFNAMSIZ]; - struct nlattr *tb[TCA_ACT_MAX + 1]; struct nlattr *kind; int err;
@@@ -942,21 -948,33 +942,21 @@@ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) - goto err_out; + return ERR_PTR(err); err = -EINVAL; kind = tb[TCA_ACT_KIND]; if (!kind) { NL_SET_ERR_MSG(extack, "TC action kind must be specified"); - goto err_out; + return ERR_PTR(err); } if (nla_strscpy(act_name, kind, IFNAMSIZ) < 0) { NL_SET_ERR_MSG(extack, "TC action name too long"); - goto err_out; + return ERR_PTR(err); } - if (tb[TCA_ACT_COOKIE]) { - cookie = nla_memdup_cookie(tb); - if (!cookie) { - NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); - err = -ENOMEM; - goto err_out; - } - } - hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); - if (tb[TCA_ACT_FLAGS]) - flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); } else { if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) { NL_SET_ERR_MSG(extack, "TC action name too long"); - err = -EINVAL; - goto err_out; + return ERR_PTR(-EINVAL); } }
@@@ -978,56 -996,24 +978,56 @@@ * indicate this using -EAGAIN. */ if (a_o != NULL) { - err = -EAGAIN; - goto err_mod; + module_put(a_o->owner); + return ERR_PTR(-EAGAIN); } #endif NL_SET_ERR_MSG(extack, "Failed to load TC action module"); - err = -ENOENT; - goto err_free; + return ERR_PTR(-ENOENT); }
+ return a_o; +} + +struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, + struct nlattr *nla, struct nlattr *est, + char *name, int ovr, int bind, + struct tc_action_ops *a_o, bool rtnl_held, + struct netlink_ext_ack *extack) +{ + struct nla_bitfield32 flags = { 0, 0 }; + u8 hw_stats = TCA_ACT_HW_STATS_ANY; + struct nlattr *tb[TCA_ACT_MAX + 1]; + struct tc_cookie *cookie = NULL; + struct tc_action *a; + int err; + /* backward compatibility for policer */ - if (name == NULL) + if (name == NULL) { + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); + if (err < 0) + return ERR_PTR(err); + if (tb[TCA_ACT_COOKIE]) { + cookie = nla_memdup_cookie(tb); + if (!cookie) { + NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); + err = -ENOMEM; + goto err_out; + } + } + hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); + if (tb[TCA_ACT_FLAGS]) + flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, rtnl_held, tp, flags.value, extack); - else + } else { err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, tp, flags.value, extack); + } if (err < 0) - goto err_mod; + goto err_out;
if (!name && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->act_cookie, cookie); @@@ -1044,11 -1030,14 +1044,11 @@@
return a;
-err_mod: - module_put(a_o->owner); -err_free: +err_out: if (cookie) { kfree(cookie->data); kfree(cookie); } -err_out: return ERR_PTR(err); }
@@@ -1059,7 -1048,6 +1059,7 @@@ int tcf_action_init(struct net *net, st struct tc_action *actions[], size_t *attr_size, bool rtnl_held, struct netlink_ext_ack *extack) { + struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t sz = 0; @@@ -1071,20 -1059,9 +1071,20 @@@ if (err < 0) return err;
+ for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { + struct tc_action_ops *a_o; + + a_o = tc_action_load_ops(name, tb[i], rtnl_held, extack); + if (IS_ERR(a_o)) { + err = PTR_ERR(a_o); + goto err_mod; + } + ops[i - 1] = a_o; + } + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, - rtnl_held, extack); + ops[i - 1], rtnl_held, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@@ -1104,11 -1081,6 +1104,11 @@@
err: tcf_action_destroy(actions, bind); +err_mod: + for (i = 0; i < TCA_ACT_MAX_PRIO; i++) { + if (ops[i]) + module_put(ops[i]->owner); + } return err; }
diff --combined net/sched/cls_api.c index a67c66a512a4,0b3900dd2354..e37556cc37ab --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@@ -3043,23 -3043,17 +3043,24 @@@ int tcf_exts_validate(struct net *net, size_t attr_size = 0;
if (exts->police && tb[exts->police]) { + struct tc_action_ops *a_o; + + a_o = tc_action_load_ops("police", tb[exts->police], rtnl_held, extack); + if (IS_ERR(a_o)) + return PTR_ERR(a_o); act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND, rtnl_held, + TCA_ACT_BIND, a_o, rtnl_held, extack); - if (IS_ERR(act)) + if (IS_ERR(act)) { + module_put(a_o->owner); return PTR_ERR(act); + }
act->type = exts->type = TCA_OLD_COMPAT; exts->actions[0] = act; exts->nr_actions = 1; + tcf_idr_insert_many(exts->actions); } else if (exts->action && tb[exts->action]) { int err;
diff --combined net/sched/cls_flower.c index caf7643e9c83,46c1b3e9f66a..2409e522c68f --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@@ -30,6 -30,11 +30,11 @@@
#include <uapi/linux/netfilter/nf_conntrack_common.h>
+ #define TCA_FLOWER_KEY_CT_FLAGS_MAX \ + ((__TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) << 1) + #define TCA_FLOWER_KEY_CT_FLAGS_MASK \ + (TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) + struct fl_flow_key { struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; @@@ -291,11 -296,9 +296,11 @@@ static u16 fl_ct_info_to_flower_map[] [IP_CT_RELATED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_RELATED, [IP_CT_ESTABLISHED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | - TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED, + TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED | + TCA_FLOWER_KEY_CT_FLAGS_REPLY, [IP_CT_RELATED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | - TCA_FLOWER_KEY_CT_FLAGS_RELATED, + TCA_FLOWER_KEY_CT_FLAGS_RELATED | + TCA_FLOWER_KEY_CT_FLAGS_REPLY, [IP_CT_NEW] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_NEW, }; @@@ -304,7 -307,6 +309,7 @@@ static int fl_classify(struct sk_buff * struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); + bool post_ct = qdisc_skb_cb(skb)->post_ct; struct fl_flow_key skb_key; struct fl_flow_mask *mask; struct cls_fl_filter *f; @@@ -321,8 -323,7 +326,8 @@@ skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key); skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, - ARRAY_SIZE(fl_ct_info_to_flower_map)); + ARRAY_SIZE(fl_ct_info_to_flower_map), + post_ct); skb_flow_dissect_hash(skb, &mask->dissector, &skb_key); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
@@@ -690,8 -691,10 +695,10 @@@ static const struct nla_policy fl_polic [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED }, - [TCA_FLOWER_KEY_CT_STATE] = { .type = NLA_U16 }, - [TCA_FLOWER_KEY_CT_STATE_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CT_STATE] = + NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK), + [TCA_FLOWER_KEY_CT_STATE_MASK] = + NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK), [TCA_FLOWER_KEY_CT_ZONE] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_CT_ZONE_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_CT_MARK] = { .type = NLA_U32 }, @@@ -1394,12 -1397,33 +1401,33 @@@ static int fl_set_enc_opt(struct nlatt return 0; }
+ static int fl_validate_ct_state(u16 state, struct nlattr *tb, + struct netlink_ext_ack *extack) + { + if (state && !(state & TCA_FLOWER_KEY_CT_FLAGS_TRACKED)) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "no trk, so no other flag can be set"); + return -EINVAL; + } + + if (state & TCA_FLOWER_KEY_CT_FLAGS_NEW && + state & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "new and est are mutually exclusive"); + return -EINVAL; + } + + return 0; + } + static int fl_set_key_ct(struct nlattr **tb, struct flow_dissector_key_ct *key, struct flow_dissector_key_ct *mask, struct netlink_ext_ack *extack) { if (tb[TCA_FLOWER_KEY_CT_STATE]) { + int err; + if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) { NL_SET_ERR_MSG(extack, "Conntrack isn't enabled"); return -EOPNOTSUPP; @@@ -1407,6 -1431,13 +1435,13 @@@ fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE, &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK, sizeof(key->ct_state)); + + err = fl_validate_ct_state(mask->ct_state, + tb[TCA_FLOWER_KEY_CT_STATE_MASK], + extack); + if (err) + return err; + } if (tb[TCA_FLOWER_KEY_CT_ZONE]) { if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) { diff --combined tools/testing/selftests/net/forwarding/tc_flower.sh index a554838666c4,b11d8e6b5bc1..4b58ccae3429 --- a/tools/testing/selftests/net/forwarding/tc_flower.sh +++ b/tools/testing/selftests/net/forwarding/tc_flower.sh @@@ -3,9 -3,7 +3,9 @@@
ALL_TESTS="match_dst_mac_test match_src_mac_test match_dst_ip_test \ match_src_ip_test match_ip_flags_test match_pcp_test match_vlan_test \ - match_ip_tos_test match_indev_test match_mpls_label_test \ - match_ip_tos_test match_indev_test match_ip_ttl_test" ++ match_ip_tos_test match_indev_testmatch_ip_ttl_test match_mpls_label_test \ + match_mpls_tc_test match_mpls_bos_test match_mpls_ttl_test \ + match_mpls_lse_test" NUM_NETIFS=2 source tc_common.sh source lib.sh @@@ -312,6 -310,42 +312,42 @@@ match_ip_tos_test( log_test "ip_tos match ($tcflags)" }
+ match_ip_ttl_test() + { + RET=0 + + tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \ + $tcflags dst_ip 192.0.2.2 ip_ttl 63 action drop + tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \ + $tcflags dst_ip 192.0.2.2 action drop + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip "ttl=63" -q + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip "ttl=63,mf,frag=256" -q + + tc_check_packets "dev $h2 ingress" 102 1 + check_fail $? "Matched on the wrong filter (no check on ttl)" + + tc_check_packets "dev $h2 ingress" 101 2 + check_err $? "Did not match on correct filter (ttl=63)" + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip "ttl=255" -q + + tc_check_packets "dev $h2 ingress" 101 3 + check_fail $? "Matched on a wrong filter (ttl=63)" + + tc_check_packets "dev $h2 ingress" 102 1 + check_err $? "Did not match on correct filter (no check on ttl)" + + tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower + tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower + + log_test "ip_ttl match ($tcflags)" + } + match_indev_test() { RET=0 @@@ -336,309 -370,6 +372,309 @@@ log_test "indev match ($tcflags)" }
+# Unfortunately, mausezahn can't build MPLS headers when used in L2 +# mode, so we have this function to build Label Stack Entries. +mpls_lse() +{ + local label=$1 + local tc=$2 + local bos=$3 + local ttl=$4 + + printf "%02x %02x %02x %02x" \ + $((label >> 12)) \ + $((label >> 4 & 0xff)) \ + $((((label & 0xf) << 4) + (tc << 1) + bos)) \ + $ttl +} + +match_mpls_label_test() +{ + local ethtype="88 47"; readonly ethtype + local pkt + + RET=0 + + check_tc_mpls_support $h2 || return 0 + + tc filter add dev $h2 ingress protocol mpls_uc pref 1 handle 101 \ + flower $tcflags mpls_label 0 action drop + tc filter add dev $h2 ingress protocol mpls_uc pref 2 handle 102 \ + flower $tcflags mpls_label 1048575 action drop + + pkt="$ethtype $(mpls_lse 1048575 0 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + tc_check_packets "dev $h2 ingress" 101 1 + check_fail $? "Matched on a wrong filter (1048575)" + + tc_check_packets "dev $h2 ingress" 102 1 + check_err $? "Did not match on correct filter (1048575)" + + pkt="$ethtype $(mpls_lse 0 0 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + tc_check_packets "dev $h2 ingress" 102 2 + check_fail $? "Matched on a wrong filter (0)" + + tc_check_packets "dev $h2 ingress" 101 1 + check_err $? "Did not match on correct filter (0)" + + tc filter del dev $h2 ingress protocol mpls_uc pref 2 handle 102 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 1 handle 101 flower + + log_test "mpls_label match ($tcflags)" +} + +match_mpls_tc_test() +{ + local ethtype="88 47"; readonly ethtype + local pkt + + RET=0 + + check_tc_mpls_support $h2 || return 0 + + tc filter add dev $h2 ingress protocol mpls_uc pref 1 handle 101 \ + flower $tcflags mpls_tc 0 action drop + tc filter add dev $h2 ingress protocol mpls_uc pref 2 handle 102 \ + flower $tcflags mpls_tc 7 action drop + + pkt="$ethtype $(mpls_lse 0 7 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + tc_check_packets "dev $h2 ingress" 101 1 + check_fail $? "Matched on a wrong filter (7)" + + tc_check_packets "dev $h2 ingress" 102 1 + check_err $? "Did not match on correct filter (7)" + + pkt="$ethtype $(mpls_lse 0 0 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + tc_check_packets "dev $h2 ingress" 102 2 + check_fail $? "Matched on a wrong filter (0)" + + tc_check_packets "dev $h2 ingress" 101 1 + check_err $? "Did not match on correct filter (0)" + + tc filter del dev $h2 ingress protocol mpls_uc pref 2 handle 102 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 1 handle 101 flower + + log_test "mpls_tc match ($tcflags)" +} + +match_mpls_bos_test() +{ + local ethtype="88 47"; readonly ethtype + local pkt + + RET=0 + + check_tc_mpls_support $h2 || return 0 + + tc filter add dev $h2 ingress protocol mpls_uc pref 1 handle 101 \ + flower $tcflags mpls_bos 0 action drop + tc filter add dev $h2 ingress protocol mpls_uc pref 2 handle 102 \ + flower $tcflags mpls_bos 1 action drop + + pkt="$ethtype $(mpls_lse 0 0 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + tc_check_packets "dev $h2 ingress" 101 1 + check_fail $? "Matched on a wrong filter (1)" + + tc_check_packets "dev $h2 ingress" 102 1 + check_err $? "Did not match on correct filter (1)" + + # Need to add a second label to properly mark the Bottom of Stack + pkt="$ethtype $(mpls_lse 0 0 0 255) $(mpls_lse 0 0 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + tc_check_packets "dev $h2 ingress" 102 2 + check_fail $? "Matched on a wrong filter (0)" + + tc_check_packets "dev $h2 ingress" 101 1 + check_err $? "Did not match on correct filter (0)" + + tc filter del dev $h2 ingress protocol mpls_uc pref 2 handle 102 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 1 handle 101 flower + + log_test "mpls_bos match ($tcflags)" +} + +match_mpls_ttl_test() +{ + local ethtype="88 47"; readonly ethtype + local pkt + + RET=0 + + check_tc_mpls_support $h2 || return 0 + + tc filter add dev $h2 ingress protocol mpls_uc pref 1 handle 101 \ + flower $tcflags mpls_ttl 0 action drop + tc filter add dev $h2 ingress protocol mpls_uc pref 2 handle 102 \ + flower $tcflags mpls_ttl 255 action drop + + pkt="$ethtype $(mpls_lse 0 0 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + tc_check_packets "dev $h2 ingress" 101 1 + check_fail $? "Matched on a wrong filter (255)" + + tc_check_packets "dev $h2 ingress" 102 1 + check_err $? "Did not match on correct filter (255)" + + pkt="$ethtype $(mpls_lse 0 0 1 0)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + tc_check_packets "dev $h2 ingress" 102 2 + check_fail $? "Matched on a wrong filter (0)" + + tc_check_packets "dev $h2 ingress" 101 1 + check_err $? "Did not match on correct filter (0)" + + tc filter del dev $h2 ingress protocol mpls_uc pref 2 handle 102 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 1 handle 101 flower + + log_test "mpls_ttl match ($tcflags)" +} + +match_mpls_lse_test() +{ + local ethtype="88 47"; readonly ethtype + local pkt + + RET=0 + + check_tc_mpls_lse_stats $h2 || return 0 + + # Match on first LSE (minimal values for each field) + tc filter add dev $h2 ingress protocol mpls_uc pref 1 handle 101 \ + flower $tcflags mpls lse depth 1 label 0 action continue + tc filter add dev $h2 ingress protocol mpls_uc pref 2 handle 102 \ + flower $tcflags mpls lse depth 1 tc 0 action continue + tc filter add dev $h2 ingress protocol mpls_uc pref 3 handle 103 \ + flower $tcflags mpls lse depth 1 bos 0 action continue + tc filter add dev $h2 ingress protocol mpls_uc pref 4 handle 104 \ + flower $tcflags mpls lse depth 1 ttl 0 action continue + + # Match on second LSE (maximal values for each field) + tc filter add dev $h2 ingress protocol mpls_uc pref 5 handle 105 \ + flower $tcflags mpls lse depth 2 label 1048575 action continue + tc filter add dev $h2 ingress protocol mpls_uc pref 6 handle 106 \ + flower $tcflags mpls lse depth 2 tc 7 action continue + tc filter add dev $h2 ingress protocol mpls_uc pref 7 handle 107 \ + flower $tcflags mpls lse depth 2 bos 1 action continue + tc filter add dev $h2 ingress protocol mpls_uc pref 8 handle 108 \ + flower $tcflags mpls lse depth 2 ttl 255 action continue + + # Match on LSE depth + tc filter add dev $h2 ingress protocol mpls_uc pref 9 handle 109 \ + flower $tcflags mpls lse depth 1 action continue + tc filter add dev $h2 ingress protocol mpls_uc pref 10 handle 110 \ + flower $tcflags mpls lse depth 2 action continue + tc filter add dev $h2 ingress protocol mpls_uc pref 11 handle 111 \ + flower $tcflags mpls lse depth 3 action continue + + # Base packet, matched by all filters (except for stack depth 3) + pkt="$ethtype $(mpls_lse 0 0 0 0) $(mpls_lse 1048575 7 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + # Make a variant of the above packet, with a non-matching value + # for each LSE field + + # Wrong label at depth 1 + pkt="$ethtype $(mpls_lse 1 0 0 0) $(mpls_lse 1048575 7 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + # Wrong TC at depth 1 + pkt="$ethtype $(mpls_lse 0 1 0 0) $(mpls_lse 1048575 7 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + # Wrong BOS at depth 1 (not adding a second LSE here since BOS is set + # in the first label, so anything that'd follow wouldn't be considered) + pkt="$ethtype $(mpls_lse 0 0 1 0)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + # Wrong TTL at depth 1 + pkt="$ethtype $(mpls_lse 0 0 0 1) $(mpls_lse 1048575 7 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + # Wrong label at depth 2 + pkt="$ethtype $(mpls_lse 0 0 0 0) $(mpls_lse 1048574 7 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + # Wrong TC at depth 2 + pkt="$ethtype $(mpls_lse 0 0 0 0) $(mpls_lse 1048575 6 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + # Wrong BOS at depth 2 (adding a third LSE here since BOS isn't set in + # the second label) + pkt="$ethtype $(mpls_lse 0 0 0 0) $(mpls_lse 1048575 7 0 255)" + pkt="$pkt $(mpls_lse 0 0 1 255)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + # Wrong TTL at depth 2 + pkt="$ethtype $(mpls_lse 0 0 0 0) $(mpls_lse 1048575 7 1 254)" + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q + + # Filters working at depth 1 should match all packets but one + + tc_check_packets "dev $h2 ingress" 101 8 + check_err $? "Did not match on correct filter" + + tc_check_packets "dev $h2 ingress" 102 8 + check_err $? "Did not match on correct filter" + + tc_check_packets "dev $h2 ingress" 103 8 + check_err $? "Did not match on correct filter" + + tc_check_packets "dev $h2 ingress" 104 8 + check_err $? "Did not match on correct filter" + + # Filters working at depth 2 should match all packets but two (because + # of the test packet where the label stack depth is just one) + + tc_check_packets "dev $h2 ingress" 105 7 + check_err $? "Did not match on correct filter" + + tc_check_packets "dev $h2 ingress" 106 7 + check_err $? "Did not match on correct filter" + + tc_check_packets "dev $h2 ingress" 107 7 + check_err $? "Did not match on correct filter" + + tc_check_packets "dev $h2 ingress" 108 7 + check_err $? "Did not match on correct filter" + + # Finally, verify the filters that only match on LSE depth + + tc_check_packets "dev $h2 ingress" 109 9 + check_err $? "Did not match on correct filter" + + tc_check_packets "dev $h2 ingress" 110 8 + check_err $? "Did not match on correct filter" + + tc_check_packets "dev $h2 ingress" 111 1 + check_err $? "Did not match on correct filter" + + tc filter del dev $h2 ingress protocol mpls_uc pref 11 handle 111 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 10 handle 110 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 9 handle 109 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 8 handle 108 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 7 handle 107 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 6 handle 106 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 5 handle 105 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 4 handle 104 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 3 handle 103 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 2 handle 102 flower + tc filter del dev $h2 ingress protocol mpls_uc pref 1 handle 101 flower + + log_test "mpls lse match ($tcflags)" +} + setup_prepare() { h1=${NETIFS[p1]}