[linux-next] LinuxNextTracking branch, master, updated. next-20210218 - linux-merge - lists.open-mesh.org

18 Feb 2021

The following commit has been merged in the master branch:
commit d489ded1a3690d7eca8633575cba3f7dac8484c7
Merge: 86dd9868b8788a9063893a97649594af93cd5aa6 3af409ca278d4a8d50e91f9f7c4c33b175645cf3
Author: David S. Miller davem@davemloft.net
Date:   Tue Feb 16 17:30:20 2021 -0800
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
diff --combined Documentation/networking/ip-sysctl.rst
index 581bfce86dca,1b7f8debada6..c7952ac5bd2f

--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@@ -178,27 -178,6 +178,27 @@@ min_adv_mss - INTEGE
    The advertised MSS depends on the first hop route MTU, but will
    never be lower than this setting.
+fib_notify_on_flag_change - INTEGER
 +        Whether to emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/
 +        RTM_F_TRAP/RTM_F_OFFLOAD_FAILED flags are changed.
 +
 +        After installing a route to the kernel, user space receives an
 +        acknowledgment, which means the route was installed in the kernel,
 +        but not necessarily in hardware.
 +        It is also possible for a route already installed in hardware to change
 +        its action and therefore its flags. For example, a host route that is
 +        trapping packets can be "promoted" to perform decapsulation following
 +        the installation of an IPinIP/VXLAN tunnel.
 +        The notifications will indicate to user-space the state of the route.
 +
 +        Default: 0 (Do not emit notifications.)
 +
 +        Possible values:
 +
 +        - 0 - Do not emit notifications.
 +        - 1 - Emit notifications.
 +        - 2 - Emit notifications only for RTM_F_OFFLOAD_FAILED flag change.
 +
  IP Fragmentation:
ipfrag_high_thresh - LONG INTEGER
@@@ -651,16 -630,15 +651,15 @@@ tcp_rmem - vector of 3 INTEGERs: min, d
default: initial size of receive buffer used by TCP sockets.
    This value overrides net.core.rmem_default used by other protocols.
- 	Default: 87380 bytes. This value results in window of 65535 with
- 	default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit
- 	less for default tcp_app_win. See below about these variables.
+ 	Default: 131072 bytes.
+ 	This value results in initial window of 65535.
max: maximal size of receive buffer allowed for automatically
    selected receiver buffers for TCP socket. This value does not override
    net.core.rmem_max.  Calling setsockopt() with SO_RCVBUF disables
    automatic tuning of that socket's receive buffer size, in which
    case this value is ignored.
- 	Default: between 87380B and 6MB, depending on RAM size.
+ 	Default: between 131072 and 6MB, depending on RAM size.
tcp_sack - BOOLEAN
    Enable select acknowledgments (SACKS).
@@@ -1446,25 -1424,6 +1445,25 @@@ rp_filter - INTEGE
    Default value is 0. Note that some distributions enable it
    in startup scripts.
+src_valid_mark - BOOLEAN
 +	- 0 - The fwmark of the packet is not included in reverse path
 +	  route lookup.  This allows for asymmetric routing configurations
 +	  utilizing the fwmark in only one direction, e.g., transparent
 +	  proxying.
 +
 +	- 1 - The fwmark of the packet is included in reverse path route
 +	  lookup.  This permits rp_filter to function when the fwmark is
 +	  used for routing traffic in both directions.
 +
 +	This setting also affects the utilization of fmwark when
 +	performing source address selection for ICMP replies, or
 +	determining addresses stored for the IPOPT_TS_TSANDADDR and
 +	IPOPT_RR IP options.
 +
 +	The max value from conf/{all,interface}/src_valid_mark is used.
 +
 +	Default value is 0.
 +
  arp_filter - BOOLEAN
    - 1 - Allows you to have multiple network interfaces on the same
      subnet, and have the ARPs for each interface be answered
@@@ -1815,27 -1774,6 +1814,27 @@@ nexthop_compat_mode - BOOLEA
    and extraneous notifications.
    Default: true (backward compat mode)
+fib_notify_on_flag_change - INTEGER
 +        Whether to emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/
 +        RTM_F_TRAP/RTM_F_OFFLOAD_FAILED flags are changed.
 +
 +        After installing a route to the kernel, user space receives an
 +        acknowledgment, which means the route was installed in the kernel,
 +        but not necessarily in hardware.
 +        It is also possible for a route already installed in hardware to change
 +        its action and therefore its flags. For example, a host route that is
 +        trapping packets can be "promoted" to perform decapsulation following
 +        the installation of an IPinIP/VXLAN tunnel.
 +        The notifications will indicate to user-space the state of the route.
 +
 +        Default: 0 (Do not emit notifications.)
 +
 +        Possible values:
 +
 +        - 0 - Do not emit notifications.
 +        - 1 - Emit notifications.
 +        - 2 - Emit notifications only for RTM_F_OFFLOAD_FAILED flag change.
 +
  IPv6 Fragmentation:
ip6frag_high_thresh - INTEGER
@@@ -1944,16 -1882,6 +1943,16 @@@ accept_ra_defrtr - BOOLEA
    	- enabled if accept_ra is enabled.
    	- disabled if accept_ra is disabled.
+ra_defrtr_metric - UNSIGNED INTEGER
 +	Route metric for default route learned in Router Advertisement. This value
 +	will be assigned as metric for the default route learned via IPv6 Router
 +	Advertisement. Takes affect only if accept_ra_defrtr is enabled.
 +
 +	Possible values:
 +		1 to 0xFFFFFFFF
 +
 +		Default: IP6_RT_PRIO_USER i.e. 1024.
 +
  accept_ra_from_local - BOOLEAN
    Accept RA with source-address that is found on local machine
    if the RA is otherwise proper and able to be accepted.
diff --combined drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 99b6d5a9f1d9,395eb0b52680..4f714f874c4f
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@@ -1368,6 -1368,7 +1368,7 @@@ static void xgbe_stop(struct xgbe_prv_d
    	return;
netif_tx_stop_all_queues(netdev);
+ 	netif_carrier_off(pdata->netdev);
xgbe_stop_timers(pdata);
    flush_workqueue(pdata->dev_workqueue);
@@@ -2295,6 -2296,8 +2296,6 @@@ static const struct net_device_ops xgbe
    .ndo_setup_tc		= xgbe_setup_tc,
    .ndo_fix_features	= xgbe_fix_features,
    .ndo_set_features	= xgbe_set_features,
 -	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
 -	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
    .ndo_features_check	= xgbe_features_check,
  };
diff --combined drivers/net/ethernet/broadcom/bnxt/bnxt.c
index d0f3f68faa91,1c96b7ba24f2..a680fd9c68ea
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@@ -255,9 -255,7 +255,9 @@@ static const u16 bnxt_async_events_arr[
    ASYNC_EVENT_CMPL_EVENT_ID_PORT_PHY_CFG_CHANGE,
    ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY,
    ASYNC_EVENT_CMPL_EVENT_ID_ERROR_RECOVERY,
 +	ASYNC_EVENT_CMPL_EVENT_ID_DEBUG_NOTIFICATION,
    ASYNC_EVENT_CMPL_EVENT_ID_RING_MONITOR_MSG,
 +	ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST,
  };
static struct workqueue_struct *bnxt_pf_wq;
@@@ -1267,7 -1265,8 +1267,7 @@@ static void bnxt_tpa_start(struct bnxt 
    } else {
    	tpa_info->hash_type = PKT_HASH_TYPE_NONE;
    	tpa_info->gso_type = 0;
 -		if (netif_msg_rx_err(bp))
 -			netdev_warn(bp->dev, "TPA packet without valid hash\n");
 +		netif_warn(bp, rx_err, bp->dev, "TPA packet without valid hash\n");
    }
    tpa_info->flags2 = le32_to_cpu(tpa_start1->rx_tpa_start_cmp_flags2);
    tpa_info->metadata = le32_to_cpu(tpa_start1->rx_tpa_start_cmp_metadata);
@@@ -2022,9 -2021,10 +2022,9 @@@ static int bnxt_async_event_process(str
    		goto async_event_process_exit;
    	set_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event);
    	break;
 -	case ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY:
 -		if (netif_msg_hw(bp))
 -			netdev_warn(bp->dev, "Received RESET_NOTIFY event, data1: 0x%x, data2: 0x%x\n",
 -				    data1, data2);
 +	case ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY: {
 +		char *fatal_str = "non-fatal";
 +
    	if (!bp->fw_health)
    		goto async_event_process_exit;
@@@ -2036,17 -2036,14 +2036,17 @@@
    	if (!bp->fw_reset_max_dsecs)
    		bp->fw_reset_max_dsecs = BNXT_DFLT_FW_RST_MAX_DSECS;
    	if (EVENT_DATA1_RESET_NOTIFY_FATAL(data1)) {
 -			netdev_warn(bp->dev, "Firmware fatal reset event received\n");
 +			fatal_str = "fatal";
    		set_bit(BNXT_STATE_FW_FATAL_COND, &bp->state);
 -		} else {
 -			netdev_warn(bp->dev, "Firmware non-fatal reset event received, max wait time %d msec\n",
 -				    bp->fw_reset_max_dsecs * 100);
    	}
 +		netif_warn(bp, hw, bp->dev,
 +			   "Firmware %s reset event, data1: 0x%x, data2: 0x%x, min wait %u ms, max wait %u ms\n",
 +			   fatal_str, data1, data2,
 +			   bp->fw_reset_min_dsecs * 100,
 +			   bp->fw_reset_max_dsecs * 100);
    	set_bit(BNXT_FW_RESET_NOTIFY_SP_EVENT, &bp->sp_event);
    	break;
 +	}
    case ASYNC_EVENT_CMPL_EVENT_ID_ERROR_RECOVERY: {
    	struct bnxt_fw_health *fw_health = bp->fw_health;
@@@ -2055,11 -2052,16 +2055,11 @@@
fw_health->enabled = EVENT_DATA1_RECOVERY_ENABLED(data1);
    	fw_health->master = EVENT_DATA1_RECOVERY_MASTER_FUNC(data1);
 -		if (!fw_health->enabled)
 +		if (!fw_health->enabled) {
 +			netif_info(bp, drv, bp->dev,
 +				   "Error recovery info: error recovery[0]\n");
    		break;
 -
 -		if (netif_msg_drv(bp))
 -			netdev_info(bp->dev, "Error recovery info: error recovery[%d], master[%d], reset count[0x%x], health status: 0x%x\n",
 -				    fw_health->enabled, fw_health->master,
 -				    bnxt_fw_health_readl(bp,
 -							 BNXT_FW_RESET_CNT_REG),
 -				    bnxt_fw_health_readl(bp,
 -							 BNXT_FW_HEALTH_REG));
 +		}
    	fw_health->tmr_multiplier =
    		DIV_ROUND_UP(fw_health->polling_dsecs * HZ,
    			     bp->current_interval * 10);
@@@ -2068,17 -2070,8 +2068,17 @@@
    		bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG);
    	fw_health->last_fw_reset_cnt =
    		bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
 +		netif_info(bp, drv, bp->dev,
 +			   "Error recovery info: error recovery[1], master[%d], reset count[%u], health status: 0x%x\n",
 +			   fw_health->master, fw_health->last_fw_reset_cnt,
 +			   bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG));
    	goto async_event_process_exit;
    }
 +	case ASYNC_EVENT_CMPL_EVENT_ID_DEBUG_NOTIFICATION:
 +		netif_notice(bp, hw, bp->dev,
 +			     "Received firmware debug notification, data1: 0x%x, data2: 0x%x\n",
 +			     data1, data2);
 +		goto async_event_process_exit;
    case ASYNC_EVENT_CMPL_EVENT_ID_RING_MONITOR_MSG: {
    	struct bnxt_rx_ring_info *rxr;
    	u16 grp_idx;
@@@ -2101,20 -2094,6 +2101,20 @@@
    	bnxt_sched_reset(bp, rxr);
    	goto async_event_process_exit;
    }
 +	case ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST: {
 +		struct bnxt_fw_health *fw_health = bp->fw_health;
 +
 +		netif_notice(bp, hw, bp->dev,
 +			     "Received firmware echo request, data1: 0x%x, data2: 0x%x\n",
 +			     data1, data2);
 +		if (fw_health) {
 +			fw_health->echo_req_data1 = data1;
 +			fw_health->echo_req_data2 = data2;
 +			set_bit(BNXT_FW_ECHO_REQUEST_SP_EVENT, &bp->sp_event);
 +			break;
 +		}
 +		goto async_event_process_exit;
 +	}
    default:
    	goto async_event_process_exit;
    }
@@@ -2415,10 -2394,6 +2415,10 @@@ static int bnxt_poll(struct napi_struc
    struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
    int work_done = 0;
+	if (unlikely(test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state))) {
 +		napi_complete(napi);
 +		return 0;
 +	}
    while (1) {
    	work_done += bnxt_poll_work(bp, cpr, budget - work_done);
@@@ -2493,10 -2468,6 +2493,10 @@@ static int bnxt_poll_p5(struct napi_str
    int work_done = 0;
    u32 cons;
+	if (unlikely(test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state))) {
 +		napi_complete(napi);
 +		return 0;
 +	}
    if (cpr->has_more_work) {
    	cpr->has_more_work = 0;
    	work_done = __bnxt_poll_cqs(bp, bnapi, budget);
@@@ -2704,23 -2675,6 +2704,23 @@@ static void bnxt_free_skbs(struct bnxt 
    bnxt_free_rx_skbs(bp);
  }
+static void bnxt_init_ctx_mem(struct bnxt_mem_init *mem_init, void *p, int len)
 +{
 +	u8 init_val = mem_init->init_val;
 +	u16 offset = mem_init->offset;
 +	u8 *p2 = p;
 +	int i;
 +
 +	if (!init_val)
 +		return;
 +	if (offset == BNXT_MEM_INVALID_OFFSET) {
 +		memset(p, init_val, len);
 +		return;
 +	}
 +	for (i = 0; i < len; i += mem_init->size)
 +		*(p2 + i + offset) = init_val;
 +}
 +
  static void bnxt_free_ring(struct bnxt *bp, struct bnxt_ring_mem_info *rmem)
  {
    struct pci_dev *pdev = bp->pdev;
@@@ -2780,9 -2734,9 +2780,9 @@@ static int bnxt_alloc_ring(struct bnxt 
    	if (!rmem->pg_arr[i])
    		return -ENOMEM;
-		if (rmem->init_val)
 -			memset(rmem->pg_arr[i], rmem->init_val,
 -			       rmem->page_size);
 +		if (rmem->mem_init)
 +			bnxt_init_ctx_mem(rmem->mem_init, rmem->pg_arr[i],
 +					  rmem->page_size);
    	if (rmem->nr_pages > 1 || rmem->depth > 0) {
    		if (i == rmem->nr_pages - 2 &&
    		    (rmem->flags & BNXT_RMEM_RING_PTE_FLAG))
@@@ -4318,9 -4272,6 +4318,9 @@@ static void bnxt_disable_int_sync(struc
  {
    int i;
+	if (!bp->irq_tbl)
 +		return;
 +
    atomic_inc(&bp->intr_sem);
bnxt_disable_int(bp);
@@@ -4474,8 -4425,6 +4474,8 @@@ static int bnxt_hwrm_do_send_msg(struc
if (!timeout)
    	timeout = DFLT_HWRM_CMD_TIMEOUT;
 +	/* Limit timeout to an upper limit */
 +	timeout = min(timeout, HWRM_CMD_MAX_TIMEOUT);
    /* convert timeout to usec */
    timeout *= 1000;
@@@ -6783,39 -6732,6 +6783,39 @@@ func_qcfg_exit
    return rc;
  }
+static void bnxt_init_ctx_initializer(struct bnxt_ctx_mem_info *ctx,
 +			struct hwrm_func_backing_store_qcaps_output *resp)
 +{
 +	struct bnxt_mem_init *mem_init;
 +	u16 init_mask;
 +	u8 init_val;
 +	u8 *offset;
 +	int i;
 +
 +	init_val = resp->ctx_kind_initializer;
 +	init_mask = le16_to_cpu(resp->ctx_init_mask);
 +	offset = &resp->qp_init_offset;
 +	mem_init = &ctx->mem_init[BNXT_CTX_MEM_INIT_QP];
 +	for (i = 0; i < BNXT_CTX_MEM_INIT_MAX; i++, mem_init++, offset++) {
 +		mem_init->init_val = init_val;
 +		mem_init->offset = BNXT_MEM_INVALID_OFFSET;
 +		if (!init_mask)
 +			continue;
 +		if (i == BNXT_CTX_MEM_INIT_STAT)
 +			offset = &resp->stat_init_offset;
 +		if (init_mask & (1 << i))
 +			mem_init->offset = *offset * 4;
 +		else
 +			mem_init->init_val = 0;
 +	}
 +	ctx->mem_init[BNXT_CTX_MEM_INIT_QP].size = ctx->qp_entry_size;
 +	ctx->mem_init[BNXT_CTX_MEM_INIT_SRQ].size = ctx->srq_entry_size;
 +	ctx->mem_init[BNXT_CTX_MEM_INIT_CQ].size = ctx->cq_entry_size;
 +	ctx->mem_init[BNXT_CTX_MEM_INIT_VNIC].size = ctx->vnic_entry_size;
 +	ctx->mem_init[BNXT_CTX_MEM_INIT_STAT].size = ctx->stat_entry_size;
 +	ctx->mem_init[BNXT_CTX_MEM_INIT_MRAV].size = ctx->mrav_entry_size;
 +}
 +
  static int bnxt_hwrm_func_backing_store_qcaps(struct bnxt *bp)
  {
    struct hwrm_func_backing_store_qcaps_input req = {0};
@@@ -6870,9 -6786,7 +6870,9 @@@
    		le16_to_cpu(resp->mrav_num_entries_units);
    	ctx->tim_entry_size = le16_to_cpu(resp->tim_entry_size);
    	ctx->tim_max_entries = le32_to_cpu(resp->tim_max_entries);
 -		ctx->ctx_kind_initializer = resp->ctx_kind_initializer;
 +
 +		bnxt_init_ctx_initializer(ctx, resp);
 +
    	ctx->tqm_fp_rings_count = resp->tqm_fp_rings_count;
    	if (!ctx->tqm_fp_rings_count)
    		ctx->tqm_fp_rings_count = bp->max_q;
@@@ -6902,9 -6816,6 +6902,9 @@@ static void bnxt_hwrm_set_pg_attr(struc
  {
    u8 pg_size = 0;
+	if (!rmem->nr_pages)
 +		return;
 +
    if (BNXT_PAGE_SHIFT == 13)
    	pg_size = 1 << 4;
    else if (BNXT_PAGE_SIZE == 16)
@@@ -6934,7 -6845,6 +6934,7 @@@ static int bnxt_hwrm_func_backing_store
    struct hwrm_func_backing_store_cfg_input req = {0};
    struct bnxt_ctx_mem_info *ctx = bp->ctx;
    struct bnxt_ctx_pg_info *ctx_pg;
 +	u32 req_len = sizeof(req);
    __le32 *num_entries;
    __le64 *pg_dir;
    u32 flags = 0;
@@@ -6945,8 -6855,6 +6945,8 @@@
    if (!ctx)
    	return 0;
+	if (req_len > bp->hwrm_max_ext_req_len)
 +		req_len = BNXT_BACKING_STORE_CFG_LEGACY_LEN;
    bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_BACKING_STORE_CFG, -1, -1);
    req.enables = cpu_to_le32(enables);
@@@ -7030,7 -6938,7 +7030,7 @@@
    	bnxt_hwrm_set_pg_attr(&ctx_pg->ring_mem, pg_attr, pg_dir);
    }
    req.flags = cpu_to_le32(flags);
 -	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 +	return hwrm_send_message(bp, &req, req_len, HWRM_CMD_TIMEOUT);
  }
static int bnxt_alloc_ctx_mem_blk(struct bnxt *bp,
@@@ -7049,7 -6957,7 +7049,7 @@@
static int bnxt_alloc_ctx_pg_tbls(struct bnxt *bp,
    			  struct bnxt_ctx_pg_info *ctx_pg, u32 mem_size,
 -				  u8 depth, bool use_init_val)
 +				  u8 depth, struct bnxt_mem_init *mem_init)
  {
    struct bnxt_ring_mem_info *rmem = &ctx_pg->ring_mem;
    int rc;
@@@ -7087,7 -6995,8 +7087,7 @@@
    		rmem->pg_tbl_map = ctx_pg->ctx_dma_arr[i];
    		rmem->depth = 1;
    		rmem->nr_pages = MAX_CTX_PAGES;
 -			if (use_init_val)
 -				rmem->init_val = bp->ctx->ctx_kind_initializer;
 +			rmem->mem_init = mem_init;
    		if (i == (nr_tbls - 1)) {
    			int rem = ctx_pg->nr_pages % MAX_CTX_PAGES;
@@@ -7102,7 -7011,8 +7102,7 @@@
    	rmem->nr_pages = DIV_ROUND_UP(mem_size, BNXT_PAGE_SIZE);
    	if (rmem->nr_pages > 1 || depth)
    		rmem->depth = 1;
 -		if (use_init_val)
 -			rmem->init_val = bp->ctx->ctx_kind_initializer;
 +		rmem->mem_init = mem_init;
    	rc = bnxt_alloc_ctx_mem_blk(bp, ctx_pg);
    }
    return rc;
@@@ -7166,7 -7076,6 +7166,7 @@@ static int bnxt_alloc_ctx_mem(struct bn
  {
    struct bnxt_ctx_pg_info *ctx_pg;
    struct bnxt_ctx_mem_info *ctx;
 +	struct bnxt_mem_init *init;
    u32 mem_size, ena, entries;
    u32 entries_sp, min;
    u32 num_mr, num_ah;
@@@ -7194,54 -7103,39 +7194,54 @@@
    ctx_pg = &ctx->qp_mem;
    ctx_pg->entries = ctx->qp_min_qp1_entries + ctx->qp_max_l2_entries +
    		  extra_qps;
 -	mem_size = ctx->qp_entry_size * ctx_pg->entries;
 -	rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, true);
 -	if (rc)
 -		return rc;
 +	if (ctx->qp_entry_size) {
 +		mem_size = ctx->qp_entry_size * ctx_pg->entries;
 +		init = &ctx->mem_init[BNXT_CTX_MEM_INIT_QP];
 +		rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, init);
 +		if (rc)
 +			return rc;
 +	}
ctx_pg = &ctx->srq_mem;
    ctx_pg->entries = ctx->srq_max_l2_entries + extra_srqs;
 -	mem_size = ctx->srq_entry_size * ctx_pg->entries;
 -	rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, true);
 -	if (rc)
 -		return rc;
 +	if (ctx->srq_entry_size) {
 +		mem_size = ctx->srq_entry_size * ctx_pg->entries;
 +		init = &ctx->mem_init[BNXT_CTX_MEM_INIT_SRQ];
 +		rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, init);
 +		if (rc)
 +			return rc;
 +	}
ctx_pg = &ctx->cq_mem;
    ctx_pg->entries = ctx->cq_max_l2_entries + extra_qps * 2;
 -	mem_size = ctx->cq_entry_size * ctx_pg->entries;
 -	rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, true);
 -	if (rc)
 -		return rc;
 +	if (ctx->cq_entry_size) {
 +		mem_size = ctx->cq_entry_size * ctx_pg->entries;
 +		init = &ctx->mem_init[BNXT_CTX_MEM_INIT_CQ];
 +		rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, init);
 +		if (rc)
 +			return rc;
 +	}
ctx_pg = &ctx->vnic_mem;
    ctx_pg->entries = ctx->vnic_max_vnic_entries +
    		  ctx->vnic_max_ring_table_entries;
 -	mem_size = ctx->vnic_entry_size * ctx_pg->entries;
 -	rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, true);
 -	if (rc)
 -		return rc;
 +	if (ctx->vnic_entry_size) {
 +		mem_size = ctx->vnic_entry_size * ctx_pg->entries;
 +		init = &ctx->mem_init[BNXT_CTX_MEM_INIT_VNIC];
 +		rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, init);
 +		if (rc)
 +			return rc;
 +	}
ctx_pg = &ctx->stat_mem;
    ctx_pg->entries = ctx->stat_max_entries;
 -	mem_size = ctx->stat_entry_size * ctx_pg->entries;
 -	rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, true);
 -	if (rc)
 -		return rc;
 +	if (ctx->stat_entry_size) {
 +		mem_size = ctx->stat_entry_size * ctx_pg->entries;
 +		init = &ctx->mem_init[BNXT_CTX_MEM_INIT_STAT];
 +		rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, init);
 +		if (rc)
 +			return rc;
 +	}
ena = 0;
    if (!(bp->flags & BNXT_FLAG_ROCE_CAP))
@@@ -7254,13 -7148,10 +7254,13 @@@
    num_mr = 1024 * 256;
    num_ah = 1024 * 128;
    ctx_pg->entries = num_mr + num_ah;
 -	mem_size = ctx->mrav_entry_size * ctx_pg->entries;
 -	rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 2, true);
 -	if (rc)
 -		return rc;
 +	if (ctx->mrav_entry_size) {
 +		mem_size = ctx->mrav_entry_size * ctx_pg->entries;
 +		init = &ctx->mem_init[BNXT_CTX_MEM_INIT_MRAV];
 +		rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 2, init);
 +		if (rc)
 +			return rc;
 +	}
    ena = FUNC_BACKING_STORE_CFG_REQ_ENABLES_MRAV;
    if (ctx->mrav_num_entries_units)
    	ctx_pg->entries =
@@@ -7269,12 -7160,10 +7269,12 @@@
ctx_pg = &ctx->tim_mem;
    ctx_pg->entries = ctx->qp_mem.entries;
 -	mem_size = ctx->tim_entry_size * ctx_pg->entries;
 -	rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, false);
 -	if (rc)
 -		return rc;
 +	if (ctx->tim_entry_size) {
 +		mem_size = ctx->tim_entry_size * ctx_pg->entries;
 +		rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, NULL);
 +		if (rc)
 +			return rc;
 +	}
    ena |= FUNC_BACKING_STORE_CFG_REQ_ENABLES_TIM;
skip_rdma:
@@@ -7288,13 -7177,10 +7288,13 @@@
    for (i = 0; i < ctx->tqm_fp_rings_count + 1; i++) {
    	ctx_pg = ctx->tqm_mem[i];
    	ctx_pg->entries = i ? entries : entries_sp;
 -		mem_size = ctx->tqm_entry_size * ctx_pg->entries;
 -		rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, false);
 -		if (rc)
 -			return rc;
 +		if (ctx->tqm_entry_size) {
 +			mem_size = ctx->tqm_entry_size * ctx_pg->entries;
 +			rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1,
 +						    NULL);
 +			if (rc)
 +				return rc;
 +		}
    	ena |= FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_SP << i;
    }
    ena |= FUNC_BACKING_STORE_CFG_REQ_DFLT_ENABLES;
@@@ -7552,22 -7438,9 +7552,22 @@@ static void bnxt_try_map_fw_health_reg(
sig = readl(hs + offsetof(struct hcomm_status, sig_ver));
    if ((sig & HCOMM_STATUS_SIGNATURE_MASK) != HCOMM_STATUS_SIGNATURE_VAL) {
 -		if (bp->fw_health)
 -			bp->fw_health->status_reliable = false;
 -		return;
 +		if (!bp->chip_num) {
 +			__bnxt_map_fw_health_reg(bp, BNXT_GRC_REG_BASE);
 +			bp->chip_num = readl(bp->bar0 +
 +					     BNXT_FW_HEALTH_WIN_BASE +
 +					     BNXT_GRC_REG_CHIP_NUM);
 +		}
 +		if (!BNXT_CHIP_P5(bp)) {
 +			if (bp->fw_health)
 +				bp->fw_health->status_reliable = false;
 +			return;
 +		}
 +		status_loc = BNXT_GRC_REG_STATUS_P5 |
 +			     BNXT_FW_HEALTH_REG_TYPE_BAR0;
 +	} else {
 +		status_loc = readl(hs + offsetof(struct hcomm_status,
 +						 fw_status_loc));
    }
if (__bnxt_alloc_fw_health(bp)) {
@@@ -7575,6 -7448,7 +7575,6 @@@
    	return;
    }
-	status_loc = readl(hs + offsetof(struct hcomm_status, fw_status_loc));
    bp->fw_health->regs[BNXT_FW_HEALTH_REG] = status_loc;
    reg_type = BNXT_FW_HEALTH_REG_TYPE(status_loc);
    if (reg_type == BNXT_FW_HEALTH_REG_TYPE_GRC) {
@@@ -8729,7 -8603,7 +8729,7 @@@ msix_setup_exit
static int bnxt_init_inta(struct bnxt *bp)
  {
 -	bp->irq_tbl = kcalloc(1, sizeof(struct bnxt_irq), GFP_KERNEL);
 +	bp->irq_tbl = kzalloc(sizeof(struct bnxt_irq), GFP_KERNEL);
    if (!bp->irq_tbl)
    	return -ENOMEM;
@@@ -8937,8 -8811,7 +8937,8 @@@ static void bnxt_disable_napi(struct bn
  {
    int i;
-	if (!bp->bnapi)
 +	if (!bp->bnapi ||
 +	    test_and_set_bit(BNXT_STATE_NAPI_DISABLED, &bp->state))
    	return;
for (i = 0; i < bp->cp_nr_rings; i++) {
@@@ -8955,7 -8828,6 +8955,7 @@@ static void bnxt_enable_napi(struct bnx
  {
    int i;
+	clear_bit(BNXT_STATE_NAPI_DISABLED, &bp->state);
    for (i = 0; i < bp->cp_nr_rings; i++) {
    	struct bnxt_napi *bnapi = bp->bnapi[i];
    	struct bnxt_cp_ring_info *cpr;
@@@ -8984,9 -8856,10 +8984,10 @@@ void bnxt_tx_disable(struct bnxt *bp
    		txr->dev_state = BNXT_DEV_STATE_CLOSING;
    	}
    }
+ 	/* Drop carrier first to prevent TX timeout */
+ 	netif_carrier_off(bp->dev);
    /* Stop all TX queues */
    netif_tx_disable(bp->dev);
- 	netif_carrier_off(bp->dev);
  }
void bnxt_tx_enable(struct bnxt *bp)
@@@ -9462,60 -9335,13 +9463,60 @@@ static int bnxt_hwrm_shutdown_link(stru
static int bnxt_fw_init_one(struct bnxt *bp);
+static int bnxt_fw_reset_via_optee(struct bnxt *bp)
 +{
 +#ifdef CONFIG_TEE_BNXT_FW
 +	int rc = tee_bnxt_fw_load();
 +
 +	if (rc)
 +		netdev_err(bp->dev, "Failed FW reset via OP-TEE, rc=%d\n", rc);
 +
 +	return rc;
 +#else
 +	netdev_err(bp->dev, "OP-TEE not supported\n");
 +	return -ENODEV;
 +#endif
 +}
 +
 +static int bnxt_try_recover_fw(struct bnxt *bp)
 +{
 +	if (bp->fw_health && bp->fw_health->status_reliable) {
 +		int retry = 0, rc;
 +		u32 sts;
 +
 +		mutex_lock(&bp->hwrm_cmd_lock);
 +		do {
 +			rc = __bnxt_hwrm_ver_get(bp, true);
 +			sts = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG);
 +			if (!sts || !BNXT_FW_IS_BOOTING(sts))
 +				break;
 +			retry++;
 +		} while (rc == -EBUSY && retry < BNXT_FW_RETRY);
 +		mutex_unlock(&bp->hwrm_cmd_lock);
 +
 +		if (!BNXT_FW_IS_HEALTHY(sts)) {
 +			netdev_err(bp->dev,
 +				   "Firmware not responding, status: 0x%x\n",
 +				   sts);
 +			rc = -ENODEV;
 +		}
 +		if (sts & FW_STATUS_REG_CRASHED_NO_MASTER) {
 +			netdev_warn(bp->dev, "Firmware recover via OP-TEE requested\n");
 +			return bnxt_fw_reset_via_optee(bp);
 +		}
 +		return rc;
 +	}
 +
 +	return -ENODEV;
 +}
 +
  static int bnxt_hwrm_if_change(struct bnxt *bp, bool up)
  {
    struct hwrm_func_drv_if_change_output *resp = bp->hwrm_cmd_resp_addr;
    struct hwrm_func_drv_if_change_input req = {0};
    bool resc_reinit = false, fw_reset = false;
 +	int rc, retry = 0;
    u32 flags = 0;
 -	int rc;
if (!(bp->fw_cap & BNXT_FW_CAP_IF_CHANGE))
    	return 0;
@@@ -9524,25 -9350,10 +9525,25 @@@
    if (up)
    	req.flags = cpu_to_le32(FUNC_DRV_IF_CHANGE_REQ_FLAGS_UP);
    mutex_lock(&bp->hwrm_cmd_lock);
 -	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 +	while (retry < BNXT_FW_IF_RETRY) {
 +		rc = _hwrm_send_message(bp, &req, sizeof(req),
 +					HWRM_CMD_TIMEOUT);
 +		if (rc != -EAGAIN)
 +			break;
 +
 +		msleep(50);
 +		retry++;
 +	}
    if (!rc)
    	flags = le32_to_cpu(resp->flags);
    mutex_unlock(&bp->hwrm_cmd_lock);
 +
 +	if (rc == -EAGAIN)
 +		return rc;
 +	if (rc && up) {
 +		rc = bnxt_try_recover_fw(bp);
 +		fw_reset = true;
 +	}
    if (rc)
    	return rc;
@@@ -9882,25 -9693,6 +9883,25 @@@ static void bnxt_preset_reg_win(struct
static int bnxt_init_dflt_ring_mode(struct bnxt *bp);
+static int bnxt_reinit_after_abort(struct bnxt *bp)
 +{
 +	int rc;
 +
 +	if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state))
 +		return -EBUSY;
 +
 +	rc = bnxt_fw_init_one(bp);
 +	if (!rc) {
 +		bnxt_clear_int_mode(bp);
 +		rc = bnxt_init_int_mode(bp);
 +		if (!rc) {
 +			clear_bit(BNXT_STATE_ABORT_ERR, &bp->state);
 +			set_bit(BNXT_STATE_FW_RESET_DET, &bp->state);
 +		}
 +	}
 +	return rc;
 +}
 +
  static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
  {
    int rc = 0;
@@@ -10059,14 -9851,8 +10060,14 @@@ static int bnxt_open(struct net_device 
    int rc;
if (test_bit(BNXT_STATE_ABORT_ERR, &bp->state)) {
 -		netdev_err(bp->dev, "A previous firmware reset did not complete, aborting\n");
 -		return -ENODEV;
 +		rc = bnxt_reinit_after_abort(bp);
 +		if (rc) {
 +			if (rc == -EBUSY)
 +				netdev_err(bp->dev, "A previous firmware reset has not completed, aborting\n");
 +			else
 +				netdev_err(bp->dev, "Failed to reinitialize after aborted firmware reset\n");
 +			return -ENODEV;
 +		}
    }
rc = bnxt_hwrm_if_change(bp, true);
@@@ -11003,23 -10789,11 +11004,23 @@@ static void bnxt_rx_ring_reset(struct b
  static void bnxt_fw_reset_close(struct bnxt *bp)
  {
    bnxt_ulp_stop(bp);
 -	/* When firmware is fatal state, disable PCI device to prevent
 -	 * any potential bad DMAs before freeing kernel memory.
 +	/* When firmware is in fatal state, quiesce device and disable
 +	 * bus master to prevent any potential bad DMAs before freeing
 +	 * kernel memory.
     */
 -	if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state))
 +	if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) {
 +		u16 val = 0;
 +
 +		pci_read_config_word(bp->pdev, PCI_SUBSYSTEM_ID, &val);
 +		if (val == 0xffff)
 +			bp->fw_reset_min_dsecs = 0;
 +		bnxt_tx_disable(bp);
 +		bnxt_disable_napi(bp);
 +		bnxt_disable_int_sync(bp);
 +		bnxt_free_irq(bp);
 +		bnxt_clear_int_mode(bp);
    	pci_disable_device(bp->pdev);
 +	}
    __bnxt_close_nic(bp, true, false);
    bnxt_clear_int_mode(bp);
    bnxt_hwrm_func_drv_unrgtr(bp);
@@@ -11224,17 -10998,6 +11225,17 @@@ static void bnxt_init_ethtool_link_sett
    	link_info->req_flow_ctrl = link_info->force_pause_setting;
  }
+static void bnxt_fw_echo_reply(struct bnxt *bp)
 +{
 +	struct bnxt_fw_health *fw_health = bp->fw_health;
 +	struct hwrm_func_echo_response_input req = {0};
 +
 +	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_ECHO_RESPONSE, -1, -1);
 +	req.event_data1 = cpu_to_le32(fw_health->echo_req_data1);
 +	req.event_data2 = cpu_to_le32(fw_health->echo_req_data2);
 +	hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 +}
 +
  static void bnxt_sp_task(struct work_struct *work)
  {
    struct bnxt *bp = container_of(work, struct bnxt, sp_task);
@@@ -11302,9 -11065,6 +11303,9 @@@
    if (test_and_clear_bit(BNXT_RING_COAL_NOW_SP_EVENT, &bp->sp_event))
    	bnxt_chk_missed_irq(bp);
+	if (test_and_clear_bit(BNXT_FW_ECHO_REQUEST_SP_EVENT, &bp->sp_event))
 +		bnxt_fw_echo_reply(bp);
 +
    /* These functions below will clear BNXT_STATE_IN_SP_TASK.  They
     * must be the last functions to be called before exiting.
     */
@@@ -11421,6 -11181,21 +11422,6 @@@ static void bnxt_init_dflt_coal(struct 
    bp->stats_coal_ticks = BNXT_DEF_STATS_COAL_TICKS;
  }
-static int bnxt_fw_reset_via_optee(struct bnxt *bp)
 -{
 -#ifdef CONFIG_TEE_BNXT_FW
 -	int rc = tee_bnxt_fw_load();
 -
 -	if (rc)
 -		netdev_err(bp->dev, "Failed FW reset via OP-TEE, rc=%d\n", rc);
 -
 -	return rc;
 -#else
 -	netdev_err(bp->dev, "OP-TEE not supported\n");
 -	return -ENODEV;
 -#endif
 -}
 -
  static int bnxt_fw_init_one_p1(struct bnxt *bp)
  {
    int rc;
@@@ -11429,10 -11204,19 +11430,10 @@@
    rc = bnxt_hwrm_ver_get(bp);
    bnxt_try_map_fw_health_reg(bp);
    if (rc) {
 -		if (bp->fw_health && bp->fw_health->status_reliable) {
 -			u32 sts = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG);
 -
 -			netdev_err(bp->dev,
 -				   "Firmware not responding, status: 0x%x\n",
 -				   sts);
 -			if (sts & FW_STATUS_REG_CRASHED_NO_MASTER) {
 -				netdev_warn(bp->dev, "Firmware recover via OP-TEE requested\n");
 -				rc = bnxt_fw_reset_via_optee(bp);
 -				if (!rc)
 -					rc = bnxt_hwrm_ver_get(bp);
 -			}
 -		}
 +		rc = bnxt_try_recover_fw(bp);
 +		if (rc)
 +			return rc;
 +		rc = bnxt_hwrm_ver_get(bp);
    	if (rc)
    		return rc;
    }
@@@ -11632,12 -11416,6 +11633,12 @@@ static void bnxt_reset_all(struct bnxt 
    bp->fw_reset_timestamp = jiffies;
  }
+static bool bnxt_fw_reset_timeout(struct bnxt *bp)
 +{
 +	return time_after(jiffies, bp->fw_reset_timestamp +
 +			  (bp->fw_reset_max_dsecs * HZ / 10));
 +}
 +
  static void bnxt_fw_reset_task(struct work_struct *work)
  {
    struct bnxt *bp = container_of(work, struct bnxt, fw_reset_task.work);
@@@ -11659,7 -11437,8 +11660,7 @@@
    			   bp->fw_reset_timestamp));
    		goto fw_reset_abort;
    	} else if (n > 0) {
 -			if (time_after(jiffies, bp->fw_reset_timestamp +
 -				       (bp->fw_reset_max_dsecs * HZ / 10))) {
 +			if (bnxt_fw_reset_timeout(bp)) {
    			clear_bit(BNXT_STATE_IN_FW_RESET, &bp->state);
    			bp->fw_reset_state = 0;
    			netdev_err(bp->dev, "Firmware reset aborted, bnxt_get_registered_vfs() returns %d\n",
@@@ -11688,7 -11467,8 +11689,7 @@@
val = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG);
    	if (!(val & BNXT_FW_STATUS_SHUTDOWN) &&
 -		    !time_after(jiffies, bp->fw_reset_timestamp +
 -		    (bp->fw_reset_max_dsecs * HZ / 10))) {
 +		    !bnxt_fw_reset_timeout(bp)) {
    		bnxt_queue_fw_reset_work(bp, HZ / 5);
    		return;
    	}
@@@ -11712,20 -11492,6 +11713,20 @@@
    	if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) {
    		u32 val;
+			if (!bp->fw_reset_min_dsecs) {
 +				u16 val;
 +
 +				pci_read_config_word(bp->pdev, PCI_SUBSYSTEM_ID,
 +						     &val);
 +				if (val == 0xffff) {
 +					if (bnxt_fw_reset_timeout(bp)) {
 +						netdev_err(bp->dev, "Firmware reset aborted, PCI config space invalid\n");
 +						goto fw_reset_abort;
 +					}
 +					bnxt_queue_fw_reset_work(bp, HZ / 1000);
 +					return;
 +				}
 +			}
    		val = bnxt_fw_health_readl(bp,
    					   BNXT_FW_RESET_INPROG_REG);
    		if (val)
@@@ -11744,7 -11510,8 +11745,7 @@@
    	bp->hwrm_cmd_timeout = SHORT_HWRM_CMD_TIMEOUT;
    	rc = __bnxt_hwrm_ver_get(bp, true);
    	if (rc) {
 -			if (time_after(jiffies, bp->fw_reset_timestamp +
 -				       (bp->fw_reset_max_dsecs * HZ / 10))) {
 +			if (bnxt_fw_reset_timeout(bp)) {
    			netdev_err(bp->dev, "Firmware reset aborted\n");
    			goto fw_reset_abort_status;
    		}
@@@ -12325,6 -12092,8 +12326,6 @@@ static const struct net_device_ops bnxt
  #ifdef CONFIG_RFS_ACCEL
    .ndo_rx_flow_steer	= bnxt_rx_flow_steer,
  #endif
 -	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
 -	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
    .ndo_bpf		= bnxt_xdp,
    .ndo_xdp_xmit		= bnxt_xdp_xmit,
    .ndo_bridge_getlink	= bnxt_bridge_getlink,
@@@ -12776,6 -12545,9 +12777,6 @@@ static int bnxt_init_one(struct pci_de
    dev->ethtool_ops = &bnxt_ethtool_ops;
    pci_set_drvdata(pdev, dev);
-	if (BNXT_PF(bp))
 -		bnxt_vpd_read_info(bp);
 -
    rc = bnxt_alloc_hwrm_resources(bp);
    if (rc)
    	goto init_err_pci_clean;
@@@ -12787,9 -12559,6 +12788,9 @@@
    if (rc)
    	goto init_err_pci_clean;
+	if (BNXT_PF(bp))
 +		bnxt_vpd_read_info(bp);
 +
    if (BNXT_CHIP_P5(bp)) {
    	bp->flags |= BNXT_FLAG_CHIP_P5;
    	if (BNXT_CHIP_SR2(bp))
diff --combined drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 90a31b4a3020,a9bcf887d2fb..64381be935a8
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@@ -44,20 -44,21 +44,20 @@@ static int bnxt_fw_reporter_diagnose(st
    			     struct netlink_ext_ack *extack)
  {
    struct bnxt *bp = devlink_health_reporter_priv(reporter);
 -	u32 val, health_status;
 +	u32 val;
    int rc;
if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state))
    	return 0;
val = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG);
 -	health_status = val & 0xffff;
-	if (health_status < BNXT_FW_STATUS_HEALTHY) {
 +	if (BNXT_FW_IS_BOOTING(val)) {
    	rc = devlink_fmsg_string_pair_put(fmsg, "Description",
    					  "Not yet completed initialization");
    	if (rc)
    		return rc;
 -	} else if (health_status > BNXT_FW_STATUS_HEALTHY) {
 +	} else if (BNXT_FW_IS_ERR(val)) {
    	rc = devlink_fmsg_string_pair_put(fmsg, "Description",
    					  "Encountered fatal error and cannot recover");
    	if (rc)
@@@ -471,8 -472,8 +471,8 @@@ static int bnxt_dl_info_get(struct devl
    if (BNXT_PF(bp) && !bnxt_hwrm_get_nvm_cfg_ver(bp, &nvm_cfg_ver)) {
    	u32 ver = nvm_cfg_ver.vu32;
- 		sprintf(buf, "%X.%X.%X", (ver >> 16) & 0xF, (ver >> 8) & 0xF,
- 			ver & 0xF);
+ 		sprintf(buf, "%d.%d.%d", (ver >> 16) & 0xf, (ver >> 8) & 0xf,
+ 			ver & 0xf);
    	rc = bnxt_dl_info_put(bp, req, BNXT_VERSION_STORED,
    			      DEVLINK_INFO_VERSION_GENERIC_FW_PSID,
    			      buf);
diff --combined drivers/net/ethernet/chelsio/cxgb4/sge.c
index 550cc065649f,3334c9e2152a..256fae15e032
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@@ -1600,8 -1600,7 +1600,8 @@@ static netdev_tx_t cxgb4_eth_xmit(struc
    	 * has opened up.
    	 */
    	eth_txq_stop(q);
 -		wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
 +		if (chip_ver > CHELSIO_T5)
 +			wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
    }
wr = (void *)&q->q.desc[q->q.pidx];
@@@ -1833,7 -1832,6 +1833,7 @@@ static netdev_tx_t cxgb4_vf_eth_xmit(st
    struct adapter *adapter;
    int qidx, credits, ret;
    size_t fw_hdr_copy_len;
 +	unsigned int chip_ver;
    u64 cntrl, *end;
    u32 wr_mid;
@@@ -1898,7 -1896,6 +1898,7 @@@
    	goto out_free;
    }
+	chip_ver = CHELSIO_CHIP_VERSION(adapter->params.chip);
    wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
    if (unlikely(credits < ETHTXQ_STOP_THRES)) {
    	/* After we're done injecting the Work Request for this
@@@ -1910,8 -1907,7 +1910,8 @@@
    	 * has opened up.
    	 */
    	eth_txq_stop(txq);
 -		wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
 +		if (chip_ver > CHELSIO_T5)
 +			wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
    }
/* Start filling in our Work Request.  Note that we do _not_ handle
@@@ -1964,7 -1960,7 +1964,7 @@@
    	 */
    	cpl = (void *)(lso + 1);
-		if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5)
 +		if (chip_ver <= CHELSIO_T5)
    		cntrl = TXPKT_ETHHDR_LEN_V(eth_xtra_len);
    	else
    		cntrl = T6_TXPKT_ETHHDR_LEN_V(eth_xtra_len);
@@@ -2846,17 -2842,22 +2846,22 @@@ int t4_mgmt_tx(struct adapter *adap, st
   *	@skb: the packet
   *
   *	Returns true if a packet can be sent as an offload WR with immediate
-  *	data.  We currently use the same limit as for Ethernet packets.
+  *	data.
+  *	FW_OFLD_TX_DATA_WR limits the payload to 255 bytes due to 8-bit field.
+  *      However, FW_ULPTX_WR commands have a 256 byte immediate only
+  *      payload limit.
   */
  static inline int is_ofld_imm(const struct sk_buff *skb)
  {
    struct work_request_hdr *req = (struct work_request_hdr *)skb->data;
    unsigned long opcode = FW_WR_OP_G(ntohl(req->wr_hi));
- 	if (opcode == FW_CRYPTO_LOOKASIDE_WR)
+ 	if (unlikely(opcode == FW_ULPTX_WR))
+ 		return skb->len <= MAX_IMM_ULPTX_WR_LEN;
+ 	else if (opcode == FW_CRYPTO_LOOKASIDE_WR)
    	return skb->len <= SGE_MAX_WR_LEN;
    else
- 		return skb->len <= MAX_IMM_TX_PKT_LEN;
+ 		return skb->len <= MAX_IMM_OFLD_TX_DATA_WR_LEN;
  }
/**
@@@ -3602,25 -3603,6 +3607,25 @@@ static void t4_tx_completion_handler(st
    }
txq = &s->ethtxq[pi->first_qset + rspq->idx];
 +
 +	/* We've got the Hardware Consumer Index Update in the Egress Update
 +	 * message. These Egress Update messages will be our sole CIDX Updates
 +	 * we get since we don't want to chew up PCIe bandwidth for both Ingress
 +	 * Messages and Status Page writes.  However, The code which manages
 +	 * reclaiming successfully DMA'ed TX Work Requests uses the CIDX value
 +	 * stored in the Status Page at the end of the TX Queue.  It's easiest
 +	 * to simply copy the CIDX Update value from the Egress Update message
 +	 * to the Status Page.  Also note that no Endian issues need to be
 +	 * considered here since both are Big Endian and we're just copying
 +	 * bytes consistently ...
 +	 */
 +	if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5) {
 +		struct cpl_sge_egr_update *egr;
 +
 +		egr = (struct cpl_sge_egr_update *)rsp;
 +		WRITE_ONCE(txq->q.stat->cidx, egr->cidx);
 +	}
 +
    t4_sge_eth_txq_egress_update(adapter, txq, -1);
  }
@@@ -4606,15 -4588,11 +4611,15 @@@ int t4_sge_alloc_eth_txq(struct adapte
     * write the CIDX Updates into the Status Page at the end of the
     * TX Queue.
     */
 -	c.autoequiqe_to_viid = htonl(FW_EQ_ETH_CMD_AUTOEQUEQE_F |
 +	c.autoequiqe_to_viid = htonl(((chip_ver <= CHELSIO_T5) ?
 +				      FW_EQ_ETH_CMD_AUTOEQUIQE_F :
 +				      FW_EQ_ETH_CMD_AUTOEQUEQE_F) |
    			     FW_EQ_ETH_CMD_VIID_V(pi->viid));
c.fetchszm_to_iqid =
 -		htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) |
 +		htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V((chip_ver <= CHELSIO_T5) ?
 +						 HOSTFCMODE_INGRESS_QUEUE_X :
 +						 HOSTFCMODE_STATUS_PAGE_X) |
    	      FW_EQ_ETH_CMD_PCIECHN_V(pi->tx_chan) |
    	      FW_EQ_ETH_CMD_FETCHRO_F | FW_EQ_ETH_CMD_IQID_V(iqid));
@@@ -4625,7 -4603,6 +4630,7 @@@
    				    : FETCHBURSTMIN_64B_T6_X) |
    	      FW_EQ_ETH_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
    	      FW_EQ_ETH_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
 +		      FW_EQ_ETH_CMD_CIDXFTHRESHO_V(chip_ver == CHELSIO_T5) |
    	      FW_EQ_ETH_CMD_EQSIZE_V(nentries));
c.eqaddr = cpu_to_be64(txq->q.phys_addr);
diff --combined drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 19f74d4cbb4e,f1c2b3c7f7e9..492943bb9c48
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@@ -350,7 -350,7 +350,7 @@@ static u32 dpaa2_eth_run_xdp(struct dpa
    struct bpf_prog *xdp_prog;
    struct xdp_buff xdp;
    u32 xdp_act = XDP_PASS;
 -	int err;
 +	int err, offset;
rcu_read_lock();
@@@ -358,10 -358,14 +358,10 @@@
    if (!xdp_prog)
    	goto out;
-	xdp.data = vaddr + dpaa2_fd_get_offset(fd);
 -	xdp.data_end = xdp.data + dpaa2_fd_get_len(fd);
 -	xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
 -	xdp_set_data_meta_invalid(&xdp);
 -	xdp.rxq = &ch->xdp_rxq;
 -
 -	xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE -
 -		(dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM);
 +	offset = dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM;
 +	xdp_init_buff(&xdp, DPAA2_ETH_RX_BUF_RAW_SIZE - offset, &ch->xdp_rxq);
 +	xdp_prepare_buff(&xdp, vaddr + offset, XDP_PACKET_HEADROOM,
 +			 dpaa2_fd_get_len(fd), false);
xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp);
@@@ -395,10 -399,20 +395,20 @@@
    	xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE;
err = xdp_do_redirect(priv->net_dev, &xdp, xdp_prog);
- 		if (unlikely(err))
+ 		if (unlikely(err)) {
+ 			addr = dma_map_page(priv->net_dev->dev.parent,
+ 					    virt_to_page(vaddr), 0,
+ 					    priv->rx_buf_size, DMA_BIDIRECTIONAL);
+ 			if (unlikely(dma_mapping_error(priv->net_dev->dev.parent, addr))) {
+ 				free_pages((unsigned long)vaddr, 0);
+ 			} else {
+ 				ch->buf_count++;
+ 				dpaa2_eth_xdp_release_buf(priv, ch, addr);
+ 			}
    		ch->stats.xdp_drop++;
- 		else
+ 		} else {
    		ch->stats.xdp_redirect++;
+ 		}
    	break;
    }
@@@ -764,11 -778,12 +774,11 @@@ static int dpaa2_eth_build_sg_fd(struc
    /* Prepare the HW SGT structure */
    sgt_buf_size = priv->tx_data_offset +
    	       sizeof(struct dpaa2_sg_entry) *  num_dma_bufs;
 -	sgt_buf = napi_alloc_frag(sgt_buf_size + DPAA2_ETH_TX_BUF_ALIGN);
 +	sgt_buf = napi_alloc_frag_align(sgt_buf_size, DPAA2_ETH_TX_BUF_ALIGN);
    if (unlikely(!sgt_buf)) {
    	err = -ENOMEM;
    	goto sgt_buf_alloc_failed;
    }
 -	sgt_buf = PTR_ALIGN(sgt_buf, DPAA2_ETH_TX_BUF_ALIGN);
    memset(sgt_buf, 0, sgt_buf_size);
sgt = (struct dpaa2_sg_entry *)(sgt_buf + priv->tx_data_offset);
@@@ -1257,22 -1272,6 +1267,22 @@@ static void dpaa2_eth_tx_conf(struct dp
    percpu_stats->tx_errors++;
  }
+static int dpaa2_eth_set_rx_vlan_filtering(struct dpaa2_eth_priv *priv,
 +					   bool enable)
 +{
 +	int err;
 +
 +	err = dpni_enable_vlan_filter(priv->mc_io, 0, priv->mc_token, enable);
 +
 +	if (err) {
 +		netdev_err(priv->net_dev,
 +			   "dpni_enable_vlan_filter failed\n");
 +		return err;
 +	}
 +
 +	return 0;
 +}
 +
  static int dpaa2_eth_set_rx_csum(struct dpaa2_eth_priv *priv, bool enable)
  {
    int err;
@@@ -1659,7 -1658,7 +1669,7 @@@ set_cgtd
     * CG taildrop threshold, so it won't interfere with it; we also
     * want frames in non-PFC enabled traffic classes to be kept in check)
     */
 -	td.enable = !tx_pause || (tx_pause && pfc);
 +	td.enable = !tx_pause || pfc;
    if (priv->rx_cgtd_enabled == td.enable)
    	return;
@@@ -1702,7 -1701,7 +1712,7 @@@ static int dpaa2_eth_link_state_update(
    /* When we manage the MAC/PHY using phylink there is no need
     * to manually update the netif_carrier.
     */
 -	if (priv->mac)
 +	if (dpaa2_eth_is_type_phy(priv))
    	goto out;
/* Chech link state; speed / duplex changes are not treated yet */
@@@ -1741,7 -1740,7 +1751,7 @@@ static int dpaa2_eth_open(struct net_de
    		   priv->dpbp_dev->obj_desc.id, priv->bpid);
    }
-	if (!priv->mac) {
 +	if (!dpaa2_eth_is_type_phy(priv)) {
    	/* We'll only start the txqs when the link is actually ready;
    	 * make sure we don't race against the link up notification,
    	 * which may come immediately after dpni_enable();
@@@ -1763,7 -1762,7 +1773,7 @@@
    	goto enable_err;
    }
-	if (priv->mac)
 +	if (dpaa2_eth_is_type_phy(priv))
    	phylink_start(priv->mac->phylink);
return 0;
@@@ -1837,11 -1836,11 +1847,11 @@@ static int dpaa2_eth_stop(struct net_de
    int dpni_enabled = 0;
    int retries = 10;
-	if (!priv->mac) {
 +	if (dpaa2_eth_is_type_phy(priv)) {
 +		phylink_stop(priv->mac->phylink);
 +	} else {
    	netif_tx_stop_all_queues(net_dev);
    	netif_carrier_off(net_dev);
 -	} else {
 -		phylink_stop(priv->mac->phylink);
    }
/* On dpni_disable(), the MC firmware will:
@@@ -1963,43 -1962,6 +1973,43 @@@ static void dpaa2_eth_add_mc_hw_addr(co
    }
  }
+static int dpaa2_eth_rx_add_vid(struct net_device *net_dev,
 +				__be16 vlan_proto, u16 vid)
 +{
 +	struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
 +	int err;
 +
 +	err = dpni_add_vlan_id(priv->mc_io, 0, priv->mc_token,
 +			       vid, 0, 0, 0);
 +
 +	if (err) {
 +		netdev_warn(priv->net_dev,
 +			    "Could not add the vlan id %u\n",
 +			    vid);
 +		return err;
 +	}
 +
 +	return 0;
 +}
 +
 +static int dpaa2_eth_rx_kill_vid(struct net_device *net_dev,
 +				 __be16 vlan_proto, u16 vid)
 +{
 +	struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
 +	int err;
 +
 +	err = dpni_remove_vlan_id(priv->mc_io, 0, priv->mc_token, vid);
 +
 +	if (err) {
 +		netdev_warn(priv->net_dev,
 +			    "Could not remove the vlan id %u\n",
 +			    vid);
 +		return err;
 +	}
 +
 +	return 0;
 +}
 +
  static void dpaa2_eth_set_rx_mode(struct net_device *net_dev)
  {
    struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
@@@ -2106,13 -2068,6 +2116,13 @@@ static int dpaa2_eth_set_features(struc
    bool enable;
    int err;
+	if (changed & NETIF_F_HW_VLAN_CTAG_FILTER) {
 +		enable = !!(features & NETIF_F_HW_VLAN_CTAG_FILTER);
 +		err = dpaa2_eth_set_rx_vlan_filtering(priv, enable);
 +		if (err)
 +			return err;
 +	}
 +
    if (changed & NETIF_F_RXCSUM) {
    	enable = !!(features & NETIF_F_RXCSUM);
    	err = dpaa2_eth_set_rx_csum(priv, enable);
@@@ -2170,7 -2125,7 +2180,7 @@@ static int dpaa2_eth_ioctl(struct net_d
    if (cmd == SIOCSHWTSTAMP)
    	return dpaa2_eth_ts_ioctl(dev, rq, cmd);
-	if (priv->mac)
 +	if (dpaa2_eth_is_type_phy(priv))
    	return phylink_mii_ioctl(priv->mac->phylink, rq, cmd);
return -EOPNOTSUPP;
@@@ -2562,8 -2517,6 +2572,8 @@@ static const struct net_device_ops dpaa
    .ndo_bpf = dpaa2_eth_xdp,
    .ndo_xdp_xmit = dpaa2_eth_xdp_xmit,
    .ndo_setup_tc = dpaa2_eth_setup_tc,
 +	.ndo_vlan_rx_add_vid = dpaa2_eth_rx_add_vid,
 +	.ndo_vlan_rx_kill_vid = dpaa2_eth_rx_kill_vid
  };
static void dpaa2_eth_cdan_cb(struct dpaa2_io_notification_ctx *ctx)
@@@ -4072,9 -4025,6 +4082,9 @@@ static int dpaa2_eth_netdev_init(struc
    		    NETIF_F_LLTX | NETIF_F_HW_TC;
    net_dev->hw_features = net_dev->features;
+	if (priv->dpni_attrs.vlan_filter_entries)
 +		net_dev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 +
    return 0;
  }
@@@ -4102,11 -4052,10 +4112,11 @@@ static int dpaa2_eth_connect_mac(struc
dpni_dev = to_fsl_mc_device(priv->net_dev->dev.parent);
    dpmac_dev = fsl_mc_get_endpoint(dpni_dev);
 -	if (IS_ERR_OR_NULL(dpmac_dev) || dpmac_dev->dev.type != &fsl_mc_bus_dpmac_type)
 -		return 0;
-	if (dpaa2_mac_is_type_fixed(dpmac_dev, priv->mc_io))
 +	if (PTR_ERR(dpmac_dev) == -EPROBE_DEFER)
 +		return PTR_ERR(dpmac_dev);
 +
 +	if (IS_ERR(dpmac_dev) || dpmac_dev->dev.type != &fsl_mc_bus_dpmac_type)
    	return 0;
mac = kzalloc(sizeof(struct dpaa2_mac), GFP_KERNEL);
@@@ -4117,38 -4066,23 +4127,38 @@@
    mac->mc_io = priv->mc_io;
    mac->net_dev = priv->net_dev;
-	err = dpaa2_mac_connect(mac);
 -	if (err) {
 -		netdev_err(priv->net_dev, "Error connecting to the MAC endpoint\n");
 -		kfree(mac);
 -		return err;
 -	}
 +	err = dpaa2_mac_open(mac);
 +	if (err)
 +		goto err_free_mac;
    priv->mac = mac;
+	if (dpaa2_eth_is_type_phy(priv)) {
 +		err = dpaa2_mac_connect(mac);
 +		if (err) {
 +			netdev_err(priv->net_dev, "Error connecting to the MAC endpoint\n");
 +			goto err_close_mac;
 +		}
 +	}
 +
    return 0;
 +
 +err_close_mac:
 +	dpaa2_mac_close(mac);
 +	priv->mac = NULL;
 +err_free_mac:
 +	kfree(mac);
 +	return err;
  }
static void dpaa2_eth_disconnect_mac(struct dpaa2_eth_priv *priv)
  {
 -	if (!priv->mac)
 +	if (dpaa2_eth_is_type_phy(priv))
 +		dpaa2_mac_disconnect(priv->mac);
 +
 +	if (!dpaa2_eth_has_mac(priv))
    	return;
-	dpaa2_mac_disconnect(priv->mac);
 +	dpaa2_mac_close(priv->mac);
    kfree(priv->mac);
    priv->mac = NULL;
  }
@@@ -4177,7 -4111,7 +4187,7 @@@ static irqreturn_t dpni_irq0_handler_th
    	dpaa2_eth_update_tx_fqids(priv);
rtnl_lock();
 -		if (priv->mac)
 +		if (dpaa2_eth_has_mac(priv))
    		dpaa2_eth_disconnect_mac(priv);
    	else
    		dpaa2_eth_connect_mac(priv);
diff --combined drivers/net/ethernet/ibm/ibmvnic.c
index 927d5f36d308,13ae7eee7ef5..5cf7e5a367f0
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@@ -115,7 -115,7 +115,7 @@@ struct ibmvnic_stat
#define IBMVNIC_STAT_OFF(stat) (offsetof(struct ibmvnic_adapter, stats) + \
    		     offsetof(struct ibmvnic_statistics, stat))
 -#define IBMVNIC_GET_STAT(a, off) (*((u64 *)(((unsigned long)(a)) + off)))
 +#define IBMVNIC_GET_STAT(a, off) (*((u64 *)(((unsigned long)(a)) + (off))))
static const struct ibmvnic_stat ibmvnic_stats[] = {
    {"rx_packets", IBMVNIC_STAT_OFF(rx_packets)},
@@@ -247,18 -247,51 +247,23 @@@ static void free_long_term_buff(struct 
    if (!ltb->buff)
    	return;
+ 	/* VIOS automatically unmaps the long term buffer at remote
+ 	 * end for the following resets:
+ 	 * FAILOVER, MOBILITY, TIMEOUT.
+ 	 */
    if (adapter->reset_reason != VNIC_RESET_FAILOVER &&
- 	    adapter->reset_reason != VNIC_RESET_MOBILITY)
+ 	    adapter->reset_reason != VNIC_RESET_MOBILITY &&
+ 	    adapter->reset_reason != VNIC_RESET_TIMEOUT)
    	send_request_unmap(adapter, ltb->map_id);
    dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
  }
-static int reset_long_term_buff(struct ibmvnic_adapter *adapter,
 -				struct ibmvnic_long_term_buff *ltb)
 +static int reset_long_term_buff(struct ibmvnic_long_term_buff *ltb)
  {
 -	struct device *dev = &adapter->vdev->dev;
 -	int rc;
 +	if (!ltb->buff)
 +		return -EINVAL;
memset(ltb->buff, 0, ltb->size);
 -
 -	mutex_lock(&adapter->fw_lock);
 -	adapter->fw_done_rc = 0;
 -
 -	reinit_completion(&adapter->fw_done);
 -	rc = send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id);
 -	if (rc) {
 -		mutex_unlock(&adapter->fw_lock);
 -		return rc;
 -	}
 -
 -	rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 10000);
 -	if (rc) {
 -		dev_info(dev,
 -			 "Reset failed, long term map request timed out or aborted\n");
 -		mutex_unlock(&adapter->fw_lock);
 -		return rc;
 -	}
 -
 -	if (adapter->fw_done_rc) {
 -		dev_info(dev,
 -			 "Reset failed, attempting to free and reallocate buffer\n");
 -		free_long_term_buff(adapter, ltb);
 -		mutex_unlock(&adapter->fw_lock);
 -		return alloc_long_term_buff(adapter, ltb, ltb->size);
 -	}
 -	mutex_unlock(&adapter->fw_lock);
    return 0;
  }
@@@ -480,7 -513,8 +485,7 @@@ static int reset_rx_pools(struct ibmvni
    					  rx_pool->size *
    					  rx_pool->buff_size);
    	} else {
 -			rc = reset_long_term_buff(adapter,
 -						  &rx_pool->long_term_buff);
 +			rc = reset_long_term_buff(&rx_pool->long_term_buff);
    	}
if (rc)
@@@ -603,11 -637,12 +608,11 @@@ static int init_rx_pools(struct net_dev
    return 0;
  }
-static int reset_one_tx_pool(struct ibmvnic_adapter *adapter,
 -			     struct ibmvnic_tx_pool *tx_pool)
 +static int reset_one_tx_pool(struct ibmvnic_tx_pool *tx_pool)
  {
    int rc, i;
-	rc = reset_long_term_buff(adapter, &tx_pool->long_term_buff);
 +	rc = reset_long_term_buff(&tx_pool->long_term_buff);
    if (rc)
    	return rc;
@@@ -634,10 -669,10 +639,10 @@@ static int reset_tx_pools(struct ibmvni
tx_scrqs = adapter->num_active_tx_pools;
    for (i = 0; i < tx_scrqs; i++) {
 -		rc = reset_one_tx_pool(adapter, &adapter->tso_pool[i]);
 +		rc = reset_one_tx_pool(&adapter->tso_pool[i]);
    	if (rc)
    		return rc;
 -		rc = reset_one_tx_pool(adapter, &adapter->tx_pool[i]);
 +		rc = reset_one_tx_pool(&adapter->tx_pool[i]);
    	if (rc)
    		return rc;
    }
@@@ -1191,7 -1226,8 +1196,7 @@@ static int ibmvnic_open(struct net_devi
    rc = __ibmvnic_open(netdev);
out:
 -	/*
 -	 * If open fails due to a pending failover, set device state and
 +	/* If open fails due to a pending failover, set device state and
     * return. Device operation will be handled by reset routine.
     */
    if (rc && adapter->failover_pending) {
@@@ -1322,10 -1358,8 +1327,8 @@@ static int __ibmvnic_close(struct net_d
adapter->state = VNIC_CLOSING;
    rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
- 	if (rc)
- 		return rc;
    adapter->state = VNIC_CLOSED;
- 	return 0;
+ 	return rc;
  }
static int ibmvnic_close(struct net_device *netdev)
@@@ -1353,10 -1387,10 +1356,10 @@@
/**
   * build_hdr_data - creates L2/L3/L4 header data buffer
 - * @hdr_field - bitfield determining needed headers
 - * @skb - socket buffer
 - * @hdr_len - array of header lengths
 - * @tot_len - total length of data
 + * @hdr_field: bitfield determining needed headers
 + * @skb: socket buffer
 + * @hdr_len: array of header lengths
 + * @hdr_data: buffer to write the header to
   *
   * Reads hdr_field to determine which headers are needed by firmware.
   * Builds a buffer containing these headers.  Saves individual header
@@@ -1413,11 -1447,11 +1416,11 @@@ static int build_hdr_data(u8 hdr_field
/**
   * create_hdr_descs - create header and header extension descriptors
 - * @hdr_field - bitfield determining needed headers
 - * @data - buffer containing header data
 - * @len - length of data buffer
 - * @hdr_len - array of individual header lengths
 - * @scrq_arr - descriptor array
 + * @hdr_field: bitfield determining needed headers
 + * @hdr_data: buffer containing header data
 + * @len: length of data buffer
 + * @hdr_len: array of individual header lengths
 + * @scrq_arr: descriptor array
   *
   * Creates header and, if needed, header extension descriptors and
   * places them in a descriptor array, scrq_arr
@@@ -1465,9 -1499,10 +1468,9 @@@ static int create_hdr_descs(u8 hdr_fiel
/**
   * build_hdr_descs_arr - build a header descriptor array
 - * @skb - socket buffer
 - * @num_entries - number of descriptors to be sent
 - * @subcrq - first TX descriptor
 - * @hdr_field - bit field determining which headers will be sent
 + * @txbuff: tx buffer
 + * @num_entries: number of descriptors to be sent
 + * @hdr_field: bit field determining which headers will be sent
   *
   * This function will build a TX descriptor array with applicable
   * L2/L3/L4 packet header descriptors to be sent by send_subcrq_indirect.
@@@ -1670,6 -1705,9 +1673,9 @@@ static netdev_tx_t ibmvnic_xmit(struct 
    	skb_copy_from_linear_data(skb, dst, skb->len);
    }
+ 	/* post changes to long_term_buff *dst before VIOS accessing it */
+ 	dma_wmb();
+ 
    tx_pool->consumer_index =
        (tx_pool->consumer_index + 1) % tx_pool->num_buffers;
@@@ -1893,7 -1931,93 +1899,7 @@@ static int ibmvnic_set_mac(struct net_d
    return rc;
  }
-/**
 - * do_change_param_reset returns zero if we are able to keep processing reset
 - * events, or non-zero if we hit a fatal error and must halt.
 - */
 -static int do_change_param_reset(struct ibmvnic_adapter *adapter,
 -				 struct ibmvnic_rwi *rwi,
 -				 u32 reset_state)
 -{
 -	struct net_device *netdev = adapter->netdev;
 -	int i, rc;
 -
 -	netdev_dbg(adapter->netdev, "Change param resetting driver (%d)\n",
 -		   rwi->reset_reason);
 -
 -	netif_carrier_off(netdev);
 -	adapter->reset_reason = rwi->reset_reason;
 -
 -	ibmvnic_cleanup(netdev);
 -
 -	if (reset_state == VNIC_OPEN) {
 -		rc = __ibmvnic_close(netdev);
 -		if (rc)
 -			goto out;
 -	}
 -
 -	release_resources(adapter);
 -	release_sub_crqs(adapter, 1);
 -	release_crq_queue(adapter);
 -
 -	adapter->state = VNIC_PROBED;
 -
 -	rc = init_crq_queue(adapter);
 -
 -	if (rc) {
 -		netdev_err(adapter->netdev,
 -			   "Couldn't initialize crq. rc=%d\n", rc);
 -		return rc;
 -	}
 -
 -	rc = ibmvnic_reset_init(adapter, true);
 -	if (rc) {
 -		rc = IBMVNIC_INIT_FAILED;
 -		goto out;
 -	}
 -
 -	/* If the adapter was in PROBE state prior to the reset,
 -	 * exit here.
 -	 */
 -	if (reset_state == VNIC_PROBED)
 -		goto out;
 -
 -	rc = ibmvnic_login(netdev);
 -	if (rc) {
 -		goto out;
 -	}
 -
 -	rc = init_resources(adapter);
 -	if (rc)
 -		goto out;
 -
 -	ibmvnic_disable_irqs(adapter);
 -
 -	adapter->state = VNIC_CLOSED;
 -
 -	if (reset_state == VNIC_CLOSED)
 -		return 0;
 -
 -	rc = __ibmvnic_open(netdev);
 -	if (rc) {
 -		rc = IBMVNIC_OPEN_FAILED;
 -		goto out;
 -	}
 -
 -	/* refresh device's multicast list */
 -	ibmvnic_set_multi(netdev);
 -
 -	/* kick napi */
 -	for (i = 0; i < adapter->req_rx_queues; i++)
 -		napi_schedule(&adapter->napi[i]);
 -
 -out:
 -	if (rc)
 -		adapter->state = reset_state;
 -	return rc;
 -}
 -
 -/**
 +/*
   * do_reset returns zero if we are able to keep processing reset events, or
   * non-zero if we hit a fatal error and must halt.
   */
@@@ -1910,12 -2034,9 +1916,12 @@@ static int do_reset(struct ibmvnic_adap
    	   adapter->state, adapter->failover_pending,
    	   rwi->reset_reason, reset_state);
-	rtnl_lock();
 -	/*
 -	 * Now that we have the rtnl lock, clear any pending failover.
 +	adapter->reset_reason = rwi->reset_reason;
 +	/* requestor of VNIC_RESET_CHANGE_PARAM already has the rtnl lock */
 +	if (!(adapter->reset_reason == VNIC_RESET_CHANGE_PARAM))
 +		rtnl_lock();
 +
 +	/* Now that we have the rtnl lock, clear any pending failover.
     * This will ensure ibmvnic_open() has either completed or will
     * block until failover is complete.
     */
@@@ -1923,6 -2044,7 +1929,6 @@@
    	adapter->failover_pending = false;
netif_carrier_off(netdev);
 -	adapter->reset_reason = rwi->reset_reason;
old_num_rx_queues = adapter->req_rx_queues;
    old_num_tx_queues = adapter->req_tx_queues;
@@@ -1934,37 -2056,25 +1940,37 @@@
    if (reset_state == VNIC_OPEN &&
        adapter->reset_reason != VNIC_RESET_MOBILITY &&
        adapter->reset_reason != VNIC_RESET_FAILOVER) {
 -		adapter->state = VNIC_CLOSING;
 +		if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) {
 +			rc = __ibmvnic_close(netdev);
 +			if (rc)
 +				goto out;
 +		} else {
 +			adapter->state = VNIC_CLOSING;
-		/* Release the RTNL lock before link state change and
 -		 * re-acquire after the link state change to allow
 -		 * linkwatch_event to grab the RTNL lock and run during
 -		 * a reset.
 -		 */
 -		rtnl_unlock();
 -		rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
 -		rtnl_lock();
 -		if (rc)
 -			goto out;
 +			/* Release the RTNL lock before link state change and
 +			 * re-acquire after the link state change to allow
 +			 * linkwatch_event to grab the RTNL lock and run during
 +			 * a reset.
 +			 */
 +			rtnl_unlock();
 +			rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
 +			rtnl_lock();
 +			if (rc)
 +				goto out;
-		if (adapter->state != VNIC_CLOSING) {
 -			rc = -1;
 -			goto out;
 +			if (adapter->state != VNIC_CLOSING) {
 +				rc = -1;
 +				goto out;
 +			}
 +
 +			adapter->state = VNIC_CLOSED;
    	}
 +	}
-		adapter->state = VNIC_CLOSED;
 +	if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) {
 +		release_resources(adapter);
 +		release_sub_crqs(adapter, 1);
 +		release_crq_queue(adapter);
    }
if (adapter->reset_reason != VNIC_RESET_NON_FATAL) {
@@@ -1973,9 -2083,7 +1979,9 @@@
    	 */
    	adapter->state = VNIC_PROBED;
-		if (adapter->reset_reason == VNIC_RESET_MOBILITY) {
 +		if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) {
 +			rc = init_crq_queue(adapter);
 +		} else if (adapter->reset_reason == VNIC_RESET_MOBILITY) {
    		rc = ibmvnic_reenable_crq_queue(adapter);
    		release_sub_crqs(adapter, 1);
    	} else {
@@@ -2010,14 -2118,11 +2016,14 @@@
    	}
rc = ibmvnic_login(netdev);
 -		if (rc) {
 +		if (rc)
    		goto out;
 -		}
-		if (adapter->req_rx_queues != old_num_rx_queues ||
 +		if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) {
 +			rc = init_resources(adapter);
 +			if (rc)
 +				goto out;
 +		} else if (adapter->req_rx_queues != old_num_rx_queues ||
    	    adapter->req_tx_queues != old_num_tx_queues ||
    	    adapter->req_rx_add_entries_per_subcrq !=
    	    old_num_rx_slots ||
@@@ -2039,14 -2144,14 +2045,14 @@@
    		rc = reset_tx_pools(adapter);
    		if (rc) {
    			netdev_dbg(adapter->netdev, "reset tx pools failed (%d)\n",
 -						rc);
 +					   rc);
    			goto out;
    		}
rc = reset_rx_pools(adapter);
    		if (rc) {
    			netdev_dbg(adapter->netdev, "reset rx pools failed (%d)\n",
 -						rc);
 +					   rc);
    			goto out;
    		}
    	}
@@@ -2082,9 -2187,7 +2088,9 @@@ out
    /* restore the adapter state if reset failed */
    if (rc)
    	adapter->state = reset_state;
 -	rtnl_unlock();
 +	/* requestor of VNIC_RESET_CHANGE_PARAM should still hold the rtnl lock */
 +	if (!(adapter->reset_reason == VNIC_RESET_CHANGE_PARAM))
 +		rtnl_unlock();
netdev_dbg(adapter->netdev, "[S:%d FOP:%d] Reset done, rc %d\n",
    	   adapter->state, adapter->failover_pending, rc);
@@@ -2215,8 -2318,12 +2221,8 @@@ static void __ibmvnic_reset(struct work
    	}
    	spin_unlock_irqrestore(&adapter->state_lock, flags);
-		if (rwi->reset_reason == VNIC_RESET_CHANGE_PARAM) {
 -			/* CHANGE_PARAM requestor holds rtnl_lock */
 -			rc = do_change_param_reset(adapter, rwi, reset_state);
 -		} else if (adapter->force_reset_recovery) {
 -			/*
 -			 * Since we are doing a hard reset now, clear the
 +		if (adapter->force_reset_recovery) {
 +			/* Since we are doing a hard reset now, clear the
    		 * failover_pending flag so we don't ignore any
    		 * future MOBILITY or other resets.
    		 */
@@@ -2288,7 -2395,10 +2294,8 @@@ static int ibmvnic_reset(struct ibmvnic
    unsigned long flags;
    int ret;
- 	/* If failover is pending don't schedule any other reset.
 -	spin_lock_irqsave(&adapter->rwi_lock, flags);
 -
+ 	/*
+ 	 * If failover is pending don't schedule any other reset.
     * Instead let the failover complete. If there is already a
     * a failover reset scheduled, we will detect and drop the
     * duplicate reset when walking the ->rwi_list below.
@@@ -2303,19 -2413,15 +2310,16 @@@
if (adapter->state == VNIC_PROBING) {
    	netdev_warn(netdev, "Adapter reset during probe\n");
 -		ret = adapter->init_done_rc = EAGAIN;
 +		adapter->init_done_rc = EAGAIN;
 +		ret = EAGAIN;
    	goto err;
    }
- 	spin_lock_irqsave(&adapter->rwi_lock, flags);
- 
    list_for_each(entry, &adapter->rwi_list) {
    	tmp = list_entry(entry, struct ibmvnic_rwi, list);
    	if (tmp->reset_reason == reason) {
    		netdev_dbg(netdev, "Skipping matching reset, reason=%d\n",
    			   reason);
- 			spin_unlock_irqrestore(&adapter->rwi_lock, flags);
    		ret = EBUSY;
    		goto err;
    	}
@@@ -2323,8 -2429,6 +2327,6 @@@
rwi = kzalloc(sizeof(*rwi), GFP_ATOMIC);
    if (!rwi) {
- 		spin_unlock_irqrestore(&adapter->rwi_lock, flags);
- 		ibmvnic_close(netdev);
    	ret = ENOMEM;
    	goto err;
    }
@@@ -2337,12 -2441,17 +2339,17 @@@
    }
    rwi->reset_reason = reason;
    list_add_tail(&rwi->list, &adapter->rwi_list);
- 	spin_unlock_irqrestore(&adapter->rwi_lock, flags);
    netdev_dbg(adapter->netdev, "Scheduling reset (reason %d)\n", reason);
    schedule_work(&adapter->ibmvnic_reset);
- 	return 0;
+ 	ret = 0;
  err:
+ 	/* ibmvnic_close() below can block, so drop the lock first */
+ 	spin_unlock_irqrestore(&adapter->rwi_lock, flags);
+ 
+ 	if (ret == ENOMEM)
+ 		ibmvnic_close(netdev);
+ 
    return -ret;
  }
@@@ -2410,9 -2519,16 +2417,9 @@@ restart_poll
if (!pending_scrq(adapter, rx_scrq))
    		break;
 -		/* The queue entry at the current index is peeked at above
 -		 * to determine that there is a valid descriptor awaiting
 -		 * processing. We want to be sure that the current slot
 -		 * holds a valid descriptor before reading its contents.
 -		 */
 -		dma_rmb();
    	next = ibmvnic_next_scrq(adapter, rx_scrq);
 -		rx_buff =
 -		    (struct ibmvnic_rx_buff *)be64_to_cpu(next->
 -							  rx_comp.correlator);
 +		rx_buff = (struct ibmvnic_rx_buff *)
 +			  be64_to_cpu(next->rx_comp.correlator);
    	/* do error checking */
    	if (next->rx_comp.rc) {
    		netdev_dbg(netdev, "rx buffer returned with rc %x\n",
@@@ -2433,6 -2549,8 +2440,8 @@@
    	offset = be16_to_cpu(next->rx_comp.off_frame_data);
    	flags = next->rx_comp.flags;
    	skb = rx_buff->skb;
+ 		/* load long_term_buff before copying to skb */
+ 		dma_rmb();
    	skb_copy_to_linear_data(skb, rx_buff->data + offset,
    				length);
@@@ -2475,6 -2593,7 +2484,6 @@@
    	if (napi_complete_done(napi, frames_processed)) {
    		enable_scrq_irq(adapter, rx_scrq);
    		if (pending_scrq(adapter, rx_scrq)) {
 -				rmb();
    			if (napi_reschedule(napi)) {
    				disable_scrq_irq(adapter, rx_scrq);
    				goto restart_poll;
@@@ -2603,9 -2722,9 +2612,9 @@@ static void ibmvnic_get_drvinfo(struct 
  {
    struct ibmvnic_adapter *adapter = netdev_priv(netdev);
-	strlcpy(info->driver, ibmvnic_driver_name, sizeof(info->driver));
 -	strlcpy(info->version, IBMVNIC_DRIVER_VERSION, sizeof(info->version));
 -	strlcpy(info->fw_version, adapter->fw_version,
 +	strscpy(info->driver, ibmvnic_driver_name, sizeof(info->driver));
 +	strscpy(info->version, IBMVNIC_DRIVER_VERSION, sizeof(info->version));
 +	strscpy(info->fw_version, adapter->fw_version,
    	sizeof(info->fw_version));
  }
@@@ -2717,6 -2836,7 +2726,6 @@@ static int ibmvnic_set_channels(struct 
    		    channels->rx_count, channels->tx_count,
    		    adapter->req_rx_queues, adapter->req_tx_queues);
    return ret;
 -
  }
static void ibmvnic_get_strings(struct net_device *dev, u32 stringset, u8 *data)
@@@ -2805,8 -2925,8 +2814,8 @@@ static void ibmvnic_get_ethtool_stats(s
    	return;
for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++)
 -		data[i] = be64_to_cpu(IBMVNIC_GET_STAT(adapter,
 -						ibmvnic_stats[i].offset));
 +		data[i] = be64_to_cpu(IBMVNIC_GET_STAT
 +				      (adapter, ibmvnic_stats[i].offset));
for (j = 0; j < adapter->req_tx_queues; j++) {
    	data[i] = adapter->tx_stats_buffers[j].packets;
@@@ -2846,7 -2966,6 +2855,7 @@@ static int ibmvnic_set_priv_flags(struc
return 0;
  }
 +
  static const struct ethtool_ops ibmvnic_ethtool_ops = {
    .get_drvinfo		= ibmvnic_get_drvinfo,
    .get_msglevel		= ibmvnic_get_msglevel,
@@@ -3116,7 -3235,7 +3125,7 @@@ static int enable_scrq_irq(struct ibmvn
    	/* H_EOI would fail with rc = H_FUNCTION when running
    	 * in XIVE mode which is expected, but not an error.
    	 */
 -		if (rc && (rc != H_FUNCTION))
 +		if (rc && rc != H_FUNCTION)
    		dev_err(dev, "H_EOI FAILED irq 0x%llx. rc=%ld\n",
    			val, rc);
    }
@@@ -3147,6 -3266,13 +3156,6 @@@ restart_loop
    	int total_bytes = 0;
    	int num_packets = 0;
-		/* The queue entry at the current index is peeked at above
 -		 * to determine that there is a valid descriptor awaiting
 -		 * processing. We want to be sure that the current slot
 -		 * holds a valid descriptor before reading its contents.
 -		 */
 -		dma_rmb();
 -
    	next = ibmvnic_next_scrq(adapter, scrq);
    	for (i = 0; i < next->tx_comp.num_comps; i++) {
    		if (next->tx_comp.rcs[i])
@@@ -3520,16 -3646,11 +3529,16 @@@ static int pending_scrq(struct ibmvnic_
    		struct ibmvnic_sub_crq_queue *scrq)
  {
    union sub_crq *entry = &scrq->msgs[scrq->cur];
 +	int rc;
-	if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP)
 -		return 1;
 -	else
 -		return 0;
 +	rc = !!(entry->generic.first & IBMVNIC_CRQ_CMD_RSP);
 +
 +	/* Ensure that the SCRQ valid flag is loaded prior to loading the
 +	 * contents of the SCRQ descriptor
 +	 */
 +	dma_rmb();
 +
 +	return rc;
  }
static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *adapter,
@@@ -3548,8 -3669,8 +3557,8 @@@
    }
    spin_unlock_irqrestore(&scrq->lock, flags);
-	/* Ensure that the entire buffer descriptor has been
 -	 * loaded before reading its contents
 +	/* Ensure that the SCRQ valid flag is loaded prior to loading the
 +	 * contents of the SCRQ descriptor
     */
    dma_rmb();
@@@ -3599,7 -3720,7 +3608,7 @@@ static int send_subcrq_indirect(struct 
    int rc;
/* Make sure the hypervisor sees the complete request */
 -	mb();
 +	dma_wmb();
    rc = plpar_hcall_norets(H_SEND_SUB_CRQ_INDIRECT, ua,
    			cpu_to_be64(remote_handle),
    			ioba, num_entries);
@@@ -3619,8 -3740,8 +3628,8 @@@ static int ibmvnic_send_crq(struct ibmv
    int rc;
netdev_dbg(adapter->netdev, "Sending CRQ: %016lx %016lx\n",
 -		   (unsigned long int)cpu_to_be64(u64_crq[0]),
 -		   (unsigned long int)cpu_to_be64(u64_crq[1]));
 +		   (unsigned long)cpu_to_be64(u64_crq[0]),
 +		   (unsigned long)cpu_to_be64(u64_crq[1]));
if (!adapter->crq.active &&
        crq->generic.first != IBMVNIC_CRQ_INIT_CMD) {
@@@ -3629,7 -3750,7 +3638,7 @@@
    }
/* Make sure the hypervisor sees the complete request */
 -	mb();
 +	dma_wmb();
rc = plpar_hcall_norets(H_SEND_CRQ, ua,
    			cpu_to_be64(u64_crq[0]),
@@@ -3825,15 -3946,15 +3834,15 @@@ static int send_login(struct ibmvnic_ad
for (i = 0; i < adapter->req_tx_queues; i++) {
    	if (adapter->tx_scrq[i]) {
 -			tx_list_p[i] = cpu_to_be64(adapter->tx_scrq[i]->
 -						   crq_num);
 +			tx_list_p[i] =
 +				cpu_to_be64(adapter->tx_scrq[i]->crq_num);
    	}
    }
for (i = 0; i < adapter->req_rx_queues; i++) {
    	if (adapter->rx_scrq[i]) {
 -			rx_list_p[i] = cpu_to_be64(adapter->rx_scrq[i]->
 -						   crq_num);
 +			rx_list_p[i] =
 +				cpu_to_be64(adapter->rx_scrq[i]->crq_num);
    	}
    }
@@@ -3849,7 -3970,7 +3858,7 @@@
    netdev_dbg(adapter->netdev, "Login Buffer:\n");
    for (i = 0; i < (adapter->login_buf_sz - 1) / 8 + 1; i++) {
    	netdev_dbg(adapter->netdev, "%016lx\n",
 -			   ((unsigned long int *)(adapter->login_buf))[i]);
 +			   ((unsigned long *)(adapter->login_buf))[i]);
    }
memset(&crq, 0, sizeof(crq));
@@@ -4217,7 -4338,7 +4226,7 @@@ static void handle_query_ip_offload_rsp
    netdev_dbg(adapter->netdev, "Query IP Offload Buffer:\n");
    for (i = 0; i < (sizeof(adapter->ip_offload_buf) - 1) / 8 + 1; i++)
    	netdev_dbg(adapter->netdev, "%016lx\n",
 -			   ((unsigned long int *)(buf))[i]);
 +			   ((unsigned long *)(buf))[i]);
netdev_dbg(adapter->netdev, "ipv4_chksum = %d\n", buf->ipv4_chksum);
    netdev_dbg(adapter->netdev, "ipv6_chksum = %d\n", buf->ipv6_chksum);
@@@ -4376,8 -4497,8 +4385,8 @@@ static void handle_request_cap_rsp(unio
    case PARTIALSUCCESS:
    	dev_info(dev, "req=%lld, rsp=%ld in %s queue, retrying.\n",
    		 *req_value,
 -			 (long int)be64_to_cpu(crq->request_capability_rsp.
 -					       number), name);
 +			 (long)be64_to_cpu(crq->request_capability_rsp.number),
 +			 name);
if (be16_to_cpu(crq->request_capability_rsp.capability) ==
    	    REQ_MTU) {
@@@ -4447,7 -4568,7 +4456,7 @@@ static int handle_login_rsp(union ibmvn
    netdev_dbg(adapter->netdev, "Login Response Buffer:\n");
    for (i = 0; i < (adapter->login_rsp_buf_sz - 1) / 8 + 1; i++) {
    	netdev_dbg(adapter->netdev, "%016lx\n",
 -			   ((unsigned long int *)(adapter->login_rsp_buf))[i]);
 +			   ((unsigned long *)(adapter->login_rsp_buf))[i]);
    }
/* Sanity checks */
@@@ -4790,8 -4911,8 +4799,8 @@@ static void ibmvnic_handle_crq(union ib
    long rc;
netdev_dbg(netdev, "Handling CRQ: %016lx %016lx\n",
 -		   (unsigned long int)cpu_to_be64(u64_crq[0]),
 -		   (unsigned long int)cpu_to_be64(u64_crq[1]));
 +		   (unsigned long)cpu_to_be64(u64_crq[0]),
 +		   (unsigned long)cpu_to_be64(u64_crq[1]));
    switch (gen_crq->first) {
    case IBMVNIC_CRQ_INIT_RSP:
    	switch (gen_crq->cmd) {
@@@ -5265,6 -5386,8 +5274,6 @@@ static int ibmvnic_probe(struct vio_de
    netdev->ethtool_ops = &ibmvnic_ethtool_ops;
    SET_NETDEV_DEV(netdev, &dev->dev);
-	spin_lock_init(&adapter->stats_lock);
 -
    INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset);
    INIT_DELAYED_WORK(&adapter->ibmvnic_delayed_reset,
    		  __ibmvnic_delayed_reset);
@@@ -5346,7 -5469,18 +5355,18 @@@ static int ibmvnic_remove(struct vio_de
    unsigned long flags;
spin_lock_irqsave(&adapter->state_lock, flags);
+ 
+ 	/* If ibmvnic_reset() is scheduling a reset, wait for it to
+ 	 * finish. Then, set the state to REMOVING to prevent it from
+ 	 * scheduling any more work and to have reset functions ignore
+ 	 * any resets that have already been scheduled. Drop the lock
+ 	 * after setting state, so __ibmvnic_reset() which is called
+ 	 * from the flush_work() below, can make progress.
+ 	 */
+ 	spin_lock_irqsave(&adapter->rwi_lock, flags);
    adapter->state = VNIC_REMOVING;
+ 	spin_unlock_irqrestore(&adapter->rwi_lock, flags);
+ 
    spin_unlock_irqrestore(&adapter->state_lock, flags);
flush_work(&adapter->ibmvnic_reset);
diff --combined drivers/net/ethernet/ibm/ibmvnic.h
index 270d1cac86a4,72fea3b1c87d..e4dcc63b9710
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@@ -31,7 -31,7 +31,7 @@@
  #define IBMVNIC_BUFFS_PER_POOL	100
  #define IBMVNIC_MAX_QUEUES	16
  #define IBMVNIC_MAX_QUEUE_SZ   4096
- #define IBMVNIC_MAX_IND_DESCS  128
+ #define IBMVNIC_MAX_IND_DESCS  16
  #define IBMVNIC_IND_ARR_SZ	(IBMVNIC_MAX_IND_DESCS * 32)
#define IBMVNIC_TSO_BUF_SZ	65536
@@@ -845,7 -845,6 +845,7 @@@ struct ibmvnic_crq_queue 
    union ibmvnic_crq *msgs;
    int size, cur;
    dma_addr_t msg_token;
 +	/* Used for serialization of msgs, cur */
    spinlock_t lock;
    bool active;
    char name[32];
@@@ -877,7 -876,6 +877,7 @@@ struct ibmvnic_sub_crq_queue 
    unsigned int irq;
    unsigned int pool_index;
    int scrq_num;
 +	/* Used for serialization of msgs, cur */
    spinlock_t lock;
    struct sk_buff *rx_skb_top;
    struct ibmvnic_adapter *adapter;
@@@ -987,6 -985,7 +987,6 @@@ struct ibmvnic_adapter 
    struct ibmvnic_statistics stats;
    dma_addr_t stats_token;
    struct completion stats_done;
 -	spinlock_t stats_lock;
    int replenish_no_mem;
    int replenish_add_buff_success;
    int replenish_add_buff_failure;
@@@ -1081,12 -1080,10 +1081,14 @@@
struct tasklet_struct tasklet;
    enum vnic_state state;
 +	/* Used for serializatin of state field */
 +	spinlock_t state_lock;
    enum ibmvnic_reset_reason reset_reason;
+ 	/* when taking both state and rwi locks, take state lock first */
+ 	spinlock_t rwi_lock;
    struct list_head rwi_list;
 +	/* Used for serialization of rwi_list */
 +	spinlock_t rwi_lock;
    struct work_struct ibmvnic_reset;
    struct delayed_work ibmvnic_delayed_reset;
    unsigned long resetting;
@@@ -1100,4 -1097,9 +1102,4 @@@
struct ibmvnic_tunables desired;
    struct ibmvnic_tunables fallback;
 -
 -	/* Used for serialization of state field. When taking both state
 -	 * and rwi locks, take state lock first.
 -	 */
 -	spinlock_t state_lock;
  };
diff --combined drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index aa76a6e0dae8,41474e42a819..d7d8a68ef23d
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@@ -7,8 -7,6 +7,8 @@@
  #include "fw_reset.h"
  #include "fs_core.h"
  #include "eswitch.h"
 +#include "sf/dev/dev.h"
 +#include "sf/sf.h"
static int mlx5_devlink_flash_update(struct devlink *devlink,
    			     struct devlink_flash_update_params *params,
@@@ -129,18 -127,12 +129,23 @@@ static int mlx5_devlink_reload_down(str
    			    struct netlink_ext_ack *extack)
  {
    struct mlx5_core_dev *dev = devlink_priv(devlink);
 +	bool sf_dev_allocated;
 +
 +	sf_dev_allocated = mlx5_sf_dev_allocated(dev);
 +	if (sf_dev_allocated) {
 +		/* Reload results in deleting SF device which further results in
 +		 * unregistering devlink instance while holding devlink_mutext.
 +		 * Hence, do not support reload.
 +		 */
 +		NL_SET_ERR_MSG_MOD(extack, "reload is unsupported when SFs are allocated\n");
 +		return -EOPNOTSUPP;
 +	}
+ 	if (mlx5_lag_is_active(dev)) {
+ 		NL_SET_ERR_MSG_MOD(extack, "reload is unsupported in Lag mode\n");
+ 		return -EOPNOTSUPP;
+ 	}
+ 
    switch (action) {
    case DEVLINK_RELOAD_ACTION_DRIVER_REINIT:
    	mlx5_unload_one(dev, false);
@@@ -181,91 -173,6 +186,91 @@@ static int mlx5_devlink_reload_up(struc
    return 0;
  }
+static struct mlx5_devlink_trap *mlx5_find_trap_by_id(struct mlx5_core_dev *dev, int trap_id)
 +{
 +	struct mlx5_devlink_trap *dl_trap;
 +
 +	list_for_each_entry(dl_trap, &dev->priv.traps, list)
 +		if (dl_trap->trap.id == trap_id)
 +			return dl_trap;
 +
 +	return NULL;
 +}
 +
 +static int mlx5_devlink_trap_init(struct devlink *devlink, const struct devlink_trap *trap,
 +				  void *trap_ctx)
 +{
 +	struct mlx5_core_dev *dev = devlink_priv(devlink);
 +	struct mlx5_devlink_trap *dl_trap;
 +
 +	dl_trap = kzalloc(sizeof(*dl_trap), GFP_KERNEL);
 +	if (!dl_trap)
 +		return -ENOMEM;
 +
 +	dl_trap->trap.id = trap->id;
 +	dl_trap->trap.action = DEVLINK_TRAP_ACTION_DROP;
 +	dl_trap->item = trap_ctx;
 +
 +	if (mlx5_find_trap_by_id(dev, trap->id)) {
 +		kfree(dl_trap);
 +		mlx5_core_err(dev, "Devlink trap: Trap 0x%x already found", trap->id);
 +		return -EEXIST;
 +	}
 +
 +	list_add_tail(&dl_trap->list, &dev->priv.traps);
 +	return 0;
 +}
 +
 +static void mlx5_devlink_trap_fini(struct devlink *devlink, const struct devlink_trap *trap,
 +				   void *trap_ctx)
 +{
 +	struct mlx5_core_dev *dev = devlink_priv(devlink);
 +	struct mlx5_devlink_trap *dl_trap;
 +
 +	dl_trap = mlx5_find_trap_by_id(dev, trap->id);
 +	if (!dl_trap) {
 +		mlx5_core_err(dev, "Devlink trap: Missing trap id 0x%x", trap->id);
 +		return;
 +	}
 +	list_del(&dl_trap->list);
 +	kfree(dl_trap);
 +}
 +
 +static int mlx5_devlink_trap_action_set(struct devlink *devlink,
 +					const struct devlink_trap *trap,
 +					enum devlink_trap_action action,
 +					struct netlink_ext_ack *extack)
 +{
 +	struct mlx5_core_dev *dev = devlink_priv(devlink);
 +	enum devlink_trap_action action_orig;
 +	struct mlx5_devlink_trap *dl_trap;
 +	int err = 0;
 +
 +	dl_trap = mlx5_find_trap_by_id(dev, trap->id);
 +	if (!dl_trap) {
 +		mlx5_core_err(dev, "Devlink trap: Set action on invalid trap id 0x%x", trap->id);
 +		err = -EINVAL;
 +		goto out;
 +	}
 +
 +	if (action != DEVLINK_TRAP_ACTION_DROP && action != DEVLINK_TRAP_ACTION_TRAP) {
 +		err = -EOPNOTSUPP;
 +		goto out;
 +	}
 +
 +	if (action == dl_trap->trap.action)
 +		goto out;
 +
 +	action_orig = dl_trap->trap.action;
 +	dl_trap->trap.action = action;
 +	err = mlx5_blocking_notifier_call_chain(dev, MLX5_DRIVER_EVENT_TYPE_TRAP,
 +						&dl_trap->trap);
 +	if (err)
 +		dl_trap->trap.action = action_orig;
 +out:
 +	return err;
 +}
 +
  static const struct devlink_ops mlx5_devlink_ops = {
  #ifdef CONFIG_MLX5_ESWITCH
    .eswitch_mode_set = mlx5_devlink_eswitch_mode_set,
@@@ -276,12 -183,6 +281,12 @@@
    .eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get,
    .port_function_hw_addr_get = mlx5_devlink_port_function_hw_addr_get,
    .port_function_hw_addr_set = mlx5_devlink_port_function_hw_addr_set,
 +#endif
 +#ifdef CONFIG_MLX5_SF_MANAGER
 +	.port_new = mlx5_devlink_sf_port_new,
 +	.port_del = mlx5_devlink_sf_port_del,
 +	.port_fn_state_get = mlx5_devlink_sf_port_fn_state_get,
 +	.port_fn_state_set = mlx5_devlink_sf_port_fn_state_set,
  #endif
    .flash_update = mlx5_devlink_flash_update,
    .info_get = mlx5_devlink_info_get,
@@@ -290,59 -191,8 +295,59 @@@
    .reload_limits = BIT(DEVLINK_RELOAD_LIMIT_NO_RESET),
    .reload_down = mlx5_devlink_reload_down,
    .reload_up = mlx5_devlink_reload_up,
 +	.trap_init = mlx5_devlink_trap_init,
 +	.trap_fini = mlx5_devlink_trap_fini,
 +	.trap_action_set = mlx5_devlink_trap_action_set,
  };
+void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb,
 +			      struct devlink_port *dl_port)
 +{
 +	struct devlink *devlink = priv_to_devlink(dev);
 +	struct mlx5_devlink_trap *dl_trap;
 +
 +	dl_trap = mlx5_find_trap_by_id(dev, trap_id);
 +	if (!dl_trap) {
 +		mlx5_core_err(dev, "Devlink trap: Report on invalid trap id 0x%x", trap_id);
 +		return;
 +	}
 +
 +	if (dl_trap->trap.action != DEVLINK_TRAP_ACTION_TRAP) {
 +		mlx5_core_dbg(dev, "Devlink trap: Trap id %d has action %d", trap_id,
 +			      dl_trap->trap.action);
 +		return;
 +	}
 +	devlink_trap_report(devlink, skb, dl_trap->item, dl_port, NULL);
 +}
 +
 +int mlx5_devlink_trap_get_num_active(struct mlx5_core_dev *dev)
 +{
 +	struct mlx5_devlink_trap *dl_trap;
 +	int count = 0;
 +
 +	list_for_each_entry(dl_trap, &dev->priv.traps, list)
 +		if (dl_trap->trap.action == DEVLINK_TRAP_ACTION_TRAP)
 +			count++;
 +
 +	return count;
 +}
 +
 +int mlx5_devlink_traps_get_action(struct mlx5_core_dev *dev, int trap_id,
 +				  enum devlink_trap_action *action)
 +{
 +	struct mlx5_devlink_trap *dl_trap;
 +
 +	dl_trap = mlx5_find_trap_by_id(dev, trap_id);
 +	if (!dl_trap) {
 +		mlx5_core_err(dev, "Devlink trap: Get action on invalid trap id 0x%x",
 +			      trap_id);
 +		return -EINVAL;
 +	}
 +
 +	*action = dl_trap->trap.action;
 +	return 0;
 +}
 +
  struct devlink *mlx5_devlink_alloc(void)
  {
    return devlink_alloc(&mlx5_devlink_ops, sizeof(struct mlx5_core_dev));
@@@ -428,6 -278,10 +433,10 @@@ static int mlx5_devlink_enable_roce_val
    	NL_SET_ERR_MSG_MOD(extack, "Device doesn't support RoCE");
    	return -EOPNOTSUPP;
    }
+ 	if (mlx5_core_is_mp_slave(dev) || mlx5_lag_is_active(dev)) {
+ 		NL_SET_ERR_MSG_MOD(extack, "Multi port slave/Lag device can't configure RoCE");
+ 		return -EOPNOTSUPP;
+ 	}
return 0;
  }
@@@ -513,49 -367,6 +522,49 @@@ static void mlx5_devlink_set_params_ini
  #endif
  }
+#define MLX5_TRAP_DROP(_id, _group_id)					\
 +	DEVLINK_TRAP_GENERIC(DROP, DROP, _id,				\
 +			     DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \
 +			     DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT)
 +
 +static const struct devlink_trap mlx5_traps_arr[] = {
 +	MLX5_TRAP_DROP(INGRESS_VLAN_FILTER, L2_DROPS),
 +	MLX5_TRAP_DROP(DMAC_FILTER, L2_DROPS),
 +};
 +
 +static const struct devlink_trap_group mlx5_trap_groups_arr[] = {
 +	DEVLINK_TRAP_GROUP_GENERIC(L2_DROPS, 0),
 +};
 +
 +static int mlx5_devlink_traps_register(struct devlink *devlink)
 +{
 +	struct mlx5_core_dev *core_dev = devlink_priv(devlink);
 +	int err;
 +
 +	err = devlink_trap_groups_register(devlink, mlx5_trap_groups_arr,
 +					   ARRAY_SIZE(mlx5_trap_groups_arr));
 +	if (err)
 +		return err;
 +
 +	err = devlink_traps_register(devlink, mlx5_traps_arr, ARRAY_SIZE(mlx5_traps_arr),
 +				     &core_dev->priv);
 +	if (err)
 +		goto err_trap_group;
 +	return 0;
 +
 +err_trap_group:
 +	devlink_trap_groups_unregister(devlink, mlx5_trap_groups_arr,
 +				       ARRAY_SIZE(mlx5_trap_groups_arr));
 +	return err;
 +}
 +
 +static void mlx5_devlink_traps_unregister(struct devlink *devlink)
 +{
 +	devlink_traps_unregister(devlink, mlx5_traps_arr, ARRAY_SIZE(mlx5_traps_arr));
 +	devlink_trap_groups_unregister(devlink, mlx5_trap_groups_arr,
 +				       ARRAY_SIZE(mlx5_trap_groups_arr));
 +}
 +
  int mlx5_devlink_register(struct devlink *devlink, struct device *dev)
  {
    int err;
@@@ -570,16 -381,8 +579,16 @@@
    	goto params_reg_err;
    mlx5_devlink_set_params_init_values(devlink);
    devlink_params_publish(devlink);
 +
 +	err = mlx5_devlink_traps_register(devlink);
 +	if (err)
 +		goto traps_reg_err;
 +
    return 0;
+traps_reg_err:
 +	devlink_params_unregister(devlink, mlx5_devlink_params,
 +				  ARRAY_SIZE(mlx5_devlink_params));
  params_reg_err:
    devlink_unregister(devlink);
    return err;
@@@ -587,7 -390,6 +596,7 @@@
void mlx5_devlink_unregister(struct devlink *devlink)
  {
 +	mlx5_devlink_traps_unregister(devlink);
    devlink_params_unregister(devlink, mlx5_devlink_params,
    			  ARRAY_SIZE(mlx5_devlink_params));
    devlink_unregister(devlink);
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
index 0b503ebe59ec,24e2c0d955b9..f3f6eb081948
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
@@@ -12,6 -12,7 +12,7 @@@
  #include <net/flow_offload.h>
  #include <net/netfilter/nf_flow_table.h>
  #include <linux/workqueue.h>
+ #include <linux/refcount.h>
  #include <linux/xarray.h>
#include "lib/fs_chains.h"
@@@ -27,7 -28,6 +28,7 @@@
  #define MLX5_CT_STATE_ESTABLISHED_BIT BIT(1)
  #define MLX5_CT_STATE_TRK_BIT BIT(2)
  #define MLX5_CT_STATE_NAT_BIT BIT(3)
 +#define MLX5_CT_STATE_REPLY_BIT BIT(4)
#define MLX5_FTE_ID_BITS (mlx5e_tc_attr_to_reg_mappings[FTEID_TO_REG].mlen * 8)
  #define MLX5_FTE_ID_MAX GENMASK(MLX5_FTE_ID_BITS - 1, 0)
@@@ -52,11 -52,11 +53,11 @@@ struct mlx5_tc_ct_priv 
    struct mlx5_flow_table *ct_nat;
    struct mlx5_flow_table *post_ct;
    struct mutex control_lock; /* guards parallel adds/dels */
- 	struct mutex shared_counter_lock;
    struct mapping_ctx *zone_mapping;
    struct mapping_ctx *labels_mapping;
    enum mlx5_flow_namespace_type ns_type;
    struct mlx5_fs_chains *chains;
+ 	spinlock_t ht_lock; /* protects ft entries */
  };
struct mlx5_ct_flow {
@@@ -125,6 -125,10 +126,10 @@@ struct mlx5_ct_counter 
    bool is_shared;
  };
+ enum {
+ 	MLX5_CT_ENTRY_FLAG_VALID,
+ };
+ 
  struct mlx5_ct_entry {
    struct rhash_head node;
    struct rhash_head tuple_node;
@@@ -135,6 -139,12 +140,12 @@@
    struct mlx5_ct_tuple tuple;
    struct mlx5_ct_tuple tuple_nat;
    struct mlx5_ct_zone_rule zone_rules[2];
+ 
+ 	struct mlx5_tc_ct_priv *ct_priv;
+ 	struct work_struct work;
+ 
+ 	refcount_t refcnt;
+ 	unsigned long flags;
  };
static const struct rhashtable_params cts_ht_params = {
@@@ -642,7 -652,6 +653,7 @@@ mlx5_tc_ct_entry_create_mod_hdr(struct 
    }
ct_state |= MLX5_CT_STATE_ESTABLISHED_BIT | MLX5_CT_STATE_TRK_BIT;
 +	ct_state |= meta->ct_metadata.orig_dir ? 0 : MLX5_CT_STATE_REPLY_BIT;
    err = mlx5_tc_ct_entry_set_registers(ct_priv, &mod_acts,
    				     ct_state,
    				     meta->ct_metadata.mark,
@@@ -711,11 -720,11 +722,11 @@@ mlx5_tc_ct_entry_add_rule(struct mlx5_t
    attr->outer_match_level = MLX5_MATCH_L4;
    attr->counter = entry->counter->counter;
    attr->flags |= MLX5_ESW_ATTR_FLAG_NO_IN_PORT;
 +	if (ct_priv->ns_type == MLX5_FLOW_NAMESPACE_FDB)
 +		attr->esw_attr->in_mdev = priv->mdev;
mlx5_tc_ct_set_tuple_match(netdev_priv(ct_priv->netdev), spec, flow_rule);
 -	mlx5e_tc_match_to_reg_match(spec, ZONE_TO_REG,
 -				    entry->tuple.zone & MLX5_CT_ZONE_MASK,
 -				    MLX5_CT_ZONE_MASK);
 +	mlx5e_tc_match_to_reg_match(spec, ZONE_TO_REG, entry->tuple.zone, MLX5_CT_ZONE_MASK);
zone_rule->rule = mlx5_tc_rule_insert(priv, spec, attr);
    if (IS_ERR(zone_rule->rule)) {
@@@ -742,6 -751,87 +753,87 @@@ err_attr
    return err;
  }
+ static bool
+ mlx5_tc_ct_entry_valid(struct mlx5_ct_entry *entry)
+ {
+ 	return test_bit(MLX5_CT_ENTRY_FLAG_VALID, &entry->flags);
+ }
+ 
+ static struct mlx5_ct_entry *
+ mlx5_tc_ct_entry_get(struct mlx5_tc_ct_priv *ct_priv, struct mlx5_ct_tuple *tuple)
+ {
+ 	struct mlx5_ct_entry *entry;
+ 
+ 	entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, tuple,
+ 				       tuples_ht_params);
+ 	if (entry && mlx5_tc_ct_entry_valid(entry) &&
+ 	    refcount_inc_not_zero(&entry->refcnt)) {
+ 		return entry;
+ 	} else if (!entry) {
+ 		entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_nat_ht,
+ 					       tuple, tuples_nat_ht_params);
+ 		if (entry && mlx5_tc_ct_entry_valid(entry) &&
+ 		    refcount_inc_not_zero(&entry->refcnt))
+ 			return entry;
+ 	}
+ 
+ 	return entry ? ERR_PTR(-EINVAL) : NULL;
+ }
+ 
+ static void mlx5_tc_ct_entry_remove_from_tuples(struct mlx5_ct_entry *entry)
+ {
+ 	struct mlx5_tc_ct_priv *ct_priv = entry->ct_priv;
+ 
+ 	rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht,
+ 			       &entry->tuple_nat_node,
+ 			       tuples_nat_ht_params);
+ 	rhashtable_remove_fast(&ct_priv->ct_tuples_ht, &entry->tuple_node,
+ 			       tuples_ht_params);
+ }
+ 
+ static void mlx5_tc_ct_entry_del(struct mlx5_ct_entry *entry)
+ {
+ 	struct mlx5_tc_ct_priv *ct_priv = entry->ct_priv;
+ 
+ 	mlx5_tc_ct_entry_del_rules(ct_priv, entry);
+ 
+ 	spin_lock_bh(&ct_priv->ht_lock);
+ 	mlx5_tc_ct_entry_remove_from_tuples(entry);
+ 	spin_unlock_bh(&ct_priv->ht_lock);
+ 
+ 	mlx5_tc_ct_counter_put(ct_priv, entry);
+ 	kfree(entry);
+ }
+ 
+ static void
+ mlx5_tc_ct_entry_put(struct mlx5_ct_entry *entry)
+ {
+ 	if (!refcount_dec_and_test(&entry->refcnt))
+ 		return;
+ 
+ 	mlx5_tc_ct_entry_del(entry);
+ }
+ 
+ static void mlx5_tc_ct_entry_del_work(struct work_struct *work)
+ {
+ 	struct mlx5_ct_entry *entry = container_of(work, struct mlx5_ct_entry, work);
+ 
+ 	mlx5_tc_ct_entry_del(entry);
+ }
+ 
+ static void
+ __mlx5_tc_ct_entry_put(struct mlx5_ct_entry *entry)
+ {
+ 	struct mlx5e_priv *priv;
+ 
+ 	if (!refcount_dec_and_test(&entry->refcnt))
+ 		return;
+ 
+ 	priv = netdev_priv(entry->ct_priv->netdev);
+ 	INIT_WORK(&entry->work, mlx5_tc_ct_entry_del_work);
+ 	queue_work(priv->wq, &entry->work);
+ }
+ 
  static struct mlx5_ct_counter *
  mlx5_tc_ct_counter_create(struct mlx5_tc_ct_priv *ct_priv)
  {
@@@ -772,6 -862,7 +864,6 @@@ mlx5_tc_ct_shared_counter_get(struct ml
    struct mlx5_ct_counter *shared_counter;
    struct mlx5_ct_entry *rev_entry;
    __be16 tmp_port;
 -	int ret;
/* get the reversed tuple */
    tmp_port = rev_tuple.port.src;
@@@ -793,20 -884,32 +885,30 @@@
    }
/* Use the same counter as the reverse direction */
- 	mutex_lock(&ct_priv->shared_counter_lock);
- 	rev_entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, &rev_tuple,
- 					   tuples_ht_params);
- 	if (rev_entry) {
- 		if (refcount_inc_not_zero(&rev_entry->counter->refcount)) {
- 			mutex_unlock(&ct_priv->shared_counter_lock);
- 			return rev_entry->counter;
- 		}
+ 	spin_lock_bh(&ct_priv->ht_lock);
+ 	rev_entry = mlx5_tc_ct_entry_get(ct_priv, &rev_tuple);
+ 
+ 	if (IS_ERR(rev_entry)) {
+ 		spin_unlock_bh(&ct_priv->ht_lock);
+ 		goto create_counter;
    }
- 	mutex_unlock(&ct_priv->shared_counter_lock);
+ 
+ 	if (rev_entry && refcount_inc_not_zero(&rev_entry->counter->refcount)) {
+ 		ct_dbg("Using shared counter entry=0x%p rev=0x%p\n", entry, rev_entry);
+ 		shared_counter = rev_entry->counter;
+ 		spin_unlock_bh(&ct_priv->ht_lock);
+ 
+ 		mlx5_tc_ct_entry_put(rev_entry);
+ 		return shared_counter;
+ 	}
+ 
+ 	spin_unlock_bh(&ct_priv->ht_lock);
+ 
+ create_counter:
shared_counter = mlx5_tc_ct_counter_create(ct_priv);
 -	if (IS_ERR(shared_counter)) {
 -		ret = PTR_ERR(shared_counter);
 -		return ERR_PTR(ret);
 -	}
 +	if (IS_ERR(shared_counter))
 +		return shared_counter;
shared_counter->is_shared = true;
    refcount_set(&shared_counter->refcount, 1);
@@@ -865,10 -968,14 +967,14 @@@ mlx5_tc_ct_block_flow_offload_add(struc
    if (!meta_action)
    	return -EOPNOTSUPP;
- 	entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie,
- 				       cts_ht_params);
- 	if (entry)
- 		return 0;
+ 	spin_lock_bh(&ct_priv->ht_lock);
+ 	entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params);
+ 	if (entry && refcount_inc_not_zero(&entry->refcnt)) {
+ 		spin_unlock_bh(&ct_priv->ht_lock);
+ 		mlx5_tc_ct_entry_put(entry);
+ 		return -EEXIST;
+ 	}
+ 	spin_unlock_bh(&ct_priv->ht_lock);
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
    if (!entry)
@@@ -877,6 -984,8 +983,8 @@@
    entry->tuple.zone = ft->zone;
    entry->cookie = flow->cookie;
    entry->restore_cookie = meta_action->ct_metadata.cookie;
+ 	refcount_set(&entry->refcnt, 2);
+ 	entry->ct_priv = ct_priv;
err = mlx5_tc_ct_rule_to_tuple(&entry->tuple, flow_rule);
    if (err)
@@@ -887,35 -996,40 +995,40 @@@
    if (err)
    	goto err_set;
- 	err = rhashtable_insert_fast(&ct_priv->ct_tuples_ht,
- 				     &entry->tuple_node,
- 				     tuples_ht_params);
+ 	spin_lock_bh(&ct_priv->ht_lock);
+ 
+ 	err = rhashtable_lookup_insert_fast(&ft->ct_entries_ht, &entry->node,
+ 					    cts_ht_params);
+ 	if (err)
+ 		goto err_entries;
+ 
+ 	err = rhashtable_lookup_insert_fast(&ct_priv->ct_tuples_ht,
+ 					    &entry->tuple_node,
+ 					    tuples_ht_params);
    if (err)
    	goto err_tuple;
if (memcmp(&entry->tuple, &entry->tuple_nat, sizeof(entry->tuple))) {
- 		err = rhashtable_insert_fast(&ct_priv->ct_tuples_nat_ht,
- 					     &entry->tuple_nat_node,
- 					     tuples_nat_ht_params);
+ 		err = rhashtable_lookup_insert_fast(&ct_priv->ct_tuples_nat_ht,
+ 						    &entry->tuple_nat_node,
+ 						    tuples_nat_ht_params);
    	if (err)
    		goto err_tuple_nat;
    }
+ 	spin_unlock_bh(&ct_priv->ht_lock);
err = mlx5_tc_ct_entry_add_rules(ct_priv, flow_rule, entry,
    				 ft->zone_restore_id);
    if (err)
    	goto err_rules;
- 	err = rhashtable_insert_fast(&ft->ct_entries_ht, &entry->node,
- 				     cts_ht_params);
- 	if (err)
- 		goto err_insert;
+ 	set_bit(MLX5_CT_ENTRY_FLAG_VALID, &entry->flags);
+ 	mlx5_tc_ct_entry_put(entry); /* this function reference */
return 0;
- err_insert:
- 	mlx5_tc_ct_entry_del_rules(ct_priv, entry);
  err_rules:
+ 	spin_lock_bh(&ct_priv->ht_lock);
    if (mlx5_tc_ct_entry_has_nat(entry))
    	rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht,
    			       &entry->tuple_nat_node, tuples_nat_ht_params);
@@@ -924,47 -1038,43 +1037,43 @@@ err_tuple_nat
    		       &entry->tuple_node,
    		       tuples_ht_params);
  err_tuple:
+ 	rhashtable_remove_fast(&ft->ct_entries_ht,
+ 			       &entry->node,
+ 			       cts_ht_params);
+ err_entries:
+ 	spin_unlock_bh(&ct_priv->ht_lock);
  err_set:
    kfree(entry);
- 	netdev_warn(ct_priv->netdev,
- 		    "Failed to offload ct entry, err: %d\n", err);
+ 	if (err != -EEXIST)
+ 		netdev_warn(ct_priv->netdev, "Failed to offload ct entry, err: %d\n", err);
    return err;
  }
- static void
- mlx5_tc_ct_del_ft_entry(struct mlx5_tc_ct_priv *ct_priv,
- 			struct mlx5_ct_entry *entry)
- {
- 	mlx5_tc_ct_entry_del_rules(ct_priv, entry);
- 	mutex_lock(&ct_priv->shared_counter_lock);
- 	if (mlx5_tc_ct_entry_has_nat(entry))
- 		rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht,
- 				       &entry->tuple_nat_node,
- 				       tuples_nat_ht_params);
- 	rhashtable_remove_fast(&ct_priv->ct_tuples_ht, &entry->tuple_node,
- 			       tuples_ht_params);
- 	mutex_unlock(&ct_priv->shared_counter_lock);
- 	mlx5_tc_ct_counter_put(ct_priv, entry);
- 
- }
- 
  static int
  mlx5_tc_ct_block_flow_offload_del(struct mlx5_ct_ft *ft,
    			  struct flow_cls_offload *flow)
  {
+ 	struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv;
    unsigned long cookie = flow->cookie;
    struct mlx5_ct_entry *entry;
- 	entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie,
- 				       cts_ht_params);
- 	if (!entry)
+ 	spin_lock_bh(&ct_priv->ht_lock);
+ 	entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params);
+ 	if (!entry) {
+ 		spin_unlock_bh(&ct_priv->ht_lock);
    	return -ENOENT;
+ 	}
- 	mlx5_tc_ct_del_ft_entry(ft->ct_priv, entry);
- 	WARN_ON(rhashtable_remove_fast(&ft->ct_entries_ht,
- 				       &entry->node,
- 				       cts_ht_params));
- 	kfree(entry);
+ 	if (!mlx5_tc_ct_entry_valid(entry)) {
+ 		spin_unlock_bh(&ct_priv->ht_lock);
+ 		return -EINVAL;
+ 	}
+ 
+ 	rhashtable_remove_fast(&ft->ct_entries_ht, &entry->node, cts_ht_params);
+ 	mlx5_tc_ct_entry_remove_from_tuples(entry);
+ 	spin_unlock_bh(&ct_priv->ht_lock);
+ 
+ 	mlx5_tc_ct_entry_put(entry);
return 0;
  }
@@@ -973,19 -1083,30 +1082,30 @@@ static in
  mlx5_tc_ct_block_flow_offload_stats(struct mlx5_ct_ft *ft,
    			    struct flow_cls_offload *f)
  {
+ 	struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv;
    unsigned long cookie = f->cookie;
    struct mlx5_ct_entry *entry;
    u64 lastuse, packets, bytes;
- 	entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie,
- 				       cts_ht_params);
- 	if (!entry)
+ 	spin_lock_bh(&ct_priv->ht_lock);
+ 	entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params);
+ 	if (!entry) {
+ 		spin_unlock_bh(&ct_priv->ht_lock);
    	return -ENOENT;
+ 	}
+ 
+ 	if (!mlx5_tc_ct_entry_valid(entry) || !refcount_inc_not_zero(&entry->refcnt)) {
+ 		spin_unlock_bh(&ct_priv->ht_lock);
+ 		return -EINVAL;
+ 	}
+ 
+ 	spin_unlock_bh(&ct_priv->ht_lock);
mlx5_fc_query_cached(entry->counter->counter, &bytes, &packets, &lastuse);
    flow_stats_update(&f->stats, bytes, packets, 0, lastuse,
    		  FLOW_ACTION_HW_STATS_DELAYED);
+ 	mlx5_tc_ct_entry_put(entry);
    return 0;
  }
@@@ -1087,8 -1208,8 +1207,8 @@@ mlx5_tc_ct_match_add(struct mlx5_tc_ct_
    	     struct netlink_ext_ack *extack)
  {
    struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 +	bool trk, est, untrk, unest, new, rpl, unrpl;
    struct flow_dissector_key_ct *mask, *key;
 -	bool trk, est, untrk, unest, new;
    u32 ctstate = 0, ctstate_mask = 0;
    u16 ct_state_on, ct_state_off;
    u16 ct_state, ct_state_mask;
@@@ -1114,10 -1235,9 +1234,10 @@@
if (ct_state_mask & ~(TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
    		      TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED |
 -			      TCA_FLOWER_KEY_CT_FLAGS_NEW)) {
 +			      TCA_FLOWER_KEY_CT_FLAGS_NEW |
 +			      TCA_FLOWER_KEY_CT_FLAGS_REPLY)) {
    	NL_SET_ERR_MSG_MOD(extack,
 -				   "only ct_state trk, est and new are supported for offload");
 +				   "only ct_state trk, est, new and rpl are supported for offload");
    	return -EOPNOTSUPP;
    }
@@@ -1126,17 -1246,13 +1246,17 @@@
    trk = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_TRACKED;
    new = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_NEW;
    est = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED;
 +	rpl = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_REPLY;
    untrk = ct_state_off & TCA_FLOWER_KEY_CT_FLAGS_TRACKED;
    unest = ct_state_off & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED;
 +	unrpl = ct_state_off & TCA_FLOWER_KEY_CT_FLAGS_REPLY;
ctstate |= trk ? MLX5_CT_STATE_TRK_BIT : 0;
    ctstate |= est ? MLX5_CT_STATE_ESTABLISHED_BIT : 0;
 +	ctstate |= rpl ? MLX5_CT_STATE_REPLY_BIT : 0;
    ctstate_mask |= (untrk || trk) ? MLX5_CT_STATE_TRK_BIT : 0;
    ctstate_mask |= (unest || est) ? MLX5_CT_STATE_ESTABLISHED_BIT : 0;
 +	ctstate_mask |= (unrpl || rpl) ? MLX5_CT_STATE_REPLY_BIT : 0;
if (new) {
    	NL_SET_ERR_MSG_MOD(extack,
@@@ -1251,8 -1367,9 +1371,8 @@@ static int tc_ct_pre_ct_add_rules(struc
    pre_ct->flow_rule = rule;
/* add miss rule */
 -	memset(spec, 0, sizeof(*spec));
    dest.ft = nat ? ct_priv->ct_nat : ct_priv->ct;
 -	rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
 +	rule = mlx5_add_flow_rules(ft, NULL, &flow_act, &dest, 1);
    if (IS_ERR(rule)) {
    	err = PTR_ERR(rule);
    	ct_dbg("Failed to add pre ct miss rule zone %d", zone);
@@@ -1481,11 -1598,9 +1601,9 @@@ err_mapping
  static void
  mlx5_tc_ct_flush_ft_entry(void *ptr, void *arg)
  {
- 	struct mlx5_tc_ct_priv *ct_priv = arg;
    struct mlx5_ct_entry *entry = ptr;
- 	mlx5_tc_ct_del_ft_entry(ct_priv, entry);
- 	kfree(entry);
+ 	mlx5_tc_ct_entry_put(entry);
  }
static void
@@@ -1763,6 -1878,7 +1881,6 @@@ __mlx5_tc_ct_flow_offload_clear(struct 
    	goto err_set_registers;
    }
-	dealloc_mod_hdr_actions(mod_acts);
    pre_ct_attr->modify_hdr = mod_hdr;
    pre_ct_attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
@@@ -1962,6 -2078,7 +2080,7 @@@ mlx5_tc_ct_init(struct mlx5e_priv *priv
    	goto err_mapping_labels;
    }
+ 	spin_lock_init(&ct_priv->ht_lock);
    ct_priv->ns_type = ns_type;
    ct_priv->chains = chains;
    ct_priv->netdev = priv->netdev;
@@@ -1996,7 -2113,6 +2115,6 @@@
idr_init(&ct_priv->fte_ids);
    mutex_init(&ct_priv->control_lock);
- 	mutex_init(&ct_priv->shared_counter_lock);
    rhashtable_init(&ct_priv->zone_ht, &zone_params);
    rhashtable_init(&ct_priv->ct_tuples_ht, &tuples_ht_params);
    rhashtable_init(&ct_priv->ct_tuples_nat_ht, &tuples_nat_ht_params);
@@@ -2039,7 -2155,6 +2157,6 @@@ mlx5_tc_ct_clean(struct mlx5_tc_ct_pri
    rhashtable_destroy(&ct_priv->ct_tuples_nat_ht);
    rhashtable_destroy(&ct_priv->zone_ht);
    mutex_destroy(&ct_priv->control_lock);
- 	mutex_destroy(&ct_priv->shared_counter_lock);
    idr_destroy(&ct_priv->fte_ids);
    kfree(ct_priv);
  }
@@@ -2061,14 -2176,22 +2178,22 @@@ mlx5e_tc_ct_restore_flow(struct mlx5_tc
    if (!mlx5_tc_ct_skb_to_tuple(skb, &tuple, zone))
    	return false;
- 	entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, &tuple,
- 				       tuples_ht_params);
- 	if (!entry)
- 		entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_nat_ht,
- 					       &tuple, tuples_nat_ht_params);
- 	if (!entry)
+ 	spin_lock(&ct_priv->ht_lock);
+ 
+ 	entry = mlx5_tc_ct_entry_get(ct_priv, &tuple);
+ 	if (!entry) {
+ 		spin_unlock(&ct_priv->ht_lock);
+ 		return false;
+ 	}
+ 
+ 	if (IS_ERR(entry)) {
+ 		spin_unlock(&ct_priv->ht_lock);
    	return false;
+ 	}
+ 	spin_unlock(&ct_priv->ht_lock);
tcf_ct_flow_table_restore_skb(skb, entry->restore_cookie);
+ 	__mlx5_tc_ct_entry_put(entry);
+ 
    return true;
  }
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
index 959bb6cd7203,ff81b69a59a9..cc0efac7b812
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
@@@ -85,7 -85,7 +85,7 @@@ mlx5e_tx_tunnel_accel(struct sk_buff *s
    }
mlx5e_set_eseg_swp(skb, eseg, &swp_spec);
 -	if (skb_vlan_tag_present(skb) &&  ihs)
 +	if (skb_vlan_tag_present(skb) && ihs)
    	mlx5e_eseg_swp_offsets_add_vlan(eseg);
  }
@@@ -144,9 -144,9 +144,9 @@@ static inline bool mlx5e_accel_tx_is_ip
  {
  #ifdef CONFIG_MLX5_EN_IPSEC
    return mlx5e_ipsec_is_tx_flow(&state->ipsec);
 -#endif
 -
 +#else
    return false;
 +#endif
  }
static inline unsigned int mlx5e_accel_tx_ids_len(struct mlx5e_txqsq *sq,
@@@ -173,7 -173,7 +173,7 @@@ static inline bool mlx5e_accel_tx_eseg(
  #endif
#if IS_ENABLED(CONFIG_GENEVE)
- 	if (skb->encapsulation)
+ 	if (skb->encapsulation && skb->ip_summed == CHECKSUM_PARTIAL)
    	mlx5e_tx_tunnel_accel(skb, eseg, ihs);
  #endif
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 5e9474dba4e5,8612c388db7d..abdf721bb264
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@@ -447,17 -447,6 +447,17 @@@ int mlx5e_ethtool_set_channels(struct m
    	goto out;
    }
+	/* Don't allow changing the number of channels if HTB offload is active,
 +	 * because the numeration of the QoS SQs will change, while per-queue
 +	 * qdiscs are attached.
 +	 */
 +	if (priv->htb.maj_id) {
 +		err = -EINVAL;
 +		netdev_err(priv->netdev, "%s: HTB offload is active, cannot change the number of channels\n",
 +			   __func__);
 +		goto out;
 +	}
 +
    new_channels.params = *cur_params;
    new_channels.params.num_channels = count;
@@@ -536,7 -525,7 +536,7 @@@ static int mlx5e_get_coalesce(struct ne
  #define MLX5E_MAX_COAL_FRAMES		MLX5_MAX_CQ_COUNT
static void
- mlx5e_set_priv_channels_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal)
+ mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal)
  {
    struct mlx5_core_dev *mdev = priv->mdev;
    int tc;
@@@ -551,6 -540,17 +551,17 @@@
    					coal->tx_coalesce_usecs,
    					coal->tx_max_coalesced_frames);
    	}
+ 	}
+ }
+ 
+ static void
+ mlx5e_set_priv_channels_rx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal)
+ {
+ 	struct mlx5_core_dev *mdev = priv->mdev;
+ 	int i;
+ 
+ 	for (i = 0; i < priv->channels.num; ++i) {
+ 		struct mlx5e_channel *c = priv->channels.c[i];
mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq,
    				       coal->rx_coalesce_usecs,
@@@ -597,21 -597,9 +608,9 @@@ int mlx5e_ethtool_set_coalesce(struct m
    tx_moder->pkts    = coal->tx_max_coalesced_frames;
    new_channels.params.tx_dim_enabled = !!coal->use_adaptive_tx_coalesce;
- 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
- 		priv->channels.params = new_channels.params;
- 		goto out;
- 	}
- 	/* we are opened */
- 
    reset_rx = !!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled;
    reset_tx = !!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled;
- 	if (!reset_rx && !reset_tx) {
- 		mlx5e_set_priv_channels_coalesce(priv, coal);
- 		priv->channels.params = new_channels.params;
- 		goto out;
- 	}
- 
    if (reset_rx) {
    	u8 mode = MLX5E_GET_PFLAG(&new_channels.params,
    				  MLX5E_PFLAG_RX_CQE_BASED_MODER);
@@@ -625,6 -613,20 +624,20 @@@
    	mlx5e_reset_tx_moderation(&new_channels.params, mode);
    }
+ 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+ 		priv->channels.params = new_channels.params;
+ 		goto out;
+ 	}
+ 
+ 	if (!reset_rx && !reset_tx) {
+ 		if (!coal->use_adaptive_rx_coalesce)
+ 			mlx5e_set_priv_channels_rx_coalesce(priv, coal);
+ 		if (!coal->use_adaptive_tx_coalesce)
+ 			mlx5e_set_priv_channels_tx_coalesce(priv, coal);
+ 		priv->channels.params = new_channels.params;
+ 		goto out;
+ 	}
+ 
    err = mlx5e_safe_switch_channels(priv, &new_channels, NULL, NULL);
out:
@@@ -1983,16 -1985,6 +1996,16 @@@ static int set_pflag_tx_port_ts(struct 
    if (!MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn))
    	return -EOPNOTSUPP;
+	/* Don't allow changing the PTP state if HTB offload is active, because
 +	 * the numeration of the QoS SQs will change, while per-queue qdiscs are
 +	 * attached.
 +	 */
 +	if (priv->htb.maj_id) {
 +		netdev_err(priv->netdev, "%s: HTB offload is active, cannot change the PTP state\n",
 +			   __func__);
 +		return -EINVAL;
 +	}
 +
    new_channels.params = priv->channels.params;
    MLX5E_SET_PFLAG(&new_channels.params, MLX5E_PFLAG_TX_PORT_TS, enable);
    /* No need to verify SQ stop room as
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c8866c14b8a3,a2e0b548bf57..39acbc83682d
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@@ -65,8 -65,7 +65,9 @@@
  #include "en/devlink.h"
  #include "lib/mlx5.h"
  #include "en/ptp.h"
 +#include "qos.h"
 +#include "en/trap.h"
+ #include "fpga/ipsec.h"
bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
  {
@@@ -108,7 -107,7 +109,7 @@@ bool mlx5e_striding_rq_possible(struct 
    if (!mlx5e_check_fragmented_striding_rq_cap(mdev))
    	return false;
- 	if (MLX5_IPSEC_DEV(mdev))
+ 	if (mlx5_fpga_is_ipsec_device(mdev))
    	return false;
if (params->xdp_prog) {
@@@ -213,33 -212,6 +214,33 @@@ static void mlx5e_disable_async_events(
    mlx5_notifier_unregister(priv->mdev, &priv->events_nb);
  }
+static int blocking_event(struct notifier_block *nb, unsigned long event, void *data)
 +{
 +	struct mlx5e_priv *priv = container_of(nb, struct mlx5e_priv, blocking_events_nb);
 +	int err;
 +
 +	switch (event) {
 +	case MLX5_DRIVER_EVENT_TYPE_TRAP:
 +		err = mlx5e_handle_trap_event(priv, data);
 +		break;
 +	default:
 +		netdev_warn(priv->netdev, "Sync event: Unknown event %ld\n", event);
 +		err = -EINVAL;
 +	}
 +	return err;
 +}
 +
 +static void mlx5e_enable_blocking_events(struct mlx5e_priv *priv)
 +{
 +	priv->blocking_events_nb.notifier_call = blocking_event;
 +	mlx5_blocking_notifier_register(priv->mdev, &priv->blocking_events_nb);
 +}
 +
 +static void mlx5e_disable_blocking_events(struct mlx5e_priv *priv)
 +{
 +	mlx5_blocking_notifier_unregister(priv->mdev, &priv->blocking_events_nb);
 +}
 +
  static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
    			       struct mlx5e_icosq *sq,
    			       struct mlx5e_umr_wqe *wqe)
@@@ -371,11 -343,13 +372,11 @@@ static void mlx5e_init_frags_partition(
    	prev->last_in_page = true;
  }
-static int mlx5e_init_di_list(struct mlx5e_rq *rq,
 -			      int wq_sz, int cpu)
 +int mlx5e_init_di_list(struct mlx5e_rq *rq, int wq_sz, int node)
  {
    int len = wq_sz << rq->wqe.info.log_num_frags;
-	rq->wqe.di = kvzalloc_node(array_size(len, sizeof(*rq->wqe.di)),
 -				   GFP_KERNEL, cpu_to_node(cpu));
 +	rq->wqe.di = kvzalloc_node(array_size(len, sizeof(*rq->wqe.di)), GFP_KERNEL, node);
    if (!rq->wqe.di)
    	return -ENOMEM;
@@@ -384,7 -358,7 +385,7 @@@
    return 0;
  }
-static void mlx5e_free_di_list(struct mlx5e_rq *rq)
 +void mlx5e_free_di_list(struct mlx5e_rq *rq)
  {
    kvfree(rq->wqe.di);
  }
@@@ -449,9 -423,6 +450,9 @@@ static int mlx5e_alloc_rq(struct mlx5e_
    rq->hw_mtu  = MLX5E_SW2HW_MTU(params, params->sw_mtu);
    rq->xdpsq   = &c->rq_xdpsq;
    rq->xsk_pool = xsk_pool;
 +	rq->ptp_cyc2time = mlx5_is_real_time_rq(mdev) ?
 +			   mlx5_real_time_cyc2time :
 +			   mlx5_timecounter_cyc2time;
if (rq->xsk_pool)
    	rq->stats = &c->priv->channel_stats[c->ix].xskrq;
@@@ -529,7 -500,7 +530,7 @@@
    		goto err_rq_wq_destroy;
    	}
-		err = mlx5e_init_di_list(rq, wq_sz, c->cpu);
 +		err = mlx5e_init_di_list(rq, wq_sz, cpu_to_node(c->cpu));
    	if (err)
    		goto err_rq_frags;
@@@ -680,10 -651,11 +681,10 @@@ static void mlx5e_free_rq(struct mlx5e_
    mlx5_wq_destroy(&rq->wq_ctrl);
  }
-static int mlx5e_create_rq(struct mlx5e_rq *rq,
 -			   struct mlx5e_rq_param *param)
 +int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
  {
    struct mlx5_core_dev *mdev = rq->mdev;
 -
 +	u8 ts_format;
    void *in;
    void *rqc;
    void *wq;
@@@ -696,9 -668,6 +697,9 @@@
    if (!in)
    	return -ENOMEM;
+	ts_format = mlx5_is_real_time_rq(mdev) ?
 +		    MLX5_RQC_TIMESTAMP_FORMAT_REAL_TIME :
 +		    MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING;
    rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
    wq  = MLX5_ADDR_OF(rqc, rqc, wq);
@@@ -706,7 -675,6 +707,7 @@@
MLX5_SET(rqc,  rqc, cqn,		rq->cq.mcq.cqn);
    MLX5_SET(rqc,  rqc, state,		MLX5_RQC_STATE_RST);
 +	MLX5_SET(rqc,  rqc, ts_format,		ts_format);
    MLX5_SET(wq,   wq,  log_wq_pg_sz,	rq->wq_ctrl.buf.page_shift -
    					MLX5_ADAPTER_PAGE_SHIFT);
    MLX5_SET64(wq, wq,  dbr_addr,		rq->wq_ctrl.db.dma);
@@@ -807,7 -775,7 +808,7 @@@ static int mlx5e_modify_rq_vsd(struct m
    return err;
  }
-static void mlx5e_destroy_rq(struct mlx5e_rq *rq)
 +void mlx5e_destroy_rq(struct mlx5e_rq *rq)
  {
    mlx5_core_destroy_rq(rq->mdev, rq->rqn);
  }
@@@ -947,7 -915,7 +948,7 @@@ void mlx5e_activate_rq(struct mlx5e_rq 
  void mlx5e_deactivate_rq(struct mlx5e_rq *rq)
  {
    clear_bit(MLX5E_RQ_STATE_ENABLED, &rq->state);
- 	synchronize_rcu(); /* Sync with NAPI to prevent mlx5e_post_rx_wqes. */
+ 	synchronize_net(); /* Sync with NAPI to prevent mlx5e_post_rx_wqes. */
  }
void mlx5e_close_rq(struct mlx5e_rq *rq)
@@@ -1176,6 -1144,7 +1177,6 @@@ static int mlx5e_alloc_txqsq(struct mlx
    sq->uar_map   = mdev->mlx5e_res.bfreg.map;
    sq->min_inline_mode = params->tx_min_inline_mode;
    sq->hw_mtu    = MLX5E_SW2HW_MTU(params, params->sw_mtu);
 -	sq->stats     = &c->priv->channel_stats[c->ix].sq[tc];
    INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work);
    if (!MLX5_CAP_ETH(mdev, wqe_vlan_insert))
    	set_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state);
@@@ -1186,9 -1155,6 +1187,9 @@@
    if (param->is_mpw)
    	set_bit(MLX5E_SQ_STATE_MPWQE, &sq->state);
    sq->stop_room = param->stop_room;
 +	sq->ptp_cyc2time = mlx5_is_real_time_sq(mdev) ?
 +			   mlx5_real_time_cyc2time :
 +			   mlx5_timecounter_cyc2time;
param->wq.db_numa_node = cpu_to_node(c->cpu);
    err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
@@@ -1222,7 -1188,6 +1223,7 @@@ static int mlx5e_create_sq(struct mlx5_
    		   struct mlx5e_create_sq_param *csp,
    		   u32 *sqn)
  {
 +	u8 ts_format;
    void *in;
    void *sqc;
    void *wq;
@@@ -1235,9 -1200,6 +1236,9 @@@
    if (!in)
    	return -ENOMEM;
+	ts_format = mlx5_is_real_time_sq(mdev) ?
 +		    MLX5_SQC_TIMESTAMP_FORMAT_REAL_TIME :
 +		    MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING;
    sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
    wq = MLX5_ADDR_OF(sqc, sqc, wq);
@@@ -1246,8 -1208,6 +1247,8 @@@
    MLX5_SET(sqc,  sqc, tis_num_0, csp->tisn);
    MLX5_SET(sqc,  sqc, cqn, csp->cqn);
    MLX5_SET(sqc,  sqc, ts_cqe_to_dest_cqn, csp->ts_cqe_to_dest_cqn);
 +	MLX5_SET(sqc,  sqc, ts_format, ts_format);
 +
if (MLX5_CAP_ETH(mdev, wqe_inline_mode) == MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
    	MLX5_SET(sqc,  sqc, min_wqe_inline_mode, csp->min_inline_mode);
@@@ -1274,7 -1234,6 +1275,7 @@@
  int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
    	    struct mlx5e_modify_sq_param *p)
  {
 +	u64 bitmask = 0;
    void *in;
    void *sqc;
    int inlen;
@@@ -1290,14 -1249,9 +1291,14 @@@
    MLX5_SET(modify_sq_in, in, sq_state, p->curr_state);
    MLX5_SET(sqc, sqc, state, p->next_state);
    if (p->rl_update && p->next_state == MLX5_SQC_STATE_RDY) {
 -		MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
 -		MLX5_SET(sqc,  sqc, packet_pacing_rate_limit_index, p->rl_index);
 +		bitmask |= 1;
 +		MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, p->rl_index);
 +	}
 +	if (p->qos_update && p->next_state == MLX5_SQC_STATE_RDY) {
 +		bitmask |= 1 << 2;
 +		MLX5_SET(sqc, sqc, qos_queue_group_id, p->qos_queue_group_id);
    }
 +	MLX5_SET64(modify_sq_in, in, modify_bitmask, bitmask);
err = mlx5_core_modify_sq(mdev, sqn, in);
@@@ -1314,7 -1268,6 +1315,7 @@@ static void mlx5e_destroy_sq(struct mlx
  int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev,
    		struct mlx5e_sq_param *param,
    		struct mlx5e_create_sq_param *csp,
 +			u16 qos_queue_group_id,
    		u32 *sqn)
  {
    struct mlx5e_modify_sq_param msp = {0};
@@@ -1326,10 -1279,6 +1327,10 @@@
msp.curr_state = MLX5_SQC_STATE_RST;
    msp.next_state = MLX5_SQC_STATE_RDY;
 +	if (qos_queue_group_id) {
 +		msp.qos_update = true;
 +		msp.qos_queue_group_id = qos_queue_group_id;
 +	}
    err = mlx5e_modify_sq(mdev, *sqn, &msp);
    if (err)
    	mlx5e_destroy_sq(mdev, *sqn);
@@@ -1340,9 -1289,13 +1341,9 @@@
  static int mlx5e_set_sq_maxrate(struct net_device *dev,
    			struct mlx5e_txqsq *sq, u32 rate);
-static int mlx5e_open_txqsq(struct mlx5e_channel *c,
 -			    u32 tisn,
 -			    int txq_ix,
 -			    struct mlx5e_params *params,
 -			    struct mlx5e_sq_param *param,
 -			    struct mlx5e_txqsq *sq,
 -			    int tc)
 +int mlx5e_open_txqsq(struct mlx5e_channel *c, u32 tisn, int txq_ix,
 +		     struct mlx5e_params *params, struct mlx5e_sq_param *param,
 +		     struct mlx5e_txqsq *sq, int tc, u16 qos_queue_group_id, u16 qos_qid)
  {
    struct mlx5e_create_sq_param csp = {};
    u32 tx_rate;
@@@ -1352,17 -1305,12 +1353,17 @@@
    if (err)
    	return err;
+	if (qos_queue_group_id)
 +		sq->stats = c->priv->htb.qos_sq_stats[qos_qid];
 +	else
 +		sq->stats = &c->priv->channel_stats[c->ix].sq[tc];
 +
    csp.tisn            = tisn;
    csp.tis_lst_sz      = 1;
    csp.cqn             = sq->cq.mcq.cqn;
    csp.wq_ctrl         = &sq->wq_ctrl;
    csp.min_inline_mode = sq->min_inline_mode;
 -	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn);
 +	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, qos_queue_group_id, &sq->sqn);
    if (err)
    	goto err_free_txqsq;
@@@ -1401,7 -1349,7 +1402,7 @@@ void mlx5e_deactivate_txqsq(struct mlx5
    struct mlx5_wq_cyc *wq = &sq->wq;
clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
- 	synchronize_rcu(); /* Sync with NAPI to prevent netif_tx_wake_queue. */
+ 	synchronize_net(); /* Sync with NAPI to prevent netif_tx_wake_queue. */
mlx5e_tx_disable_queue(sq->txq);
@@@ -1419,7 -1367,7 +1420,7 @@@
    }
  }
-static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
 +void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
  {
    struct mlx5_core_dev *mdev = sq->mdev;
    struct mlx5_rate_limit rl = {0};
@@@ -1456,7 -1404,7 +1457,7 @@@ int mlx5e_open_icosq(struct mlx5e_chann
    csp.cqn             = sq->cq.mcq.cqn;
    csp.wq_ctrl         = &sq->wq_ctrl;
    csp.min_inline_mode = params->tx_min_inline_mode;
 -	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn);
 +	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, 0, &sq->sqn);
    if (err)
    	goto err_free_icosq;
@@@ -1476,7 -1424,7 +1477,7 @@@ void mlx5e_activate_icosq(struct mlx5e_
  void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq)
  {
    clear_bit(MLX5E_SQ_STATE_ENABLED, &icosq->state);
- 	synchronize_rcu(); /* Sync with NAPI. */
+ 	synchronize_net(); /* Sync with NAPI. */
  }
void mlx5e_close_icosq(struct mlx5e_icosq *sq)
@@@ -1505,7 -1453,7 +1506,7 @@@ int mlx5e_open_xdpsq(struct mlx5e_chann
    csp.wq_ctrl         = &sq->wq_ctrl;
    csp.min_inline_mode = sq->min_inline_mode;
    set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
 -	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn);
 +	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, 0, &sq->sqn);
    if (err)
    	goto err_free_xdpsq;
@@@ -1555,7 -1503,7 +1556,7 @@@ void mlx5e_close_xdpsq(struct mlx5e_xdp
    struct mlx5e_channel *c = sq->channel;
clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
- 	synchronize_rcu(); /* Sync with NAPI. */
+ 	synchronize_net(); /* Sync with NAPI. */
mlx5e_destroy_sq(c->mdev, sq->sqn);
    mlx5e_free_xdpsq_descs(sq);
@@@ -1756,7 -1704,7 +1757,7 @@@ static int mlx5e_open_sqs(struct mlx5e_
    	int txq_ix = c->ix + tc * params->num_channels;
err = mlx5e_open_txqsq(c, c->priv->tisn[c->lag_port][tc], txq_ix,
 -				       params, &cparam->txq_sq, &c->sq[tc], tc);
 +				       params, &cparam->txq_sq, &c->sq[tc], tc, 0, 0);
    	if (err)
    		goto err_close_sqs;
    }
@@@ -1879,12 -1827,12 +1880,12 @@@ static int mlx5e_open_queues(struct mlx
mlx5e_build_create_cq_param(&ccp, c);
- 	err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->icosq.cqp, &ccp,
+ 	err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->async_icosq.cqp, &ccp,
    		    &c->async_icosq.cq);
    if (err)
    	return err;
- 	err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->async_icosq.cqp, &ccp,
+ 	err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->icosq.cqp, &ccp,
    		    &c->icosq.cq);
    if (err)
    	goto err_close_async_icosq_cq;
@@@ -1908,11 -1856,13 +1909,11 @@@
    if (err)
    	goto err_close_rx_cq;
-	napi_enable(&c->napi);
 -
    spin_lock_init(&c->async_icosq_lock);
err = mlx5e_open_icosq(c, params, &cparam->async_icosq, &c->async_icosq);
    if (err)
 -		goto err_disable_napi;
 +		goto err_close_xdpsq_cq;
err = mlx5e_open_icosq(c, params, &cparam->icosq, &c->icosq);
    if (err)
@@@ -1955,7 -1905,9 +1956,7 @@@ err_close_icosq
  err_close_async_icosq:
    mlx5e_close_icosq(&c->async_icosq);
-err_disable_napi:
 -	napi_disable(&c->napi);
 -
 +err_close_xdpsq_cq:
    if (c->xdp)
    	mlx5e_close_cq(&c->rq_xdpsq.cq);
@@@ -1986,6 -1938,7 +1987,6 @@@ static void mlx5e_close_queues(struct m
    mlx5e_close_sqs(c);
    mlx5e_close_icosq(&c->icosq);
    mlx5e_close_icosq(&c->async_icosq);
 -	napi_disable(&c->napi);
    if (c->xdp)
    	mlx5e_close_cq(&c->rq_xdpsq.cq);
    mlx5e_close_cq(&c->rq.cq);
@@@ -2070,8 -2023,6 +2071,8 @@@ static void mlx5e_activate_channel(stru
  {
    int tc;
+	napi_enable(&c->napi);
 +
    for (tc = 0; tc < c->num_tc; tc++)
    	mlx5e_activate_txqsq(&c->sq[tc]);
    mlx5e_activate_icosq(&c->icosq);
@@@ -2094,9 -2045,6 +2095,9 @@@ static void mlx5e_deactivate_channel(st
    mlx5e_deactivate_icosq(&c->icosq);
    for (tc = 0; tc < c->num_tc; tc++)
    	mlx5e_deactivate_txqsq(&c->sq[tc]);
 +	mlx5e_qos_deactivate_queues(c);
 +
 +	napi_disable(&c->napi);
  }
static void mlx5e_close_channel(struct mlx5e_channel *c)
@@@ -2104,7 -2052,6 +2105,7 @@@
    if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
    	mlx5e_close_xsk(c);
    mlx5e_close_queues(c);
 +	mlx5e_qos_close_queues(c);
    netif_napi_del(&c->napi);
kvfree(c);
@@@ -2122,8 -2069,10 +2123,13 @@@ static void mlx5e_build_rq_frags_info(s
    u32 buf_size = 0;
    int i;
++<<<<<<< HEAD
 +	if (MLX5_IPSEC_DEV(mdev))
++=======
+ #ifdef CONFIG_MLX5_EN_IPSEC
+ 	if (mlx5_fpga_is_ipsec_device(mdev))
++>>>>>>> 3af409ca278d4a8d50e91f9f7c4c33b175645cf3
    	byte_count += MLX5E_METADATA_ETHER_LEN;
 -#endif
if (mlx5e_rx_is_linear_skb(params, xsk)) {
    	int frag_stride;
@@@ -2252,8 -2201,9 +2258,8 @@@ void mlx5e_build_sq_param_common(struc
    param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(priv->mdev));
  }
-static void mlx5e_build_sq_param(struct mlx5e_priv *priv,
 -				 struct mlx5e_params *params,
 -				 struct mlx5e_sq_param *param)
 +void mlx5e_build_sq_param(struct mlx5e_priv *priv, struct mlx5e_params *params,
 +			  struct mlx5e_sq_param *param)
  {
    void *sqc = param->sqc;
    void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
@@@ -2432,18 -2382,10 +2438,18 @@@ int mlx5e_open_channels(struct mlx5e_pr
    		goto err_close_channels;
    }
+	err = mlx5e_qos_open_queues(priv, chs);
 +	if (err)
 +		goto err_close_ptp;
 +
    mlx5e_health_channels_update(priv);
    kvfree(cparam);
    return 0;
+err_close_ptp:
 +	if (chs->port_ptp)
 +		mlx5e_port_ptp_close(chs->port_ptp);
 +
  err_close_channels:
    for (i--; i >= 0; i--)
    	mlx5e_close_channel(chs->c[i]);
@@@ -2976,31 -2918,11 +2982,31 @@@ static void mlx5e_netdev_set_tcs(struc
    	netdev_set_tc_queue(netdev, tc, nch, 0);
  }
+int mlx5e_update_tx_netdev_queues(struct mlx5e_priv *priv)
 +{
 +	int qos_queues, nch, ntc, num_txqs, err;
 +
 +	qos_queues = mlx5e_qos_cur_leaf_nodes(priv);
 +
 +	nch = priv->channels.params.num_channels;
 +	ntc = priv->channels.params.num_tc;
 +	num_txqs = nch * ntc + qos_queues;
 +	if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_PORT_TS))
 +		num_txqs += ntc;
 +
 +	mlx5e_dbg(DRV, priv, "Setting num_txqs %d\n", num_txqs);
 +	err = netif_set_real_num_tx_queues(priv->netdev, num_txqs);
 +	if (err)
 +		netdev_warn(priv->netdev, "netif_set_real_num_tx_queues failed, %d\n", err);
 +
 +	return err;
 +}
 +
  static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv)
  {
    struct net_device *netdev = priv->netdev;
 -	int num_txqs, num_rxqs, nch, ntc;
    int old_num_txqs, old_ntc;
 +	int num_rxqs, nch, ntc;
    int err;
old_num_txqs = netdev->real_num_tx_queues;
@@@ -3008,13 -2930,18 +3014,13 @@@
nch = priv->channels.params.num_channels;
    ntc = priv->channels.params.num_tc;
 -	num_txqs = nch * ntc;
 -	if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_PORT_TS))
 -		num_txqs += ntc;
    num_rxqs = nch * priv->profile->rq_groups;
mlx5e_netdev_set_tcs(netdev, nch, ntc);
-	err = netif_set_real_num_tx_queues(netdev, num_txqs);
 -	if (err) {
 -		netdev_warn(netdev, "netif_set_real_num_tx_queues failed, %d\n", err);
 +	err = mlx5e_update_tx_netdev_queues(priv);
 +	if (err)
    	goto err_tcs;
 -	}
    err = netif_set_real_num_rx_queues(netdev, num_rxqs);
    if (err) {
    	netdev_warn(netdev, "netif_set_real_num_rx_queues failed, %d\n", err);
@@@ -3118,7 -3045,6 +3124,7 @@@ void mlx5e_activate_priv_channels(struc
    mlx5e_update_num_tc_x_num_ch(priv);
    mlx5e_build_txq_maps(priv);
    mlx5e_activate_channels(&priv->channels);
 +	mlx5e_qos_activate_queues(priv);
    mlx5e_xdp_tx_enable(priv);
    netif_tx_start_all_queues(priv->netdev);
@@@ -3261,7 -3187,6 +3267,7 @@@ int mlx5e_open_locked(struct net_devic
priv->profile->update_rx(priv);
    mlx5e_activate_priv_channels(priv);
 +	mlx5e_apply_traps(priv, true);
    if (priv->profile->update_carrier)
    	priv->profile->update_carrier(priv);
@@@ -3297,7 -3222,6 +3303,7 @@@ int mlx5e_close_locked(struct net_devic
    if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
    	return 0;
+	mlx5e_apply_traps(priv, false);
    clear_bit(MLX5E_STATE_OPENED, &priv->state);
netif_carrier_off(priv->netdev);
@@@ -3687,14 -3611,6 +3693,14 @@@ static int mlx5e_setup_tc_mqprio(struc
mutex_lock(&priv->state_lock);
+	/* MQPRIO is another toplevel qdisc that can't be attached
 +	 * simultaneously with the offloaded HTB.
 +	 */
 +	if (WARN_ON(priv->htb.maj_id)) {
 +		err = -EINVAL;
 +		goto out;
 +	}
 +
    new_channels.params = priv->channels.params;
    new_channels.params.num_tc = tc ? tc : 1;
@@@ -3720,55 -3636,12 +3726,55 @@@ out
    return err;
  }
+static int mlx5e_setup_tc_htb(struct mlx5e_priv *priv, struct tc_htb_qopt_offload *htb)
 +{
 +	int res;
 +
 +	switch (htb->command) {
 +	case TC_HTB_CREATE:
 +		return mlx5e_htb_root_add(priv, htb->parent_classid, htb->classid,
 +					  htb->extack);
 +	case TC_HTB_DESTROY:
 +		return mlx5e_htb_root_del(priv);
 +	case TC_HTB_LEAF_ALLOC_QUEUE:
 +		res = mlx5e_htb_leaf_alloc_queue(priv, htb->classid, htb->parent_classid,
 +						 htb->rate, htb->ceil, htb->extack);
 +		if (res < 0)
 +			return res;
 +		htb->qid = res;
 +		return 0;
 +	case TC_HTB_LEAF_TO_INNER:
 +		return mlx5e_htb_leaf_to_inner(priv, htb->parent_classid, htb->classid,
 +					       htb->rate, htb->ceil, htb->extack);
 +	case TC_HTB_LEAF_DEL:
 +		return mlx5e_htb_leaf_del(priv, htb->classid, &htb->moved_qid, &htb->qid,
 +					  htb->extack);
 +	case TC_HTB_LEAF_DEL_LAST:
 +	case TC_HTB_LEAF_DEL_LAST_FORCE:
 +		return mlx5e_htb_leaf_del_last(priv, htb->classid,
 +					       htb->command == TC_HTB_LEAF_DEL_LAST_FORCE,
 +					       htb->extack);
 +	case TC_HTB_NODE_MODIFY:
 +		return mlx5e_htb_node_modify(priv, htb->classid, htb->rate, htb->ceil,
 +					     htb->extack);
 +	case TC_HTB_LEAF_QUERY_QUEUE:
 +		res = mlx5e_get_txq_by_classid(priv, htb->classid);
 +		if (res < 0)
 +			return res;
 +		htb->qid = res;
 +		return 0;
 +	default:
 +		return -EOPNOTSUPP;
 +	}
 +}
 +
  static LIST_HEAD(mlx5e_block_cb_list);
static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
    		  void *type_data)
  {
    struct mlx5e_priv *priv = netdev_priv(dev);
 +	int err;
switch (type) {
    case TC_SETUP_BLOCK: {
@@@ -3782,11 -3655,6 +3788,11 @@@
    }
    case TC_SETUP_QDISC_MQPRIO:
    	return mlx5e_setup_tc_mqprio(priv, type_data);
 +	case TC_SETUP_QDISC_HTB:
 +		mutex_lock(&priv->state_lock);
 +		err = mlx5e_setup_tc_htb(priv, type_data);
 +		mutex_unlock(&priv->state_lock);
 +		return err;
    default:
    	return -EOPNOTSUPP;
    }
@@@ -3902,7 -3770,7 +3908,7 @@@ static int set_feature_lro(struct net_d
    mutex_lock(&priv->state_lock);
if (enable && priv->xsk.refcnt) {
 -		netdev_warn(netdev, "LRO is incompatible with AF_XDP (%hu XSKs are active)\n",
 +		netdev_warn(netdev, "LRO is incompatible with AF_XDP (%u XSKs are active)\n",
    		    priv->xsk.refcnt);
    	err = -EINVAL;
    	goto out;
@@@ -3956,25 -3824,20 +3962,25 @@@ static int set_feature_cvlan_filter(str
    return 0;
  }
-#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
 -static int set_feature_tc_num_filters(struct net_device *netdev, bool enable)
 +static int set_feature_hw_tc(struct net_device *netdev, bool enable)
  {
    struct mlx5e_priv *priv = netdev_priv(netdev);
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
    if (!enable && mlx5e_tc_num_filters(priv, MLX5_TC_FLAG(NIC_OFFLOAD))) {
    	netdev_err(netdev,
    		   "Active offloaded tc filters, can't turn hw_tc_offload off\n");
    	return -EINVAL;
    }
 +#endif
 +
 +	if (!enable && priv->htb.maj_id) {
 +		netdev_err(netdev, "Active HTB offload, can't turn hw_tc_offload off\n");
 +		return -EINVAL;
 +	}
return 0;
  }
 -#endif
static int set_feature_rx_all(struct net_device *netdev, bool enable)
  {
@@@ -4072,7 -3935,9 +4078,7 @@@ int mlx5e_set_features(struct net_devic
    err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro);
    err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER,
    			    set_feature_cvlan_filter);
 -#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
 -	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_tc_num_filters);
 -#endif
 +	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_hw_tc);
    err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all);
    err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs);
    err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_RX, set_feature_rx_vlan);
@@@ -4105,7 -3970,6 +4111,7 @@@ static netdev_features_t mlx5e_fix_feat
    	if (!params->vlan_strip_disable)
    		netdev_warn(netdev, "Dropping C-tag vlan stripping offload due to S-tag vlan\n");
    }
 +
    if (!MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ)) {
    	if (features & NETIF_F_LRO) {
    		netdev_warn(netdev, "Disabling LRO, not supported in legacy RQ\n");
@@@ -4153,7 -4017,7 +4159,7 @@@ static bool mlx5e_xsk_validate_mtu(stru
    		max_mtu_page = mlx5e_xdp_max_mtu(new_params, &xsk);
    		max_mtu = min(max_mtu_frame, max_mtu_page);
-			netdev_err(netdev, "MTU %d is too big for an XSK running on channel %hu. Try MTU <= %d\n",
 +			netdev_err(netdev, "MTU %d is too big for an XSK running on channel %u. Try MTU <= %d\n",
    			   new_params->sw_mtu, ix, max_mtu);
    		return false;
    	}
@@@ -4530,8 -4394,10 +4536,8 @@@ netdev_features_t mlx5e_features_check(
    features = vlan_features_check(skb, features);
    features = vxlan_features_check(skb, features);
-#ifdef CONFIG_MLX5_EN_IPSEC
    if (mlx5e_ipsec_feature_check(skb, netdev, features))
    	return features;
 -#endif
/* Validate if the tunneled packet is being offloaded by HW */
    if (skb->encapsulation &&
@@@ -4590,8 -4456,9 +4596,9 @@@ static int mlx5e_xdp_allowed(struct mlx
    	return -EINVAL;
    }
- 	if (MLX5_IPSEC_DEV(priv->mdev)) {
- 		netdev_warn(netdev, "can't set XDP with IPSec offload\n");
+ 	if (mlx5_fpga_is_ipsec_device(priv->mdev)) {
+ 		netdev_warn(netdev,
+ 			    "XDP is not available on Innova cards with IPsec support\n");
    	return -EINVAL;
    }
@@@ -4774,6 -4641,8 +4781,6 @@@ const struct net_device_ops mlx5e_netde
    .ndo_change_mtu          = mlx5e_change_nic_mtu,
    .ndo_do_ioctl            = mlx5e_ioctl,
    .ndo_set_tx_maxrate      = mlx5e_set_tx_maxrate,
 -	.ndo_udp_tunnel_add      = udp_tunnel_nic_add_port,
 -	.ndo_udp_tunnel_del      = udp_tunnel_nic_del_port,
    .ndo_features_check      = mlx5e_features_check,
    .ndo_tx_timeout          = mlx5e_tx_timeout,
    .ndo_bpf		 = mlx5e_xdp,
@@@ -4941,15 -4810,15 +4948,15 @@@ void mlx5e_build_rss_params(struct mlx5
    		tirc_default_config[tt].rx_hash_fields;
  }
-void mlx5e_build_nic_params(struct mlx5e_priv *priv,
 -			    struct mlx5e_xsk *xsk,
 -			    struct mlx5e_rss_params *rss_params,
 -			    struct mlx5e_params *params,
 -			    u16 mtu)
 +void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu)
  {
 +	struct mlx5e_rss_params *rss_params = &priv->rss_params;
 +	struct mlx5e_params *params = &priv->channels.params;
    struct mlx5_core_dev *mdev = priv->mdev;
    u8 rx_cq_period_mode;
+	priv->max_nch = mlx5e_calc_max_nch(priv, priv->profile);
 +
    params->sw_mtu = mtu;
    params->hard_mtu = MLX5E_ETH_HARD_MTU;
    params->num_channels = min_t(unsigned int, MLX5E_MAX_NUM_CHANNELS / 2,
@@@ -5007,11 -4876,6 +5014,11 @@@
/* AF_XDP */
    params->xsk = xsk;
 +
 +	/* Do not update netdev->features directly in here
 +	 * on mlx5e_attach_netdev() we will call mlx5e_update_features()
 +	 * To update netdev->features please modify mlx5e_fix_features()
 +	 */
  }
static void mlx5e_set_netdev_dev_addr(struct net_device *netdev)
@@@ -5113,6 -4977,8 +5120,6 @@@ static void mlx5e_build_nic_netdev(stru
    netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_FILTER;
    netdev->hw_features      |= NETIF_F_HW_VLAN_STAG_TX;
-	mlx5e_vxlan_set_netdev_info(priv);
 -
    if (mlx5e_tunnel_any_tx_proto_supported(mdev)) {
    	netdev->hw_enc_features |= NETIF_F_HW_CSUM;
    	netdev->hw_enc_features |= NETIF_F_TSO;
@@@ -5162,12 -5028,18 +5169,12 @@@
    	netdev->hw_features |= NETIF_F_RXFCS;
netdev->features          = netdev->hw_features;
 -	if (!priv->channels.params.lro_en)
 -		netdev->features  &= ~NETIF_F_LRO;
+	/* Defaults */
    if (fcs_enabled)
    	netdev->features  &= ~NETIF_F_RXALL;
 -
 -	if (!priv->channels.params.scatter_fcs_en)
 -		netdev->features  &= ~NETIF_F_RXFCS;
 -
 -	/* prefere CQE compression over rxhash */
 -	if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS))
 -		netdev->features &= ~NETIF_F_RXHASH;
 +	netdev->features  &= ~NETIF_F_LRO;
 +	netdev->features  &= ~NETIF_F_RXFCS;
#define FT_CAP(f) MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.f)
    if (FT_CAP(flow_modify_en) &&
@@@ -5181,8 -5053,6 +5188,8 @@@
    	netdev->hw_features	 |= NETIF_F_NTUPLE;
  #endif
    }
 +	if (mlx5_qos_is_supported(mdev))
 +		netdev->features |= NETIF_F_HW_TC;
netdev->features         |= NETIF_F_HIGHDMA;
    netdev->features         |= NETIF_F_HW_VLAN_STAG_FILTER;
@@@ -5233,28 -5103,33 +5240,28 @@@ void mlx5e_destroy_q_counters(struct ml
  }
static int mlx5e_nic_init(struct mlx5_core_dev *mdev,
 -			  struct net_device *netdev,
 -			  const struct mlx5e_profile *profile,
 -			  void *ppriv)
 +			  struct net_device *netdev)
  {
    struct mlx5e_priv *priv = netdev_priv(netdev);
 -	struct mlx5e_rss_params *rss = &priv->rss_params;
    int err;
-	err = mlx5e_netdev_init(netdev, priv, mdev, profile, ppriv);
 -	if (err)
 -		return err;
 -
 -	mlx5e_build_nic_params(priv, &priv->xsk, rss, &priv->channels.params,
 -			       netdev->mtu);
 +	mlx5e_build_nic_params(priv, &priv->xsk, netdev->mtu);
 +	mlx5e_vxlan_set_netdev_info(priv);
mlx5e_timestamp_init(priv);
err = mlx5e_ipsec_init(priv);
    if (err)
    	mlx5_core_err(mdev, "IPSec initialization failed, %d\n", err);
 +
    err = mlx5e_tls_init(priv);
    if (err)
    	mlx5_core_err(mdev, "TLS initialization failed, %d\n", err);
 -	mlx5e_build_nic_netdev(netdev);
 +
    err = mlx5e_devlink_port_register(priv);
    if (err)
    	mlx5_core_err(mdev, "mlx5e_devlink_port_register failed, %d\n", err);
 +
    mlx5e_health_create_reporters(priv);
return 0;
@@@ -5266,6 -5141,7 +5273,6 @@@ static void mlx5e_nic_cleanup(struct ml
    mlx5e_devlink_port_unregister(priv);
    mlx5e_tls_cleanup(priv);
    mlx5e_ipsec_cleanup(priv);
 -	mlx5e_netdev_cleanup(priv->netdev, priv);
  }
static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
@@@ -5394,7 -5270,6 +5401,7 @@@ static void mlx5e_nic_enable(struct mlx
    mlx5_lag_add(mdev, netdev);
mlx5e_enable_async_events(priv);
 +	mlx5e_enable_blocking_events(priv);
    if (mlx5e_monitor_counter_supported(priv))
    	mlx5e_monitor_counter_init(priv);
@@@ -5432,12 -5307,6 +5439,12 @@@ static void mlx5e_nic_disable(struct ml
    if (mlx5e_monitor_counter_supported(priv))
    	mlx5e_monitor_counter_cleanup(priv);
+	mlx5e_disable_blocking_events(priv);
 +	if (priv->en_trap) {
 +		mlx5e_deactivate_trap(priv);
 +		mlx5e_close_trap(priv->en_trap);
 +		priv->en_trap = NULL;
 +	}
    mlx5e_disable_async_events(priv);
    mlx5_lag_remove(mdev);
    mlx5_vxlan_reset_to_default(mdev->vxlan);
@@@ -5468,23 -5337,27 +5475,23 @@@ static const struct mlx5e_profile mlx5e
  };
/* mlx5e generic netdev management API (move to en_common.c) */
 -
 -/* mlx5e_netdev_init/cleanup must be called from profile->init/cleanup callbacks */
 -int mlx5e_netdev_init(struct net_device *netdev,
 -		      struct mlx5e_priv *priv,
 -		      struct mlx5_core_dev *mdev,
 -		      const struct mlx5e_profile *profile,
 -		      void *ppriv)
 +int mlx5e_priv_init(struct mlx5e_priv *priv,
 +		    struct net_device *netdev,
 +		    struct mlx5_core_dev *mdev)
  {
 +	memset(priv, 0, sizeof(*priv));
 +
    /* priv init */
    priv->mdev        = mdev;
    priv->netdev      = netdev;
 -	priv->profile     = profile;
 -	priv->ppriv       = ppriv;
    priv->msglevel    = MLX5E_MSG_LEVEL;
 -	priv->max_nch     = netdev->num_rx_queues / max_t(u8, profile->rq_groups, 1);
    priv->max_opened_tc = 1;
if (!alloc_cpumask_var(&priv->scratchpad.cpumask, GFP_KERNEL))
    	return -ENOMEM;
mutex_init(&priv->state_lock);
 +	hash_init(priv->htb.qos_tc2node);
    INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work);
    INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work);
    INIT_WORK(&priv->tx_timeout_work, mlx5e_tx_timeout_work);
@@@ -5494,6 -5367,9 +5501,6 @@@
    if (!priv->wq)
    	goto err_free_cpumask;
-	/* netdev init */
 -	netif_carrier_off(netdev);
 -
    return 0;
err_free_cpumask:
@@@ -5502,39 -5378,38 +5509,39 @@@
    return -ENOMEM;
  }
-void mlx5e_netdev_cleanup(struct net_device *netdev, struct mlx5e_priv *priv)
 +void mlx5e_priv_cleanup(struct mlx5e_priv *priv)
  {
 +	int i;
 +
    destroy_workqueue(priv->wq);
    free_cpumask_var(priv->scratchpad.cpumask);
 +
 +	for (i = 0; i < priv->htb.max_qos_sqs; i++)
 +		kfree(priv->htb.qos_sq_stats[i]);
 +	kvfree(priv->htb.qos_sq_stats);
  }
-struct net_device *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
 -				       const struct mlx5e_profile *profile,
 -				       int nch,
 -				       void *ppriv)
 +struct net_device *
 +mlx5e_create_netdev(struct mlx5_core_dev *mdev, unsigned int txqs, unsigned int rxqs)
  {
    struct net_device *netdev;
 -	unsigned int ptp_txqs = 0;
    int err;
-	if (MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn))
 -		ptp_txqs = profile->max_tc;
 -
 -	netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv),
 -				    nch * profile->max_tc + ptp_txqs,
 -				    nch * profile->rq_groups);
 +	netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv), txqs, rxqs);
    if (!netdev) {
    	mlx5_core_err(mdev, "alloc_etherdev_mqs() failed\n");
    	return NULL;
    }
-	err = profile->init(mdev, netdev, profile, ppriv);
 +	err = mlx5e_priv_init(netdev_priv(netdev), netdev, mdev);
    if (err) {
 -		mlx5_core_err(mdev, "failed to init mlx5e profile %d\n", err);
 +		mlx5_core_err(mdev, "mlx5e_priv_init failed, err=%d\n", err);
    	goto err_free_netdev;
    }
+	netif_carrier_off(netdev);
 +	dev_net_set(netdev, mlx5_core_net(mdev));
 +
    return netdev;
err_free_netdev:
@@@ -5543,23 -5418,14 +5550,23 @@@
    return NULL;
  }
+static void mlx5e_update_features(struct net_device *netdev)
 +{
 +	if (netdev->reg_state != NETREG_REGISTERED)
 +		return; /* features will be updated on netdev registration */
 +
 +	rtnl_lock();
 +	netdev_update_features(netdev);
 +	rtnl_unlock();
 +}
 +
  int mlx5e_attach_netdev(struct mlx5e_priv *priv)
  {
    const bool take_rtnl = priv->netdev->reg_state == NETREG_REGISTERED;
 -	const struct mlx5e_profile *profile;
 +	const struct mlx5e_profile *profile = priv->profile;
    int max_nch;
    int err;
-	profile = priv->profile;
    clear_bit(MLX5E_STATE_DESTROYING, &priv->state);
/* max number of channels may have changed */
@@@ -5599,8 -5465,6 +5606,8 @@@
    if (profile->enable)
    	profile->enable(priv);
+	mlx5e_update_features(priv->netdev);
 +
    return 0;
err_cleanup_tx:
@@@ -5627,76 -5491,13 +5634,76 @@@ void mlx5e_detach_netdev(struct mlx5e_p
    cancel_work_sync(&priv->update_stats_work);
  }
+static int
 +mlx5e_netdev_attach_profile(struct mlx5e_priv *priv,
 +			    const struct mlx5e_profile *new_profile, void *new_ppriv)
 +{
 +	struct net_device *netdev = priv->netdev;
 +	struct mlx5_core_dev *mdev = priv->mdev;
 +	int err;
 +
 +	err = mlx5e_priv_init(priv, netdev, mdev);
 +	if (err) {
 +		mlx5_core_err(mdev, "mlx5e_priv_init failed, err=%d\n", err);
 +		return err;
 +	}
 +	netif_carrier_off(netdev);
 +	priv->profile = new_profile;
 +	priv->ppriv = new_ppriv;
 +	err = new_profile->init(priv->mdev, priv->netdev);
 +	if (err)
 +		return err;
 +	err = mlx5e_attach_netdev(priv);
 +	if (err)
 +		new_profile->cleanup(priv);
 +	return err;
 +}
 +
 +int mlx5e_netdev_change_profile(struct mlx5e_priv *priv,
 +				const struct mlx5e_profile *new_profile, void *new_ppriv)
 +{
 +	unsigned int new_max_nch = mlx5e_calc_max_nch(priv, new_profile);
 +	const struct mlx5e_profile *orig_profile = priv->profile;
 +	void *orig_ppriv = priv->ppriv;
 +	int err, rollback_err;
 +
 +	/* sanity */
 +	if (new_max_nch != priv->max_nch) {
 +		netdev_warn(priv->netdev,
 +			    "%s: Replacing profile with different max channels\n",
 +			    __func__);
 +		return -EINVAL;
 +	}
 +
 +	/* cleanup old profile */
 +	mlx5e_detach_netdev(priv);
 +	priv->profile->cleanup(priv);
 +	mlx5e_priv_cleanup(priv);
 +
 +	err = mlx5e_netdev_attach_profile(priv, new_profile, new_ppriv);
 +	if (err) { /* roll back to original profile */
 +		netdev_warn(priv->netdev, "%s: new profile init failed, %d\n",
 +			    __func__, err);
 +		goto rollback;
 +	}
 +
 +	return 0;
 +
 +rollback:
 +	rollback_err = mlx5e_netdev_attach_profile(priv, orig_profile, orig_ppriv);
 +	if (rollback_err) {
 +		netdev_err(priv->netdev,
 +			   "%s: failed to rollback to orig profile, %d\n",
 +			   __func__, rollback_err);
 +	}
 +	return err;
 +}
 +
  void mlx5e_destroy_netdev(struct mlx5e_priv *priv)
  {
 -	const struct mlx5e_profile *profile = priv->profile;
    struct net_device *netdev = priv->netdev;
-	if (profile->cleanup)
 -		profile->cleanup(priv);
 +	mlx5e_priv_cleanup(priv);
    free_netdev(netdev);
  }
@@@ -5742,48 -5543,28 +5749,48 @@@ static int mlx5e_probe(struct auxiliary
    	       const struct auxiliary_device_id *id)
  {
    struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
 +	const struct mlx5e_profile *profile = &mlx5e_nic_profile;
    struct mlx5_core_dev *mdev = edev->mdev;
    struct net_device *netdev;
    pm_message_t state = {};
 -	void *priv;
 +	unsigned int txqs, rxqs, ptp_txqs = 0;
 +	struct mlx5e_priv *priv;
 +	int qos_sqs = 0;
    int err;
    int nch;
+	if (MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn))
 +		ptp_txqs = profile->max_tc;
 +
 +	if (mlx5_qos_is_supported(mdev))
 +		qos_sqs = mlx5e_qos_max_leaf_nodes(mdev);
 +
    nch = mlx5e_get_max_num_channels(mdev);
 -	netdev = mlx5e_create_netdev(mdev, &mlx5e_nic_profile, nch, NULL);
 +	txqs = nch * profile->max_tc + ptp_txqs + qos_sqs;
 +	rxqs = nch * profile->rq_groups;
 +	netdev = mlx5e_create_netdev(mdev, txqs, rxqs);
    if (!netdev) {
    	mlx5_core_err(mdev, "mlx5e_create_netdev failed\n");
    	return -ENOMEM;
    }
-	dev_net_set(netdev, mlx5_core_net(mdev));
 +	mlx5e_build_nic_netdev(netdev);
 +
    priv = netdev_priv(netdev);
    dev_set_drvdata(&adev->dev, priv);
+	priv->profile = profile;
 +	priv->ppriv = NULL;
 +	err = profile->init(mdev, netdev);
 +	if (err) {
 +		mlx5_core_err(mdev, "mlx5e_nic_profile init failed, %d\n", err);
 +		goto err_destroy_netdev;
 +	}
 +
    err = mlx5e_resume(adev);
    if (err) {
    	mlx5_core_err(mdev, "mlx5e_resume failed, %d\n", err);
 -		goto err_destroy_netdev;
 +		goto err_profile_cleanup;
    }
err = register_netdev(netdev);
@@@ -5799,8 -5580,6 +5806,8 @@@
err_resume:
    mlx5e_suspend(adev, state);
 +err_profile_cleanup:
 +	profile->cleanup(priv);
  err_destroy_netdev:
    mlx5e_destroy_netdev(priv);
    return err;
@@@ -5814,7 -5593,6 +5821,7 @@@ static void mlx5e_remove(struct auxilia
    mlx5e_dcbnl_delete_app(priv);
    unregister_netdev(priv->netdev);
    mlx5e_suspend(adev, state);
 +	priv->profile->cleanup(priv);
    mlx5e_destroy_netdev(priv);
  }
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index fac96ea819a1,4864deed9dc9..1b6ad94ebb10
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@@ -47,11 -47,11 +47,11 @@@
  #include "fpga/ipsec.h"
  #include "en_accel/ipsec_rxtx.h"
  #include "en_accel/tls_rxtx.h"
 -#include "lib/clock.h"
  #include "en/xdp.h"
  #include "en/xsk/rx.h"
  #include "en/health.h"
  #include "en/params.h"
 +#include "devlink.h"
static struct sk_buff *
  mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
@@@ -212,6 -212,11 +212,6 @@@ static inline u32 mlx5e_decompress_cqes
    return mlx5e_decompress_cqes_cont(rq, wq, 1, budget_rem) - 1;
  }
-static inline bool mlx5e_page_is_reserved(struct page *page)
 -{
 -	return page_is_pfmemalloc(page) || page_to_nid(page) != numa_mem_id();
 -}
 -
  static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq,
    			      struct mlx5e_dma_info *dma_info)
  {
@@@ -224,7 -229,7 +224,7 @@@
    	return false;
    }
-	if (unlikely(mlx5e_page_is_reserved(dma_info->page))) {
 +	if (!dev_page_is_reusable(dma_info->page)) {
    	stats->cache_waive++;
    	return false;
    }
@@@ -1061,8 -1066,9 +1061,8 @@@ static inline void mlx5e_build_rx_skb(s
    }
if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp)))
 -		skb_hwtstamps(skb)->hwtstamp =
 -				mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe));
 -
 +		skb_hwtstamps(skb)->hwtstamp = mlx5e_cqe_ts_to_ns(rq->ptp_cyc2time,
 +								  rq->clock, get_cqe_ts(cqe));
    skb_record_rx_queue(skb, rq->ix);
if (likely(netdev->features & NETIF_F_RXHASH))
@@@ -1120,8 -1126,12 +1120,8 @@@ struct sk_buff *mlx5e_build_linear_skb(
  static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom,
    			u32 len, struct xdp_buff *xdp)
  {
 -	xdp->data_hard_start = va;
 -	xdp->data = va + headroom;
 -	xdp_set_data_meta_invalid(xdp);
 -	xdp->data_end = xdp->data + len;
 -	xdp->rxq = &rq->xdp_rxq;
 -	xdp->frame_sz = rq->buff.frame0_sz;
 +	xdp_init_buff(xdp, rq->buff.frame0_sz, &rq->xdp_rxq);
 +	xdp_prepare_buff(xdp, va, headroom, len, false);
  }
static struct sk_buff *
@@@ -1665,8 -1675,9 +1665,8 @@@ static inline void mlx5i_complete_rx_cq
    }
if (unlikely(mlx5e_rx_hw_stamp(tstamp)))
 -		skb_hwtstamps(skb)->hwtstamp =
 -				mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe));
 -
 +		skb_hwtstamps(skb)->hwtstamp = mlx5e_cqe_ts_to_ns(rq->ptp_cyc2time,
 +								  rq->clock, get_cqe_ts(cqe));
    skb_record_rx_queue(skb, rq->ix);
if (likely(netdev->features & NETIF_F_RXHASH))
@@@ -1783,10 -1794,12 +1783,10 @@@ int mlx5e_rq_set_handlers(struct mlx5e_
    	rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
rq->handle_rx_cqe = priv->profile->rx_handlers->handle_rx_cqe_mpwqe;
- 		if (MLX5_IPSEC_DEV(mdev)) {
- 			netdev_err(netdev, "MPWQE RQ with IPSec offload not supported\n");
 -#ifdef CONFIG_MLX5_EN_IPSEC
+ 		if (mlx5_fpga_is_ipsec_device(mdev)) {
+ 			netdev_err(netdev, "MPWQE RQ with Innova IPSec offload not supported\n");
    		return -EINVAL;
    	}
 -#endif
    	if (!rq->handle_rx_cqe) {
    		netdev_err(netdev, "RX handler of MPWQE RQ is not set\n");
    		return -EINVAL;
@@@ -1816,48 -1829,3 +1816,48 @@@
return 0;
  }
 +
 +static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 +{
 +	struct mlx5e_priv *priv = netdev_priv(rq->netdev);
 +	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 +	struct mlx5e_wqe_frag_info *wi;
 +	struct sk_buff *skb;
 +	u32 cqe_bcnt;
 +	u16 trap_id;
 +	u16 ci;
 +
 +	trap_id  = get_cqe_flow_tag(cqe);
 +	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
 +	wi       = get_frag(rq, ci);
 +	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 +
 +	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
 +		rq->stats->wqe_err++;
 +		goto free_wqe;
 +	}
 +
 +	skb = mlx5e_skb_from_cqe_nonlinear(rq, cqe, wi, cqe_bcnt);
 +	if (!skb)
 +		goto free_wqe;
 +
 +	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 +	skb_push(skb, ETH_HLEN);
 +
 +	mlx5_devlink_trap_report(rq->mdev, trap_id, skb, &priv->dl_port);
 +	dev_kfree_skb_any(skb);
 +
 +free_wqe:
 +	mlx5e_free_rx_wqe(rq, wi, false);
 +	mlx5_wq_cyc_pop(wq);
 +}
 +
 +void mlx5e_rq_set_trap_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params)
 +{
 +	rq->wqe.skb_from_cqe = mlx5e_rx_is_linear_skb(params, NULL) ?
 +			       mlx5e_skb_from_cqe_linear :
 +			       mlx5e_skb_from_cqe_nonlinear;
 +	rq->post_wqes = mlx5e_post_rx_wqes;
 +	rq->dealloc_wqe = mlx5e_dealloc_rx_wqe;
 +	rq->handle_rx_cqe = mlx5e_trap_handle_rx_cqe;
 +}
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 9f126054d371,717fbaa6ce73..0da69b98f38f
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@@ -63,8 -63,6 +63,8 @@@
  #include "en/mapping.h"
  #include "en/tc_ct.h"
  #include "en/mod_hdr.h"
 +#include "en/tc_priv.h"
 +#include "en/tc_tun_encap.h"
  #include "lib/devcom.h"
  #include "lib/geneve.h"
  #include "lib/fs_chains.h"
@@@ -73,6 -71,90 +73,6 @@@
#define nic_chains(priv) ((priv)->fs.tc.chains)
  #define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)
 -#define MLX5E_TC_FLOW_BASE (MLX5E_TC_FLAG_LAST_EXPORTED_BIT + 1)
 -
 -enum {
 -	MLX5E_TC_FLOW_FLAG_INGRESS	= MLX5E_TC_FLAG_INGRESS_BIT,
 -	MLX5E_TC_FLOW_FLAG_EGRESS	= MLX5E_TC_FLAG_EGRESS_BIT,
 -	MLX5E_TC_FLOW_FLAG_ESWITCH	= MLX5E_TC_FLAG_ESW_OFFLOAD_BIT,
 -	MLX5E_TC_FLOW_FLAG_FT		= MLX5E_TC_FLAG_FT_OFFLOAD_BIT,
 -	MLX5E_TC_FLOW_FLAG_NIC		= MLX5E_TC_FLAG_NIC_OFFLOAD_BIT,
 -	MLX5E_TC_FLOW_FLAG_OFFLOADED	= MLX5E_TC_FLOW_BASE,
 -	MLX5E_TC_FLOW_FLAG_HAIRPIN	= MLX5E_TC_FLOW_BASE + 1,
 -	MLX5E_TC_FLOW_FLAG_HAIRPIN_RSS	= MLX5E_TC_FLOW_BASE + 2,
 -	MLX5E_TC_FLOW_FLAG_SLOW		= MLX5E_TC_FLOW_BASE + 3,
 -	MLX5E_TC_FLOW_FLAG_DUP		= MLX5E_TC_FLOW_BASE + 4,
 -	MLX5E_TC_FLOW_FLAG_NOT_READY	= MLX5E_TC_FLOW_BASE + 5,
 -	MLX5E_TC_FLOW_FLAG_DELETED	= MLX5E_TC_FLOW_BASE + 6,
 -	MLX5E_TC_FLOW_FLAG_CT		= MLX5E_TC_FLOW_BASE + 7,
 -	MLX5E_TC_FLOW_FLAG_L3_TO_L2_DECAP = MLX5E_TC_FLOW_BASE + 8,
 -};
 -
 -#define MLX5E_TC_MAX_SPLITS 1
 -
 -/* Helper struct for accessing a struct containing list_head array.
 - * Containing struct
 - *   |- Helper array
 - *      [0] Helper item 0
 - *          |- list_head item 0
 - *          |- index (0)
 - *      [1] Helper item 1
 - *          |- list_head item 1
 - *          |- index (1)
 - * To access the containing struct from one of the list_head items:
 - * 1. Get the helper item from the list_head item using
 - *    helper item =
 - *        container_of(list_head item, helper struct type, list_head field)
 - * 2. Get the contining struct from the helper item and its index in the array:
 - *    containing struct =
 - *        container_of(helper item, containing struct type, helper field[index])
 - */
 -struct encap_flow_item {
 -	struct mlx5e_encap_entry *e; /* attached encap instance */
 -	struct list_head list;
 -	int index;
 -};
 -
 -struct mlx5e_tc_flow {
 -	struct rhash_head	node;
 -	struct mlx5e_priv	*priv;
 -	u64			cookie;
 -	unsigned long		flags;
 -	struct mlx5_flow_handle *rule[MLX5E_TC_MAX_SPLITS + 1];
 -
 -	/* flows sharing the same reformat object - currently mpls decap */
 -	struct list_head l3_to_l2_reformat;
 -	struct mlx5e_decap_entry *decap_reformat;
 -
 -	/* Flow can be associated with multiple encap IDs.
 -	 * The number of encaps is bounded by the number of supported
 -	 * destinations.
 -	 */
 -	struct encap_flow_item encaps[MLX5_MAX_FLOW_FWD_VPORTS];
 -	struct mlx5e_tc_flow    *peer_flow;
 -	struct mlx5e_mod_hdr_handle *mh; /* attached mod header instance */
 -	struct mlx5e_hairpin_entry *hpe; /* attached hairpin instance */
 -	struct list_head	hairpin; /* flows sharing the same hairpin */
 -	struct list_head	peer;    /* flows with peer flow */
 -	struct list_head	unready; /* flows not ready to be offloaded (e.g due to missing route) */
 -	struct net_device	*orig_dev; /* netdev adding flow first */
 -	int			tmp_efi_index;
 -	struct list_head	tmp_list; /* temporary flow list used by neigh update */
 -	refcount_t		refcnt;
 -	struct rcu_head		rcu_head;
 -	struct completion	init_done;
 -	int tunnel_id; /* the mapped tunnel id of this flow */
 -	struct mlx5_flow_attr *attr;
 -};
 -
 -struct mlx5e_tc_flow_parse_attr {
 -	const struct ip_tunnel_info *tun_info[MLX5_MAX_FLOW_FWD_VPORTS];
 -	struct net_device *filter_dev;
 -	struct mlx5_flow_spec spec;
 -	struct mlx5e_tc_mod_hdr_acts mod_hdr_acts;
 -	int mirred_ifindex[MLX5_MAX_FLOW_FWD_VPORTS];
 -	struct ethhdr eth;
 -};
#define MLX5E_TC_TABLE_NUM_GROUPS 4
  #define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(18)
@@@ -83,15 -165,10 +83,15 @@@ struct mlx5e_tc_attr_to_reg_mapping mlx
    	.moffset = 0,
    	.mlen = 2,
    },
 +	[VPORT_TO_REG] = {
 +		.mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_0,
 +		.moffset = 2,
 +		.mlen = 2,
 +	},
    [TUNNEL_TO_REG] = {
    	.mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_1,
    	.moffset = 1,
 -		.mlen = 3,
 +		.mlen = ((ESW_TUN_OPTS_BITS + ESW_TUN_ID_BITS) / 8),
    	.soffset = MLX5_BYTE_OFF(fte_match_param,
    				 misc_parameters_2.metadata_reg_c_1),
    },
@@@ -113,14 -190,6 +113,14 @@@
    [NIC_ZONE_RESTORE_TO_REG] = nic_zone_restore_to_reg_ct,
  };
+/* To avoid false lock dependency warning set the tc_ht lock
 + * class different than the lock class of the ht being used when deleting
 + * last flow from a group and then deleting a group, we get into del_sw_flow_group()
 + * which call rhashtable_destroy on fg->ftes_hash which will take ht->mutex but
 + * it's different than the ht->mutex here.
 + */
 +static struct lock_class_key tc_ht_lock_key;
 +
  static void mlx5e_put_flow_tunnel_id(struct mlx5e_tc_flow *flow);
void
@@@ -170,11 -239,11 +170,11 @@@ mlx5e_tc_match_to_reg_get_match(struct 
  }
int
 -mlx5e_tc_match_to_reg_set(struct mlx5_core_dev *mdev,
 -			  struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
 -			  enum mlx5_flow_namespace_type ns,
 -			  enum mlx5e_tc_attr_to_reg type,
 -			  u32 data)
 +mlx5e_tc_match_to_reg_set_and_get_id(struct mlx5_core_dev *mdev,
 +				     struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
 +				     enum mlx5_flow_namespace_type ns,
 +				     enum mlx5e_tc_attr_to_reg type,
 +				     u32 data)
  {
    int moffset = mlx5e_tc_attr_to_reg_mappings[type].moffset;
    int mfield = mlx5e_tc_attr_to_reg_mappings[type].mfield;
@@@ -198,10 -267,9 +198,10 @@@
    MLX5_SET(set_action_in, modact, offset, moffset * 8);
    MLX5_SET(set_action_in, modact, length, mlen * 8);
    MLX5_SET(set_action_in, modact, data, data);
 +	err = mod_hdr_acts->num_actions;
    mod_hdr_acts->num_actions++;
-	return 0;
 +	return err;
  }
static struct mlx5_tc_ct_priv *
@@@ -250,41 -318,6 +250,41 @@@ mlx5_tc_rule_delete(struct mlx5e_priv *
    mlx5e_del_offloaded_nic_rule(priv, rule, attr);
  }
+int
 +mlx5e_tc_match_to_reg_set(struct mlx5_core_dev *mdev,
 +			  struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
 +			  enum mlx5_flow_namespace_type ns,
 +			  enum mlx5e_tc_attr_to_reg type,
 +			  u32 data)
 +{
 +	int ret = mlx5e_tc_match_to_reg_set_and_get_id(mdev, mod_hdr_acts, ns, type, data);
 +
 +	return ret < 0 ? ret : 0;
 +}
 +
 +void mlx5e_tc_match_to_reg_mod_hdr_change(struct mlx5_core_dev *mdev,
 +					  struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
 +					  enum mlx5e_tc_attr_to_reg type,
 +					  int act_id, u32 data)
 +{
 +	int moffset = mlx5e_tc_attr_to_reg_mappings[type].moffset;
 +	int mfield = mlx5e_tc_attr_to_reg_mappings[type].mfield;
 +	int mlen = mlx5e_tc_attr_to_reg_mappings[type].mlen;
 +	char *modact;
 +
 +	modact = mod_hdr_acts->actions + (act_id * MLX5_MH_ACT_SZ);
 +
 +	/* Firmware has 5bit length field and 0 means 32bits */
 +	if (mlen == 4)
 +		mlen = 0;
 +
 +	MLX5_SET(set_action_in, modact, action_type, MLX5_ACTION_TYPE_SET);
 +	MLX5_SET(set_action_in, modact, field, mfield);
 +	MLX5_SET(set_action_in, modact, offset, moffset * 8);
 +	MLX5_SET(set_action_in, modact, length, mlen * 8);
 +	MLX5_SET(set_action_in, modact, data, data);
 +}
 +
  struct mlx5e_hairpin {
    struct mlx5_hairpin *pair;
@@@ -322,14 -355,15 +322,14 @@@ struct mlx5e_hairpin_entry 
  static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
    		      struct mlx5e_tc_flow *flow);
-static struct mlx5e_tc_flow *mlx5e_flow_get(struct mlx5e_tc_flow *flow)
 +struct mlx5e_tc_flow *mlx5e_flow_get(struct mlx5e_tc_flow *flow)
  {
    if (!flow || !refcount_inc_not_zero(&flow->refcnt))
    	return ERR_PTR(-EINVAL);
    return flow;
  }
-static void mlx5e_flow_put(struct mlx5e_priv *priv,
 -			   struct mlx5e_tc_flow *flow)
 +void mlx5e_flow_put(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow)
  {
    if (refcount_dec_and_test(&flow->refcnt)) {
    	mlx5e_tc_del_flow(priv, flow);
@@@ -337,6 -371,48 +337,6 @@@
    }
  }
-static void __flow_flag_set(struct mlx5e_tc_flow *flow, unsigned long flag)
 -{
 -	/* Complete all memory stores before setting bit. */
 -	smp_mb__before_atomic();
 -	set_bit(flag, &flow->flags);
 -}
 -
 -#define flow_flag_set(flow, flag) __flow_flag_set(flow, MLX5E_TC_FLOW_FLAG_##flag)
 -
 -static bool __flow_flag_test_and_set(struct mlx5e_tc_flow *flow,
 -				     unsigned long flag)
 -{
 -	/* test_and_set_bit() provides all necessary barriers */
 -	return test_and_set_bit(flag, &flow->flags);
 -}
 -
 -#define flow_flag_test_and_set(flow, flag)			\
 -	__flow_flag_test_and_set(flow,				\
 -				 MLX5E_TC_FLOW_FLAG_##flag)
 -
 -static void __flow_flag_clear(struct mlx5e_tc_flow *flow, unsigned long flag)
 -{
 -	/* Complete all memory stores before clearing bit. */
 -	smp_mb__before_atomic();
 -	clear_bit(flag, &flow->flags);
 -}
 -
 -#define flow_flag_clear(flow, flag) __flow_flag_clear(flow, \
 -						      MLX5E_TC_FLOW_FLAG_##flag)
 -
 -static bool __flow_flag_test(struct mlx5e_tc_flow *flow, unsigned long flag)
 -{
 -	bool ret = test_bit(flag, &flow->flags);
 -
 -	/* Read fields of flow structure only after checking flags. */
 -	smp_mb__after_atomic();
 -	return ret;
 -}
 -
 -#define flow_flag_test(flow, flag) __flow_flag_test(flow, \
 -						    MLX5E_TC_FLOW_FLAG_##flag)
 -
  bool mlx5e_is_eswitch_flow(struct mlx5e_tc_flow *flow)
  {
    return flow_flag_test(flow, ESWITCH);
@@@ -347,7 -423,7 +347,7 @@@ static bool mlx5e_is_ft_flow(struct mlx
    return flow_flag_test(flow, FT);
  }
-static bool mlx5e_is_offloaded_flow(struct mlx5e_tc_flow *flow)
 +bool mlx5e_is_offloaded_flow(struct mlx5e_tc_flow *flow)
  {
    return flow_flag_test(flow, OFFLOADED);
  }
@@@ -1062,7 -1138,23 +1062,7 @@@ static void mlx5e_tc_del_nic_flow(struc
    kfree(flow->attr);
  }
-static void mlx5e_detach_encap(struct mlx5e_priv *priv,
 -			       struct mlx5e_tc_flow *flow, int out_index);
 -
 -static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 -			      struct mlx5e_tc_flow *flow,
 -			      struct net_device *mirred_dev,
 -			      int out_index,
 -			      struct netlink_ext_ack *extack,
 -			      struct net_device **encap_dev,
 -			      bool *encap_valid);
 -static int mlx5e_attach_decap(struct mlx5e_priv *priv,
 -			      struct mlx5e_tc_flow *flow,
 -			      struct netlink_ext_ack *extack);
 -static void mlx5e_detach_decap(struct mlx5e_priv *priv,
 -			       struct mlx5e_tc_flow *flow);
 -
 -static struct mlx5_flow_handle *
 +struct mlx5_flow_handle *
  mlx5e_tc_offload_fdb_rules(struct mlx5_eswitch *esw,
    		   struct mlx5e_tc_flow *flow,
    		   struct mlx5_flow_spec *spec,
@@@ -1097,9 -1189,10 +1097,9 @@@
    return rule;
  }
-static void
 -mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw,
 -			     struct mlx5e_tc_flow *flow,
 -			     struct mlx5_flow_attr *attr)
 +void mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw,
 +				  struct mlx5e_tc_flow *flow,
 +				  struct mlx5_flow_attr *attr)
  {
    flow_flag_clear(flow, OFFLOADED);
@@@ -1118,7 -1211,7 +1118,7 @@@ offload_rule_0
    mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
  }
-static struct mlx5_flow_handle *
 +struct mlx5_flow_handle *
  mlx5e_tc_offload_to_slow_path(struct mlx5_eswitch *esw,
    		      struct mlx5e_tc_flow *flow,
    		      struct mlx5_flow_spec *spec)
@@@ -1144,8 -1237,9 +1144,8 @@@
    return rule;
  }
-static void
 -mlx5e_tc_unoffload_from_slow_path(struct mlx5_eswitch *esw,
 -				  struct mlx5e_tc_flow *flow)
 +void mlx5e_tc_unoffload_from_slow_path(struct mlx5_eswitch *esw,
 +				       struct mlx5e_tc_flow *flow)
  {
    struct mlx5_flow_attr *slow_attr;
@@@ -1213,63 -1307,6 +1213,63 @@@ static void remove_unready_flow(struct 
    mutex_unlock(&uplink_priv->unready_flows_lock);
  }
+static bool same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv);
 +
 +bool mlx5e_tc_is_vf_tunnel(struct net_device *out_dev, struct net_device *route_dev)
 +{
 +	struct mlx5_core_dev *out_mdev, *route_mdev;
 +	struct mlx5e_priv *out_priv, *route_priv;
 +
 +	out_priv = netdev_priv(out_dev);
 +	out_mdev = out_priv->mdev;
 +	route_priv = netdev_priv(route_dev);
 +	route_mdev = route_priv->mdev;
 +
 +	if (out_mdev->coredev_type != MLX5_COREDEV_PF ||
 +	    route_mdev->coredev_type != MLX5_COREDEV_VF)
 +		return false;
 +
 +	return same_hw_devs(out_priv, route_priv);
 +}
 +
 +int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *route_dev, u16 *vport)
 +{
 +	struct mlx5e_priv *out_priv, *route_priv;
 +	struct mlx5_core_dev *route_mdev;
 +	struct mlx5_eswitch *esw;
 +	u16 vhca_id;
 +	int err;
 +
 +	out_priv = netdev_priv(out_dev);
 +	esw = out_priv->mdev->priv.eswitch;
 +	route_priv = netdev_priv(route_dev);
 +	route_mdev = route_priv->mdev;
 +
 +	vhca_id = MLX5_CAP_GEN(route_mdev, vhca_id);
 +	err = mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport);
 +	return err;
 +}
 +
 +int mlx5e_tc_add_flow_mod_hdr(struct mlx5e_priv *priv,
 +			      struct mlx5e_tc_flow_parse_attr *parse_attr,
 +			      struct mlx5e_tc_flow *flow)
 +{
 +	struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts = &parse_attr->mod_hdr_acts;
 +	struct mlx5_modify_hdr *mod_hdr;
 +
 +	mod_hdr = mlx5_modify_header_alloc(priv->mdev,
 +					   get_flow_name_space(flow),
 +					   mod_hdr_acts->num_actions,
 +					   mod_hdr_acts->actions);
 +	if (IS_ERR(mod_hdr))
 +		return PTR_ERR(mod_hdr);
 +
 +	WARN_ON(flow->attr->modify_hdr);
 +	flow->attr->modify_hdr = mod_hdr;
 +
 +	return 0;
 +}
 +
  static int
  mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
    	      struct mlx5e_tc_flow *flow,
@@@ -1279,15 -1316,21 +1279,15 @@@
    struct net_device *out_dev, *encap_dev = NULL;
    struct mlx5e_tc_flow_parse_attr *parse_attr;
    struct mlx5_flow_attr *attr = flow->attr;
 +	bool vf_tun = false, encap_valid = true;
    struct mlx5_esw_flow_attr *esw_attr;
    struct mlx5_fc *counter = NULL;
    struct mlx5e_rep_priv *rpriv;
    struct mlx5e_priv *out_priv;
 -	bool encap_valid = true;
    u32 max_prio, max_chain;
    int err = 0;
    int out_index;
-	if (!mlx5_chains_prios_supported(esw_chains(esw)) && attr->prio != 1) {
 -		NL_SET_ERR_MSG_MOD(extack,
 -				   "E-switch priorities unsupported, upgrade FW");
 -		return -EOPNOTSUPP;
 -	}
 -
    /* We check chain range only for tc flows.
     * For ft flows, we checked attr->chain was originally 0 and set it to
     * FDB_FT_CHAIN which is outside tc range.
@@@ -1297,28 -1340,20 +1297,28 @@@
    if (!mlx5e_is_ft_flow(flow) && attr->chain > max_chain) {
    	NL_SET_ERR_MSG_MOD(extack,
    			   "Requested chain is out of supported range");
 -		return -EOPNOTSUPP;
 +		err = -EOPNOTSUPP;
 +		goto err_out;
    }
max_prio = mlx5_chains_get_prio_range(esw_chains(esw));
    if (attr->prio > max_prio) {
    	NL_SET_ERR_MSG_MOD(extack,
    			   "Requested priority is out of supported range");
 -		return -EOPNOTSUPP;
 +		err = -EOPNOTSUPP;
 +		goto err_out;
 +	}
 +
 +	if (flow_flag_test(flow, TUN_RX)) {
 +		err = mlx5e_attach_decap_route(priv, flow);
 +		if (err)
 +			goto err_out;
    }
if (flow_flag_test(flow, L3_TO_L2_DECAP)) {
    	err = mlx5e_attach_decap(priv, flow, extack);
    	if (err)
 -			return err;
 +			goto err_out;
    }
parse_attr = attr->parse_attr;
@@@ -1336,11 -1371,8 +1336,11 @@@
    	err = mlx5e_attach_encap(priv, flow, out_dev, out_index,
    				 extack, &encap_dev, &encap_valid);
    	if (err)
 -			return err;
 +			goto err_out;
+		if (esw_attr->dests[out_index].flags &
 +		    MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE)
 +			vf_tun = true;
    	out_priv = netdev_priv(encap_dev);
    	rpriv = out_priv->ppriv;
    	esw_attr->dests[out_index].rep = rpriv->rep;
@@@ -1349,27 -1381,20 +1349,27 @@@
err = mlx5_eswitch_add_vlan_action(esw, attr);
    if (err)
 -		return err;
 +		goto err_out;
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR &&
        !(attr->ct_attr.ct_action & TCA_CT_ACT_CLEAR)) {
 -		err = mlx5e_attach_mod_hdr(priv, flow, parse_attr);
 -		dealloc_mod_hdr_actions(&parse_attr->mod_hdr_acts);
 -		if (err)
 -			return err;
 +		if (vf_tun) {
 +			err = mlx5e_tc_add_flow_mod_hdr(priv, parse_attr, flow);
 +			if (err)
 +				goto err_out;
 +		} else {
 +			err = mlx5e_attach_mod_hdr(priv, flow, parse_attr);
 +			if (err)
 +				goto err_out;
 +		}
    }
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
    	counter = mlx5_fc_create(esw_attr->counter_dev, true);
 -		if (IS_ERR(counter))
 -			return PTR_ERR(counter);
 +		if (IS_ERR(counter)) {
 +			err = PTR_ERR(counter);
 +			goto err_out;
 +		}
attr->counter = counter;
    }
@@@ -1383,17 -1408,12 +1383,17 @@@
    else
    	flow->rule[0] = mlx5e_tc_offload_fdb_rules(esw, flow, &parse_attr->spec, attr);
-	if (IS_ERR(flow->rule[0]))
 -		return PTR_ERR(flow->rule[0]);
 -	else
 -		flow_flag_set(flow, OFFLOADED);
 +	if (IS_ERR(flow->rule[0])) {
 +		err = PTR_ERR(flow->rule[0]);
 +		goto err_out;
 +	}
 +	flow_flag_set(flow, OFFLOADED);
return 0;
 +
 +err_out:
 +	flow_flag_set(flow, FAILED);
 +	return err;
  }
static bool mlx5_flow_has_geneve_opt(struct mlx5e_tc_flow *flow)
@@@ -1414,11 -1434,8 +1414,11 @@@ static void mlx5e_tc_del_fdb_flow(struc
  {
    struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
    struct mlx5_flow_attr *attr = flow->attr;
 +	struct mlx5_esw_flow_attr *esw_attr;
 +	bool vf_tun = false;
    int out_index;
+	esw_attr = attr->esw_attr;
    mlx5e_put_flow_tunnel_id(flow);
if (flow_flag_test(flow, NOT_READY))
@@@ -1436,33 -1453,20 +1436,33 @@@
mlx5_eswitch_del_vlan_action(esw, attr);
-	for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++)
 -		if (attr->esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP) {
 +	if (flow->decap_route)
 +		mlx5e_detach_decap_route(priv, flow);
 +
 +	for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) {
 +		if (esw_attr->dests[out_index].flags &
 +		    MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE)
 +			vf_tun = true;
 +		if (esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP) {
    		mlx5e_detach_encap(priv, flow, out_index);
    		kfree(attr->parse_attr->tun_info[out_index]);
    	}
 -	kvfree(attr->parse_attr);
 +	}
mlx5_tc_ct_match_del(get_ct_priv(priv), &flow->attr->ct_attr);
-	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
 -		mlx5e_detach_mod_hdr(priv, flow);
 +	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
 +		dealloc_mod_hdr_actions(&attr->parse_attr->mod_hdr_acts);
 +		if (vf_tun && attr->modify_hdr)
 +			mlx5_modify_header_dealloc(priv->mdev, attr->modify_hdr);
 +		else
 +			mlx5e_detach_mod_hdr(priv, flow);
 +	}
 +	kvfree(attr->parse_attr);
 +	kvfree(attr->esw_attr->rx_tun_attr);
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
 -		mlx5_fc_destroy(attr->esw_attr->counter_dev, attr->counter);
 +		mlx5_fc_destroy(esw_attr->counter_dev, attr->counter);
if (flow_flag_test(flow, L3_TO_L2_DECAP))
    	mlx5e_detach_decap(priv, flow);
@@@ -1470,13 -1474,141 +1470,13 @@@
    kfree(flow->attr);
  }
-void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 -			      struct mlx5e_encap_entry *e,
 -			      struct list_head *flow_list)
 -{
 -	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 -	struct mlx5_esw_flow_attr *esw_attr;
 -	struct mlx5_flow_handle *rule;
 -	struct mlx5_flow_attr *attr;
 -	struct mlx5_flow_spec *spec;
 -	struct mlx5e_tc_flow *flow;
 -	int err;
 -
 -	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
 -						     e->reformat_type,
 -						     e->encap_size, e->encap_header,
 -						     MLX5_FLOW_NAMESPACE_FDB);
 -	if (IS_ERR(e->pkt_reformat)) {
 -		mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %lu\n",
 -			       PTR_ERR(e->pkt_reformat));
 -		return;
 -	}
 -	e->flags |= MLX5_ENCAP_ENTRY_VALID;
 -	mlx5e_rep_queue_neigh_stats_work(priv);
 -
 -	list_for_each_entry(flow, flow_list, tmp_list) {
 -		bool all_flow_encaps_valid = true;
 -		int i;
 -
 -		if (!mlx5e_is_offloaded_flow(flow))
 -			continue;
 -		attr = flow->attr;
 -		esw_attr = attr->esw_attr;
 -		spec = &attr->parse_attr->spec;
 -
 -		esw_attr->dests[flow->tmp_efi_index].pkt_reformat = e->pkt_reformat;
 -		esw_attr->dests[flow->tmp_efi_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
 -		/* Flow can be associated with multiple encap entries.
 -		 * Before offloading the flow verify that all of them have
 -		 * a valid neighbour.
 -		 */
 -		for (i = 0; i < MLX5_MAX_FLOW_FWD_VPORTS; i++) {
 -			if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP))
 -				continue;
 -			if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP_VALID)) {
 -				all_flow_encaps_valid = false;
 -				break;
 -			}
 -		}
 -		/* Do not offload flows with unresolved neighbors */
 -		if (!all_flow_encaps_valid)
 -			continue;
 -		/* update from slow path rule to encap rule */
 -		rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
 -		if (IS_ERR(rule)) {
 -			err = PTR_ERR(rule);
 -			mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
 -				       err);
 -			continue;
 -		}
 -
 -		mlx5e_tc_unoffload_from_slow_path(esw, flow);
 -		flow->rule[0] = rule;
 -		/* was unset when slow path rule removed */
 -		flow_flag_set(flow, OFFLOADED);
 -	}
 -}
 -
 -void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
 -			      struct mlx5e_encap_entry *e,
 -			      struct list_head *flow_list)
 -{
 -	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 -	struct mlx5_esw_flow_attr *esw_attr;
 -	struct mlx5_flow_handle *rule;
 -	struct mlx5_flow_attr *attr;
 -	struct mlx5_flow_spec *spec;
 -	struct mlx5e_tc_flow *flow;
 -	int err;
 -
 -	list_for_each_entry(flow, flow_list, tmp_list) {
 -		if (!mlx5e_is_offloaded_flow(flow))
 -			continue;
 -		attr = flow->attr;
 -		esw_attr = attr->esw_attr;
 -		spec = &attr->parse_attr->spec;
 -
 -		/* update from encap rule to slow path rule */
 -		rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec);
 -		/* mark the flow's encap dest as non-valid */
 -		esw_attr->dests[flow->tmp_efi_index].flags &= ~MLX5_ESW_DEST_ENCAP_VALID;
 -
 -		if (IS_ERR(rule)) {
 -			err = PTR_ERR(rule);
 -			mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n",
 -				       err);
 -			continue;
 -		}
 -
 -		mlx5e_tc_unoffload_fdb_rules(esw, flow, attr);
 -		flow->rule[0] = rule;
 -		/* was unset when fast path rule removed */
 -		flow_flag_set(flow, OFFLOADED);
 -	}
 -
 -	/* we know that the encap is valid */
 -	e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
 -	mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
 -}
 -
 -static struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow)
 +struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow)
  {
    return flow->attr->counter;
  }
-/* Takes reference to all flows attached to encap and adds the flows to
 - * flow_list using 'tmp_list' list_head in mlx5e_tc_flow.
 - */
 -void mlx5e_take_all_encap_flows(struct mlx5e_encap_entry *e, struct list_head *flow_list)
 -{
 -	struct encap_flow_item *efi;
 -	struct mlx5e_tc_flow *flow;
 -
 -	list_for_each_entry(efi, &e->flows, list) {
 -		flow = container_of(efi, struct mlx5e_tc_flow, encaps[efi->index]);
 -		if (IS_ERR(mlx5e_flow_get(flow)))
 -			continue;
 -		wait_for_completion(&flow->init_done);
 -
 -		flow->tmp_efi_index = efi->index;
 -		list_add(&flow->tmp_list, flow_list);
 -	}
 -}
 -
  /* Iterate over tmp_list of flows attached to flow_list head. */
 -void mlx5e_put_encap_flow_list(struct mlx5e_priv *priv, struct list_head *flow_list)
 +void mlx5e_put_flow_list(struct mlx5e_priv *priv, struct list_head *flow_list)
  {
    struct mlx5e_tc_flow *flow, *tmp;
@@@ -1484,6 -1616,222 +1484,6 @@@
    	mlx5e_flow_put(priv, flow);
  }
-static struct mlx5e_encap_entry *
 -mlx5e_get_next_valid_encap(struct mlx5e_neigh_hash_entry *nhe,
 -			   struct mlx5e_encap_entry *e)
 -{
 -	struct mlx5e_encap_entry *next = NULL;
 -
 -retry:
 -	rcu_read_lock();
 -
 -	/* find encap with non-zero reference counter value */
 -	for (next = e ?
 -		     list_next_or_null_rcu(&nhe->encap_list,
 -					   &e->encap_list,
 -					   struct mlx5e_encap_entry,
 -					   encap_list) :
 -		     list_first_or_null_rcu(&nhe->encap_list,
 -					    struct mlx5e_encap_entry,
 -					    encap_list);
 -	     next;
 -	     next = list_next_or_null_rcu(&nhe->encap_list,
 -					  &next->encap_list,
 -					  struct mlx5e_encap_entry,
 -					  encap_list))
 -		if (mlx5e_encap_take(next))
 -			break;
 -
 -	rcu_read_unlock();
 -
 -	/* release starting encap */
 -	if (e)
 -		mlx5e_encap_put(netdev_priv(e->out_dev), e);
 -	if (!next)
 -		return next;
 -
 -	/* wait for encap to be fully initialized */
 -	wait_for_completion(&next->res_ready);
 -	/* continue searching if encap entry is not in valid state after completion */
 -	if (!(next->flags & MLX5_ENCAP_ENTRY_VALID)) {
 -		e = next;
 -		goto retry;
 -	}
 -
 -	return next;
 -}
 -
 -void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
 -{
 -	struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
 -	struct mlx5e_encap_entry *e = NULL;
 -	struct mlx5e_tc_flow *flow;
 -	struct mlx5_fc *counter;
 -	struct neigh_table *tbl;
 -	bool neigh_used = false;
 -	struct neighbour *n;
 -	u64 lastuse;
 -
 -	if (m_neigh->family == AF_INET)
 -		tbl = &arp_tbl;
 -#if IS_ENABLED(CONFIG_IPV6)
 -	else if (m_neigh->family == AF_INET6)
 -		tbl = ipv6_stub->nd_tbl;
 -#endif
 -	else
 -		return;
 -
 -	/* mlx5e_get_next_valid_encap() releases previous encap before returning
 -	 * next one.
 -	 */
 -	while ((e = mlx5e_get_next_valid_encap(nhe, e)) != NULL) {
 -		struct mlx5e_priv *priv = netdev_priv(e->out_dev);
 -		struct encap_flow_item *efi, *tmp;
 -		struct mlx5_eswitch *esw;
 -		LIST_HEAD(flow_list);
 -
 -		esw = priv->mdev->priv.eswitch;
 -		mutex_lock(&esw->offloads.encap_tbl_lock);
 -		list_for_each_entry_safe(efi, tmp, &e->flows, list) {
 -			flow = container_of(efi, struct mlx5e_tc_flow,
 -					    encaps[efi->index]);
 -			if (IS_ERR(mlx5e_flow_get(flow)))
 -				continue;
 -			list_add(&flow->tmp_list, &flow_list);
 -
 -			if (mlx5e_is_offloaded_flow(flow)) {
 -				counter = mlx5e_tc_get_counter(flow);
 -				lastuse = mlx5_fc_query_lastuse(counter);
 -				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
 -					neigh_used = true;
 -					break;
 -				}
 -			}
 -		}
 -		mutex_unlock(&esw->offloads.encap_tbl_lock);
 -
 -		mlx5e_put_encap_flow_list(priv, &flow_list);
 -		if (neigh_used) {
 -			/* release current encap before breaking the loop */
 -			mlx5e_encap_put(priv, e);
 -			break;
 -		}
 -	}
 -
 -	trace_mlx5e_tc_update_neigh_used_value(nhe, neigh_used);
 -
 -	if (neigh_used) {
 -		nhe->reported_lastuse = jiffies;
 -
 -		/* find the relevant neigh according to the cached device and
 -		 * dst ip pair
 -		 */
 -		n = neigh_lookup(tbl, &m_neigh->dst_ip, m_neigh->dev);
 -		if (!n)
 -			return;
 -
 -		neigh_event_send(n, NULL);
 -		neigh_release(n);
 -	}
 -}
 -
 -static void mlx5e_encap_dealloc(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
 -{
 -	WARN_ON(!list_empty(&e->flows));
 -
 -	if (e->compl_result > 0) {
 -		mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
 -
 -		if (e->flags & MLX5_ENCAP_ENTRY_VALID)
 -			mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
 -	}
 -
 -	kfree(e->tun_info);
 -	kfree(e->encap_header);
 -	kfree_rcu(e, rcu);
 -}
 -
 -static void mlx5e_decap_dealloc(struct mlx5e_priv *priv,
 -				struct mlx5e_decap_entry *d)
 -{
 -	WARN_ON(!list_empty(&d->flows));
 -
 -	if (!d->compl_result)
 -		mlx5_packet_reformat_dealloc(priv->mdev, d->pkt_reformat);
 -
 -	kfree_rcu(d, rcu);
 -}
 -
 -void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
 -{
 -	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 -
 -	if (!refcount_dec_and_mutex_lock(&e->refcnt, &esw->offloads.encap_tbl_lock))
 -		return;
 -	hash_del_rcu(&e->encap_hlist);
 -	mutex_unlock(&esw->offloads.encap_tbl_lock);
 -
 -	mlx5e_encap_dealloc(priv, e);
 -}
 -
 -static void mlx5e_decap_put(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d)
 -{
 -	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 -
 -	if (!refcount_dec_and_mutex_lock(&d->refcnt, &esw->offloads.decap_tbl_lock))
 -		return;
 -	hash_del_rcu(&d->hlist);
 -	mutex_unlock(&esw->offloads.decap_tbl_lock);
 -
 -	mlx5e_decap_dealloc(priv, d);
 -}
 -
 -static void mlx5e_detach_encap(struct mlx5e_priv *priv,
 -			       struct mlx5e_tc_flow *flow, int out_index)
 -{
 -	struct mlx5e_encap_entry *e = flow->encaps[out_index].e;
 -	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 -
 -	/* flow wasn't fully initialized */
 -	if (!e)
 -		return;
 -
 -	mutex_lock(&esw->offloads.encap_tbl_lock);
 -	list_del(&flow->encaps[out_index].list);
 -	flow->encaps[out_index].e = NULL;
 -	if (!refcount_dec_and_test(&e->refcnt)) {
 -		mutex_unlock(&esw->offloads.encap_tbl_lock);
 -		return;
 -	}
 -	hash_del_rcu(&e->encap_hlist);
 -	mutex_unlock(&esw->offloads.encap_tbl_lock);
 -
 -	mlx5e_encap_dealloc(priv, e);
 -}
 -
 -static void mlx5e_detach_decap(struct mlx5e_priv *priv,
 -			       struct mlx5e_tc_flow *flow)
 -{
 -	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 -	struct mlx5e_decap_entry *d = flow->decap_reformat;
 -
 -	if (!d)
 -		return;
 -
 -	mutex_lock(&esw->offloads.decap_tbl_lock);
 -	list_del(&flow->l3_to_l2_reformat);
 -	flow->decap_reformat = NULL;
 -
 -	if (!refcount_dec_and_test(&d->refcnt)) {
 -		mutex_unlock(&esw->offloads.decap_tbl_lock);
 -		return;
 -	}
 -	hash_del_rcu(&d->hlist);
 -	mutex_unlock(&esw->offloads.decap_tbl_lock);
 -
 -	mlx5e_decap_dealloc(priv, d);
 -}
 -
  static void __mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow)
  {
    struct mlx5_eswitch *esw = flow->priv->mdev->priv.eswitch;
@@@ -1741,29 -2089,6 +1741,29 @@@ void mlx5e_tc_set_ethertype(struct mlx5
    }
  }
+u8 mlx5e_tc_get_ip_version(struct mlx5_flow_spec *spec, bool outer)
 +{
 +	void *headers_v;
 +	u16 ethertype;
 +	u8 ip_version;
 +
 +	if (outer)
 +		headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
 +	else
 +		headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, inner_headers);
 +
 +	ip_version = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_version);
 +	/* Return ip_version converted from ethertype anyway */
 +	if (!ip_version) {
 +		ethertype = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ethertype);
 +		if (ethertype == ETH_P_IP || ethertype == ETH_P_ARP)
 +			ip_version = 4;
 +		else if (ethertype == ETH_P_IPV6)
 +			ip_version = 6;
 +	}
 +	return ip_version;
 +}
 +
  static int parse_tunnel_attr(struct mlx5e_priv *priv,
    		     struct mlx5e_tc_flow *flow,
    		     struct mlx5_flow_spec *spec,
@@@ -1772,7 -2097,6 +1772,7 @@@
    		     u8 *match_level,
    		     bool *match_inner)
  {
 +	struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(filter_dev);
    struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
    struct netlink_ext_ack *extack = f->common.extack;
    bool needs_mapping, sets_mapping;
@@@ -1810,31 -2134,6 +1810,31 @@@
    	 */
    	if (!netif_is_bareudp(filter_dev))
    		flow->attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
 +		err = mlx5e_tc_set_attr_rx_tun(flow, spec);
 +		if (err)
 +			return err;
 +	} else if (tunnel && tunnel->tunnel_type == MLX5E_TC_TUNNEL_TYPE_VXLAN) {
 +		struct mlx5_flow_spec *tmp_spec;
 +
 +		tmp_spec = kvzalloc(sizeof(*tmp_spec), GFP_KERNEL);
 +		if (!tmp_spec) {
 +			NL_SET_ERR_MSG_MOD(extack, "Failed to allocate memory for vxlan tmp spec");
 +			netdev_warn(priv->netdev, "Failed to allocate memory for vxlan tmp spec");
 +			return -ENOMEM;
 +		}
 +		memcpy(tmp_spec, spec, sizeof(*tmp_spec));
 +
 +		err = mlx5e_tc_tun_parse(filter_dev, priv, tmp_spec, f, match_level);
 +		if (err) {
 +			kvfree(tmp_spec);
 +			NL_SET_ERR_MSG_MOD(extack, "Failed to parse tunnel attributes");
 +			netdev_warn(priv->netdev, "Failed to parse tunnel attributes");
 +			return err;
 +		}
 +		err = mlx5e_tc_set_attr_rx_tun(flow, tmp_spec);
 +		kvfree(tmp_spec);
 +		if (err)
 +			return err;
    }
if (!needs_mapping && !sets_mapping)
@@@ -3283,6 -3582,35 +3283,6 @@@ static int parse_tc_nic_actions(struct 
    return 0;
  }
-struct encap_key {
 -	const struct ip_tunnel_key *ip_tun_key;
 -	struct mlx5e_tc_tunnel *tc_tunnel;
 -};
 -
 -static inline int cmp_encap_info(struct encap_key *a,
 -				 struct encap_key *b)
 -{
 -	return memcmp(a->ip_tun_key, b->ip_tun_key, sizeof(*a->ip_tun_key)) ||
 -	       a->tc_tunnel->tunnel_type != b->tc_tunnel->tunnel_type;
 -}
 -
 -static inline int cmp_decap_info(struct mlx5e_decap_key *a,
 -				 struct mlx5e_decap_key *b)
 -{
 -	return memcmp(&a->key, &b->key, sizeof(b->key));
 -}
 -
 -static inline int hash_encap_info(struct encap_key *key)
 -{
 -	return jhash(key->ip_tun_key, sizeof(*key->ip_tun_key),
 -		     key->tc_tunnel->tunnel_type);
 -}
 -
 -static inline int hash_decap_info(struct mlx5e_decap_key *key)
 -{
 -	return jhash(&key->key, sizeof(key->key), 0);
 -}
 -
  static bool is_merged_eswitch_vfs(struct mlx5e_priv *priv,
    			  struct net_device *peer_netdev)
  {
@@@ -3296,6 -3624,277 +3296,6 @@@
    	same_hw_devs(priv, peer_priv));
  }
-bool mlx5e_encap_take(struct mlx5e_encap_entry *e)
 -{
 -	return refcount_inc_not_zero(&e->refcnt);
 -}
 -
 -static bool mlx5e_decap_take(struct mlx5e_decap_entry *e)
 -{
 -	return refcount_inc_not_zero(&e->refcnt);
 -}
 -
 -static struct mlx5e_encap_entry *
 -mlx5e_encap_get(struct mlx5e_priv *priv, struct encap_key *key,
 -		uintptr_t hash_key)
 -{
 -	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 -	struct mlx5e_encap_entry *e;
 -	struct encap_key e_key;
 -
 -	hash_for_each_possible_rcu(esw->offloads.encap_tbl, e,
 -				   encap_hlist, hash_key) {
 -		e_key.ip_tun_key = &e->tun_info->key;
 -		e_key.tc_tunnel = e->tunnel;
 -		if (!cmp_encap_info(&e_key, key) &&
 -		    mlx5e_encap_take(e))
 -			return e;
 -	}
 -
 -	return NULL;
 -}
 -
 -static struct mlx5e_decap_entry *
 -mlx5e_decap_get(struct mlx5e_priv *priv, struct mlx5e_decap_key *key,
 -		uintptr_t hash_key)
 -{
 -	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 -	struct mlx5e_decap_key r_key;
 -	struct mlx5e_decap_entry *e;
 -
 -	hash_for_each_possible_rcu(esw->offloads.decap_tbl, e,
 -				   hlist, hash_key) {
 -		r_key = e->key;
 -		if (!cmp_decap_info(&r_key, key) &&
 -		    mlx5e_decap_take(e))
 -			return e;
 -	}
 -	return NULL;
 -}
 -
 -static struct ip_tunnel_info *dup_tun_info(const struct ip_tunnel_info *tun_info)
 -{
 -	size_t tun_size = sizeof(*tun_info) + tun_info->options_len;
 -
 -	return kmemdup(tun_info, tun_size, GFP_KERNEL);
 -}
 -
 -static bool is_duplicated_encap_entry(struct mlx5e_priv *priv,
 -				      struct mlx5e_tc_flow *flow,
 -				      int out_index,
 -				      struct mlx5e_encap_entry *e,
 -				      struct netlink_ext_ack *extack)
 -{
 -	int i;
 -
 -	for (i = 0; i < out_index; i++) {
 -		if (flow->encaps[i].e != e)
 -			continue;
 -		NL_SET_ERR_MSG_MOD(extack, "can't duplicate encap action");
 -		netdev_err(priv->netdev, "can't duplicate encap action\n");
 -		return true;
 -	}
 -
 -	return false;
 -}
 -
 -static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 -			      struct mlx5e_tc_flow *flow,
 -			      struct net_device *mirred_dev,
 -			      int out_index,
 -			      struct netlink_ext_ack *extack,
 -			      struct net_device **encap_dev,
 -			      bool *encap_valid)
 -{
 -	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 -	struct mlx5e_tc_flow_parse_attr *parse_attr;
 -	struct mlx5_flow_attr *attr = flow->attr;
 -	const struct ip_tunnel_info *tun_info;
 -	struct encap_key key;
 -	struct mlx5e_encap_entry *e;
 -	unsigned short family;
 -	uintptr_t hash_key;
 -	int err = 0;
 -
 -	parse_attr = attr->parse_attr;
 -	tun_info = parse_attr->tun_info[out_index];
 -	family = ip_tunnel_info_af(tun_info);
 -	key.ip_tun_key = &tun_info->key;
 -	key.tc_tunnel = mlx5e_get_tc_tun(mirred_dev);
 -	if (!key.tc_tunnel) {
 -		NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel");
 -		return -EOPNOTSUPP;
 -	}
 -
 -	hash_key = hash_encap_info(&key);
 -
 -	mutex_lock(&esw->offloads.encap_tbl_lock);
 -	e = mlx5e_encap_get(priv, &key, hash_key);
 -
 -	/* must verify if encap is valid or not */
 -	if (e) {
 -		/* Check that entry was not already attached to this flow */
 -		if (is_duplicated_encap_entry(priv, flow, out_index, e, extack)) {
 -			err = -EOPNOTSUPP;
 -			goto out_err;
 -		}
 -
 -		mutex_unlock(&esw->offloads.encap_tbl_lock);
 -		wait_for_completion(&e->res_ready);
 -
 -		/* Protect against concurrent neigh update. */
 -		mutex_lock(&esw->offloads.encap_tbl_lock);
 -		if (e->compl_result < 0) {
 -			err = -EREMOTEIO;
 -			goto out_err;
 -		}
 -		goto attach_flow;
 -	}
 -
 -	e = kzalloc(sizeof(*e), GFP_KERNEL);
 -	if (!e) {
 -		err = -ENOMEM;
 -		goto out_err;
 -	}
 -
 -	refcount_set(&e->refcnt, 1);
 -	init_completion(&e->res_ready);
 -
 -	tun_info = dup_tun_info(tun_info);
 -	if (!tun_info) {
 -		err = -ENOMEM;
 -		goto out_err_init;
 -	}
 -	e->tun_info = tun_info;
 -	err = mlx5e_tc_tun_init_encap_attr(mirred_dev, priv, e, extack);
 -	if (err)
 -		goto out_err_init;
 -
 -	INIT_LIST_HEAD(&e->flows);
 -	hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key);
 -	mutex_unlock(&esw->offloads.encap_tbl_lock);
 -
 -	if (family == AF_INET)
 -		err = mlx5e_tc_tun_create_header_ipv4(priv, mirred_dev, e);
 -	else if (family == AF_INET6)
 -		err = mlx5e_tc_tun_create_header_ipv6(priv, mirred_dev, e);
 -
 -	/* Protect against concurrent neigh update. */
 -	mutex_lock(&esw->offloads.encap_tbl_lock);
 -	complete_all(&e->res_ready);
 -	if (err) {
 -		e->compl_result = err;
 -		goto out_err;
 -	}
 -	e->compl_result = 1;
 -
 -attach_flow:
 -	flow->encaps[out_index].e = e;
 -	list_add(&flow->encaps[out_index].list, &e->flows);
 -	flow->encaps[out_index].index = out_index;
 -	*encap_dev = e->out_dev;
 -	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
 -		attr->esw_attr->dests[out_index].pkt_reformat = e->pkt_reformat;
 -		attr->esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
 -		*encap_valid = true;
 -	} else {
 -		*encap_valid = false;
 -	}
 -	mutex_unlock(&esw->offloads.encap_tbl_lock);
 -
 -	return err;
 -
 -out_err:
 -	mutex_unlock(&esw->offloads.encap_tbl_lock);
 -	if (e)
 -		mlx5e_encap_put(priv, e);
 -	return err;
 -
 -out_err_init:
 -	mutex_unlock(&esw->offloads.encap_tbl_lock);
 -	kfree(tun_info);
 -	kfree(e);
 -	return err;
 -}
 -
 -static int mlx5e_attach_decap(struct mlx5e_priv *priv,
 -			      struct mlx5e_tc_flow *flow,
 -			      struct netlink_ext_ack *extack)
 -{
 -	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 -	struct mlx5_esw_flow_attr *attr = flow->attr->esw_attr;
 -	struct mlx5e_tc_flow_parse_attr *parse_attr;
 -	struct mlx5e_decap_entry *d;
 -	struct mlx5e_decap_key key;
 -	uintptr_t hash_key;
 -	int err = 0;
 -
 -	parse_attr = flow->attr->parse_attr;
 -	if (sizeof(parse_attr->eth) > MLX5_CAP_ESW(priv->mdev, max_encap_header_size)) {
 -		NL_SET_ERR_MSG_MOD(extack,
 -				   "encap header larger than max supported");
 -		return -EOPNOTSUPP;
 -	}
 -
 -	key.key = parse_attr->eth;
 -	hash_key = hash_decap_info(&key);
 -	mutex_lock(&esw->offloads.decap_tbl_lock);
 -	d = mlx5e_decap_get(priv, &key, hash_key);
 -	if (d) {
 -		mutex_unlock(&esw->offloads.decap_tbl_lock);
 -		wait_for_completion(&d->res_ready);
 -		mutex_lock(&esw->offloads.decap_tbl_lock);
 -		if (d->compl_result) {
 -			err = -EREMOTEIO;
 -			goto out_free;
 -		}
 -		goto found;
 -	}
 -
 -	d = kzalloc(sizeof(*d), GFP_KERNEL);
 -	if (!d) {
 -		err = -ENOMEM;
 -		goto out_err;
 -	}
 -
 -	d->key = key;
 -	refcount_set(&d->refcnt, 1);
 -	init_completion(&d->res_ready);
 -	INIT_LIST_HEAD(&d->flows);
 -	hash_add_rcu(esw->offloads.decap_tbl, &d->hlist, hash_key);
 -	mutex_unlock(&esw->offloads.decap_tbl_lock);
 -
 -	d->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
 -						     MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2,
 -						     sizeof(parse_attr->eth),
 -						     &parse_attr->eth,
 -						     MLX5_FLOW_NAMESPACE_FDB);
 -	if (IS_ERR(d->pkt_reformat)) {
 -		err = PTR_ERR(d->pkt_reformat);
 -		d->compl_result = err;
 -	}
 -	mutex_lock(&esw->offloads.decap_tbl_lock);
 -	complete_all(&d->res_ready);
 -	if (err)
 -		goto out_free;
 -
 -found:
 -	flow->decap_reformat = d;
 -	attr->decap_pkt_reformat = d->pkt_reformat;
 -	list_add(&flow->l3_to_l2_reformat, &d->flows);
 -	mutex_unlock(&esw->offloads.decap_tbl_lock);
 -	return 0;
 -
 -out_free:
 -	mutex_unlock(&esw->offloads.decap_tbl_lock);
 -	mlx5e_decap_put(priv, d);
 -	return err;
 -
 -out_err:
 -	mutex_unlock(&esw->offloads.decap_tbl_lock);
 -	return err;
 -}
 -
  static int parse_tc_vlan_action(struct mlx5e_priv *priv,
    			const struct flow_action_entry *act,
    			struct mlx5_esw_flow_attr *attr,
@@@ -3648,8 -4247,7 +3648,8 @@@ static int parse_tc_fdb_actions(struct 
    		if (encap) {
    			parse_attr->mirred_ifindex[esw_attr->out_count] =
    				out_dev->ifindex;
 -				parse_attr->tun_info[esw_attr->out_count] = dup_tun_info(info);
 +				parse_attr->tun_info[esw_attr->out_count] =
 +					mlx5e_dup_tun_info(info);
    			if (!parse_attr->tun_info[esw_attr->out_count])
    				return -ENOMEM;
    			encap = false;
@@@ -3786,9 -4384,6 +3786,9 @@@
    	}
    }
+	/* always set IP version for indirect table handling */
 +	attr->ip_version = mlx5e_tc_get_ip_version(&parse_attr->spec, true);
 +
    if (MLX5_CAP_GEN(esw->dev, prio_tag_required) &&
        action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP) {
    	/* For prio tag mode, replace vlan pop with rewrite vlan prio
@@@ -4069,6 -4664,7 +4069,6 @@@ __mlx5e_add_fdb_flow(struct mlx5e_priv 
    return flow;
err_free:
 -	dealloc_mod_hdr_actions(&parse_attr->mod_hdr_acts);
    mlx5e_flow_put(priv, flow);
  out:
    return ERR_PTR(err);
@@@ -4213,7 -4809,6 +4213,7 @@@ mlx5e_add_nic_flow(struct mlx5e_priv *p
    return 0;
err_free:
 +	flow_flag_set(flow, FAILED);
    dealloc_mod_hdr_actions(&parse_attr->mod_hdr_acts);
    mlx5e_flow_put(priv, flow);
  out:
@@@ -4445,7 -5040,7 +4445,7 @@@ static int apply_police_params(struct m
     */
    if (rate) {
    	rate = (rate * BITS_PER_BYTE) + 500000;
- 		rate_mbps = max_t(u32, do_div(rate, 1000000), 1);
+ 		rate_mbps = max_t(u64, do_div(rate, 1000000), 1);
    }
err = mlx5_esw_modify_vport_rate(esw, vport_num, rate_mbps);
@@@ -4626,8 -5221,6 +4626,8 @@@ int mlx5e_tc_nic_init(struct mlx5e_pri
    if (err)
    	return err;
+	lockdep_set_class(&tc->ht.mutex, &tc_ht_lock_key);
 +
    if (MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, ignore_flow_level)) {
    	attr.flags = MLX5_CHAINS_AND_PRIOS_SUPPORTED |
    		MLX5_CHAINS_IGNORE_FLOW_LEVEL_SUPPORTED;
@@@ -4735,8 -5328,7 +4735,8 @@@ int mlx5e_tc_esw_init(struct rhashtabl
    }
    uplink_priv->tunnel_mapping = mapping;
-	mapping = mapping_create(sz_enc_opts, ENC_OPTS_BITS_MASK, true);
 +	/* 0xFFF is reserved for stack devices slow path table mark */
 +	mapping = mapping_create(sz_enc_opts, ENC_OPTS_BITS_MASK - 1, true);
    if (IS_ERR(mapping)) {
    	err = PTR_ERR(mapping);
    	goto err_enc_opts_mapping;
@@@ -4747,18 -5339,8 +4747,18 @@@
    if (err)
    	goto err_ht_init;
-	return err;
 +	lockdep_set_class(&tc_ht->mutex, &tc_ht_lock_key);
 +
 +	uplink_priv->encap = mlx5e_tc_tun_init(priv);
 +	if (IS_ERR(uplink_priv->encap)) {
 +		err = PTR_ERR(uplink_priv->encap);
 +		goto err_register_fib_notifier;
 +	}
+	return 0;
 +
 +err_register_fib_notifier:
 +	rhashtable_destroy(tc_ht);
  err_ht_init:
    mapping_destroy(uplink_priv->tunnel_enc_opts_mapping);
  err_enc_opts_mapping:
@@@ -4775,11 -5357,10 +4775,11 @@@ void mlx5e_tc_esw_cleanup(struct rhasht
  {
    struct mlx5_rep_uplink_priv *uplink_priv;
-	rhashtable_free_and_destroy(tc_ht, _mlx5e_tc_del_flow, NULL);
 -
    uplink_priv = container_of(tc_ht, struct mlx5_rep_uplink_priv, tc_ht);
+	rhashtable_free_and_destroy(tc_ht, _mlx5e_tc_del_flow, NULL);
 +	mlx5e_tc_tun_cleanup(uplink_priv->encap);
 +
    mapping_destroy(uplink_priv->tunnel_enc_opts_mapping);
    mapping_destroy(uplink_priv->tunnel_mapping);
@@@ -4879,7 -5460,7 +4879,7 @@@ bool mlx5e_tc_update_skb(struct mlx5_cq
    	tc_skb_ext->chain = chain;
zone_restore_id = (reg_b >> REG_MAPPING_SHIFT(NIC_ZONE_RESTORE_TO_REG)) &
 -				  ZONE_RESTORE_MAX;
 +			ESW_ZONE_ID_MASK;
if (!mlx5e_tc_ct_restore_flow(tc->ct, skb,
    				      zone_restore_id))
diff --combined drivers/net/ethernet/mellanox/mlx5/core/main.c
index e4c9627485aa,ba1a4ae28097..2f2c352f301e
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@@ -73,9 -73,6 +73,9 @@@
  #include "ecpf.h"
  #include "lib/hv_vhca.h"
  #include "diag/rsc_dump.h"
 +#include "sf/vhca_event.h"
 +#include "sf/dev/dev.h"
 +#include "sf/sf.h"
MODULE_AUTHOR("Eli Cohen eli@mellanox.com");
  MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
@@@ -85,6 -82,7 +85,6 @@@ unsigned int mlx5_core_debug_mask
  module_param_named(debug_mask, mlx5_core_debug_mask, uint, 0644);
  MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec time, 3 = both. Default=0");
-#define MLX5_DEFAULT_PROF	2
  static unsigned int prof_sel = MLX5_DEFAULT_PROF;
  module_param_named(prof_sel, prof_sel, uint, 0444);
  MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2");
@@@ -569,8 -567,6 +569,8 @@@ static int handle_hca_cap(struct mlx5_c
    if (MLX5_CAP_GEN_MAX(dev, mkey_by_name))
    	MLX5_SET(cmd_hca_cap, set_hca_cap, mkey_by_name, 1);
+	mlx5_vhca_state_cap_handle(dev, set_hca_cap);
 +
    return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
  }
@@@ -888,24 -884,6 +888,24 @@@ static int mlx5_init_once(struct mlx5_c
    	goto err_eswitch_cleanup;
    }
+	err = mlx5_vhca_event_init(dev);
 +	if (err) {
 +		mlx5_core_err(dev, "Failed to init vhca event notifier %d\n", err);
 +		goto err_fpga_cleanup;
 +	}
 +
 +	err = mlx5_sf_hw_table_init(dev);
 +	if (err) {
 +		mlx5_core_err(dev, "Failed to init SF HW table %d\n", err);
 +		goto err_sf_hw_table_cleanup;
 +	}
 +
 +	err = mlx5_sf_table_init(dev);
 +	if (err) {
 +		mlx5_core_err(dev, "Failed to init SF table %d\n", err);
 +		goto err_sf_table_cleanup;
 +	}
 +
    dev->dm = mlx5_dm_create(dev);
    if (IS_ERR(dev->dm))
    	mlx5_core_warn(dev, "Failed to init device memory%d\n", err);
@@@ -916,12 -894,6 +916,12 @@@
return 0;
+err_sf_table_cleanup:
 +	mlx5_sf_hw_table_cleanup(dev);
 +err_sf_hw_table_cleanup:
 +	mlx5_vhca_event_cleanup(dev);
 +err_fpga_cleanup:
 +	mlx5_fpga_cleanup(dev);
  err_eswitch_cleanup:
    mlx5_eswitch_cleanup(dev->priv.eswitch);
  err_sriov_cleanup:
@@@ -953,9 -925,6 +953,9 @@@ static void mlx5_cleanup_once(struct ml
    mlx5_hv_vhca_destroy(dev->hv_vhca);
    mlx5_fw_tracer_destroy(dev->tracer);
    mlx5_dm_cleanup(dev);
 +	mlx5_sf_table_cleanup(dev);
 +	mlx5_sf_hw_table_cleanup(dev);
 +	mlx5_vhca_event_cleanup(dev);
    mlx5_fpga_cleanup(dev);
    mlx5_eswitch_cleanup(dev->priv.eswitch);
    mlx5_sriov_cleanup(dev);
@@@ -1160,14 -1129,6 +1160,14 @@@ static int mlx5_load(struct mlx5_core_d
    	goto err_sriov;
    }
+	mlx5_vhca_event_start(dev);
 +
 +	err = mlx5_sf_hw_table_create(dev);
 +	if (err) {
 +		mlx5_core_err(dev, "sf table create failed %d\n", err);
 +		goto err_vhca;
 +	}
 +
    err = mlx5_ec_init(dev);
    if (err) {
    	mlx5_core_err(dev, "Failed to init embedded CPU\n");
@@@ -1180,16 -1141,11 +1180,16 @@@
    	goto err_sriov;
    }
+	mlx5_sf_dev_table_create(dev);
 +
    return 0;
err_sriov:
    mlx5_ec_cleanup(dev);
  err_ec:
 +	mlx5_sf_hw_table_destroy(dev);
 +err_vhca:
 +	mlx5_vhca_event_stop(dev);
    mlx5_cleanup_fs(dev);
  err_fs:
    mlx5_accel_tls_cleanup(dev);
@@@ -1215,11 -1171,8 +1215,11 @@@ err_irq_table
static void mlx5_unload(struct mlx5_core_dev *dev)
  {
 +	mlx5_sf_dev_table_destroy(dev);
    mlx5_sriov_detach(dev);
    mlx5_ec_cleanup(dev);
 +	mlx5_sf_hw_table_destroy(dev);
 +	mlx5_vhca_event_stop(dev);
    mlx5_cleanup_fs(dev);
    mlx5_accel_ipsec_cleanup(dev);
    mlx5_accel_tls_cleanup(dev);
@@@ -1330,7 -1283,7 +1330,7 @@@ out
    mutex_unlock(&dev->intf_state_mutex);
  }
-static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
 +int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
  {
    struct mlx5_priv *priv = &dev->priv;
    int err;
@@@ -1352,8 -1305,6 +1352,8 @@@
priv->dbg_root = debugfs_create_dir(dev_name(dev->device),
    				    mlx5_debugfs_root);
 +	INIT_LIST_HEAD(&priv->traps);
 +
    err = mlx5_health_init(dev);
    if (err)
    	goto err_health_init;
@@@ -1382,7 -1333,7 +1382,7 @@@ err_health_init
    return err;
  }
-static void mlx5_mdev_uninit(struct mlx5_core_dev *dev)
 +void mlx5_mdev_uninit(struct mlx5_core_dev *dev)
  {
    struct mlx5_priv *priv = &dev->priv;
@@@ -1445,7 -1396,8 +1445,8 @@@ static int init_one(struct pci_dev *pde
    	dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err);
pci_save_state(pdev);
- 	devlink_reload_enable(devlink);
+ 	if (!mlx5_core_is_mp_slave(dev))
+ 		devlink_reload_enable(devlink);
    return 0;
err_load_one:
@@@ -1725,10 -1677,6 +1726,10 @@@ static int __init init(void
    if (err)
    	goto err_debug;
+	err = mlx5_sf_driver_register();
 +	if (err)
 +		goto err_sf;
 +
  #ifdef CONFIG_MLX5_CORE_EN
    err = mlx5e_init();
    if (err) {
@@@ -1739,8 -1687,6 +1740,8 @@@
return 0;
+err_sf:
 +	pci_unregister_driver(&mlx5_core_driver);
  err_debug:
    mlx5_unregister_debugfs();
    return err;
@@@ -1751,7 -1697,6 +1752,7 @@@ static void __exit cleanup(void
  #ifdef CONFIG_MLX5_CORE_EN
    mlx5e_cleanup();
  #endif
 +	mlx5_sf_driver_unregister();
    pci_unregister_driver(&mlx5_core_driver);
    mlx5_unregister_debugfs();
  }
diff --combined drivers/net/ethernet/realtek/r8169_main.c
index cbc30df4e08a,e7a59dc5fe49..9ce98e3d3f9f
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@@ -28,7 -28,6 +28,7 @@@
  #include <linux/bitfield.h>
  #include <linux/prefetch.h>
  #include <linux/ipv6.h>
 +#include <asm/unaligned.h>
  #include <net/ip6_checksum.h>
#include "r8169.h"
@@@ -146,7 -145,6 +146,7 @@@ static const struct 
    [RTL_GIGA_MAC_VER_50] = {"RTL8168ep/8111ep"			},
    [RTL_GIGA_MAC_VER_51] = {"RTL8168ep/8111ep"			},
    [RTL_GIGA_MAC_VER_52] = {"RTL8168fp/RTL8117",  FIRMWARE_8168FP_3},
 +	[RTL_GIGA_MAC_VER_53] = {"RTL8168fp/RTL8117",			},
    [RTL_GIGA_MAC_VER_60] = {"RTL8125A"				},
    [RTL_GIGA_MAC_VER_61] = {"RTL8125A",		FIRMWARE_8125A_3},
    /* reserve 62 for CFG_METHOD_4 in the vendor driver */
@@@ -262,9 -260,6 +262,9 @@@ enum rtl8168_8101_registers 
  #define	CSIAR_BYTE_ENABLE		0x0000f000
  #define	CSIAR_ADDR_MASK			0x00000fff
    PMCH			= 0x6f,
 +#define D3COLD_NO_PLL_DOWN		BIT(7)
 +#define D3HOT_NO_PLL_DOWN		BIT(6)
 +#define D3_NO_PLL_DOWN			(BIT(7) | BIT(6))
    EPHYAR			= 0x80,
  #define	EPHYAR_FLAG			0x80000000
  #define	EPHYAR_WRITE_CMD		0x80000000
@@@ -534,9 -529,6 +534,9 @@@ enum rtl_rx_desc_bit 
    IPFail		= (1 << 16), /* IP checksum failed */
    UDPFail		= (1 << 15), /* UDP/IP checksum failed */
    TCPFail		= (1 << 14), /* TCP/IP checksum failed */
 +
 +#define RxCSFailMask	(IPFail | UDPFail | TCPFail)
 +
    RxVlanTag	= (1 << 16), /* VLAN tag available */
  };
@@@ -592,12 -584,6 +592,12 @@@ enum rtl_flag 
    RTL_FLAG_MAX
  };
+enum rtl_dash_type {
 +	RTL_DASH_NONE,
 +	RTL_DASH_DP,
 +	RTL_DASH_EP,
 +};
 +
  struct rtl8169_private {
    void __iomem *mmio_addr;	/* memory map physical address */
    struct pci_dev *pci_dev;
@@@ -605,7 -591,6 +605,7 @@@
    struct phy_device *phydev;
    struct napi_struct napi;
    enum mac_version mac_version;
 +	enum rtl_dash_type dash_type;
    u32 cur_rx; /* Index into the Rx descriptor buffer of next Rx pkt. */
    u32 cur_tx; /* Index into the Tx descriptor buffer of next Rx pkt. */
    u32 dirty_tx;
@@@ -697,7 -682,7 +697,7 @@@ static bool rtl_is_8168evl_up(struct rt
  {
    return tp->mac_version >= RTL_GIGA_MAC_VER_34 &&
           tp->mac_version != RTL_GIGA_MAC_VER_39 &&
 -	       tp->mac_version <= RTL_GIGA_MAC_VER_52;
 +	       tp->mac_version <= RTL_GIGA_MAC_VER_53;
  }
static bool rtl_supports_eee(struct rtl8169_private *tp)
@@@ -761,77 -746,14 +761,77 @@@ static const struct rtl_cond name = 
    						\
  static bool name ## _check(struct rtl8169_private *tp)
-static bool rtl_ocp_reg_failure(struct rtl8169_private *tp, u32 reg)
 +static void r8168fp_adjust_ocp_cmd(struct rtl8169_private *tp, u32 *cmd, int type)
  {
 -	if (reg & 0xffff0001) {
 -		if (net_ratelimit())
 -			netdev_err(tp->dev, "Invalid ocp reg %x!\n", reg);
 -		return true;
 -	}
 -	return false;
 +	/* based on RTL8168FP_OOBMAC_BASE in vendor driver */
 +	if (type == ERIAR_OOB &&
 +	    (tp->mac_version == RTL_GIGA_MAC_VER_52 ||
 +	     tp->mac_version == RTL_GIGA_MAC_VER_53))
 +		*cmd |= 0x7f0 << 18;
 +}
 +
 +DECLARE_RTL_COND(rtl_eriar_cond)
 +{
 +	return RTL_R32(tp, ERIAR) & ERIAR_FLAG;
 +}
 +
 +static void _rtl_eri_write(struct rtl8169_private *tp, int addr, u32 mask,
 +			   u32 val, int type)
 +{
 +	u32 cmd = ERIAR_WRITE_CMD | type | mask | addr;
 +
 +	if (WARN(addr & 3 || !mask, "addr: 0x%x, mask: 0x%08x\n", addr, mask))
 +		return;
 +
 +	RTL_W32(tp, ERIDR, val);
 +	r8168fp_adjust_ocp_cmd(tp, &cmd, type);
 +	RTL_W32(tp, ERIAR, cmd);
 +
 +	rtl_loop_wait_low(tp, &rtl_eriar_cond, 100, 100);
 +}
 +
 +static void rtl_eri_write(struct rtl8169_private *tp, int addr, u32 mask,
 +			  u32 val)
 +{
 +	_rtl_eri_write(tp, addr, mask, val, ERIAR_EXGMAC);
 +}
 +
 +static u32 _rtl_eri_read(struct rtl8169_private *tp, int addr, int type)
 +{
 +	u32 cmd = ERIAR_READ_CMD | type | ERIAR_MASK_1111 | addr;
 +
 +	r8168fp_adjust_ocp_cmd(tp, &cmd, type);
 +	RTL_W32(tp, ERIAR, cmd);
 +
 +	return rtl_loop_wait_high(tp, &rtl_eriar_cond, 100, 100) ?
 +		RTL_R32(tp, ERIDR) : ~0;
 +}
 +
 +static u32 rtl_eri_read(struct rtl8169_private *tp, int addr)
 +{
 +	return _rtl_eri_read(tp, addr, ERIAR_EXGMAC);
 +}
 +
 +static void rtl_w0w1_eri(struct rtl8169_private *tp, int addr, u32 p, u32 m)
 +{
 +	u32 val = rtl_eri_read(tp, addr);
 +
 +	rtl_eri_write(tp, addr, ERIAR_MASK_1111, (val & ~m) | p);
 +}
 +
 +static void rtl_eri_set_bits(struct rtl8169_private *tp, int addr, u32 p)
 +{
 +	rtl_w0w1_eri(tp, addr, p, 0);
 +}
 +
 +static void rtl_eri_clear_bits(struct rtl8169_private *tp, int addr, u32 m)
 +{
 +	rtl_w0w1_eri(tp, addr, 0, m);
 +}
 +
 +static bool rtl_ocp_reg_failure(u32 reg)
 +{
 +	return WARN_ONCE(reg & 0xffff0001, "Invalid ocp reg %x!\n", reg);
  }
DECLARE_RTL_COND(rtl_ocp_gphy_cond)
@@@ -841,7 -763,7 +841,7 @@@
static void r8168_phy_ocp_write(struct rtl8169_private *tp, u32 reg, u32 data)
  {
 -	if (rtl_ocp_reg_failure(tp, reg))
 +	if (rtl_ocp_reg_failure(reg))
    	return;
RTL_W32(tp, GPHY_OCP, OCPAR_FLAG | (reg << 15) | data);
@@@ -851,7 -773,7 +851,7 @@@
static int r8168_phy_ocp_read(struct rtl8169_private *tp, u32 reg)
  {
 -	if (rtl_ocp_reg_failure(tp, reg))
 +	if (rtl_ocp_reg_failure(reg))
    	return 0;
RTL_W32(tp, GPHY_OCP, reg << 15);
@@@ -862,7 -784,7 +862,7 @@@
static void r8168_mac_ocp_write(struct rtl8169_private *tp, u32 reg, u32 data)
  {
 -	if (rtl_ocp_reg_failure(tp, reg))
 +	if (rtl_ocp_reg_failure(reg))
    	return;
RTL_W32(tp, OCPDR, OCPAR_FLAG | (reg << 15) | data);
@@@ -870,7 -792,7 +870,7 @@@
static u16 r8168_mac_ocp_read(struct rtl8169_private *tp, u32 reg)
  {
 -	if (rtl_ocp_reg_failure(tp, reg))
 +	if (rtl_ocp_reg_failure(reg))
    	return 0;
RTL_W32(tp, OCPDR, reg << 15);
@@@ -886,25 -808,6 +886,25 @@@ static void r8168_mac_ocp_modify(struc
    r8168_mac_ocp_write(tp, reg, (data & ~mask) | set);
  }
+/* Work around a hw issue with RTL8168g PHY, the quirk disables
 + * PHY MCU interrupts before PHY power-down.
 + */
 +static void rtl8168g_phy_suspend_quirk(struct rtl8169_private *tp, int value)
 +{
 +	switch (tp->mac_version) {
 +	case RTL_GIGA_MAC_VER_40:
 +	case RTL_GIGA_MAC_VER_41:
 +	case RTL_GIGA_MAC_VER_49:
 +		if (value & BMCR_RESET || !(value & BMCR_PDOWN))
 +			rtl_eri_set_bits(tp, 0x1a8, 0xfc000000);
 +		else
 +			rtl_eri_clear_bits(tp, 0x1a8, 0xfc000000);
 +		break;
 +	default:
 +		break;
 +	}
 +};
 +
  static void r8168g_mdio_write(struct rtl8169_private *tp, int reg, int value)
  {
    if (reg == 0x1f) {
@@@ -915,9 -818,6 +915,9 @@@
    if (tp->ocp_base != OCP_STD_PHY_BASE)
    	reg -= 0x10;
+	if (tp->ocp_base == OCP_STD_PHY_BASE && reg == MII_BMCR)
 +		rtl8168g_phy_suspend_quirk(tp, value);
 +
    r8168_phy_ocp_write(tp, tp->ocp_base + reg * 2, value);
  }
@@@ -1109,6 -1009,70 +1109,6 @@@ static u16 rtl_ephy_read(struct rtl8169
    	RTL_R32(tp, EPHYAR) & EPHYAR_DATA_MASK : ~0;
  }
-static void r8168fp_adjust_ocp_cmd(struct rtl8169_private *tp, u32 *cmd, int type)
 -{
 -	/* based on RTL8168FP_OOBMAC_BASE in vendor driver */
 -	if (tp->mac_version == RTL_GIGA_MAC_VER_52 && type == ERIAR_OOB)
 -		*cmd |= 0x7f0 << 18;
 -}
 -
 -DECLARE_RTL_COND(rtl_eriar_cond)
 -{
 -	return RTL_R32(tp, ERIAR) & ERIAR_FLAG;
 -}
 -
 -static void _rtl_eri_write(struct rtl8169_private *tp, int addr, u32 mask,
 -			   u32 val, int type)
 -{
 -	u32 cmd = ERIAR_WRITE_CMD | type | mask | addr;
 -
 -	BUG_ON((addr & 3) || (mask == 0));
 -	RTL_W32(tp, ERIDR, val);
 -	r8168fp_adjust_ocp_cmd(tp, &cmd, type);
 -	RTL_W32(tp, ERIAR, cmd);
 -
 -	rtl_loop_wait_low(tp, &rtl_eriar_cond, 100, 100);
 -}
 -
 -static void rtl_eri_write(struct rtl8169_private *tp, int addr, u32 mask,
 -			  u32 val)
 -{
 -	_rtl_eri_write(tp, addr, mask, val, ERIAR_EXGMAC);
 -}
 -
 -static u32 _rtl_eri_read(struct rtl8169_private *tp, int addr, int type)
 -{
 -	u32 cmd = ERIAR_READ_CMD | type | ERIAR_MASK_1111 | addr;
 -
 -	r8168fp_adjust_ocp_cmd(tp, &cmd, type);
 -	RTL_W32(tp, ERIAR, cmd);
 -
 -	return rtl_loop_wait_high(tp, &rtl_eriar_cond, 100, 100) ?
 -		RTL_R32(tp, ERIDR) : ~0;
 -}
 -
 -static u32 rtl_eri_read(struct rtl8169_private *tp, int addr)
 -{
 -	return _rtl_eri_read(tp, addr, ERIAR_EXGMAC);
 -}
 -
 -static void rtl_w0w1_eri(struct rtl8169_private *tp, int addr, u32 p, u32 m)
 -{
 -	u32 val = rtl_eri_read(tp, addr);
 -
 -	rtl_eri_write(tp, addr, ERIAR_MASK_1111, (val & ~m) | p);
 -}
 -
 -static void rtl_eri_set_bits(struct rtl8169_private *tp, int addr, u32 p)
 -{
 -	rtl_w0w1_eri(tp, addr, p, 0);
 -}
 -
 -static void rtl_eri_clear_bits(struct rtl8169_private *tp, int addr, u32 m)
 -{
 -	rtl_w0w1_eri(tp, addr, 0, m);
 -}
 -
  static u32 r8168dp_ocp_read(struct rtl8169_private *tp, u16 reg)
  {
    RTL_W32(tp, OCPAR, 0x0fu << 12 | (reg & 0x0fff));
@@@ -1194,10 -1158,19 +1194,10 @@@ static void rtl8168ep_driver_start(stru
static void rtl8168_driver_start(struct rtl8169_private *tp)
  {
 -	switch (tp->mac_version) {
 -	case RTL_GIGA_MAC_VER_27:
 -	case RTL_GIGA_MAC_VER_28:
 -	case RTL_GIGA_MAC_VER_31:
 +	if (tp->dash_type == RTL_DASH_DP)
    	rtl8168dp_driver_start(tp);
 -		break;
 -	case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_52:
 +	else
    	rtl8168ep_driver_start(tp);
 -		break;
 -	default:
 -		BUG();
 -		break;
 -	}
  }
static void rtl8168dp_driver_stop(struct rtl8169_private *tp)
@@@ -1216,52 -1189,44 +1216,52 @@@ static void rtl8168ep_driver_stop(struc
static void rtl8168_driver_stop(struct rtl8169_private *tp)
  {
 -	switch (tp->mac_version) {
 -	case RTL_GIGA_MAC_VER_27:
 -	case RTL_GIGA_MAC_VER_28:
 -	case RTL_GIGA_MAC_VER_31:
 +	if (tp->dash_type == RTL_DASH_DP)
    	rtl8168dp_driver_stop(tp);
 -		break;
 -	case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_52:
 +	else
    	rtl8168ep_driver_stop(tp);
 -		break;
 -	default:
 -		BUG();
 -		break;
 -	}
  }
static bool r8168dp_check_dash(struct rtl8169_private *tp)
  {
    u16 reg = rtl8168_get_ocp_reg(tp);
-	return !!(r8168dp_ocp_read(tp, reg) & 0x00008000);
 +	return r8168dp_ocp_read(tp, reg) & BIT(15);
  }
static bool r8168ep_check_dash(struct rtl8169_private *tp)
  {
 -	return r8168ep_ocp_read(tp, 0x128) & 0x00000001;
 +	return r8168ep_ocp_read(tp, 0x128) & BIT(0);
  }
-static bool r8168_check_dash(struct rtl8169_private *tp)
 +static enum rtl_dash_type rtl_check_dash(struct rtl8169_private *tp)
  {
    switch (tp->mac_version) {
    case RTL_GIGA_MAC_VER_27:
    case RTL_GIGA_MAC_VER_28:
    case RTL_GIGA_MAC_VER_31:
 -		return r8168dp_check_dash(tp);
 -	case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_52:
 -		return r8168ep_check_dash(tp);
 +		return r8168dp_check_dash(tp) ? RTL_DASH_DP : RTL_DASH_NONE;
 +	case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_53:
 +		return r8168ep_check_dash(tp) ? RTL_DASH_EP : RTL_DASH_NONE;
    default:
 -		return false;
 +		return RTL_DASH_NONE;
 +	}
 +}
 +
 +static void rtl_set_d3_pll_down(struct rtl8169_private *tp, bool enable)
 +{
 +	switch (tp->mac_version) {
 +	case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_26:
 +	case RTL_GIGA_MAC_VER_29 ... RTL_GIGA_MAC_VER_30:
 +	case RTL_GIGA_MAC_VER_32 ... RTL_GIGA_MAC_VER_37:
 +	case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_63:
 +		if (enable)
 +			RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) & ~D3_NO_PLL_DOWN);
 +		else
 +			RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | D3_NO_PLL_DOWN);
 +		break;
 +	default:
 +		break;
    }
  }
@@@ -1431,7 -1396,6 +1431,7 @@@ static void __rtl8169_set_wol(struct rt
    rtl_lock_config_regs(tp);
device_set_wakeup_enable(tp_to_dev(tp), wolopts);
 +	rtl_set_d3_pll_down(tp, !wolopts);
    tp->dev->wol_enabled = wolopts ? 1 : 0;
  }
@@@ -1966,7 -1930,6 +1966,7 @@@ static enum mac_version rtl8169_get_mac
    	{ 0x7c8, 0x608,	RTL_GIGA_MAC_VER_61 },
/* RTL8117 */
 +		{ 0x7cf, 0x54b,	RTL_GIGA_MAC_VER_53 },
    	{ 0x7cf, 0x54a,	RTL_GIGA_MAC_VER_52 },
/* 8168EP family. */
@@@ -1999,11 -1962,7 +1999,11 @@@
    	{ 0x7c8, 0x280,	RTL_GIGA_MAC_VER_26 },
/* 8168DP family. */
 -		{ 0x7cf, 0x288,	RTL_GIGA_MAC_VER_27 },
 +		/* It seems this early RTL8168dp version never made it to
 +		 * the wild. Let's see whether somebody complains, if not
 +		 * we'll remove support for this chip version completely.
 +		 * { 0x7cf, 0x288,      RTL_GIGA_MAC_VER_27 },
 +		 */
    	{ 0x7cf, 0x28a,	RTL_GIGA_MAC_VER_28 },
    	{ 0x7cf, 0x28b,	RTL_GIGA_MAC_VER_31 },
@@@ -2037,12 -1996,9 +2037,12 @@@
    	{ 0x7c8, 0x348,	RTL_GIGA_MAC_VER_09 },
    	{ 0x7c8, 0x248,	RTL_GIGA_MAC_VER_09 },
    	{ 0x7c8, 0x340,	RTL_GIGA_MAC_VER_16 },
 -		/* FIXME: where did these entries come from ? -- FR */
 -		{ 0xfc8, 0x388,	RTL_GIGA_MAC_VER_13 },
 -		{ 0xfc8, 0x308,	RTL_GIGA_MAC_VER_13 },
 +		/* FIXME: where did these entries come from ? -- FR
 +		 * Not even r8101 vendor driver knows these id's,
 +		 * so let's disable detection for now. -- HK
 +		 * { 0xfc8, 0x388,	RTL_GIGA_MAC_VER_13 },
 +		 * { 0xfc8, 0x308,	RTL_GIGA_MAC_VER_13 },
 +		 */
/* 8110 family. */
    	{ 0xfc8, 0x980,	RTL_GIGA_MAC_VER_06 },
@@@ -2125,12 -2081,18 +2125,12 @@@ static void rtl8125b_config_eee_mac(str
    r8168_mac_ocp_modify(tp, 0xe040, 0, BIT(1) | BIT(0));
  }
-static void rtl_rar_exgmac_set(struct rtl8169_private *tp, u8 *addr)
 +static void rtl_rar_exgmac_set(struct rtl8169_private *tp, const u8 *addr)
  {
 -	const u16 w[] = {
 -		addr[0] | (addr[1] << 8),
 -		addr[2] | (addr[3] << 8),
 -		addr[4] | (addr[5] << 8)
 -	};
 -
 -	rtl_eri_write(tp, 0xe0, ERIAR_MASK_1111, w[0] | (w[1] << 16));
 -	rtl_eri_write(tp, 0xe4, ERIAR_MASK_1111, w[2]);
 -	rtl_eri_write(tp, 0xf0, ERIAR_MASK_1111, w[0] << 16);
 -	rtl_eri_write(tp, 0xf4, ERIAR_MASK_1111, w[1] | (w[2] << 16));
 +	rtl_eri_write(tp, 0xe0, ERIAR_MASK_1111, get_unaligned_le32(addr));
 +	rtl_eri_write(tp, 0xe4, ERIAR_MASK_1111, get_unaligned_le16(addr + 4));
 +	rtl_eri_write(tp, 0xf0, ERIAR_MASK_1111, get_unaligned_le16(addr) << 16);
 +	rtl_eri_write(tp, 0xf4, ERIAR_MASK_1111, get_unaligned_le32(addr + 2));
  }
u16 rtl8168h_2_get_adc_bias_ioffset(struct rtl8169_private *tp)
@@@ -2180,14 -2142,14 +2180,14 @@@ static void rtl8169_init_phy(struct rtl
    genphy_soft_reset(tp->phydev);
  }
-static void rtl_rar_set(struct rtl8169_private *tp, u8 *addr)
 +static void rtl_rar_set(struct rtl8169_private *tp, const u8 *addr)
  {
    rtl_unlock_config_regs(tp);
-	RTL_W32(tp, MAC4, addr[4] | addr[5] << 8);
 +	RTL_W32(tp, MAC4, get_unaligned_le16(addr + 4));
    rtl_pci_commit(tp);
-	RTL_W32(tp, MAC0, addr[0] | addr[1] << 8 | addr[2] << 16 | addr[3] << 24);
 +	RTL_W32(tp, MAC0, get_unaligned_le32(addr));
    rtl_pci_commit(tp);
if (tp->mac_version == RTL_GIGA_MAC_VER_34)
@@@ -2210,16 -2172,28 +2210,16 @@@ static int rtl_set_mac_address(struct n
    return 0;
  }
-static void rtl_wol_suspend_quirk(struct rtl8169_private *tp)
 +static void rtl_wol_enable_rx(struct rtl8169_private *tp)
  {
 -	switch (tp->mac_version) {
 -	case RTL_GIGA_MAC_VER_25:
 -	case RTL_GIGA_MAC_VER_26:
 -	case RTL_GIGA_MAC_VER_29:
 -	case RTL_GIGA_MAC_VER_30:
 -	case RTL_GIGA_MAC_VER_32:
 -	case RTL_GIGA_MAC_VER_33:
 -	case RTL_GIGA_MAC_VER_34:
 -	case RTL_GIGA_MAC_VER_37 ... RTL_GIGA_MAC_VER_63:
 +	if (tp->mac_version >= RTL_GIGA_MAC_VER_25)
    	RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) |
    		AcceptBroadcast | AcceptMulticast | AcceptMyPhys);
 -		break;
 -	default:
 -		break;
 -	}
  }
-static void rtl_pll_power_down(struct rtl8169_private *tp)
 +static void rtl_prepare_power_down(struct rtl8169_private *tp)
  {
 -	if (r8168_check_dash(tp))
 +	if (tp->dash_type != RTL_DASH_NONE)
    	return;
if (tp->mac_version == RTL_GIGA_MAC_VER_32 ||
@@@ -2228,10 -2202,68 +2228,35 @@@
if (device_may_wakeup(tp_to_dev(tp))) {
    	phy_speed_down(tp->phydev, false);
 -		rtl_wol_suspend_quirk(tp);
 -		return;
 +		rtl_wol_enable_rx(tp);
    }
+ 
+ 	switch (tp->mac_version) {
+ 	case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_26:
+ 	case RTL_GIGA_MAC_VER_29 ... RTL_GIGA_MAC_VER_30:
+ 	case RTL_GIGA_MAC_VER_32 ... RTL_GIGA_MAC_VER_33:
+ 	case RTL_GIGA_MAC_VER_37:
+ 	case RTL_GIGA_MAC_VER_39:
+ 	case RTL_GIGA_MAC_VER_43:
+ 	case RTL_GIGA_MAC_VER_44:
+ 	case RTL_GIGA_MAC_VER_45:
+ 	case RTL_GIGA_MAC_VER_46:
+ 	case RTL_GIGA_MAC_VER_47:
+ 	case RTL_GIGA_MAC_VER_48:
+ 	case RTL_GIGA_MAC_VER_50 ... RTL_GIGA_MAC_VER_63:
+ 		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) & ~0x80);
+ 		break;
+ 	case RTL_GIGA_MAC_VER_40:
+ 	case RTL_GIGA_MAC_VER_41:
+ 	case RTL_GIGA_MAC_VER_49:
+ 		rtl_eri_clear_bits(tp, 0x1a8, 0xfc000000);
+ 		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) & ~0x80);
+ 		break;
+ 	default:
+ 		break;
+ 	}
  }
-static void rtl_pll_power_up(struct rtl8169_private *tp)
 -{
 -	switch (tp->mac_version) {
 -	case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_26:
 -	case RTL_GIGA_MAC_VER_29 ... RTL_GIGA_MAC_VER_30:
 -	case RTL_GIGA_MAC_VER_32 ... RTL_GIGA_MAC_VER_33:
 -	case RTL_GIGA_MAC_VER_37:
 -	case RTL_GIGA_MAC_VER_39:
 -	case RTL_GIGA_MAC_VER_43:
 -		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0x80);
 -		break;
 -	case RTL_GIGA_MAC_VER_44:
 -	case RTL_GIGA_MAC_VER_45:
 -	case RTL_GIGA_MAC_VER_46:
 -	case RTL_GIGA_MAC_VER_47:
 -	case RTL_GIGA_MAC_VER_48:
 -	case RTL_GIGA_MAC_VER_50 ... RTL_GIGA_MAC_VER_63:
 -		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0xc0);
 -		break;
 -	case RTL_GIGA_MAC_VER_40:
 -	case RTL_GIGA_MAC_VER_41:
 -	case RTL_GIGA_MAC_VER_49:
 -		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0xc0);
 -		rtl_eri_set_bits(tp, 0x1a8, 0xfc000000);
 -		break;
 -	default:
 -		break;
 -	}
 -
 -	phy_resume(tp->phydev);
 -}
 -
  static void rtl_init_rxcfg(struct rtl8169_private *tp)
  {
    switch (tp->mac_version) {
@@@ -2244,7 -2276,7 +2269,7 @@@
    case RTL_GIGA_MAC_VER_38:
    	RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST);
    	break;
 -	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_52:
 +	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_53:
    	RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST | RX_EARLY_OFF);
    	break;
    case RTL_GIGA_MAC_VER_60 ... RTL_GIGA_MAC_VER_63:
@@@ -2310,14 -2342,13 +2335,14 @@@ static void r8168b_1_hw_jumbo_disable(s
  static void rtl_jumbo_config(struct rtl8169_private *tp)
  {
    bool jumbo = tp->dev->mtu > ETH_DATA_LEN;
 +	int readrq = 4096;
rtl_unlock_config_regs(tp);
    switch (tp->mac_version) {
    case RTL_GIGA_MAC_VER_12:
    case RTL_GIGA_MAC_VER_17:
    	if (jumbo) {
 -			pcie_set_readrq(tp->pci_dev, 512);
 +			readrq = 512;
    		r8168b_1_hw_jumbo_enable(tp);
    	} else {
    		r8168b_1_hw_jumbo_disable(tp);
@@@ -2325,7 -2356,7 +2350,7 @@@
    	break;
    case RTL_GIGA_MAC_VER_18 ... RTL_GIGA_MAC_VER_26:
    	if (jumbo) {
 -			pcie_set_readrq(tp->pci_dev, 512);
 +			readrq = 512;
    		r8168c_hw_jumbo_enable(tp);
    	} else {
    		r8168c_hw_jumbo_disable(tp);
@@@ -2338,18 -2369,20 +2363,18 @@@
    		r8168dp_hw_jumbo_disable(tp);
    	break;
    case RTL_GIGA_MAC_VER_31 ... RTL_GIGA_MAC_VER_33:
 -		if (jumbo) {
 -			pcie_set_readrq(tp->pci_dev, 512);
 +		if (jumbo)
    		r8168e_hw_jumbo_enable(tp);
 -		} else {
 +		else
    		r8168e_hw_jumbo_disable(tp);
 -		}
    	break;
    default:
    	break;
    }
    rtl_lock_config_regs(tp);
-	if (!jumbo && pci_is_pcie(tp->pci_dev) && tp->supports_gmii)
 -		pcie_set_readrq(tp->pci_dev, 4096);
 +	if (pci_is_pcie(tp->pci_dev) && tp->supports_gmii)
 +		pcie_set_readrq(tp->pci_dev, readrq);
  }
DECLARE_RTL_COND(rtl_chipcmd_cond)
@@@ -2418,7 -2451,7 +2443,7 @@@ DECLARE_RTL_COND(rtl_rxtx_empty_cond_2
  static void rtl_wait_txrx_fifo_empty(struct rtl8169_private *tp)
  {
    switch (tp->mac_version) {
 -	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_52:
 +	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_53:
    	rtl_loop_wait_high(tp, &rtl_txcfg_empty_cond, 100, 42);
    	rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond, 100, 42);
    	break;
@@@ -3677,7 -3710,6 +3702,7 @@@ static void rtl_hw_config(struct rtl816
    	[RTL_GIGA_MAC_VER_50] = rtl_hw_start_8168ep_2,
    	[RTL_GIGA_MAC_VER_51] = rtl_hw_start_8168ep_3,
    	[RTL_GIGA_MAC_VER_52] = rtl_hw_start_8117,
 +		[RTL_GIGA_MAC_VER_53] = rtl_hw_start_8117,
    	[RTL_GIGA_MAC_VER_60] = rtl_hw_start_8125a_1,
    	[RTL_GIGA_MAC_VER_61] = rtl_hw_start_8125a_2,
    	[RTL_GIGA_MAC_VER_63] = rtl_hw_start_8125b,
@@@ -4437,9 -4469,10 +4462,9 @@@ static inline int rtl8169_fragmented_fr
static inline void rtl8169_rx_csum(struct sk_buff *skb, u32 opts1)
  {
 -	u32 status = opts1 & RxProtoMask;
 +	u32 status = opts1 & (RxProtoMask | RxCSFailMask);
-	if (((status == RxProtoTCP) && !(opts1 & TCPFail)) ||
 -	    ((status == RxProtoUDP) && !(opts1 & UDPFail)))
 +	if (status == RxProtoTCP || status == RxProtoUDP)
    	skb->ip_summed = CHECKSUM_UNNECESSARY;
    else
    	skb_checksum_none_assert(skb);
@@@ -4553,10 -4586,8 +4578,10 @@@ static irqreturn_t rtl8169_interrupt(in
    	rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
    }
-	rtl_irq_disable(tp);
 -	napi_schedule(&tp->napi);
 +	if (napi_schedule_prep(&tp->napi)) {
 +		rtl_irq_disable(tp);
 +		__napi_schedule(&tp->napi);
 +	}
  out:
    rtl_ack_events(tp, status);
@@@ -4588,10 -4619,10 +4613,10 @@@ static int rtl8169_poll(struct napi_str
    struct net_device *dev = tp->dev;
    int work_done;
-	work_done = rtl_rx(dev, tp, budget);
 -
    rtl_tx(dev, tp, budget);
+	work_done = rtl_rx(dev, tp, budget);
 +
    if (work_done < budget && napi_complete_done(napi, work_done))
    	rtl_irq_enable(tp);
@@@ -4648,12 -4679,12 +4673,12 @@@ static void rtl8169_down(struct rtl8169
rtl8169_cleanup(tp, true);
-	rtl_pll_power_down(tp);
 +	rtl_prepare_power_down(tp);
  }
static void rtl8169_up(struct rtl8169_private *tp)
  {
 -	rtl_pll_power_up(tp);
 +	phy_resume(tp->phydev);
    rtl8169_init_phy(tp);
    napi_enable(&tp->napi);
    set_bit(RTL_FLAG_TASK_ENABLED, tp->wk.flags);
@@@ -4808,12 -4839,9 +4833,12 @@@ static void rtl8169_net_suspend(struct
#ifdef CONFIG_PM
-static int rtl8169_net_resume(struct rtl8169_private *tp)
 +static int rtl8169_runtime_resume(struct device *dev)
  {
 +	struct rtl8169_private *tp = dev_get_drvdata(dev);
 +
    rtl_rar_set(tp, tp->dev->dev_addr);
 +	__rtl8169_set_wol(tp, tp->saved_wolopts);
if (tp->TxDescArray)
    	rtl8169_up(tp);
@@@ -4847,7 -4875,7 +4872,7 @@@ static int __maybe_unused rtl8169_resum
    if (tp->mac_version == RTL_GIGA_MAC_VER_37)
    	rtl_init_rxcfg(tp);
-	return rtl8169_net_resume(tp);
 +	return rtl8169_runtime_resume(device);
  }
static int rtl8169_runtime_suspend(struct device *device)
@@@ -4867,6 -4895,15 +4892,6 @@@
    return 0;
  }
-static int rtl8169_runtime_resume(struct device *device)
 -{
 -	struct rtl8169_private *tp = dev_get_drvdata(device);
 -
 -	__rtl8169_set_wol(tp, tp->saved_wolopts);
 -
 -	return rtl8169_net_resume(tp);
 -}
 -
  static int rtl8169_runtime_idle(struct device *device)
  {
    struct rtl8169_private *tp = dev_get_drvdata(device);
@@@ -4914,10 -4951,12 +4939,10 @@@ static void rtl_shutdown(struct pci_de
    rtl_rar_set(tp, tp->dev->perm_addr);
if (system_state == SYSTEM_POWER_OFF) {
 -		if (tp->saved_wolopts) {
 -			rtl_wol_suspend_quirk(tp);
 +		if (tp->saved_wolopts)
    		rtl_wol_shutdown_quirk(tp);
 -		}
-		pci_wake_from_d3(pdev, true);
 +		pci_wake_from_d3(pdev, tp->saved_wolopts);
    	pci_set_power_state(pdev, PCI_D3hot);
    }
  }
@@@ -4931,7 -4970,7 +4956,7 @@@ static void rtl_remove_one(struct pci_d
unregister_netdev(tp->dev);
-	if (r8168_check_dash(tp))
 +	if (tp->dash_type != RTL_DASH_NONE)
    	rtl8168_driver_stop(tp);
rtl_release_firmware(tp);
@@@ -4999,12 -5038,16 +5024,12 @@@ static void rtl_read_mac_address(struc
  {
    /* Get MAC address */
    if (rtl_is_8168evl_up(tp) && tp->mac_version != RTL_GIGA_MAC_VER_34) {
 -		u32 value = rtl_eri_read(tp, 0xe0);
 -
 -		mac_addr[0] = (value >>  0) & 0xff;
 -		mac_addr[1] = (value >>  8) & 0xff;
 -		mac_addr[2] = (value >> 16) & 0xff;
 -		mac_addr[3] = (value >> 24) & 0xff;
 +		u32 value;
+		value = rtl_eri_read(tp, 0xe0);
 +		put_unaligned_le32(value, mac_addr);
    	value = rtl_eri_read(tp, 0xe4);
 -		mac_addr[4] = (value >>  0) & 0xff;
 -		mac_addr[5] = (value >>  8) & 0xff;
 +		put_unaligned_le16(value, mac_addr + 4);
    } else if (rtl_is_8125(tp)) {
    	rtl_read_mac_from_reg(tp, mac_addr, MAC0_BKP);
    }
@@@ -5056,7 -5099,7 +5081,7 @@@ static int r8169_mdio_register(struct r
    new_bus->name = "r8169";
    new_bus->priv = tp;
    new_bus->parent = &pdev->dev;
 -	new_bus->irq[0] = PHY_IGNORE_INTERRUPT;
 +	new_bus->irq[0] = PHY_MAC_INTERRUPT;
    snprintf(new_bus->id, MII_BUS_ID_SIZE, "r8169-%x", pci_dev_id(pdev));
new_bus->read = r8169_mdio_read_reg;
@@@ -5119,7 -5162,7 +5144,7 @@@ static void rtl_hw_init_8125(struct rtl
  static void rtl_hw_initialize(struct rtl8169_private *tp)
  {
    switch (tp->mac_version) {
 -	case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_52:
 +	case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_53:
    	rtl8168ep_stop_cmac(tp);
    	fallthrough;
    case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_48:
@@@ -5285,14 -5328,12 +5310,14 @@@ static int rtl_init_one(struct pci_dev 
    /* Identify chip attached to board */
    chipset = rtl8169_get_mac_version(xid, tp->supports_gmii);
    if (chipset == RTL_GIGA_MAC_NONE) {
 -		dev_err(&pdev->dev, "unknown chip XID %03x\n", xid);
 +		dev_err(&pdev->dev, "unknown chip XID %03x, contact r8169 maintainers (see MAINTAINERS file)\n", xid);
    	return -ENODEV;
    }
tp->mac_version = chipset;
+	tp->dash_type = rtl_check_dash(tp);
 +
    tp->cp_cmd = RTL_R16(tp, CPlusCmd) & CPCMD_MASK;
if (sizeof(dma_addr_t) > 4 && tp->mac_version >= RTL_GIGA_MAC_VER_18 &&
@@@ -5362,8 -5403,6 +5387,8 @@@
    /* configure chip for default features */
    rtl8169_set_features(dev, dev->features);
+	rtl_set_d3_pll_down(tp, true);
 +
    jumbo_max = rtl_jumbo_max(tp);
    if (jumbo_max)
    	dev->max_mtu = jumbo_max;
@@@ -5384,6 -5423,9 +5409,6 @@@
    if (rc)
    	return rc;
-	/* chip gets powered up in rtl_open() */
 -	rtl_pll_power_down(tp);
 -
    rc = register_netdev(dev);
    if (rc)
    	return rc;
@@@ -5397,7 -5439,7 +5422,7 @@@
    		    jumbo_max, tp->mac_version <= RTL_GIGA_MAC_VER_06 ?
    		    "ok" : "ko");
-	if (r8168_check_dash(tp)) {
 +	if (tp->dash_type != RTL_DASH_NONE) {
    	netdev_info(dev, "DASH enabled\n");
    	rtl8168_driver_start(tp);
    }
diff --combined drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index e8febfb1924d,b4a0bfce5b76..3a8775e0ca55
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@@ -1469,13 -1469,6 +1469,13 @@@ axienet_ethtools_set_link_ksettings(str
    return phylink_ethtool_ksettings_set(lp->phylink, cmd);
  }
+static int axienet_ethtools_nway_reset(struct net_device *dev)
 +{
 +	struct axienet_local *lp = netdev_priv(dev);
 +
 +	return phylink_ethtool_nway_reset(lp->phylink);
 +}
 +
  static const struct ethtool_ops axienet_ethtool_ops = {
    .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES,
    .get_drvinfo    = axienet_ethtools_get_drvinfo,
@@@ -1490,7 -1483,6 +1490,7 @@@
    .set_coalesce   = axienet_ethtools_set_coalesce,
    .get_link_ksettings = axienet_ethtools_get_link_ksettings,
    .set_link_ksettings = axienet_ethtools_set_link_ksettings,
 +	.nway_reset	= axienet_ethtools_nway_reset,
  };
static void axienet_validate(struct phylink_config *config,
@@@ -1502,22 -1494,13 +1502,22 @@@
    __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
/* Only support the mode we are configured for */
 -	if (state->interface != PHY_INTERFACE_MODE_NA &&
 -	    state->interface != lp->phy_mode) {
 -		netdev_warn(ndev, "Cannot use PHY mode %s, supported: %s\n",
 -			    phy_modes(state->interface),
 -			    phy_modes(lp->phy_mode));
 -		bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
 -		return;
 +	switch (state->interface) {
 +	case PHY_INTERFACE_MODE_NA:
 +		break;
 +	case PHY_INTERFACE_MODE_1000BASEX:
 +	case PHY_INTERFACE_MODE_SGMII:
 +		if (lp->switch_x_sgmii)
 +			break;
 +		fallthrough;
 +	default:
 +		if (state->interface != lp->phy_mode) {
 +			netdev_warn(ndev, "Cannot use PHY mode %s, supported: %s\n",
 +				    phy_modes(state->interface),
 +				    phy_modes(lp->phy_mode));
 +			bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
 +			return;
 +		}
    }
phylink_set(mask, Autoneg);
@@@ -1577,33 -1560,6 +1577,33 @@@ static void axienet_mac_an_restart(stru
    phylink_mii_c22_pcs_an_restart(lp->pcs_phy);
  }
+static int axienet_mac_prepare(struct phylink_config *config, unsigned int mode,
 +			       phy_interface_t iface)
 +{
 +	struct net_device *ndev = to_net_dev(config->dev);
 +	struct axienet_local *lp = netdev_priv(ndev);
 +	int ret;
 +
 +	switch (iface) {
 +	case PHY_INTERFACE_MODE_SGMII:
 +	case PHY_INTERFACE_MODE_1000BASEX:
 +		if (!lp->switch_x_sgmii)
 +			return 0;
 +
 +		ret = mdiobus_write(lp->pcs_phy->bus,
 +				    lp->pcs_phy->addr,
 +				    XLNX_MII_STD_SELECT_REG,
 +				    iface == PHY_INTERFACE_MODE_SGMII ?
 +					XLNX_MII_STD_SELECT_SGMII : 0);
 +		if (ret < 0)
 +			netdev_warn(ndev, "Failed to switch PHY interface: %d\n",
 +				    ret);
 +		return ret;
 +	default:
 +		return 0;
 +	}
 +}
 +
  static void axienet_mac_config(struct phylink_config *config, unsigned int mode,
    		       const struct phylink_link_state *state)
  {
@@@ -1681,7 -1637,6 +1681,7 @@@ static const struct phylink_mac_ops axi
    .validate = axienet_validate,
    .mac_pcs_get_state = axienet_mac_pcs_get_state,
    .mac_an_restart = axienet_mac_an_restart,
 +	.mac_prepare = axienet_mac_prepare,
    .mac_config = axienet_mac_config,
    .mac_link_down = axienet_mac_link_down,
    .mac_link_up = axienet_mac_link_up,
@@@ -1862,6 -1817,18 +1862,18 @@@ static int axienet_probe(struct platfor
    lp->options = XAE_OPTION_DEFAULTS;
    lp->rx_bd_num = RX_BD_NUM_DEFAULT;
    lp->tx_bd_num = TX_BD_NUM_DEFAULT;
+ 
+ 	lp->clk = devm_clk_get_optional(&pdev->dev, NULL);
+ 	if (IS_ERR(lp->clk)) {
+ 		ret = PTR_ERR(lp->clk);
+ 		goto free_netdev;
+ 	}
+ 	ret = clk_prepare_enable(lp->clk);
+ 	if (ret) {
+ 		dev_err(&pdev->dev, "Unable to enable clock: %d\n", ret);
+ 		goto free_netdev;
+ 	}
+ 
    /* Map device registers */
    ethres = platform_get_resource(pdev, IORESOURCE_MEM, 0);
    lp->regs = devm_ioremap_resource(&pdev->dev, ethres);
@@@ -1921,9 -1888,6 +1933,9 @@@
     */
    of_property_read_u32(pdev->dev.of_node, "xlnx,rxmem", &lp->rxmem);
+	lp->switch_x_sgmii = of_property_read_bool(pdev->dev.of_node,
 +						   "xlnx,switch-x-sgmii");
 +
    /* Start with the proprietary, and broken phy_type */
    ret = of_property_read_u32(pdev->dev.of_node, "xlnx,phy-type", &value);
    if (!ret) {
@@@ -1953,12 -1917,6 +1965,12 @@@
    	if (ret)
    		goto free_netdev;
    }
 +	if (lp->switch_x_sgmii && lp->phy_mode != PHY_INTERFACE_MODE_SGMII &&
 +	    lp->phy_mode != PHY_INTERFACE_MODE_1000BASEX) {
 +		dev_err(&pdev->dev, "xlnx,switch-x-sgmii only supported with SGMII or 1000BaseX\n");
 +		ret = -EINVAL;
 +		goto free_netdev;
 +	}
/* Find the DMA node, map the DMA registers, and decode the DMA IRQs */
    np = of_parse_phandle(pdev->dev.of_node, "axistream-connected", 0);
@@@ -2046,20 -2004,6 +2058,6 @@@
lp->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0);
    if (lp->phy_node) {
- 		lp->clk = devm_clk_get(&pdev->dev, NULL);
- 		if (IS_ERR(lp->clk)) {
- 			dev_warn(&pdev->dev, "Failed to get clock: %ld\n",
- 				 PTR_ERR(lp->clk));
- 			lp->clk = NULL;
- 		} else {
- 			ret = clk_prepare_enable(lp->clk);
- 			if (ret) {
- 				dev_err(&pdev->dev, "Unable to enable clock: %d\n",
- 					ret);
- 				goto free_netdev;
- 			}
- 		}
- 
    	ret = axienet_mdio_setup(lp);
    	if (ret)
    		dev_warn(&pdev->dev,
diff --combined drivers/net/ipa/ipa_main.c
index c10e7340b031,eb1c8396bcdd..97c1b55405cb
--- a/drivers/net/ipa/ipa_main.c
+++ b/drivers/net/ipa/ipa_main.c
@@@ -15,6 -15,7 +15,6 @@@
  #include <linux/of.h>
  #include <linux/of_device.h>
  #include <linux/of_address.h>
 -#include <linux/remoteproc.h>
  #include <linux/qcom_scm.h>
  #include <linux/soc/qcom/mdt_loader.h>
@@@ -580,10 -581,10 +580,10 @@@ ipa_resource_config(struct ipa *ipa, co
    	return -EINVAL;
for (i = 0; i < data->resource_src_count; i++)
- 		ipa_resource_config_src(ipa, data->resource_src);
+ 		ipa_resource_config_src(ipa, &data->resource_src[i]);
for (i = 0; i < data->resource_dst_count; i++)
- 		ipa_resource_config_dst(ipa, data->resource_dst);
+ 		ipa_resource_config_dst(ipa, &data->resource_dst[i]);
return 0;
  }
@@@ -728,6 -729,19 +728,6 @@@ static const struct of_device_id ipa_ma
  };
  MODULE_DEVICE_TABLE(of, ipa_match);
-static phandle of_property_read_phandle(const struct device_node *np,
 -					const char *name)
 -{
 -        struct property *prop;
 -        int len = 0;
 -
 -        prop = of_find_property(np, name, &len);
 -        if (!prop || len != sizeof(__be32))
 -                return 0;
 -
 -        return be32_to_cpup(prop->value);
 -}
 -
  /* Check things that can be validated at build time.  This just
   * groups these things BUILD_BUG_ON() calls don't clutter the rest
   * of the code.
@@@ -793,8 -807,10 +793,8 @@@ static int ipa_probe(struct platform_de
    struct device *dev = &pdev->dev;
    const struct ipa_data *data;
    struct ipa_clock *clock;
 -	struct rproc *rproc;
    bool modem_init;
    struct ipa *ipa;
 -	phandle ph;
    int ret;
ipa_validate_build();
@@@ -813,12 -829,25 +813,12 @@@
    	if (!qcom_scm_is_available())
    		return -EPROBE_DEFER;
-	/* We rely on remoteproc to tell us about modem state changes */
 -	ph = of_property_read_phandle(dev->of_node, "modem-remoteproc");
 -	if (!ph) {
 -		dev_err(dev, "DT missing "modem-remoteproc" property\n");
 -		return -EINVAL;
 -	}
 -
 -	rproc = rproc_get_by_phandle(ph);
 -	if (!rproc)
 -		return -EPROBE_DEFER;
 -
    /* The clock and interconnects might not be ready when we're
     * probed, so might return -EPROBE_DEFER.
     */
    clock = ipa_clock_init(dev, data->clock_data);
 -	if (IS_ERR(clock)) {
 -		ret = PTR_ERR(clock);
 -		goto err_rproc_put;
 -	}
 +	if (IS_ERR(clock))
 +		return PTR_ERR(clock);
/* No more EPROBE_DEFER.  Allocate and initialize the IPA structure */
    ipa = kzalloc(sizeof(*ipa), GFP_KERNEL);
@@@ -829,9 -858,9 +829,9 @@@
ipa->pdev = pdev;
    dev_set_drvdata(dev, ipa);
 -	ipa->modem_rproc = rproc;
    ipa->clock = clock;
    ipa->version = data->version;
 +	init_completion(&ipa->completion);
ret = ipa_reg_init(ipa);
    if (ret)
@@@ -906,6 -935,8 +906,6 @@@ err_kfree_ipa
    kfree(ipa);
  err_clock_exit:
    ipa_clock_exit(clock);
 -err_rproc_put:
 -	rproc_put(rproc);
return ret;
  }
@@@ -913,6 -944,7 +913,6 @@@
  static int ipa_remove(struct platform_device *pdev)
  {
    struct ipa *ipa = dev_get_drvdata(&pdev->dev);
 -	struct rproc *rproc = ipa->modem_rproc;
    struct ipa_clock *clock = ipa->clock;
    int ret;
@@@ -938,6 -970,7 +938,6 @@@
    ipa_reg_exit(ipa);
    kfree(ipa);
    ipa_clock_exit(clock);
 -	rproc_put(rproc);
return 0;
  }
diff --combined drivers/net/phy/phy_device.c
index d6ac3ed38197,71169e7d6177..ce495473cd5d
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@@ -300,50 -300,22 +300,22 @@@ static int mdio_bus_phy_resume(struct d
phydev->suspended_by_mdio_bus = 0;
- 	ret = phy_resume(phydev);
+ 	ret = phy_init_hw(phydev);
    if (ret < 0)
    	return ret;
- no_resume:
- 	if (phydev->attached_dev && phydev->adjust_link)
- 		phy_start_machine(phydev);
- 
- 	return 0;
- }
- 
- static int mdio_bus_phy_restore(struct device *dev)
- {
- 	struct phy_device *phydev = to_phy_device(dev);
- 	struct net_device *netdev = phydev->attached_dev;
- 	int ret;
- 
- 	if (!netdev)
- 		return 0;
- 
- 	ret = phy_init_hw(phydev);
+ 	ret = phy_resume(phydev);
    if (ret < 0)
    	return ret;
- 
+ no_resume:
    if (phydev->attached_dev && phydev->adjust_link)
    	phy_start_machine(phydev);
return 0;
  }
- static const struct dev_pm_ops mdio_bus_phy_pm_ops = {
- 	.suspend = mdio_bus_phy_suspend,
- 	.resume = mdio_bus_phy_resume,
- 	.freeze = mdio_bus_phy_suspend,
- 	.thaw = mdio_bus_phy_resume,
- 	.restore = mdio_bus_phy_restore,
- };
- 
- #define MDIO_BUS_PHY_PM_OPS (&mdio_bus_phy_pm_ops)
- 
- #else
- 
- #define MDIO_BUS_PHY_PM_OPS NULL
- 
+ static SIMPLE_DEV_PM_OPS(mdio_bus_phy_pm_ops, mdio_bus_phy_suspend,
+ 			 mdio_bus_phy_resume);
  #endif /* CONFIG_PM */
/**
@@@ -554,7 -526,7 +526,7 @@@ static const struct device_type mdio_bu
    .name = "PHY",
    .groups = phy_dev_groups,
    .release = phy_device_release,
- 	.pm = MDIO_BUS_PHY_PM_OPS,
+ 	.pm = pm_ptr(&mdio_bus_phy_pm_ops),
  };
static int phy_request_driver_module(struct phy_device *dev, u32 phy_id)
@@@ -606,7 -578,6 +578,7 @@@ struct phy_device *phy_device_create(st
    dev->pause = 0;
    dev->asym_pause = 0;
    dev->link = 0;
 +	dev->port = PORT_TP;
    dev->interface = PHY_INTERFACE_MODE_GMII;
dev->autoneg = AUTONEG_ENABLE;
@@@ -1144,10 -1115,19 +1116,19 @@@ int phy_init_hw(struct phy_device *phyd
    if (ret < 0)
    	return ret;
- 	if (phydev->drv->config_init)
+ 	if (phydev->drv->config_init) {
    	ret = phydev->drv->config_init(phydev);
+ 		if (ret < 0)
+ 			return ret;
+ 	}
- 	return ret;
+ 	if (phydev->drv->config_intr) {
+ 		ret = phydev->drv->config_intr(phydev);
+ 		if (ret < 0)
+ 			return ret;
+ 	}
+ 
+ 	return 0;
  }
  EXPORT_SYMBOL(phy_init_hw);
@@@ -1167,8 -1147,8 +1148,8 @@@ char *phy_attached_info_irq(struct phy_
    case PHY_POLL:
    	irq_str = "POLL";
    	break;
 -	case PHY_IGNORE_INTERRUPT:
 -		irq_str = "IGNORE";
 +	case PHY_MAC_INTERRUPT:
 +		irq_str = "MAC";
    	break;
    default:
    	snprintf(irq_num, sizeof(irq_num), "%d", phydev->irq);
@@@ -1377,8 -1357,6 +1358,8 @@@ int phy_attach_direct(struct net_devic
if (phydev->sfp_bus_attached)
    		dev->sfp_bus = phydev->sfp_bus;
 +		else if (dev->sfp_bus)
 +			phydev->is_on_sfp_module = true;
    }
/* Some Ethernet drivers try to connect to a PHY device before
@@@ -1406,14 -1384,6 +1387,14 @@@
phydev->state = PHY_READY;
+	/* Port is set to PORT_TP by default and the actual PHY driver will set
 +	 * it to different value depending on the PHY configuration. If we have
 +	 * the generic PHY driver we can't figure it out, thus set the old
 +	 * legacy PORT_MII value.
 +	 */
 +	if (using_genphy)
 +		phydev->port = PORT_MII;
 +
    /* Initial carrier state is off as the phy is about to be
     * (re)initialized.
     */
@@@ -1751,7 -1721,7 +1732,7 @@@ int __phy_resume(struct phy_device *phy
    struct phy_driver *phydrv = phydev->drv;
    int ret;
-	WARN_ON(!mutex_is_locked(&phydev->lock));
 +	lockdep_assert_held(&phydev->lock);
if (!phydrv || !phydrv->resume)
    	return 0;
diff --combined include/net/act_api.h
index 761c0e331915,57be7c5d273b..2bf3092ae7ec
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@@ -166,6 -166,7 +166,7 @@@ int tcf_idr_create_from_flags(struct tc
    		      struct nlattr *est, struct tc_action **a,
    		      const struct tc_action_ops *ops, int bind,
    		      u32 flags);
+ void tcf_idr_insert_many(struct tc_action *actions[]);
  void tcf_idr_cleanup(struct tc_action_net *tn, u32 index);
  int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
    		struct tc_action **a, int bind);
@@@ -186,13 -187,10 +187,13 @@@ int tcf_action_init(struct net *net, st
    	    struct nlattr *est, char *name, int ovr, int bind,
    	    struct tc_action *actions[], size_t *attr_size,
    	    bool rtnl_held, struct netlink_ext_ack *extack);
 +struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
 +					 bool rtnl_held,
 +					 struct netlink_ext_ack *extack);
  struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
    			    struct nlattr *nla, struct nlattr *est,
    			    char *name, int ovr, int bind,
 -				    bool rtnl_held,
 +				    struct tc_action_ops *ops, bool rtnl_held,
    			    struct netlink_ext_ack *extack);
  int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind,
    	    int ref, bool terse);
diff --combined include/uapi/linux/pkt_cls.h
index afe6836e44b1,88f4bf0047e7..7ea59cfe1fa7
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@@ -591,8 -591,8 +591,9 @@@ enum 
    TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */
    TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */
    TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */
 -
 +	TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */
 +	TCA_FLOWER_KEY_CT_FLAGS_REPLY = 1 << 5, /* Packet is in the reply direction. */
+ 	__TCA_FLOWER_KEY_CT_FLAGS_MAX,
  };
enum {
diff --combined kernel/bpf/verifier.c
index 36d1e7339ede,20babdd06278..1dda9d81f12c
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@@ -228,12 -228,6 +228,12 @@@ static void bpf_map_key_store(struct bp
    		     (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
  }
+static bool bpf_pseudo_call(const struct bpf_insn *insn)
 +{
 +	return insn->code == (BPF_JMP | BPF_CALL) &&
 +	       insn->src_reg == BPF_PSEUDO_CALL;
 +}
 +
  struct bpf_call_arg_meta {
    struct bpf_map *map_ptr;
    bool raw_mode;
@@@ -1079,51 -1073,6 +1079,51 @@@ static void mark_reg_known_zero(struct 
    __mark_reg_known_zero(regs + regno);
  }
+static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
 +{
 +	switch (reg->type) {
 +	case PTR_TO_MAP_VALUE_OR_NULL: {
 +		const struct bpf_map *map = reg->map_ptr;
 +
 +		if (map->inner_map_meta) {
 +			reg->type = CONST_PTR_TO_MAP;
 +			reg->map_ptr = map->inner_map_meta;
 +		} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
 +			reg->type = PTR_TO_XDP_SOCK;
 +		} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
 +			   map->map_type == BPF_MAP_TYPE_SOCKHASH) {
 +			reg->type = PTR_TO_SOCKET;
 +		} else {
 +			reg->type = PTR_TO_MAP_VALUE;
 +		}
 +		break;
 +	}
 +	case PTR_TO_SOCKET_OR_NULL:
 +		reg->type = PTR_TO_SOCKET;
 +		break;
 +	case PTR_TO_SOCK_COMMON_OR_NULL:
 +		reg->type = PTR_TO_SOCK_COMMON;
 +		break;
 +	case PTR_TO_TCP_SOCK_OR_NULL:
 +		reg->type = PTR_TO_TCP_SOCK;
 +		break;
 +	case PTR_TO_BTF_ID_OR_NULL:
 +		reg->type = PTR_TO_BTF_ID;
 +		break;
 +	case PTR_TO_MEM_OR_NULL:
 +		reg->type = PTR_TO_MEM;
 +		break;
 +	case PTR_TO_RDONLY_BUF_OR_NULL:
 +		reg->type = PTR_TO_RDONLY_BUF;
 +		break;
 +	case PTR_TO_RDWR_BUF_OR_NULL:
 +		reg->type = PTR_TO_RDWR_BUF;
 +		break;
 +	default:
 +		WARN_ON("unknown nullable register type");
 +	}
 +}
 +
  static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
  {
    return type_is_pkt_pointer(reg->type);
@@@ -1537,7 -1486,9 +1537,7 @@@ static int check_subprogs(struct bpf_ve
/* determine subprog starts. The end is one before the next starts */
    for (i = 0; i < insn_cnt; i++) {
 -		if (insn[i].code != (BPF_JMP | BPF_CALL))
 -			continue;
 -		if (insn[i].src_reg != BPF_PSEUDO_CALL)
 +		if (!bpf_pseudo_call(insn + i))
    		continue;
    	if (!env->bpf_capable) {
    		verbose(env,
@@@ -2320,14 -2271,12 +2320,14 @@@ static void save_register_state(struct 
    	state->stack[spi].slot_type[i] = STACK_SPILL;
  }
-/* check_stack_read/write functions track spill/fill of registers,
 +/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
   * stack boundary and alignment are checked in check_mem_access()
   */
 -static int check_stack_write(struct bpf_verifier_env *env,
 -			     struct bpf_func_state *state, /* func where register points to */
 -			     int off, int size, int value_regno, int insn_idx)
 +static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 +				       /* stack frame we're writing to */
 +				       struct bpf_func_state *state,
 +				       int off, int size, int value_regno,
 +				       int insn_idx)
  {
    struct bpf_func_state *cur; /* state of the current function */
    int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
@@@ -2453,175 -2402,9 +2453,175 @@@
    return 0;
  }
-static int check_stack_read(struct bpf_verifier_env *env,
 -			    struct bpf_func_state *reg_state /* func where register points to */,
 -			    int off, int size, int value_regno)
 +/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
 + * known to contain a variable offset.
 + * This function checks whether the write is permitted and conservatively
 + * tracks the effects of the write, considering that each stack slot in the
 + * dynamic range is potentially written to.
 + *
 + * 'off' includes 'regno->off'.
 + * 'value_regno' can be -1, meaning that an unknown value is being written to
 + * the stack.
 + *
 + * Spilled pointers in range are not marked as written because we don't know
 + * what's going to be actually written. This means that read propagation for
 + * future reads cannot be terminated by this write.
 + *
 + * For privileged programs, uninitialized stack slots are considered
 + * initialized by this write (even though we don't know exactly what offsets
 + * are going to be written to). The idea is that we don't want the verifier to
 + * reject future reads that access slots written to through variable offsets.
 + */
 +static int check_stack_write_var_off(struct bpf_verifier_env *env,
 +				     /* func where register points to */
 +				     struct bpf_func_state *state,
 +				     int ptr_regno, int off, int size,
 +				     int value_regno, int insn_idx)
 +{
 +	struct bpf_func_state *cur; /* state of the current function */
 +	int min_off, max_off;
 +	int i, err;
 +	struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
 +	bool writing_zero = false;
 +	/* set if the fact that we're writing a zero is used to let any
 +	 * stack slots remain STACK_ZERO
 +	 */
 +	bool zero_used = false;
 +
 +	cur = env->cur_state->frame[env->cur_state->curframe];
 +	ptr_reg = &cur->regs[ptr_regno];
 +	min_off = ptr_reg->smin_value + off;
 +	max_off = ptr_reg->smax_value + off + size;
 +	if (value_regno >= 0)
 +		value_reg = &cur->regs[value_regno];
 +	if (value_reg && register_is_null(value_reg))
 +		writing_zero = true;
 +
 +	err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE),
 +				 state->acquired_refs, true);
 +	if (err)
 +		return err;
 +
 +
 +	/* Variable offset writes destroy any spilled pointers in range. */
 +	for (i = min_off; i < max_off; i++) {
 +		u8 new_type, *stype;
 +		int slot, spi;
 +
 +		slot = -i - 1;
 +		spi = slot / BPF_REG_SIZE;
 +		stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
 +
 +		if (!env->allow_ptr_leaks
 +				&& *stype != NOT_INIT
 +				&& *stype != SCALAR_VALUE) {
 +			/* Reject the write if there's are spilled pointers in
 +			 * range. If we didn't reject here, the ptr status
 +			 * would be erased below (even though not all slots are
 +			 * actually overwritten), possibly opening the door to
 +			 * leaks.
 +			 */
 +			verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
 +				insn_idx, i);
 +			return -EINVAL;
 +		}
 +
 +		/* Erase all spilled pointers. */
 +		state->stack[spi].spilled_ptr.type = NOT_INIT;
 +
 +		/* Update the slot type. */
 +		new_type = STACK_MISC;
 +		if (writing_zero && *stype == STACK_ZERO) {
 +			new_type = STACK_ZERO;
 +			zero_used = true;
 +		}
 +		/* If the slot is STACK_INVALID, we check whether it's OK to
 +		 * pretend that it will be initialized by this write. The slot
 +		 * might not actually be written to, and so if we mark it as
 +		 * initialized future reads might leak uninitialized memory.
 +		 * For privileged programs, we will accept such reads to slots
 +		 * that may or may not be written because, if we're reject
 +		 * them, the error would be too confusing.
 +		 */
 +		if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
 +			verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
 +					insn_idx, i);
 +			return -EINVAL;
 +		}
 +		*stype = new_type;
 +	}
 +	if (zero_used) {
 +		/* backtracking doesn't work for STACK_ZERO yet. */
 +		err = mark_chain_precision(env, value_regno);
 +		if (err)
 +			return err;
 +	}
 +	return 0;
 +}
 +
 +/* When register 'dst_regno' is assigned some values from stack[min_off,
 + * max_off), we set the register's type according to the types of the
 + * respective stack slots. If all the stack values are known to be zeros, then
 + * so is the destination reg. Otherwise, the register is considered to be
 + * SCALAR. This function does not deal with register filling; the caller must
 + * ensure that all spilled registers in the stack range have been marked as
 + * read.
 + */
 +static void mark_reg_stack_read(struct bpf_verifier_env *env,
 +				/* func where src register points to */
 +				struct bpf_func_state *ptr_state,
 +				int min_off, int max_off, int dst_regno)
 +{
 +	struct bpf_verifier_state *vstate = env->cur_state;
 +	struct bpf_func_state *state = vstate->frame[vstate->curframe];
 +	int i, slot, spi;
 +	u8 *stype;
 +	int zeros = 0;
 +
 +	for (i = min_off; i < max_off; i++) {
 +		slot = -i - 1;
 +		spi = slot / BPF_REG_SIZE;
 +		stype = ptr_state->stack[spi].slot_type;
 +		if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
 +			break;
 +		zeros++;
 +	}
 +	if (zeros == max_off - min_off) {
 +		/* any access_size read into register is zero extended,
 +		 * so the whole register == const_zero
 +		 */
 +		__mark_reg_const_zero(&state->regs[dst_regno]);
 +		/* backtracking doesn't support STACK_ZERO yet,
 +		 * so mark it precise here, so that later
 +		 * backtracking can stop here.
 +		 * Backtracking may not need this if this register
 +		 * doesn't participate in pointer adjustment.
 +		 * Forward propagation of precise flag is not
 +		 * necessary either. This mark is only to stop
 +		 * backtracking. Any register that contributed
 +		 * to const 0 was marked precise before spill.
 +		 */
 +		state->regs[dst_regno].precise = true;
 +	} else {
 +		/* have read misc data from the stack */
 +		mark_reg_unknown(env, state->regs, dst_regno);
 +	}
 +	state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
 +}
 +
 +/* Read the stack at 'off' and put the results into the register indicated by
 + * 'dst_regno'. It handles reg filling if the addressed stack slot is a
 + * spilled reg.
 + *
 + * 'dst_regno' can be -1, meaning that the read value is not going to a
 + * register.
 + *
 + * The access is assumed to be within the current stack bounds.
 + */
 +static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 +				      /* func where src register points to */
 +				      struct bpf_func_state *reg_state,
 +				      int off, int size, int dst_regno)
  {
    struct bpf_verifier_state *vstate = env->cur_state;
    struct bpf_func_state *state = vstate->frame[vstate->curframe];
@@@ -2629,6 -2412,11 +2629,6 @@@
    struct bpf_reg_state *reg;
    u8 *stype;
-	if (reg_state->allocated_stack <= slot) {
 -		verbose(env, "invalid read from stack off %d+0 size %d\n",
 -			off, size);
 -		return -EACCES;
 -	}
    stype = reg_state->stack[spi].slot_type;
    reg = &reg_state->stack[spi].spilled_ptr;
@@@ -2639,9 -2427,9 +2639,9 @@@
    			verbose(env, "invalid size of register fill\n");
    			return -EACCES;
    		}
 -			if (value_regno >= 0) {
 -				mark_reg_unknown(env, state->regs, value_regno);
 -				state->regs[value_regno].live |= REG_LIVE_WRITTEN;
 +			if (dst_regno >= 0) {
 +				mark_reg_unknown(env, state->regs, dst_regno);
 +				state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
    		}
    		mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
    		return 0;
@@@ -2653,16 -2441,16 +2653,16 @@@
    		}
    	}
-		if (value_regno >= 0) {
 +		if (dst_regno >= 0) {
    		/* restore register state from stack */
 -			state->regs[value_regno] = *reg;
 +			state->regs[dst_regno] = *reg;
    		/* mark reg as written since spilled pointer state likely
    		 * has its liveness marks cleared by is_state_visited()
    		 * which resets stack/reg liveness for state transitions
    		 */
 -			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
 +			state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
    	} else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
 -			/* If value_regno==-1, the caller is asking us whether
 +			/* If dst_regno==-1, the caller is asking us whether
    		 * it is acceptable to use this value as a SCALAR_VALUE
    		 * (e.g. for XADD).
    		 * We must not allow unprivileged callers to do that
@@@ -2674,167 -2462,70 +2674,167 @@@
    	}
    	mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
    } else {
 -		int zeros = 0;
 +		u8 type;
for (i = 0; i < size; i++) {
 -			if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
 +			type = stype[(slot - i) % BPF_REG_SIZE];
 +			if (type == STACK_MISC)
    			continue;
 -			if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
 -				zeros++;
 +			if (type == STACK_ZERO)
    			continue;
 -			}
    		verbose(env, "invalid read from stack off %d+%d size %d\n",
    			off, i, size);
    		return -EACCES;
    	}
    	mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
 -		if (value_regno >= 0) {
 -			if (zeros == size) {
 -				/* any size read into register is zero extended,
 -				 * so the whole register == const_zero
 -				 */
 -				__mark_reg_const_zero(&state->regs[value_regno]);
 -				/* backtracking doesn't support STACK_ZERO yet,
 -				 * so mark it precise here, so that later
 -				 * backtracking can stop here.
 -				 * Backtracking may not need this if this register
 -				 * doesn't participate in pointer adjustment.
 -				 * Forward propagation of precise flag is not
 -				 * necessary either. This mark is only to stop
 -				 * backtracking. Any register that contributed
 -				 * to const 0 was marked precise before spill.
 -				 */
 -				state->regs[value_regno].precise = true;
 -			} else {
 -				/* have read misc data from the stack */
 -				mark_reg_unknown(env, state->regs, value_regno);
 -			}
 -			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
 -		}
 +		if (dst_regno >= 0)
 +			mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
    }
    return 0;
  }
-static int check_stack_access(struct bpf_verifier_env *env,
 -			      const struct bpf_reg_state *reg,
 -			      int off, int size)
 +enum stack_access_src {
 +	ACCESS_DIRECT = 1,  /* the access is performed by an instruction */
 +	ACCESS_HELPER = 2,  /* the access is performed by a helper */
 +};
 +
 +static int check_stack_range_initialized(struct bpf_verifier_env *env,
 +					 int regno, int off, int access_size,
 +					 bool zero_size_allowed,
 +					 enum stack_access_src type,
 +					 struct bpf_call_arg_meta *meta);
 +
 +static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
 +{
 +	return cur_regs(env) + regno;
 +}
 +
 +/* Read the stack at 'ptr_regno + off' and put the result into the register
 + * 'dst_regno'.
 + * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
 + * but not its variable offset.
 + * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
 + *
 + * As opposed to check_stack_read_fixed_off, this function doesn't deal with
 + * filling registers (i.e. reads of spilled register cannot be detected when
 + * the offset is not fixed). We conservatively mark 'dst_regno' as containing
 + * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
 + * offset; for a fixed offset check_stack_read_fixed_off should be used
 + * instead.
 + */
 +static int check_stack_read_var_off(struct bpf_verifier_env *env,
 +				    int ptr_regno, int off, int size, int dst_regno)
  {
 -	/* Stack accesses must be at a fixed offset, so that we
 -	 * can determine what type of data were returned. See
 -	 * check_stack_read().
 +	/* The state of the source register. */
 +	struct bpf_reg_state *reg = reg_state(env, ptr_regno);
 +	struct bpf_func_state *ptr_state = func(env, reg);
 +	int err;
 +	int min_off, max_off;
 +
 +	/* Note that we pass a NULL meta, so raw access will not be permitted.
     */
 -	if (!tnum_is_const(reg->var_off)) {
 +	err = check_stack_range_initialized(env, ptr_regno, off, size,
 +					    false, ACCESS_DIRECT, NULL);
 +	if (err)
 +		return err;
 +
 +	min_off = reg->smin_value + off;
 +	max_off = reg->smax_value + off;
 +	mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
 +	return 0;
 +}
 +
 +/* check_stack_read dispatches to check_stack_read_fixed_off or
 + * check_stack_read_var_off.
 + *
 + * The caller must ensure that the offset falls within the allocated stack
 + * bounds.
 + *
 + * 'dst_regno' is a register which will receive the value from the stack. It
 + * can be -1, meaning that the read value is not going to a register.
 + */
 +static int check_stack_read(struct bpf_verifier_env *env,
 +			    int ptr_regno, int off, int size,
 +			    int dst_regno)
 +{
 +	struct bpf_reg_state *reg = reg_state(env, ptr_regno);
 +	struct bpf_func_state *state = func(env, reg);
 +	int err;
 +	/* Some accesses are only permitted with a static offset. */
 +	bool var_off = !tnum_is_const(reg->var_off);
 +
 +	/* The offset is required to be static when reads don't go to a
 +	 * register, in order to not leak pointers (see
 +	 * check_stack_read_fixed_off).
 +	 */
 +	if (dst_regno < 0 && var_off) {
    	char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 -		verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
 +		verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
    		tn_buf, off, size);
    	return -EACCES;
    }
 +	/* Variable offset is prohibited for unprivileged mode for simplicity
 +	 * since it requires corresponding support in Spectre masking for stack
 +	 * ALU. See also retrieve_ptr_limit().
 +	 */
 +	if (!env->bypass_spec_v1 && var_off) {
 +		char tn_buf[48];
-	if (off >= 0 || off < -MAX_BPF_STACK) {
 -		verbose(env, "invalid stack off=%d size=%d\n", off, size);
 +		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 +		verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
 +				ptr_regno, tn_buf);
    	return -EACCES;
    }
-	return 0;
 +	if (!var_off) {
 +		off += reg->var_off.value;
 +		err = check_stack_read_fixed_off(env, state, off, size,
 +						 dst_regno);
 +	} else {
 +		/* Variable offset stack reads need more conservative handling
 +		 * than fixed offset ones. Note that dst_regno >= 0 on this
 +		 * branch.
 +		 */
 +		err = check_stack_read_var_off(env, ptr_regno, off, size,
 +					       dst_regno);
 +	}
 +	return err;
 +}
 +
 +
 +/* check_stack_write dispatches to check_stack_write_fixed_off or
 + * check_stack_write_var_off.
 + *
 + * 'ptr_regno' is the register used as a pointer into the stack.
 + * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
 + * 'value_regno' is the register whose value we're writing to the stack. It can
 + * be -1, meaning that we're not writing from a register.
 + *
 + * The caller must ensure that the offset falls within the maximum stack size.
 + */
 +static int check_stack_write(struct bpf_verifier_env *env,
 +			     int ptr_regno, int off, int size,
 +			     int value_regno, int insn_idx)
 +{
 +	struct bpf_reg_state *reg = reg_state(env, ptr_regno);
 +	struct bpf_func_state *state = func(env, reg);
 +	int err;
 +
 +	if (tnum_is_const(reg->var_off)) {
 +		off += reg->var_off.value;
 +		err = check_stack_write_fixed_off(env, state, off, size,
 +						  value_regno, insn_idx);
 +	} else {
 +		/* Variable offset stack reads need more conservative handling
 +		 * than fixed offset ones.
 +		 */
 +		err = check_stack_write_var_off(env, state,
 +						ptr_regno, off, size,
 +						value_regno, insn_idx);
 +	}
 +	return err;
  }
static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
@@@ -3167,6 -2858,11 +3167,6 @@@ static int check_sock_access(struct bpf
    return -EACCES;
  }
-static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
 -{
 -	return cur_regs(env) + regno;
 -}
 -
  static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
  {
    return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
@@@ -3285,8 -2981,8 +3285,8 @@@ static int check_ptr_alignment(struct b
    	break;
    case PTR_TO_STACK:
    	pointer_desc = "stack ";
 -		/* The stack spill tracking logic in check_stack_write()
 -		 * and check_stack_read() relies on stack accesses being
 +		/* The stack spill tracking logic in check_stack_write_fixed_off()
 +		 * and check_stack_read_fixed_off() relies on stack accesses being
    	 * aligned.
    	 */
    	strict = true;
@@@ -3378,7 -3074,9 +3378,7 @@@ process_func
  continue_func:
    subprog_end = subprog[idx + 1].start;
    for (; i < subprog_end; i++) {
 -		if (insn[i].code != (BPF_JMP | BPF_CALL))
 -			continue;
 -		if (insn[i].src_reg != BPF_PSEUDO_CALL)
 +		if (!bpf_pseudo_call(insn + i))
    		continue;
    	/* remember insn and function to return to */
    	ret_insn[frame] = i + 1;
@@@ -3702,91 -3400,6 +3702,91 @@@ static int check_ptr_to_map_access(stru
    return 0;
  }
+/* Check that the stack access at the given offset is within bounds. The
 + * maximum valid offset is -1.
 + *
 + * The minimum valid offset is -MAX_BPF_STACK for writes, and
 + * -state->allocated_stack for reads.
 + */
 +static int check_stack_slot_within_bounds(int off,
 +					  struct bpf_func_state *state,
 +					  enum bpf_access_type t)
 +{
 +	int min_valid_off;
 +
 +	if (t == BPF_WRITE)
 +		min_valid_off = -MAX_BPF_STACK;
 +	else
 +		min_valid_off = -state->allocated_stack;
 +
 +	if (off < min_valid_off || off > -1)
 +		return -EACCES;
 +	return 0;
 +}
 +
 +/* Check that the stack access at 'regno + off' falls within the maximum stack
 + * bounds.
 + *
 + * 'off' includes `regno->offset`, but not its dynamic part (if any).
 + */
 +static int check_stack_access_within_bounds(
 +		struct bpf_verifier_env *env,
 +		int regno, int off, int access_size,
 +		enum stack_access_src src, enum bpf_access_type type)
 +{
 +	struct bpf_reg_state *regs = cur_regs(env);
 +	struct bpf_reg_state *reg = regs + regno;
 +	struct bpf_func_state *state = func(env, reg);
 +	int min_off, max_off;
 +	int err;
 +	char *err_extra;
 +
 +	if (src == ACCESS_HELPER)
 +		/* We don't know if helpers are reading or writing (or both). */
 +		err_extra = " indirect access to";
 +	else if (type == BPF_READ)
 +		err_extra = " read from";
 +	else
 +		err_extra = " write to";
 +
 +	if (tnum_is_const(reg->var_off)) {
 +		min_off = reg->var_off.value + off;
 +		if (access_size > 0)
 +			max_off = min_off + access_size - 1;
 +		else
 +			max_off = min_off;
 +	} else {
 +		if (reg->smax_value >= BPF_MAX_VAR_OFF ||
 +		    reg->smin_value <= -BPF_MAX_VAR_OFF) {
 +			verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
 +				err_extra, regno);
 +			return -EACCES;
 +		}
 +		min_off = reg->smin_value + off;
 +		if (access_size > 0)
 +			max_off = reg->smax_value + off + access_size - 1;
 +		else
 +			max_off = min_off;
 +	}
 +
 +	err = check_stack_slot_within_bounds(min_off, state, type);
 +	if (!err)
 +		err = check_stack_slot_within_bounds(max_off, state, type);
 +
 +	if (err) {
 +		if (tnum_is_const(reg->var_off)) {
 +			verbose(env, "invalid%s stack R%d off=%d size=%d\n",
 +				err_extra, regno, off, access_size);
 +		} else {
 +			char tn_buf[48];
 +
 +			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 +			verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n",
 +				err_extra, regno, tn_buf, access_size);
 +		}
 +	}
 +	return err;
 +}
/* check whether memory at (regno + off) is accessible for t = (read | write)
   * if t==write, value_regno is a register which value is stored into memory
@@@ -3902,8 -3515,8 +3902,8 @@@ static int check_mem_access(struct bpf_
    	}
} else if (reg->type == PTR_TO_STACK) {
 -		off += reg->var_off.value;
 -		err = check_stack_access(env, reg, off, size);
 +		/* Basic bounds checks. */
 +		err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
    	if (err)
    		return err;
@@@ -3912,12 -3525,12 +3912,12 @@@
    	if (err)
    		return err;
-		if (t == BPF_WRITE)
 -			err = check_stack_write(env, state, off, size,
 -						value_regno, insn_idx);
 -		else
 -			err = check_stack_read(env, state, off, size,
 +		if (t == BPF_READ)
 +			err = check_stack_read(env, regno, off, size,
    				       value_regno);
 +		else
 +			err = check_stack_write(env, regno, off, size,
 +						value_regno, insn_idx);
    } else if (reg_is_pkt_pointer(reg)) {
    	if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
    		verbose(env, "cannot write into packet\n");
@@@ -3993,30 -3606,13 +3993,30 @@@
    return err;
  }
-static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
 +static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
  {
 +	int load_reg;
    int err;
-	if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
 -	    insn->imm != 0) {
 -		verbose(env, "BPF_XADD uses reserved fields\n");
 +	switch (insn->imm) {
 +	case BPF_ADD:
 +	case BPF_ADD | BPF_FETCH:
 +	case BPF_AND:
 +	case BPF_AND | BPF_FETCH:
 +	case BPF_OR:
 +	case BPF_OR | BPF_FETCH:
 +	case BPF_XOR:
 +	case BPF_XOR | BPF_FETCH:
 +	case BPF_XCHG:
 +	case BPF_CMPXCHG:
 +		break;
 +	default:
 +		verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
 +		return -EINVAL;
 +	}
 +
 +	if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) {
 +		verbose(env, "invalid atomic operand size\n");
    	return -EINVAL;
    }
@@@ -4030,13 -3626,6 +4030,13 @@@
    if (err)
    	return err;
+	if (insn->imm == BPF_CMPXCHG) {
 +		/* Check comparison of R0 with memory location */
 +		err = check_reg_arg(env, BPF_REG_0, SRC_OP);
 +		if (err)
 +			return err;
 +	}
 +
    if (is_pointer_value(env, insn->src_reg)) {
    	verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
    	return -EACCES;
@@@ -4046,91 -3635,66 +4046,91 @@@
        is_pkt_reg(env, insn->dst_reg) ||
        is_flow_key_reg(env, insn->dst_reg) ||
        is_sk_reg(env, insn->dst_reg)) {
 -		verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
 +		verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
    		insn->dst_reg,
    		reg_type_str[reg_state(env, insn->dst_reg)->type]);
    	return -EACCES;
    }
-	/* check whether atomic_add can read the memory */
 +	if (insn->imm & BPF_FETCH) {
 +		if (insn->imm == BPF_CMPXCHG)
 +			load_reg = BPF_REG_0;
 +		else
 +			load_reg = insn->src_reg;
 +
 +		/* check and record load of old value */
 +		err = check_reg_arg(env, load_reg, DST_OP);
 +		if (err)
 +			return err;
 +	} else {
 +		/* This instruction accesses a memory location but doesn't
 +		 * actually load it into a register.
 +		 */
 +		load_reg = -1;
 +	}
 +
 +	/* check whether we can read the memory */
    err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 -			       BPF_SIZE(insn->code), BPF_READ, -1, true);
 +			       BPF_SIZE(insn->code), BPF_READ, load_reg, true);
    if (err)
    	return err;
-	/* check whether atomic_add can write into the same memory */
 -	return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 -				BPF_SIZE(insn->code), BPF_WRITE, -1, true);
 -}
 -
 -static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
 -				  int off, int access_size,
 -				  bool zero_size_allowed)
 -{
 -	struct bpf_reg_state *reg = reg_state(env, regno);
 -
 -	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
 -	    access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
 -		if (tnum_is_const(reg->var_off)) {
 -			verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
 -				regno, off, access_size);
 -		} else {
 -			char tn_buf[48];
 +	/* check whether we can write into the same memory */
 +	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 +			       BPF_SIZE(insn->code), BPF_WRITE, -1, true);
 +	if (err)
 +		return err;
-			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 -			verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
 -				regno, tn_buf, access_size);
 -		}
 -		return -EACCES;
 -	}
    return 0;
  }
-/* when register 'regno' is passed into function that will read 'access_size'
 - * bytes from that pointer, make sure that it's within stack boundary
 - * and all elements of stack are initialized.
 - * Unlike most pointer bounds-checking functions, this one doesn't take an
 - * 'off' argument, so it has to add in reg->off itself.
 +/* When register 'regno' is used to read the stack (either directly or through
 + * a helper function) make sure that it's within stack boundary and, depending
 + * on the access type, that all elements of the stack are initialized.
 + *
 + * 'off' includes 'regno->off', but not its dynamic part (if any).
 + *
 + * All registers that have been spilled on the stack in the slots within the
 + * read offsets are marked as read.
   */
 -static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 -				int access_size, bool zero_size_allowed,
 -				struct bpf_call_arg_meta *meta)
 +static int check_stack_range_initialized(
 +		struct bpf_verifier_env *env, int regno, int off,
 +		int access_size, bool zero_size_allowed,
 +		enum stack_access_src type, struct bpf_call_arg_meta *meta)
  {
    struct bpf_reg_state *reg = reg_state(env, regno);
    struct bpf_func_state *state = func(env, reg);
    int err, min_off, max_off, i, j, slot, spi;
 +	char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
 +	enum bpf_access_type bounds_check_type;
 +	/* Some accesses can write anything into the stack, others are
 +	 * read-only.
 +	 */
 +	bool clobber = false;
 +
 +	if (access_size == 0 && !zero_size_allowed) {
 +		verbose(env, "invalid zero-sized read\n");
 +		return -EACCES;
 +	}
 +
 +	if (type == ACCESS_HELPER) {
 +		/* The bounds checks for writes are more permissive than for
 +		 * reads. However, if raw_mode is not set, we'll do extra
 +		 * checks below.
 +		 */
 +		bounds_check_type = BPF_WRITE;
 +		clobber = true;
 +	} else {
 +		bounds_check_type = BPF_READ;
 +	}
 +	err = check_stack_access_within_bounds(env, regno, off, access_size,
 +					       type, bounds_check_type);
 +	if (err)
 +		return err;
 +
if (tnum_is_const(reg->var_off)) {
 -		min_off = max_off = reg->var_off.value + reg->off;
 -		err = __check_stack_boundary(env, regno, min_off, access_size,
 -					     zero_size_allowed);
 -		if (err)
 -			return err;
 +		min_off = max_off = reg->var_off.value + off;
    } else {
    	/* Variable offset is prohibited for unprivileged mode for
    	 * simplicity since it requires corresponding support in
@@@ -4141,8 -3705,8 +4141,8 @@@
    		char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 -			verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
 -				regno, tn_buf);
 +			verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
 +				regno, err_extra, tn_buf);
    		return -EACCES;
    	}
    	/* Only initialized buffer on stack is allowed to be accessed
@@@ -4154,8 -3718,28 +4154,8 @@@
    	if (meta && meta->raw_mode)
    		meta = NULL;
-		if (reg->smax_value >= BPF_MAX_VAR_OFF ||
 -		    reg->smax_value <= -BPF_MAX_VAR_OFF) {
 -			verbose(env, "R%d unbounded indirect variable offset stack access\n",
 -				regno);
 -			return -EACCES;
 -		}
 -		min_off = reg->smin_value + reg->off;
 -		max_off = reg->smax_value + reg->off;
 -		err = __check_stack_boundary(env, regno, min_off, access_size,
 -					     zero_size_allowed);
 -		if (err) {
 -			verbose(env, "R%d min value is outside of stack bound\n",
 -				regno);
 -			return err;
 -		}
 -		err = __check_stack_boundary(env, regno, max_off, access_size,
 -					     zero_size_allowed);
 -		if (err) {
 -			verbose(env, "R%d max value is outside of stack bound\n",
 -				regno);
 -			return err;
 -		}
 +		min_off = reg->smin_value + off;
 +		max_off = reg->smax_value + off;
    }
if (meta && meta->raw_mode) {
@@@ -4175,10 -3759,8 +4175,10 @@@
    	if (*stype == STACK_MISC)
    		goto mark;
    	if (*stype == STACK_ZERO) {
 -			/* helper can write anything into the stack */
 -			*stype = STACK_MISC;
 +			if (clobber) {
 +				/* helper can write anything into the stack */
 +				*stype = STACK_MISC;
 +			}
    		goto mark;
    	}
@@@ -4189,24 -3771,22 +4189,24 @@@
    	if (state->stack[spi].slot_type[0] == STACK_SPILL &&
    	    (state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
    	     env->allow_ptr_leaks)) {
 -			__mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
 -			for (j = 0; j < BPF_REG_SIZE; j++)
 -				state->stack[spi].slot_type[j] = STACK_MISC;
 +			if (clobber) {
 +				__mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
 +				for (j = 0; j < BPF_REG_SIZE; j++)
 +					state->stack[spi].slot_type[j] = STACK_MISC;
 +			}
    		goto mark;
    	}
err:
    	if (tnum_is_const(reg->var_off)) {
 -			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
 -				min_off, i - min_off, access_size);
 +			verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
 +				err_extra, regno, min_off, i - min_off, access_size);
    	} else {
    		char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 -			verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
 -				tn_buf, i - min_off, access_size);
 +			verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
 +				err_extra, regno, tn_buf, i - min_off, access_size);
    	}
    	return -EACCES;
  mark:
@@@ -4255,10 -3835,8 +4255,10 @@@ static int check_helper_mem_access(stru
    				   "rdwr",
    				   &env->prog->aux->max_rdwr_access);
    case PTR_TO_STACK:
 -		return check_stack_boundary(env, regno, access_size,
 -					    zero_size_allowed, meta);
 +		return check_stack_range_initialized(
 +				env,
 +				regno, reg->off, access_size,
 +				zero_size_allowed, ACCESS_HELPER, meta);
    default: /* scalar_value or invalid ptr */
    	/* Allow zero-byte read from NULL, regardless of pointer type */
    	if (zero_size_allowed && access_size == 0 &&
@@@ -4272,29 -3850,6 +4272,29 @@@
    }
  }
+int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 +		   u32 regno, u32 mem_size)
 +{
 +	if (register_is_null(reg))
 +		return 0;
 +
 +	if (reg_type_may_be_null(reg->type)) {
 +		/* Assuming that the register contains a value check if the memory
 +		 * access is safe. Temporarily save and restore the register's state as
 +		 * the conversion shouldn't be visible to a caller.
 +		 */
 +		const struct bpf_reg_state saved_reg = *reg;
 +		int rv;
 +
 +		mark_ptr_not_null_reg(reg);
 +		rv = check_helper_mem_access(env, regno, mem_size, true, NULL);
 +		*reg = saved_reg;
 +		return rv;
 +	}
 +
 +	return check_helper_mem_access(env, regno, mem_size, true, NULL);
 +}
 +
  /* Implementation details:
   * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
   * Two bpf_map_lookups (even with the same key) will have different reg->id.
@@@ -4766,7 -4321,7 +4766,7 @@@ skip_type_check
    		err = mark_chain_precision(env, regno);
    } else if (arg_type_is_alloc_size(arg_type)) {
    	if (!tnum_is_const(reg->var_off)) {
 -			verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n",
 +			verbose(env, "R%d is not a known constant'\n",
    			regno);
    		return -EACCES;
    	}
@@@ -5279,9 -4834,8 +5279,9 @@@ static int check_func_call(struct bpf_v
    				subprog);
    		clear_caller_saved_regs(env, caller->regs);
-			/* All global functions return SCALAR_VALUE */
 +			/* All global functions return a 64-bit SCALAR_VALUE */
    		mark_reg_unknown(env, caller->regs, BPF_REG_0);
 +			caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
/* continue with next insn after call */
    		return 0;
@@@ -5946,41 -5500,6 +5946,41 @@@ do_sim
    return !ret ? -EFAULT : 0;
  }
+/* check that stack access falls within stack limits and that 'reg' doesn't
 + * have a variable offset.
 + *
 + * Variable offset is prohibited for unprivileged mode for simplicity since it
 + * requires corresponding support in Spectre masking for stack ALU.  See also
 + * retrieve_ptr_limit().
 + *
 + *
 + * 'off' includes 'reg->off'.
 + */
 +static int check_stack_access_for_ptr_arithmetic(
 +				struct bpf_verifier_env *env,
 +				int regno,
 +				const struct bpf_reg_state *reg,
 +				int off)
 +{
 +	if (!tnum_is_const(reg->var_off)) {
 +		char tn_buf[48];
 +
 +		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 +		verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
 +			regno, tn_buf, off);
 +		return -EACCES;
 +	}
 +
 +	if (off >= 0 || off < -MAX_BPF_STACK) {
 +		verbose(env, "R%d stack pointer arithmetic goes out of range, "
 +			"prohibited for !root; off=%d\n", regno, off);
 +		return -EACCES;
 +	}
 +
 +	return 0;
 +}
 +
 +
  /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
   * Caller should also handle BPF_MOV case separately.
   * If we return -EACCES, caller may want to try again treating pointer as a
@@@ -6224,9 -5743,10 +6224,9 @@@ static int adjust_ptr_min_max_vals(stru
    			"prohibited for !root\n", dst);
    		return -EACCES;
    	} else if (dst_reg->type == PTR_TO_STACK &&
 -			   check_stack_access(env, dst_reg, dst_reg->off +
 -					      dst_reg->var_off.value, 1)) {
 -			verbose(env, "R%d stack pointer arithmetic goes out of range, "
 -				"prohibited for !root\n", dst);
 +			   check_stack_access_for_ptr_arithmetic(
 +				   env, dst, dst_reg, dst_reg->off +
 +				   dst_reg->var_off.value)) {
    		return -EACCES;
    	}
    }
@@@ -6705,7 -6225,7 +6705,7 @@@ static void scalar32_min_max_rsh(struc
     * 3) the signed bounds cross zero, so they tell us nothing
     *    about the result
     * If the value in dst_reg is known nonnegative, then again the
 -	 * unsigned bounts capture the signed bounds.
 +	 * unsigned bounds capture the signed bounds.
     * Thus, in all cases it suffices to blow away our signed bounds
     * and rely on inferring new ones from the unsigned bounds and
     * var_off of the result.
@@@ -6736,7 -6256,7 +6736,7 @@@ static void scalar_min_max_rsh(struct b
     * 3) the signed bounds cross zero, so they tell us nothing
     *    about the result
     * If the value in dst_reg is known nonnegative, then again the
 -	 * unsigned bounts capture the signed bounds.
 +	 * unsigned bounds capture the signed bounds.
     * Thus, in all cases it suffices to blow away our signed bounds
     * and rely on inferring new ones from the unsigned bounds and
     * var_off of the result.
@@@ -7806,19 -7326,43 +7806,19 @@@ static void mark_ptr_or_null_reg(struc
    	}
    	if (is_null) {
    		reg->type = SCALAR_VALUE;
 -		} else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
 -			const struct bpf_map *map = reg->map_ptr;
 -
 -			if (map->inner_map_meta) {
 -				reg->type = CONST_PTR_TO_MAP;
 -				reg->map_ptr = map->inner_map_meta;
 -			} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
 -				reg->type = PTR_TO_XDP_SOCK;
 -			} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
 -				   map->map_type == BPF_MAP_TYPE_SOCKHASH) {
 -				reg->type = PTR_TO_SOCKET;
 -			} else {
 -				reg->type = PTR_TO_MAP_VALUE;
 -			}
 -		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
 -			reg->type = PTR_TO_SOCKET;
 -		} else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
 -			reg->type = PTR_TO_SOCK_COMMON;
 -		} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
 -			reg->type = PTR_TO_TCP_SOCK;
 -		} else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
 -			reg->type = PTR_TO_BTF_ID;
 -		} else if (reg->type == PTR_TO_MEM_OR_NULL) {
 -			reg->type = PTR_TO_MEM;
 -		} else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) {
 -			reg->type = PTR_TO_RDONLY_BUF;
 -		} else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) {
 -			reg->type = PTR_TO_RDWR_BUF;
 -		}
 -		if (is_null) {
    		/* We don't need id and ref_obj_id from this point
    		 * onwards anymore, thus we should better reset it,
    		 * so that state pruning has chances to take effect.
    		 */
    		reg->id = 0;
    		reg->ref_obj_id = 0;
 -		} else if (!reg_may_point_to_spin_lock(reg)) {
 +
 +			return;
 +		}
 +
 +		mark_ptr_not_null_reg(reg);
 +
 +		if (!reg_may_point_to_spin_lock(reg)) {
    		/* For not-NULL ptr, reg->ref_obj_id will be reset
    		 * in release_reg_references().
    		 *
@@@ -8401,9 -7945,6 +8401,9 @@@ static int check_return_code(struct bpf
    	    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
    	    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
    		range = tnum_range(1, 1);
 +		if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
 +		    env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
 +			range = tnum_range(0, 3);
    	break;
    case BPF_PROG_TYPE_CGROUP_SKB:
    	if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
@@@ -9989,19 -9530,14 +9989,19 @@@ static int do_check(struct bpf_verifier
    	} else if (class == BPF_STX) {
    		enum bpf_reg_type *prev_dst_type, dst_reg_type;
-			if (BPF_MODE(insn->code) == BPF_XADD) {
 -				err = check_xadd(env, env->insn_idx, insn);
 +			if (BPF_MODE(insn->code) == BPF_ATOMIC) {
 +				err = check_atomic(env, env->insn_idx, insn);
    			if (err)
    				return err;
    			env->insn_idx++;
    			continue;
    		}
+			if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
 +				verbose(env, "BPF_STX uses reserved fields\n");
 +				return -EINVAL;
 +			}
 +
    		/* check src1 operand */
    		err = check_reg_arg(env, insn->src_reg, SRC_OP);
    		if (err)
@@@ -10173,36 -9709,6 +10173,36 @@@ process_bpf_exit
    return 0;
  }
+static int find_btf_percpu_datasec(struct btf *btf)
 +{
 +	const struct btf_type *t;
 +	const char *tname;
 +	int i, n;
 +
 +	/*
 +	 * Both vmlinux and module each have their own ".data..percpu"
 +	 * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF
 +	 * types to look at only module's own BTF types.
 +	 */
 +	n = btf_nr_types(btf);
 +	if (btf_is_module(btf))
 +		i = btf_nr_types(btf_vmlinux);
 +	else
 +		i = 1;
 +
 +	for(; i < n; i++) {
 +		t = btf_type_by_id(btf, i);
 +		if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC)
 +			continue;
 +
 +		tname = btf_name_by_offset(btf, t->name_off);
 +		if (!strcmp(tname, ".data..percpu"))
 +			return i;
 +	}
 +
 +	return -ENOENT;
 +}
 +
  /* replace pseudo btf_id with kernel symbol address */
  static int check_pseudo_btf_id(struct bpf_verifier_env *env,
    		       struct bpf_insn *insn,
@@@ -10210,57 -9716,48 +10210,57 @@@
  {
    const struct btf_var_secinfo *vsi;
    const struct btf_type *datasec;
 +	struct btf_mod_pair *btf_mod;
    const struct btf_type *t;
    const char *sym_name;
    bool percpu = false;
    u32 type, id = insn->imm;
 +	struct btf *btf;
    s32 datasec_id;
    u64 addr;
 -	int i;
 -
 -	if (!btf_vmlinux) {
 -		verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
 -		return -EINVAL;
 -	}
 +	int i, btf_fd, err;
-	if (insn[1].imm != 0) {
 -		verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n");
 -		return -EINVAL;
 +	btf_fd = insn[1].imm;
 +	if (btf_fd) {
 +		btf = btf_get_by_fd(btf_fd);
 +		if (IS_ERR(btf)) {
 +			verbose(env, "invalid module BTF object FD specified.\n");
 +			return -EINVAL;
 +		}
 +	} else {
 +		if (!btf_vmlinux) {
 +			verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
 +			return -EINVAL;
 +		}
 +		btf = btf_vmlinux;
 +		btf_get(btf);
    }
-	t = btf_type_by_id(btf_vmlinux, id);
 +	t = btf_type_by_id(btf, id);
    if (!t) {
    	verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
 -		return -ENOENT;
 +		err = -ENOENT;
 +		goto err_put;
    }
if (!btf_type_is_var(t)) {
 -		verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n",
 -			id);
 -		return -EINVAL;
 +		verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id);
 +		err = -EINVAL;
 +		goto err_put;
    }
-	sym_name = btf_name_by_offset(btf_vmlinux, t->name_off);
 +	sym_name = btf_name_by_offset(btf, t->name_off);
    addr = kallsyms_lookup_name(sym_name);
    if (!addr) {
    	verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
    		sym_name);
 -		return -ENOENT;
 +		err = -ENOENT;
 +		goto err_put;
    }
-	datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu",
 -					   BTF_KIND_DATASEC);
 +	datasec_id = find_btf_percpu_datasec(btf);
    if (datasec_id > 0) {
 -		datasec = btf_type_by_id(btf_vmlinux, datasec_id);
 +		datasec = btf_type_by_id(btf, datasec_id);
    	for_each_vsi(i, datasec, vsi) {
    		if (vsi->type == id) {
    			percpu = true;
@@@ -10273,10 -9770,10 +10273,10 @@@
    insn[1].imm = addr >> 32;
type = t->type;
 -	t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
 +	t = btf_type_skip_modifiers(btf, type, NULL);
    if (percpu) {
    	aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
 -		aux->btf_var.btf = btf_vmlinux;
 +		aux->btf_var.btf = btf;
    	aux->btf_var.btf_id = type;
    } else if (!btf_type_is_struct(t)) {
    	const struct btf_type *ret;
@@@ -10284,54 -9781,21 +10284,54 @@@
    	u32 tsize;
/* resolve the type size of ksym. */
 -		ret = btf_resolve_size(btf_vmlinux, t, &tsize);
 +		ret = btf_resolve_size(btf, t, &tsize);
    	if (IS_ERR(ret)) {
 -			tname = btf_name_by_offset(btf_vmlinux, t->name_off);
 +			tname = btf_name_by_offset(btf, t->name_off);
    		verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
    			tname, PTR_ERR(ret));
 -			return -EINVAL;
 +			err = -EINVAL;
 +			goto err_put;
    	}
    	aux->btf_var.reg_type = PTR_TO_MEM;
    	aux->btf_var.mem_size = tsize;
    } else {
    	aux->btf_var.reg_type = PTR_TO_BTF_ID;
 -		aux->btf_var.btf = btf_vmlinux;
 +		aux->btf_var.btf = btf;
    	aux->btf_var.btf_id = type;
    }
 +
 +	/* check whether we recorded this BTF (and maybe module) already */
 +	for (i = 0; i < env->used_btf_cnt; i++) {
 +		if (env->used_btfs[i].btf == btf) {
 +			btf_put(btf);
 +			return 0;
 +		}
 +	}
 +
 +	if (env->used_btf_cnt >= MAX_USED_BTFS) {
 +		err = -E2BIG;
 +		goto err_put;
 +	}
 +
 +	btf_mod = &env->used_btfs[env->used_btf_cnt];
 +	btf_mod->btf = btf;
 +	btf_mod->module = NULL;
 +
 +	/* if we reference variables from kernel module, bump its refcount */
 +	if (btf_is_module(btf)) {
 +		btf_mod->module = btf_try_get_module(btf);
 +		if (!btf_mod->module) {
 +			err = -ENXIO;
 +			goto err_put;
 +		}
 +	}
 +
 +	env->used_btf_cnt++;
 +
    return 0;
 +err_put:
 +	btf_put(btf);
 +	return err;
  }
static int check_map_prealloc(struct bpf_map *map)
@@@ -10433,22 -9897,15 +10433,22 @@@ static int check_map_prog_compatibility
    	case BPF_MAP_TYPE_HASH:
    	case BPF_MAP_TYPE_LRU_HASH:
    	case BPF_MAP_TYPE_ARRAY:
 +		case BPF_MAP_TYPE_PERCPU_HASH:
 +		case BPF_MAP_TYPE_PERCPU_ARRAY:
 +		case BPF_MAP_TYPE_LRU_PERCPU_HASH:
 +		case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 +		case BPF_MAP_TYPE_HASH_OF_MAPS:
    		if (!is_preallocated_map(map)) {
    			verbose(env,
 -					"Sleepable programs can only use preallocated hash maps\n");
 +					"Sleepable programs can only use preallocated maps\n");
    			return -EINVAL;
    		}
    		break;
 +		case BPF_MAP_TYPE_RINGBUF:
 +			break;
    	default:
    		verbose(env,
 -				"Sleepable programs can only use array and hash maps\n");
 +				"Sleepable programs can only use array, hash, and ringbuf maps\n");
    		return -EINVAL;
    	}
@@@ -10485,6 -9942,13 +10485,6 @@@ static int resolve_pseudo_ldimm64(struc
    		return -EINVAL;
    	}
-		if (BPF_CLASS(insn->code) == BPF_STX &&
 -		    ((BPF_MODE(insn->code) != BPF_MEM &&
 -		      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
 -			verbose(env, "BPF_STX uses reserved fields\n");
 -			return -EINVAL;
 -		}
 -
    	if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
    		struct bpf_insn_aux_data *aux;
    		struct bpf_map *map;
@@@ -10628,13 -10092,6 +10628,13 @@@ static void release_maps(struct bpf_ver
    		     env->used_map_cnt);
  }
+/* drop refcnt of maps used by the rejected program */
 +static void release_btfs(struct bpf_verifier_env *env)
 +{
 +	__bpf_free_used_btfs(env->prog->aux, env->used_btfs,
 +			     env->used_btf_cnt);
 +}
 +
  /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
  static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
  {
@@@ -11006,7 -10463,6 +11006,7 @@@ static int opt_subreg_zext_lo32_rnd_hi3
    for (i = 0; i < len; i++) {
    	int adj_idx = i + delta;
    	struct bpf_insn insn;
 +		u8 load_reg;
insn = insns[adj_idx];
    	if (!aux[adj_idx].zext_dst) {
@@@ -11049,27 -10505,9 +11049,27 @@@
    	if (!bpf_jit_needs_zext())
    		continue;
+		/* zext_dst means that we want to zero-extend whatever register
 +		 * the insn defines, which is dst_reg most of the time, with
 +		 * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH.
 +		 */
 +		if (BPF_CLASS(insn.code) == BPF_STX &&
 +		    BPF_MODE(insn.code) == BPF_ATOMIC) {
 +			/* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not
 +			 * define any registers, therefore zext_dst cannot be
 +			 * set.
 +			 */
 +			if (WARN_ON(!(insn.imm & BPF_FETCH)))
 +				return -EINVAL;
 +			load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0
 +							   : insn.src_reg;
 +		} else {
 +			load_reg = insn.dst_reg;
 +		}
 +
    	zext_patch[0] = insn;
 -		zext_patch[1].dst_reg = insn.dst_reg;
 -		zext_patch[1].src_reg = insn.dst_reg;
 +		zext_patch[1].dst_reg = load_reg;
 +		zext_patch[1].src_reg = load_reg;
    	patch = zext_patch;
    	patch_len = 2;
  apply_patch_buffer:
@@@ -11285,7 -10723,8 +11285,7 @@@ static int jit_subprogs(struct bpf_veri
    	return 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
 -		if (insn->code != (BPF_JMP | BPF_CALL) ||
 -		    insn->src_reg != BPF_PSEUDO_CALL)
 +		if (!bpf_pseudo_call(insn))
    		continue;
    	/* Upon error here we cannot fall back to interpreter but
    	 * need a hard reject of the program. Thus -EFAULT is
@@@ -11326,7 -10765,7 +11326,7 @@@
    	/* BPF_PROG_RUN doesn't call subprogs directly,
    	 * hence main prog stats include the runtime of subprogs.
    	 * subprogs don't have IDs and not reachable via prog_get_next_id
 -		 * func[i]->aux->stats will never be accessed and stays NULL
 +		 * func[i]->stats will never be accessed and stays NULL
    	 */
    	func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
    	if (!func[i])
@@@ -11414,7 -10853,8 +11414,7 @@@
    for (i = 0; i < env->subprog_cnt; i++) {
    	insn = func[i]->insnsi;
    	for (j = 0; j < func[i]->len; j++, insn++) {
 -			if (insn->code != (BPF_JMP | BPF_CALL) ||
 -			    insn->src_reg != BPF_PSEUDO_CALL)
 +			if (!bpf_pseudo_call(insn))
    			continue;
    		subprog = insn->off;
    		insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
@@@ -11459,7 -10899,8 +11459,7 @@@
     * later look the same as if they were interpreted only.
     */
    for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
 -		if (insn->code != (BPF_JMP | BPF_CALL) ||
 -		    insn->src_reg != BPF_PSEUDO_CALL)
 +		if (!bpf_pseudo_call(insn))
    		continue;
    	insn->off = env->insn_aux_data[i].call_imm;
    	subprog = find_subprog(env, i + insn->off + 1);
@@@ -11488,7 -10929,8 +11488,7 @@@ out_undo_insn
    /* cleanup main prog to be interpreted */
    prog->jit_requested = 0;
    for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
 -		if (insn->code != (BPF_JMP | BPF_CALL) ||
 -		    insn->src_reg != BPF_PSEUDO_CALL)
 +		if (!bpf_pseudo_call(insn))
    		continue;
    	insn->off = 0;
    	insn->imm = env->insn_aux_data[i].call_imm;
@@@ -11523,7 -10965,8 +11523,7 @@@ static int fixup_call_args(struct bpf_v
    	return -EINVAL;
    }
    for (i = 0; i < prog->len; i++, insn++) {
 -		if (insn->code != (BPF_JMP | BPF_CALL) ||
 -		    insn->src_reg != BPF_PSEUDO_CALL)
 +		if (!bpf_pseudo_call(insn))
    		continue;
    	depth = get_callee_stack_depth(env, insn, i);
    	if (depth < 0)
@@@ -11563,7 -11006,7 +11563,7 @@@ static int fixup_bpf_calls(struct bpf_v
    		bool isdiv = BPF_OP(insn->code) == BPF_DIV;
    		struct bpf_insn *patchlet;
    		struct bpf_insn chk_and_div[] = {
- 				/* Rx div 0 -> 0 */
+ 				/* [R,W]x div 0 -> 0 */
    			BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
    				     BPF_JNE | BPF_K, insn->src_reg,
    				     0, 2, 0),
@@@ -11572,16 -11015,18 +11572,18 @@@
    			*insn,
    		};
    		struct bpf_insn chk_and_mod[] = {
- 				/* Rx mod 0 -> Rx */
+ 				/* [R,W]x mod 0 -> [R,W]x */
    			BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
    				     BPF_JEQ | BPF_K, insn->src_reg,
- 					     0, 1, 0),
+ 					     0, 1 + (is64 ? 0 : 1), 0),
    			*insn,
+ 				BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ 				BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
    		};
patchlet = isdiv ? chk_and_div : chk_and_mod;
    		cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
- 				      ARRAY_SIZE(chk_and_mod);
+ 				      ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
    		if (!new_prog)
@@@ -11986,13 -11431,6 +11988,13 @@@ static int do_check_common(struct bpf_v
    			mark_reg_known_zero(env, regs, i);
    		else if (regs[i].type == SCALAR_VALUE)
    			mark_reg_unknown(env, regs, i);
 +			else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
 +				const u32 mem_size = regs[i].mem_size;
 +
 +				mark_reg_known_zero(env, regs, i);
 +				regs[i].mem_size = mem_size;
 +				regs[i].id = ++env->id_gen;
 +			}
    	}
    } else {
    	/* 1st arg to a function */
@@@ -12571,7 -12009,6 +12573,7 @@@ int bpf_check(struct bpf_prog **prog, u
    	env->strict_alignment = false;
env->allow_ptr_leaks = bpf_allow_ptr_leaks();
 +	env->allow_uninit_stack = bpf_allow_uninit_stack();
    env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
    env->bypass_spec_v1 = bpf_bypass_spec_v1();
    env->bypass_spec_v4 = bpf_bypass_spec_v4();
@@@ -12667,10 -12104,7 +12669,10 @@@ skip_full_check
    	goto err_release_maps;
    }
-	if (ret == 0 && env->used_map_cnt) {
 +	if (ret)
 +		goto err_release_maps;
 +
 +	if (env->used_map_cnt) {
    	/* if program passed verifier, update used_maps in bpf_prog_info */
    	env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
    						  sizeof(env->used_maps[0]),
@@@ -12684,29 -12118,15 +12686,29 @@@
    	memcpy(env->prog->aux->used_maps, env->used_maps,
    	       sizeof(env->used_maps[0]) * env->used_map_cnt);
    	env->prog->aux->used_map_cnt = env->used_map_cnt;
 +	}
 +	if (env->used_btf_cnt) {
 +		/* if program passed verifier, update used_btfs in bpf_prog_aux */
 +		env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt,
 +							  sizeof(env->used_btfs[0]),
 +							  GFP_KERNEL);
 +		if (!env->prog->aux->used_btfs) {
 +			ret = -ENOMEM;
 +			goto err_release_maps;
 +		}
+		memcpy(env->prog->aux->used_btfs, env->used_btfs,
 +		       sizeof(env->used_btfs[0]) * env->used_btf_cnt);
 +		env->prog->aux->used_btf_cnt = env->used_btf_cnt;
 +	}
 +	if (env->used_map_cnt || env->used_btf_cnt) {
    	/* program is valid. Convert pseudo bpf_ld_imm64 into generic
    	 * bpf_ld_imm64 instructions
    	 */
    	convert_pseudo_ld_imm64(env);
    }
-	if (ret == 0)
 -		adjust_btf_func(env);
 +	adjust_btf_func(env);
err_release_maps:
    if (!env->prog->aux->used_maps)
@@@ -12714,8 -12134,6 +12716,8 @@@
    	 * them now. Otherwise free_used_maps() will release them.
    	 */
    	release_maps(env);
 +	if (!env->prog->aux->used_btfs)
 +		release_btfs(env);
/* extension progs temporarily inherit the attach_type of their targets
       for verification purposes, so set it back to zero before returning
diff --combined net/core/flow_dissector.c
index c565c7a17091,0b4f536bc32d..2ef2224b3bff
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@@ -23,7 -23,6 +23,7 @@@
  #include <linux/if_ether.h>
  #include <linux/mpls.h>
  #include <linux/tcp.h>
 +#include <linux/ptp_classify.h>
  #include <net/flow_dissector.h>
  #include <scsi/fc/fc_fcoe.h>
  #include <uapi/linux/batadv_packet.h>
@@@ -237,8 -236,9 +237,8 @@@ skb_flow_dissect_set_enc_addr_type(enu
  void
  skb_flow_dissect_ct(const struct sk_buff *skb,
    	    struct flow_dissector *flow_dissector,
 -		    void *target_container,
 -		    u16 *ctinfo_map,
 -		    size_t mapsize)
 +		    void *target_container, u16 *ctinfo_map,
 +		    size_t mapsize, bool post_ct)
  {
  #if IS_ENABLED(CONFIG_NF_CONNTRACK)
    struct flow_dissector_key_ct *key;
@@@ -250,19 -250,13 +250,19 @@@
    	return;
ct = nf_ct_get(skb, &ctinfo);
 -	if (!ct)
 +	if (!ct && !post_ct)
    	return;
key = skb_flow_dissector_target(flow_dissector,
    				FLOW_DISSECTOR_KEY_CT,
    				target_container);
+	if (!ct) {
 +		key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
 +				TCA_FLOWER_KEY_CT_FLAGS_INVALID;
 +		return;
 +	}
 +
    if (ctinfo < mapsize)
    	key->ct_state = ctinfo_map[ctinfo];
  #if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
@@@ -1056,6 -1050,9 +1056,9 @@@ proto_again
    		key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
    	}
+ 		__skb_flow_dissect_ipv4(skb, flow_dissector,
+ 					target_container, data, iph);
+ 
    	if (ip_is_fragment(iph)) {
    		key_control->flags |= FLOW_DIS_IS_FRAGMENT;
@@@ -1072,9 -1069,6 +1075,6 @@@
    		}
    	}
- 		__skb_flow_dissect_ipv4(skb, flow_dissector,
- 					target_container, data, iph);
- 
    	break;
    }
    case htons(ETH_P_IPV6): {
@@@ -1257,21 -1251,6 +1257,21 @@@
    					  &proto, &nhoff, hlen, flags);
    	break;
+	case htons(ETH_P_1588): {
 +		struct ptp_header *hdr, _hdr;
 +
 +		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
 +					   hlen, &_hdr);
 +		if (!hdr) {
 +			fdret = FLOW_DISSECT_RET_OUT_BAD;
 +			break;
 +		}
 +
 +		nhoff += ntohs(hdr->message_length);
 +		fdret = FLOW_DISSECT_RET_OUT_GOOD;
 +		break;
 +	}
 +
    default:
    	fdret = FLOW_DISSECT_RET_OUT_BAD;
    	break;
diff --combined net/mptcp/options.c
index bb874c5d663a,8fec3dabe109..b63574d6b812
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@@ -282,15 -282,6 +282,15 @@@ static void mptcp_parse_option(const st
    	pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
    	break;
+	case MPTCPOPT_MP_PRIO:
 +		if (opsize != TCPOLEN_MPTCP_PRIO)
 +			break;
 +
 +		mp_opt->mp_prio = 1;
 +		mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP;
 +		pr_debug("MP_PRIO: prio=%d", mp_opt->backup);
 +		break;
 +
    case MPTCPOPT_MP_FASTCLOSE:
    	if (opsize != TCPOLEN_MPTCP_FASTCLOSE)
    		break;
@@@ -322,7 -313,6 +322,7 @@@ void mptcp_get_options(const struct sk_
    mp_opt->port = 0;
    mp_opt->rm_addr = 0;
    mp_opt->dss = 0;
 +	mp_opt->mp_prio = 0;
length = (th->doff * 4) - sizeof(struct tcphdr);
    ptr = (const unsigned char *)(th + 1);
@@@ -508,8 -498,8 +508,8 @@@ static bool mptcp_established_options_d
  {
    struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
    struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ 	u64 snd_data_fin_enable, ack_seq;
    unsigned int dss_size = 0;
- 	u64 snd_data_fin_enable;
    struct mptcp_ext *mpext;
    unsigned int ack_size;
    bool ret = false;
@@@ -541,13 -531,14 +541,14 @@@
    	return ret;
    }
+ 	ack_seq = READ_ONCE(msk->ack_seq);
    if (READ_ONCE(msk->use_64bit_ack)) {
    	ack_size = TCPOLEN_MPTCP_DSS_ACK64;
- 		opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq);
+ 		opts->ext_copy.data_ack = ack_seq;
    	opts->ext_copy.ack64 = 1;
    } else {
    	ack_size = TCPOLEN_MPTCP_DSS_ACK32;
- 		opts->ext_copy.data_ack32 = (uint32_t)READ_ONCE(msk->ack_seq);
+ 		opts->ext_copy.data_ack32 = (uint32_t)ack_seq;
    	opts->ext_copy.ack64 = 0;
    }
    opts->ext_copy.use_ack = 1;
@@@ -689,29 -680,6 +690,29 @@@ static bool mptcp_established_options_r
    return true;
  }
+static bool mptcp_established_options_mp_prio(struct sock *sk,
 +					      unsigned int *size,
 +					      unsigned int remaining,
 +					      struct mptcp_out_options *opts)
 +{
 +	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 +
 +	if (!subflow->send_mp_prio)
 +		return false;
 +
 +	/* account for the trailing 'nop' option */
 +	if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN)
 +		return false;
 +
 +	*size = TCPOLEN_MPTCP_PRIO_ALIGN;
 +	opts->suboptions |= OPTION_MPTCP_PRIO;
 +	opts->backup = subflow->request_bkup;
 +
 +	pr_debug("prio=%d", opts->backup);
 +
 +	return true;
 +}
 +
  bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
    		       unsigned int *size, unsigned int remaining,
    		       struct mptcp_out_options *opts)
@@@ -754,12 -722,6 +755,12 @@@
    	ret = true;
    }
+	if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) {
 +		*size += opt_size;
 +		remaining -= opt_size;
 +		ret = true;
 +	}
 +
    return ret;
  }
@@@ -867,7 -829,7 +868,7 @@@ fully_established
    	clear_3rdack_retransmission(ssk);
    	mptcp_pm_subflow_established(msk, subflow);
    } else {
 -		mptcp_pm_fully_established(msk);
 +		mptcp_pm_fully_established(msk, ssk, GFP_ATOMIC);
    }
    return true;
@@@ -918,8 -880,7 +919,7 @@@ static void ack_update_msk(struct mptcp
    	msk->wnd_end = new_wnd_end;
/* this assumes mptcp_incoming_options() is invoked after tcp_ack() */
- 	if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)) &&
- 	    sk_stream_memory_free(ssk))
+ 	if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)))
    	__mptcp_check_push(sk, ssk);
if (after64(new_snd_una, old_snd_una)) {
@@@ -1025,10 -986,6 +1025,10 @@@ void mptcp_incoming_options(struct soc
    		mptcp_pm_del_add_timer(msk, &addr);
    		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD);
    	}
 +
 +		if (mp_opt.port)
 +			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD);
 +
    	mp_opt.add_addr = 0;
    }
@@@ -1037,12 -994,6 +1037,12 @@@
    	mp_opt.rm_addr = 0;
    }
+	if (mp_opt.mp_prio) {
 +		mptcp_pm_mp_prio_received(sk, mp_opt.backup);
 +		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX);
 +		mp_opt.mp_prio = 0;
 +	}
 +
    if (!mp_opt.dss)
    	return;
@@@ -1217,18 -1168,6 +1217,18 @@@ mp_capable_done
    			      0, opts->rm_id);
    }
+	if (OPTION_MPTCP_PRIO & opts->suboptions) {
 +		const struct sock *ssk = (const struct sock *)tp;
 +		struct mptcp_subflow_context *subflow;
 +
 +		subflow = mptcp_subflow_ctx(ssk);
 +		subflow->send_mp_prio = 0;
 +
 +		*ptr++ = mptcp_option(MPTCPOPT_MP_PRIO,
 +				      TCPOLEN_MPTCP_PRIO,
 +				      opts->backup, TCPOPT_NOP);
 +	}
 +
    if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
    	*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
    			      TCPOLEN_MPTCP_MPJ_SYN,
diff --combined net/mptcp/protocol.c
index c2a8392254dc,06da6ad31c87..a57f3eab7b6a
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@@ -45,14 -45,11 +45,14 @@@ static struct percpu_counter mptcp_sock
  static void __mptcp_destroy_sock(struct sock *sk);
  static void __mptcp_check_send_data_fin(struct sock *sk);
+DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
 +static struct net_device mptcp_napi_dev;
 +
  /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
   * completed yet or has failed, return the subflow socket.
   * Otherwise return NULL.
   */
 -static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
 +struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
  {
    if (!msk->subflow || READ_ONCE(msk->can_ack))
    	return NULL;
@@@ -117,7 -114,11 +117,7 @@@ static int __mptcp_socket_create(struc
    list_add(&subflow->node, &msk->conn_list);
    sock_hold(ssock->sk);
    subflow->request_mptcp = 1;
 -
 -	/* accept() will wait on first subflow sk_wq, and we always wakes up
 -	 * via msk->sk_socket
 -	 */
 -	RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq);
 +	mptcp_sock_graft(msk->first, sk->sk_socket);
return 0;
  }
@@@ -363,8 -364,6 +363,6 @@@ static void mptcp_check_data_fin_ack(st
/* Look for an acknowledged DATA_FIN */
    if (mptcp_pending_data_fin_ack(sk)) {
- 		mptcp_stop_timer(sk);
- 
    	WRITE_ONCE(msk->snd_data_fin_enable, 0);
switch (sk->sk_state) {
@@@ -458,7 -457,18 +456,18 @@@ static bool mptcp_subflow_cleanup_rbuf(
  static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
  {
    struct sock *ack_hint = READ_ONCE(msk->ack_hint);
+ 	int old_space = READ_ONCE(msk->old_wspace);
    struct mptcp_subflow_context *subflow;
+ 	struct sock *sk = (struct sock *)msk;
+ 	bool cleanup;
+ 
+ 	/* this is a simple superset of what tcp_cleanup_rbuf() implements
+ 	 * so that we don't have to acquire the ssk socket lock most of the time
+ 	 * to do actually nothing
+ 	 */
+ 	cleanup = __mptcp_space(sk) - old_space >= max(0, old_space);
+ 	if (!cleanup)
+ 		return;
/* if the hinted ssk is still active, try to use it */
    if (likely(ack_hint)) {
@@@ -733,14 -743,10 +742,14 @@@ wake
void __mptcp_flush_join_list(struct mptcp_sock *msk)
  {
 +	struct mptcp_subflow_context *subflow;
 +
    if (likely(list_empty(&msk->join_list)))
    	return;
spin_lock_bh(&msk->join_list_lock);
 +	list_for_each_entry(subflow, &msk->join_list, node)
 +		mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
    list_splice_tail_init(&msk->join_list, &msk->conn_list);
    spin_unlock_bh(&msk->join_list_lock);
  }
@@@ -1040,6 -1046,13 +1049,6 @@@ out
    		__mptcp_update_wmem(sk);
    		sk_mem_reclaim_partial(sk);
    	}
 -
 -		if (sk_stream_is_writeable(sk)) {
 -			/* pairs with memory barrier in mptcp_poll */
 -			smp_mb();
 -			if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags))
 -				sk_stream_write_space(sk);
 -		}
    }
if (snd_una == READ_ONCE(msk->snd_nxt)) {
@@@ -1358,7 -1371,8 +1367,7 @@@ struct subflow_send_info 
    u64 ratio;
  };
-static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
 -					   u32 *sndbuf)
 +static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
  {
    struct subflow_send_info send_info[2];
    struct mptcp_subflow_context *subflow;
@@@ -1369,17 -1383,24 +1378,17 @@@
sock_owned_by_me((struct sock *)msk);
-	*sndbuf = 0;
    if (__mptcp_check_fallback(msk)) {
    	if (!msk->first)
    		return NULL;
 -		*sndbuf = msk->first->sk_sndbuf;
    	return sk_stream_memory_free(msk->first) ? msk->first : NULL;
    }
/* re-use last subflow, if the burst allow that */
    if (msk->last_snd && msk->snd_burst > 0 &&
        sk_stream_memory_free(msk->last_snd) &&
 -	    mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
 -		mptcp_for_each_subflow(msk, subflow) {
 -			ssk =  mptcp_subflow_tcp_sock(subflow);
 -			*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
 -		}
 +	    mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd)))
    	return msk->last_snd;
 -	}
/* pick the subflow with the lower wmem/wspace ratio */
    for (i = 0; i < 2; ++i) {
@@@ -1392,7 -1413,8 +1401,7 @@@
    		continue;
nr_active += !subflow->backup;
 -		*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
 -		if (!sk_stream_memory_free(subflow->tcp_sock))
 +		if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
    		continue;
pace = READ_ONCE(ssk->sk_pacing_rate);
@@@ -1418,10 -1440,9 +1427,10 @@@
    if (send_info[0].ssk) {
    	msk->last_snd = send_info[0].ssk;
    	msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
 -				       sk_stream_wspace(msk->last_snd));
 +				       tcp_sk(msk->last_snd)->snd_wnd);
    	return msk->last_snd;
    }
 +
    return NULL;
  }
@@@ -1442,6 -1463,7 +1451,6 @@@ static void mptcp_push_pending(struct s
    };
    struct mptcp_data_frag *dfrag;
    int len, copied = 0;
 -	u32 sndbuf;
while ((dfrag = mptcp_send_head(sk))) {
    	info.sent = dfrag->already_sent;
@@@ -1452,7 -1474,12 +1461,7 @@@
prev_ssk = ssk;
    		__mptcp_flush_join_list(msk);
 -			ssk = mptcp_subflow_get_send(msk, &sndbuf);
 -
 -			/* do auto tuning */
 -			if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
 -			    sndbuf > READ_ONCE(sk->sk_sndbuf))
 -				WRITE_ONCE(sk->sk_sndbuf, sndbuf);
 +			ssk = mptcp_subflow_get_send(msk);
/* try to keep the subflow socket lock across
    		 * consecutive xmit on the same socket
@@@ -1509,9 -1536,7 +1518,9 @@@ static void __mptcp_subflow_push_pendin
    struct mptcp_sock *msk = mptcp_sk(sk);
    struct mptcp_sendmsg_info info;
    struct mptcp_data_frag *dfrag;
 +	struct sock *xmit_ssk;
    int len, copied = 0;
 +	bool first = true;
info.flags = 0;
    while ((dfrag = mptcp_send_head(sk))) {
@@@ -1521,17 -1546,10 +1530,17 @@@
    	while (len > 0) {
    		int ret = 0;
-			/* do auto tuning */
 -			if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
 -			    ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf))
 -				WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
 +			/* the caller already invoked the packet scheduler,
 +			 * check for a different subflow usage only after
 +			 * spooling the first chunk of data
 +			 */
 +			xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
 +			if (!xmit_ssk)
 +				goto out;
 +			if (xmit_ssk != ssk) {
 +				mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk));
 +				goto out;
 +			}
if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) {
    			__mptcp_update_wmem(sk);
@@@ -1551,7 -1569,6 +1560,7 @@@
    		msk->tx_pending_data -= ret;
    		copied += ret;
    		len -= ret;
 +			first = false;
    	}
    	WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
    }
@@@ -1565,21 -1582,15 +1574,24 @@@ out
    	mptcp_set_timeout(sk, ssk);
    	tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
    		 info.size_goal);
+ 		if (!mptcp_timer_pending(sk))
+ 			mptcp_reset_timer(sk);
+ 
    	if (msk->snd_data_fin_enable &&
    	    msk->snd_nxt + 1 == msk->write_seq)
    		mptcp_schedule_work(sk);
    }
  }
+static void mptcp_set_nospace(struct sock *sk)
 +{
 +	/* enable autotune */
 +	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 +
 +	/* will be cleared on avail space */
 +	set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
 +}
 +
  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
  {
    struct mptcp_sock *msk = mptcp_sk(sk);
@@@ -1681,7 -1692,7 +1693,7 @@@
    	continue;
wait_for_memory:
 -		set_bit(MPTCP_NOSPACE, &msk->flags);
 +		mptcp_set_nospace(sk);
    	mptcp_push_pending(sk, msg->msg_flags);
    	ret = sk_stream_wait_memory(sk, &timeo);
    	if (ret)
@@@ -1868,7 -1879,7 +1880,7 @@@ static void __mptcp_splice_receive_queu
    skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue);
  }
- static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
+ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
  {
    struct sock *sk = (struct sock *)msk;
    unsigned int moved = 0;
@@@ -1888,13 -1899,10 +1900,10 @@@
slowpath = lock_sock_fast(ssk);
    	mptcp_data_lock(sk);
+ 		__mptcp_update_rmem(sk);
    	done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
    	mptcp_data_unlock(sk);
- 		if (moved && rcv) {
- 			WRITE_ONCE(msk->rmem_pending, min(rcv, moved));
- 			tcp_cleanup_rbuf(ssk, 1);
- 			WRITE_ONCE(msk->rmem_pending, 0);
- 		}
+ 		tcp_cleanup_rbuf(ssk, moved);
    	unlock_sock_fast(ssk, slowpath);
    } while (!done);
@@@ -1907,6 -1915,7 +1916,7 @@@
    	ret |= __mptcp_ofo_queue(msk);
    	__mptcp_splice_receive_queue(sk);
    	mptcp_data_unlock(sk);
+ 		mptcp_cleanup_rbuf(msk);
    }
    if (ret)
    	mptcp_check_data_fin((struct sock *)msk);
@@@ -1936,7 -1945,7 +1946,7 @@@ static int mptcp_recvmsg(struct sock *s
    target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
while (copied < len) {
- 		int bytes_read, old_space;
+ 		int bytes_read;
bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
    	if (unlikely(bytes_read < 0)) {
@@@ -1947,14 -1956,11 +1957,11 @@@
copied += bytes_read;
- 		if (skb_queue_empty(&msk->receive_queue) &&
- 		    __mptcp_move_skbs(msk, len - copied))
- 			continue;
- 
    	/* be sure to advertise window change */
- 		old_space = READ_ONCE(msk->old_wspace);
- 		if ((tcp_space(sk) - old_space) >= old_space)
- 			mptcp_cleanup_rbuf(msk);
+ 		mptcp_cleanup_rbuf(msk);
+ 
+ 		if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk))
+ 			continue;
/* only the master socket status is relevant here. The exit
    	 * conditions mirror closely tcp_recvmsg()
@@@ -1982,7 -1988,7 +1989,7 @@@
    			/* race breaker: the shutdown could be after the
    			 * previous receive queue check
    			 */
- 				if (__mptcp_move_skbs(msk, len - copied))
+ 				if (__mptcp_move_skbs(msk))
    				continue;
    			break;
    		}
@@@ -2015,7 -2021,7 +2022,7 @@@
    	/* .. race-breaker: ssk might have gotten new data
    	 * after last __mptcp_move_skbs() returned false.
    	 */
- 		if (unlikely(__mptcp_move_skbs(msk, 0)))
+ 		if (unlikely(__mptcp_move_skbs(msk)))
    		set_bit(MPTCP_DATA_READY, &msk->flags);
    } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
    	/* data to read but mptcp_wait_data() cleared DATA_READY */
@@@ -2114,9 -2120,12 +2121,9 @@@ static struct sock *mptcp_subflow_get_r
   * so we need to use tcp_close() after detaching them from the mptcp
   * parent socket.
   */
 -void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 -		       struct mptcp_subflow_context *subflow)
 +static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 +			      struct mptcp_subflow_context *subflow)
  {
 -	bool dispose_socket = false;
 -	struct socket *sock;
 -
    list_del(&subflow->node);
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
@@@ -2124,8 -2133,11 +2131,8 @@@
    /* if we are invoked by the msk cleanup code, the subflow is
     * already orphaned
     */
 -	sock = ssk->sk_socket;
 -	if (sock) {
 -		dispose_socket = sock != sk->sk_socket;
 +	if (ssk->sk_socket)
    	sock_orphan(ssk);
 -	}
subflow->disposable = 1;
@@@ -2143,40 -2155,59 +2150,40 @@@
    	__sock_put(ssk);
    }
    release_sock(ssk);
 -	if (dispose_socket)
 -		iput(SOCK_INODE(sock));
sock_put(ssk);
  }
-static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
 +void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 +		     struct mptcp_subflow_context *subflow)
  {
 -	return 0;
 +	if (sk->sk_state == TCP_ESTABLISHED)
 +		mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL);
 +	__mptcp_close_ssk(sk, ssk, subflow);
  }
-static void pm_work(struct mptcp_sock *msk)
 +static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
  {
 -	struct mptcp_pm_data *pm = &msk->pm;
 -
 -	spin_lock_bh(&msk->pm.lock);
 -
 -	pr_debug("msk=%p status=%x", msk, pm->status);
 -	if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
 -		pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
 -		mptcp_pm_nl_add_addr_received(msk);
 -	}
 -	if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) {
 -		pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK);
 -		mptcp_pm_nl_add_addr_send_ack(msk);
 -	}
 -	if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
 -		pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
 -		mptcp_pm_nl_rm_addr_received(msk);
 -	}
 -	if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
 -		pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
 -		mptcp_pm_nl_fully_established(msk);
 -	}
 -	if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
 -		pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
 -		mptcp_pm_nl_subflow_established(msk);
 -	}
 -
 -	spin_unlock_bh(&msk->pm.lock);
 +	return 0;
  }
static void __mptcp_close_subflow(struct mptcp_sock *msk)
  {
    struct mptcp_subflow_context *subflow, *tmp;
+	might_sleep();
 +
    list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
    	struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (inet_sk_state_load(ssk) != TCP_CLOSE)
    		continue;
-		__mptcp_close_ssk((struct sock *)msk, ssk, subflow);
 +		/* 'subflow_data_ready' will re-sched once rx queue is empty */
 +		if (!skb_queue_empty_lockless(&ssk->sk_receive_queue))
 +			continue;
 +
 +		mptcp_close_ssk((struct sock *)msk, ssk, subflow);
    }
  }
@@@ -2248,8 -2279,11 +2255,8 @@@ static void mptcp_worker(struct work_st
mptcp_check_fastclose(msk);
-	if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
 -		__mptcp_close_subflow(msk);
 -
    if (msk->pm.status)
 -		pm_work(msk);
 +		mptcp_pm_nl_work(msk);
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
    	mptcp_check_for_eof(msk);
@@@ -2269,12 -2303,10 +2276,13 @@@
    	goto unlock;
    }
+	if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
 +		__mptcp_close_subflow(msk);
 +
    if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
    	goto unlock;
+ 	__mptcp_clean_una(sk);
    dfrag = mptcp_rtx_head(sk);
    if (!dfrag)
    	goto unlock;
@@@ -2512,14 -2544,6 +2520,14 @@@ static void __mptcp_destroy_sock(struc
pr_debug("msk=%p", msk);
+	might_sleep();
 +
 +	/* dispose the ancillatory tcp socket, if any */
 +	if (msk->subflow) {
 +		iput(SOCK_INODE(msk->subflow));
 +		msk->subflow = NULL;
 +	}
 +
    /* be sure to always acquire the join list lock, to sync vs
     * mptcp_finish_join().
     */
@@@ -2570,10 -2594,20 +2578,10 @@@ cleanup
    inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
    list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) {
    	struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 -		bool slow, dispose_socket;
 -		struct socket *sock;
 +		bool slow = lock_sock_fast(ssk);
-		slow = lock_sock_fast(ssk);
 -		sock = ssk->sk_socket;
 -		dispose_socket = sock && sock != sk->sk_socket;
    	sock_orphan(ssk);
    	unlock_sock_fast(ssk, slow);
 -
 -		/* for the outgoing subflows we additionally need to free
 -		 * the associated socket
 -		 */
 -		if (dispose_socket)
 -			iput(SOCK_INODE(sock));
    }
    sock_orphan(sk);
@@@ -2588,10 -2622,6 +2596,10 @@@
    release_sock(sk);
    if (do_cancel_work)
    	mptcp_cancel_work(sk);
 +
 +	if (mptcp_sk(sk)->token)
 +		mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
 +
    sock_put(sk);
  }
@@@ -2906,16 -2936,10 +2914,16 @@@ void __mptcp_check_push(struct sock *sk
    if (!mptcp_send_head(sk))
    	return;
-	if (!sock_owned_by_user(sk))
 -		__mptcp_subflow_push_pending(sk, ssk);
 -	else
 +	if (!sock_owned_by_user(sk)) {
 +		struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));
 +
 +		if (xmit_ssk == ssk)
 +			__mptcp_subflow_push_pending(sk, ssk);
 +		else if (xmit_ssk)
 +			mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk));
 +	} else {
    	set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
 +	}
  }
#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
@@@ -2943,6 -2967,8 +2951,8 @@@ static void mptcp_release_cb(struct soc
    	mptcp_push_pending(sk, 0);
    	spin_lock_bh(&sk->sk_lock.slock);
    }
+ 	if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags))
+ 		__mptcp_error_report(sk);
/* clear any wmem reservation and errors */
    __mptcp_update_wmem(sk);
@@@ -2963,20 -2989,6 +2973,20 @@@
    }
  }
+void mptcp_subflow_process_delegated(struct sock *ssk)
 +{
 +	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 +	struct sock *sk = subflow->conn;
 +
 +	mptcp_data_lock(sk);
 +	if (!sock_owned_by_user(sk))
 +		__mptcp_subflow_push_pending(sk, ssk);
 +	else
 +		set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
 +	mptcp_data_unlock(sk);
 +	mptcp_subflow_delegated_done(subflow);
 +}
 +
  static int mptcp_hash(struct sock *sk)
  {
    /* should never be called,
@@@ -3034,12 -3046,12 +3044,12 @@@ void mptcp_finish_connect(struct sock *
    WRITE_ONCE(msk->can_ack, 1);
    WRITE_ONCE(msk->snd_una, msk->write_seq);
-	mptcp_pm_new_connection(msk, 0);
 +	mptcp_pm_new_connection(msk, ssk, 0);
mptcp_rcv_space_init(msk, ssk);
  }
-static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
 +void mptcp_sock_graft(struct sock *sk, struct socket *parent)
  {
    write_lock_bh(&sk->sk_callback_lock);
    rcu_assign_pointer(sk->sk_wq, &parent->wq);
@@@ -3063,7 -3075,7 +3073,7 @@@ bool mptcp_finish_join(struct sock *ssk
    	return false;
if (!msk->pm.server_side)
 -		return true;
 +		goto out;
if (!mptcp_pm_allow_new_subflow(msk))
    	return false;
@@@ -3090,8 -3102,6 +3100,8 @@@
    if (parent_sock && !ssk->sk_socket)
    	mptcp_sock_graft(ssk, parent_sock);
    subflow->map_seq = READ_ONCE(msk->ack_seq);
 +out:
 +	mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
    return true;
  }
@@@ -3268,8 -3278,9 +3278,8 @@@ static int mptcp_stream_accept(struct s
    	struct mptcp_sock *msk = mptcp_sk(newsock->sk);
    	struct mptcp_subflow_context *subflow;
    	struct sock *newsk = newsock->sk;
 -		bool slowpath;
-		slowpath = lock_sock_fast(newsk);
 +		lock_sock(newsk);
/* PM/worker can now acquire the first subflow socket
    	 * lock without racing with listener queue cleanup,
@@@ -3279,11 -3290,10 +3289,11 @@@
    	list_add(&subflow->node, &msk->conn_list);
    	sock_hold(msk->first);
    	if (mptcp_is_fully_established(newsk))
 -			mptcp_pm_fully_established(msk);
 +			mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL);
mptcp_copy_inaddrs(newsk, msk->first);
    	mptcp_rcv_space_init(msk, msk->first);
 +		mptcp_propagate_sndbuf(newsk, msk->first);
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
    	 * This is needed so NOSPACE flag can be set from tcp stack.
@@@ -3295,7 -3305,7 +3305,7 @@@
    		if (!ssk->sk_socket)
    			mptcp_sock_graft(ssk, newsock);
    	}
 -		unlock_sock_fast(newsk, slowpath);
 +		release_sock(newsk);
    }
if (inet_csk_listen_poll(ssock->sk))
@@@ -3319,12 -3329,12 +3329,12 @@@ static __poll_t mptcp_check_writeable(s
    struct sock *sk = (struct sock *)msk;
if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
- 		return 0;
+ 		return EPOLLOUT | EPOLLWRNORM;
if (sk_stream_is_writeable(sk))
    	return EPOLLOUT | EPOLLWRNORM;
-	set_bit(MPTCP_NOSPACE, &msk->flags);
 +	mptcp_set_nospace(sk);
    smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
    if (sk_stream_is_writeable(sk))
    	return EPOLLOUT | EPOLLWRNORM;
@@@ -3352,9 -3362,16 +3362,16 @@@ static __poll_t mptcp_poll(struct file 
    	mask |= mptcp_check_readable(msk);
    	mask |= mptcp_check_writeable(msk);
    }
+ 	if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
+ 		mask |= EPOLLHUP;
    if (sk->sk_shutdown & RCV_SHUTDOWN)
    	mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+ 	/* This barrier is coupled with smp_wmb() in tcp_reset() */
+ 	smp_rmb();
+ 	if (sk->sk_err)
+ 		mask |= EPOLLERR;
+ 
    return mask;
  }
@@@ -3388,58 -3405,13 +3405,58 @@@ static struct inet_protosw mptcp_protos
    .flags		= INET_PROTOSW_ICSK,
  };
+static int mptcp_napi_poll(struct napi_struct *napi, int budget)
 +{
 +	struct mptcp_delegated_action *delegated;
 +	struct mptcp_subflow_context *subflow;
 +	int work_done = 0;
 +
 +	delegated = container_of(napi, struct mptcp_delegated_action, napi);
 +	while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) {
 +		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 +
 +		bh_lock_sock_nested(ssk);
 +		if (!sock_owned_by_user(ssk) &&
 +		    mptcp_subflow_has_delegated_action(subflow))
 +			mptcp_subflow_process_delegated(ssk);
 +		/* ... elsewhere tcp_release_cb_override already processed
 +		 * the action or will do at next release_sock().
 +		 * In both case must dequeue the subflow here - on the same
 +		 * CPU that scheduled it.
 +		 */
 +		bh_unlock_sock(ssk);
 +		sock_put(ssk);
 +
 +		if (++work_done == budget)
 +			return budget;
 +	}
 +
 +	/* always provide a 0 'work_done' argument, so that napi_complete_done
 +	 * will not try accessing the NULL napi->dev ptr
 +	 */
 +	napi_complete_done(napi, 0);
 +	return work_done;
 +}
 +
  void __init mptcp_proto_init(void)
  {
 +	struct mptcp_delegated_action *delegated;
 +	int cpu;
 +
    mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
    	panic("Failed to allocate MPTCP pcpu counter\n");
+	init_dummy_netdev(&mptcp_napi_dev);
 +	for_each_possible_cpu(cpu) {
 +		delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu);
 +		INIT_LIST_HEAD(&delegated->head);
 +		netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll,
 +				  NAPI_POLL_WEIGHT);
 +		napi_enable(&delegated->napi);
 +	}
 +
    mptcp_subflow_init();
    mptcp_pm_init();
    mptcp_token_init();
diff --combined net/mptcp/protocol.h
index 1b6ec1773678,8d9f0ff10cb8..91827d949766
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@@ -10,7 -10,6 +10,7 @@@
  #include <linux/random.h>
  #include <net/tcp.h>
  #include <net/inet_connection_sock.h>
 +#include <uapi/linux/mptcp.h>
#define MPTCP_SUPPORTED_VERSION	1
@@@ -25,7 -24,6 +25,7 @@@
  #define OPTION_MPTCP_ADD_ADDR6	BIT(7)
  #define OPTION_MPTCP_RM_ADDR	BIT(8)
  #define OPTION_MPTCP_FASTCLOSE	BIT(9)
 +#define OPTION_MPTCP_PRIO	BIT(10)
/* MPTCP option subtypes */
  #define MPTCPOPT_MP_CAPABLE	0
@@@ -61,8 -59,6 +61,8 @@@
  #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT	24
  #define TCPOLEN_MPTCP_PORT_LEN		4
  #define TCPOLEN_MPTCP_RM_ADDR_BASE	4
 +#define TCPOLEN_MPTCP_PRIO		3
 +#define TCPOLEN_MPTCP_PRIO_ALIGN	4
  #define TCPOLEN_MPTCP_FASTCLOSE		12
/* MPTCP MP_JOIN flags */
@@@ -90,9 -86,6 +90,9 @@@
  #define MPTCP_ADDR_IPVERSION_4	4
  #define MPTCP_ADDR_IPVERSION_6	6
+/* MPTCP MP_PRIO flags */
 +#define MPTCP_PRIO_BKUP		BIT(0)
 +
  /* MPTCP socket flags */
  #define MPTCP_DATA_READY	0
  #define MPTCP_NOSPACE		1
@@@ -102,6 -95,7 +102,7 @@@
  #define MPTCP_WORK_CLOSE_SUBFLOW 5
  #define MPTCP_PUSH_PENDING	6
  #define MPTCP_CLEAN_UNA		7
+ #define MPTCP_ERROR_REPORT	8
static inline bool before64(__u64 seq1, __u64 seq2)
  {
@@@ -123,7 -117,6 +124,7 @@@ struct mptcp_options_received 
    	dss : 1,
    	add_addr : 1,
    	rm_addr : 1,
 +		mp_prio : 1,
    	family : 4,
    	echo : 1,
    	backup : 1;
@@@ -204,6 -197,10 +205,6 @@@ struct mptcp_pm_data 
    u8		add_addr_accepted;
    u8		local_addr_used;
    u8		subflows;
 -	u8		add_addr_signal_max;
 -	u8		add_addr_accept_max;
 -	u8		local_addr_max;
 -	u8		subflows_max;
    u8		status;
    u8		rm_id;
  };
@@@ -237,7 -234,6 +238,6 @@@ struct mptcp_sock 
    u64		wnd_end;
    unsigned long	timer_ival;
    u32		token;
- 	int		rmem_pending;
    int		rmem_released;
    unsigned long	flags;
    bool		can_ack;
@@@ -289,11 -285,6 +289,11 @@@
  #define mptcp_for_each_subflow(__msk, __subflow)			\
    list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+static inline void msk_owned_by_me(const struct mptcp_sock *msk)
 +{
 +	sock_owned_by_me((const struct sock *)msk);
 +}
 +
  static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
  {
    return (struct mptcp_sock *)sk;
@@@ -301,7 -292,7 +301,7 @@@
static inline int __mptcp_space(const struct sock *sk)
  {
- 	return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_pending);
+ 	return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_released);
  }
static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
@@@ -334,20 -325,13 +334,13 @@@ static inline struct mptcp_data_frag *m
    return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
  }
- static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk)
+ static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk)
  {
    struct mptcp_sock *msk = mptcp_sk(sk);
- 	if (!before64(msk->snd_nxt, READ_ONCE(msk->snd_una)))
+ 	if (msk->snd_una == READ_ONCE(msk->snd_nxt))
    	return NULL;
- 	return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
- }
- 
- static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk)
- {
- 	struct mptcp_sock *msk = mptcp_sk(sk);
- 
    return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list);
  }
@@@ -381,15 -365,6 +374,15 @@@ enum mptcp_data_avail 
    MPTCP_SUBFLOW_OOO_DATA
  };
+struct mptcp_delegated_action {
 +	struct napi_struct napi;
 +	struct list_head head;
 +};
 +
 +DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
 +
 +#define MPTCP_DELEGATE_SEND		0
 +
  /* MPTCP subflow context */
  struct mptcp_subflow_context {
    struct	list_head node;/* conn_list of subflows */
@@@ -414,7 -389,6 +407,7 @@@
    	map_valid : 1,
    	mpc_map : 1,
    	backup : 1,
 +		send_mp_prio : 1,
    	rx_eof : 1,
    	can_ack : 1,        /* only after processing the remote a key */
    	disposable : 1;	    /* ctx can be free at ulp release time */
@@@ -427,15 -401,13 +420,16 @@@
    u8	local_id;
    u8	remote_id;
+	long	delegated_status;
 +	struct	list_head delegated_node;   /* link into delegated_action, protected by local BH */
 +
    struct	sock *tcp_sock;	    /* tcp sk backpointer */
    struct	sock *conn;	    /* parent mptcp_sock */
    const	struct inet_connection_sock_af_ops *icsk_af_ops;
    void	(*tcp_data_ready)(struct sock *sk);
    void	(*tcp_state_change)(struct sock *sk);
    void	(*tcp_write_space)(struct sock *sk);
+ 	void	(*tcp_error_report)(struct sock *sk);
struct	rcu_head rcu;
  };
@@@ -478,61 -450,6 +472,61 @@@ static inline void mptcp_add_pending_su
    spin_unlock_bh(&msk->join_list_lock);
  }
+void mptcp_subflow_process_delegated(struct sock *ssk);
 +
 +static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow)
 +{
 +	struct mptcp_delegated_action *delegated;
 +	bool schedule;
 +
 +	/* The implied barrier pairs with mptcp_subflow_delegated_done(), and
 +	 * ensures the below list check sees list updates done prior to status
 +	 * bit changes
 +	 */
 +	if (!test_and_set_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) {
 +		/* still on delegated list from previous scheduling */
 +		if (!list_empty(&subflow->delegated_node))
 +			return;
 +
 +		/* the caller held the subflow bh socket lock */
 +		lockdep_assert_in_softirq();
 +
 +		delegated = this_cpu_ptr(&mptcp_delegated_actions);
 +		schedule = list_empty(&delegated->head);
 +		list_add_tail(&subflow->delegated_node, &delegated->head);
 +		sock_hold(mptcp_subflow_tcp_sock(subflow));
 +		if (schedule)
 +			napi_schedule(&delegated->napi);
 +	}
 +}
 +
 +static inline struct mptcp_subflow_context *
 +mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated)
 +{
 +	struct mptcp_subflow_context *ret;
 +
 +	if (list_empty(&delegated->head))
 +		return NULL;
 +
 +	ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node);
 +	list_del_init(&ret->delegated_node);
 +	return ret;
 +}
 +
 +static inline bool mptcp_subflow_has_delegated_action(const struct mptcp_subflow_context *subflow)
 +{
 +	return test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status);
 +}
 +
 +static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *subflow)
 +{
 +	/* pairs with mptcp_subflow_delegate, ensures delegate_node is updated before
 +	 * touching the status bit
 +	 */
 +	smp_wmb();
 +	clear_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status);
 +}
 +
  int mptcp_is_enabled(struct net *net);
  unsigned int mptcp_get_add_addr_timeout(struct net *net);
  void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
@@@ -540,19 -457,14 +534,19 @@@
  bool mptcp_subflow_data_available(struct sock *sk);
  void __init mptcp_subflow_init(void);
  void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how);
 -void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 -		       struct mptcp_subflow_context *subflow);
 +void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 +		     struct mptcp_subflow_context *subflow);
  void mptcp_subflow_reset(struct sock *ssk);
 +void mptcp_sock_graft(struct sock *sk, struct socket *parent);
 +struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
/* called with sk socket lock held */
  int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
    		    const struct mptcp_addr_info *remote);
  int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
 +void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
 +			 struct sockaddr_storage *addr,
 +			 unsigned short family);
static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
    				      struct mptcp_subflow_context *ctx)
@@@ -560,6 -472,7 +554,7 @@@
    sk->sk_data_ready = ctx->tcp_data_ready;
    sk->sk_state_change = ctx->tcp_state_change;
    sk->sk_write_space = ctx->tcp_write_space;
+ 	sk->sk_error_report = ctx->tcp_error_report;
inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops;
  }
@@@ -587,6 -500,7 +582,7 @@@ bool mptcp_finish_join(struct sock *sk)
  bool mptcp_schedule_work(struct sock *sk);
  void __mptcp_check_push(struct sock *sk, struct sock *ssk);
  void __mptcp_data_acked(struct sock *sk);
+ void __mptcp_error_report(struct sock *sk);
  void mptcp_subflow_eof(struct sock *sk);
  bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
  void __mptcp_flush_join_list(struct mptcp_sock *msk);
@@@ -596,25 -510,6 +592,25 @@@ static inline bool mptcp_data_fin_enabl
           READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
  }
+static inline bool mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
 +{
 +	if ((sk->sk_userlocks & SOCK_SNDBUF_LOCK) || ssk->sk_sndbuf <= READ_ONCE(sk->sk_sndbuf))
 +		return false;
 +
 +	WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
 +	return true;
 +}
 +
 +static inline void mptcp_write_space(struct sock *sk)
 +{
 +	if (sk_stream_is_writeable(sk)) {
 +		/* pairs with memory barrier in mptcp_poll */
 +		smp_mb();
 +		if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags))
 +			sk_stream_write_space(sk);
 +	}
 +}
 +
  void mptcp_destroy_common(struct mptcp_sock *msk);
void __init mptcp_token_init(void);
@@@ -640,8 -535,8 +636,8 @@@ void mptcp_crypto_hmac_sha(u64 key1, u6
void __init mptcp_pm_init(void);
  void mptcp_pm_data_init(struct mptcp_sock *msk);
 -void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side);
 -void mptcp_pm_fully_established(struct mptcp_sock *msk);
 +void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side);
 +void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp);
  bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
  void mptcp_pm_connection_closed(struct mptcp_sock *msk);
  void mptcp_pm_subflow_established(struct mptcp_sock *msk,
@@@ -651,12 -546,7 +647,12 @@@ void mptcp_pm_add_addr_received(struct 
    			const struct mptcp_addr_info *addr);
  void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk);
  void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id);
 +void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup);
 +int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
 +				 struct mptcp_addr_info *addr,
 +				 u8 bkup);
  void mptcp_pm_free_anno_list(struct mptcp_sock *msk);
 +bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk);
  struct mptcp_pm_add_entry *
  mptcp_pm_del_add_timer(struct mptcp_sock *msk,
    	       struct mptcp_addr_info *addr);
@@@ -667,11 -557,6 +663,11 @@@ int mptcp_pm_announce_addr(struct mptcp
  int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id);
  int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id);
+void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
 +		 const struct sock *ssk, gfp_t gfp);
 +void mptcp_event_addr_announced(const struct mptcp_sock *msk, const struct mptcp_addr_info *info);
 +void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id);
 +
  static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk)
  {
    return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL);
@@@ -719,13 -604,13 +715,13 @@@ int mptcp_pm_get_local_id(struct mptcp_
void __init mptcp_pm_nl_init(void);
  void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
 -void mptcp_pm_nl_fully_established(struct mptcp_sock *msk);
 -void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk);
 -void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk);
 -void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk);
 -void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk);
 +void mptcp_pm_nl_work(struct mptcp_sock *msk);
  void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id);
  int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
 +unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk);
 +unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk);
 +unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk);
 +unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk);
static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
  {
diff --combined net/mptcp/subflow.c
index ce2dea2a6e0a,8b2338dfdc80..06e233410e0e
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@@ -18,15 -18,12 +18,15 @@@
  #include <net/tcp.h>
  #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  #include <net/ip6_route.h>
 +#include <net/transp_v6.h>
  #endif
  #include <net/mptcp.h>
  #include <uapi/linux/mptcp.h>
  #include "protocol.h"
  #include "mib.h"
+static void mptcp_subflow_ops_undo_override(struct sock *ssk);
 +
  static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
    			  enum linux_mptcp_mib_field field)
  {
@@@ -64,23 -61,11 +64,23 @@@ static bool mptcp_can_accept_new_subflo
  }
/* validate received token and create truncated hmac and nonce for SYN-ACK */
 -static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
 -						     const struct sk_buff *skb)
 +static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req)
  {
 -	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
 +	struct mptcp_sock *msk = subflow_req->msk;
    u8 hmac[SHA256_DIGEST_SIZE];
 +
 +	get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
 +
 +	subflow_generate_hmac(msk->local_key, msk->remote_key,
 +			      subflow_req->local_nonce,
 +			      subflow_req->remote_nonce, hmac);
 +
 +	subflow_req->thmac = get_unaligned_be64(hmac);
 +}
 +
 +static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
 +{
 +	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
    struct mptcp_sock *msk;
    int local_id;
@@@ -97,10 -82,17 +97,10 @@@
    }
    subflow_req->local_id = local_id;
-	get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
 -
 -	subflow_generate_hmac(msk->local_key, msk->remote_key,
 -			      subflow_req->local_nonce,
 -			      subflow_req->remote_nonce, hmac);
 -
 -	subflow_req->thmac = get_unaligned_be64(hmac);
    return msk;
  }
- static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
+ static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
  {
    struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
@@@ -108,42 -100,30 +108,35 @@@
    subflow_req->mp_join = 0;
    subflow_req->msk = NULL;
    mptcp_token_init_request(req);
- 
- #ifdef CONFIG_TCP_MD5SIG
- 	/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
- 	 * TCP option space.
- 	 */
- 	if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
- 		return -EINVAL;
- #endif
- 
- 	return 0;
  }
+static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk)
 +{
 +	return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport;
 +}
 +
  /* Init mptcp request socket.
   *
   * Returns an error code if a JOIN has failed and a TCP reset
   * should be sent.
   */
- static int subflow_init_req(struct request_sock *req,
- 			    const struct sock *sk_listener,
- 			    struct sk_buff *skb)
+ static int subflow_check_req(struct request_sock *req,
+ 			     const struct sock *sk_listener,
+ 			     struct sk_buff *skb)
  {
    struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
    struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
    struct mptcp_options_received mp_opt;
- 	int ret;
pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
- 	ret = __subflow_init_req(req, sk_listener);
- 	if (ret)
- 		return 0;
+ #ifdef CONFIG_TCP_MD5SIG
+ 	/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+ 	 * TCP option space.
+ 	 */
+ 	if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
+ 		return -EINVAL;
+ #endif
mptcp_get_options(skb, &mp_opt);
@@@ -191,30 -171,12 +184,30 @@@ again
    	subflow_req->remote_id = mp_opt.join_id;
    	subflow_req->token = mp_opt.token;
    	subflow_req->remote_nonce = mp_opt.nonce;
 -		subflow_req->msk = subflow_token_join_request(req, skb);
 +		subflow_req->msk = subflow_token_join_request(req);
/* Can't fall back to TCP in this case. */
    	if (!subflow_req->msk)
    		return -EPERM;
+		if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
 +			pr_debug("syn inet_sport=%d %d",
 +				 ntohs(inet_sk(sk_listener)->inet_sport),
 +				 ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
 +			if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
 +				sock_put((struct sock *)subflow_req->msk);
 +				mptcp_token_destroy_request(req);
 +				tcp_request_sock_ops.destructor(req);
 +				subflow_req->msk = NULL;
 +				subflow_req->mp_join = 0;
 +				SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
 +				return -EPERM;
 +			}
 +			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX);
 +		}
 +
 +		subflow_req_create_thmac(subflow_req);
 +
    	if (unlikely(req->syncookie)) {
    		if (mptcp_can_accept_new_subflow(subflow_req->msk))
    			subflow_init_req_cookie_join_save(subflow_req, skb);
@@@ -236,10 -198,7 +229,7 @@@ int mptcp_subflow_init_cookie_req(struc
    struct mptcp_options_received mp_opt;
    int err;
- 	err = __subflow_init_req(req, sk_listener);
- 	if (err)
- 		return err;
- 
+ 	subflow_init_req(req, sk_listener);
    mptcp_get_options(skb, &mp_opt);
if (mp_opt.mp_capable && mp_opt.mp_join)
@@@ -279,12 -238,13 +269,13 @@@ static struct dst_entry *subflow_v4_rou
    int err;
tcp_rsk(req)->is_mptcp = 1;
+ 	subflow_init_req(req, sk);
dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
    if (!dst)
    	return NULL;
- 	err = subflow_init_req(req, sk, skb);
+ 	err = subflow_check_req(req, sk, skb);
    if (err == 0)
    	return dst;
@@@ -304,12 -264,13 +295,13 @@@ static struct dst_entry *subflow_v6_rou
    int err;
tcp_rsk(req)->is_mptcp = 1;
+ 	subflow_init_req(req, sk);
dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
    if (!dst)
    	return NULL;
- 	err = subflow_init_req(req, sk, skb);
+ 	err = subflow_check_req(req, sk, skb);
    if (err == 0)
    	return dst;
@@@ -357,11 -318,6 +349,11 @@@ void mptcp_subflow_reset(struct sock *s
    sock_put(sk);
  }
+static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk)
 +{
 +	return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport;
 +}
 +
  static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
  {
    struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@@ -379,7 -335,6 +371,7 @@@
    if (subflow->conn_finished)
    	return;
+	mptcp_propagate_sndbuf(parent, sk);
    subflow->rel_write_seq = 1;
    subflow->conn_finished = 1;
    subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
@@@ -428,13 -383,6 +420,13 @@@
subflow->mp_join = 1;
    	MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
 +
 +		if (subflow_use_different_dport(mptcp_sk(parent), sk)) {
 +			pr_debug("synack inet_dport=%d %d",
 +				 ntohs(inet_sk(sk)->inet_dport),
 +				 ntohs(inet_sk(parent)->inet_dport));
 +			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX);
 +		}
    } else if (mptcp_check_fallback(sk)) {
  fallback:
    	mptcp_rcv_space_init(mptcp_sk(parent), sk);
@@@ -471,7 -419,6 +463,7 @@@ drop
  static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
  static struct inet_connection_sock_af_ops subflow_v6_specific;
  static struct inet_connection_sock_af_ops subflow_v6m_specific;
 +static struct proto tcpv6_prot_override;
static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
  {
@@@ -553,8 -500,6 +545,8 @@@ static void subflow_ulp_fallback(struc
    icsk->icsk_ulp_ops = NULL;
    rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
    tcp_sk(sk)->is_mptcp = 0;
 +
 +	mptcp_subflow_ops_undo_override(sk);
  }
static void subflow_drop_ctx(struct sock *ssk)
@@@ -675,7 -620,7 +667,7 @@@ create_child
    		 * created mptcp socket
    		 */
    		new_msk->sk_destruct = mptcp_sock_destruct;
 -			mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
 +			mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1);
    		mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
    		ctx->conn = new_msk;
    		new_msk = NULL;
@@@ -700,17 -645,6 +692,17 @@@
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
    		tcp_rsk(req)->drop_req = true;
 +
 +			if (subflow_use_different_sport(owner, sk)) {
 +				pr_debug("ack inet_sport=%d %d",
 +					 ntohs(inet_sk(sk)->inet_sport),
 +					 ntohs(inet_sk((struct sock *)owner)->inet_sport));
 +				if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
 +					SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX);
 +					goto out;
 +				}
 +				SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX);
 +			}
    	}
    }
@@@ -739,7 -673,6 +731,7 @@@ dispose_child
  }
static struct inet_connection_sock_af_ops subflow_specific;
 +static struct proto tcp_prot_override;
enum mapping_status {
    MAPPING_OK,
@@@ -953,22 -886,6 +945,22 @@@ static void mptcp_subflow_discard_data(
    	subflow->map_valid = 0;
  }
+/* sched mptcp worker to remove the subflow if no more data is pending */
 +static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk)
 +{
 +	struct sock *sk = (struct sock *)msk;
 +
 +	if (likely(ssk->sk_state != TCP_CLOSE))
 +		return;
 +
 +	if (skb_queue_empty(&ssk->sk_receive_queue) &&
 +	    !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) {
 +		sock_hold(sk);
 +		if (!schedule_work(&msk->work))
 +			sock_put(sk);
 +	}
 +}
 +
  static bool subflow_check_data_avail(struct sock *ssk)
  {
    struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
@@@ -1007,11 -924,11 +999,11 @@@
    	}
if (status != MAPPING_OK)
 -			return false;
 +			goto no_data;
skb = skb_peek(&ssk->sk_receive_queue);
    	if (WARN_ON_ONCE(!skb))
 -			return false;
 +			goto no_data;
/* if msk lacks the remote key, this subflow must provide an
    	 * MP_CAPABLE-based mapping
@@@ -1045,9 -962,6 +1037,9 @@@
    }
    return true;
+no_data:
 +	subflow_sched_work_if_closed(msk, ssk);
 +	return false;
  fatal:
    /* fatal protocol error, close the socket */
    /* This barrier is coupled with smp_rmb() in tcp_poll() */
@@@ -1118,12 -1032,49 +1110,52 @@@ static void subflow_data_ready(struct s
static void subflow_write_space(struct sock *ssk)
  {
 -	/* we take action in __mptcp_clean_una() */
 +	struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
 +
 +	mptcp_propagate_sndbuf(sk, ssk);
 +	mptcp_write_space(sk);
  }
+ void __mptcp_error_report(struct sock *sk)
+ {
+ 	struct mptcp_subflow_context *subflow;
+ 	struct mptcp_sock *msk = mptcp_sk(sk);
+ 
+ 	mptcp_for_each_subflow(msk, subflow) {
+ 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ 		int err = sock_error(ssk);
+ 
+ 		if (!err)
+ 			continue;
+ 
+ 		/* only propagate errors on fallen-back sockets or
+ 		 * on MPC connect
+ 		 */
+ 		if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
+ 			continue;
+ 
+ 		inet_sk_state_store(sk, inet_sk_state_load(ssk));
+ 		sk->sk_err = -err;
+ 
+ 		/* This barrier is coupled with smp_rmb() in mptcp_poll() */
+ 		smp_wmb();
+ 		sk->sk_error_report(sk);
+ 		break;
+ 	}
+ }
+ 
+ static void subflow_error_report(struct sock *ssk)
+ {
+ 	struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
+ 
+ 	mptcp_data_lock(sk);
+ 	if (!sock_owned_by_user(sk))
+ 		__mptcp_error_report(sk);
+ 	else
+ 		set_bit(MPTCP_ERROR_REPORT,  &mptcp_sk(sk)->flags);
+ 	mptcp_data_unlock(sk);
+ }
+ 
  static struct inet_connection_sock_af_ops *
  subflow_default_af_ops(struct sock *sk)
  {
@@@ -1154,32 -1105,22 +1186,32 @@@ void mptcpv6_handle_mapped(struct sock 
  }
  #endif
-static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
 -				struct sockaddr_storage *addr)
 +void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
 +			 struct sockaddr_storage *addr,
 +			 unsigned short family)
  {
    memset(addr, 0, sizeof(*addr));
 -	addr->ss_family = info->family;
 +	addr->ss_family = family;
    if (addr->ss_family == AF_INET) {
    	struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
-		in_addr->sin_addr = info->addr;
 +		if (info->family == AF_INET)
 +			in_addr->sin_addr = info->addr;
 +#if IS_ENABLED(CONFIG_MPTCP_IPV6)
 +		else if (ipv6_addr_v4mapped(&info->addr6))
 +			in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3];
 +#endif
    	in_addr->sin_port = info->port;
    }
  #if IS_ENABLED(CONFIG_MPTCP_IPV6)
    else if (addr->ss_family == AF_INET6) {
    	struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
-		in6_addr->sin6_addr = info->addr6;
 +		if (info->family == AF_INET)
 +			ipv6_addr_set_v4mapped(info->addr.s_addr,
 +					       &in6_addr->sin6_addr);
 +		else
 +			in6_addr->sin6_addr = info->addr6;
    	in6_addr->sin6_port = info->port;
    }
  #endif
@@@ -1223,11 -1164,11 +1255,11 @@@ int __mptcp_subflow_connect(struct soc
    subflow->remote_key = msk->remote_key;
    subflow->local_key = msk->local_key;
    subflow->token = msk->token;
 -	mptcp_info2sockaddr(loc, &addr);
 +	mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
addrlen = sizeof(struct sockaddr_in);
  #if IS_ENABLED(CONFIG_MPTCP_IPV6)
 -	if (loc->family == AF_INET6)
 +	if (addr.ss_family == AF_INET6)
    	addrlen = sizeof(struct sockaddr_in6);
  #endif
    ssk->sk_bound_dev_if = loc->ifindex;
@@@ -1243,16 -1184,13 +1275,16 @@@
    subflow->remote_id = remote_id;
    subflow->request_join = 1;
    subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
 -	mptcp_info2sockaddr(remote, &addr);
 +	mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
mptcp_add_pending_subflow(msk, subflow);
    err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
    if (err && err != -EINPROGRESS)
    	goto failed_unlink;
+	/* discard the subflow socket */
 +	mptcp_sock_graft(ssk, sk->sk_socket);
 +	iput(SOCK_INODE(sf));
    return err;
failed_unlink:
@@@ -1290,25 -1228,6 +1322,25 @@@ static void mptcp_attach_cgroup(struct 
  #endif /* CONFIG_SOCK_CGROUP_DATA */
  }
+static void mptcp_subflow_ops_override(struct sock *ssk)
 +{
 +#if IS_ENABLED(CONFIG_MPTCP_IPV6)
 +	if (ssk->sk_prot == &tcpv6_prot)
 +		ssk->sk_prot = &tcpv6_prot_override;
 +	else
 +#endif
 +		ssk->sk_prot = &tcp_prot_override;
 +}
 +
 +static void mptcp_subflow_ops_undo_override(struct sock *ssk)
 +{
 +#if IS_ENABLED(CONFIG_MPTCP_IPV6)
 +	if (ssk->sk_prot == &tcpv6_prot_override)
 +		ssk->sk_prot = &tcpv6_prot;
 +	else
 +#endif
 +		ssk->sk_prot = &tcp_prot;
 +}
  int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
  {
    struct mptcp_subflow_context *subflow;
@@@ -1364,7 -1283,6 +1396,7 @@@
    *new_sock = sf;
    sock_hold(sk);
    subflow->conn = sk;
 +	mptcp_subflow_ops_override(sf->sk);
return 0;
  }
@@@ -1381,7 -1299,6 +1413,7 @@@ static struct mptcp_subflow_context *su
rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
    INIT_LIST_HEAD(&ctx->node);
 +	INIT_LIST_HEAD(&ctx->delegated_node);
pr_debug("subflow=%p", ctx);
@@@ -1414,7 -1331,6 +1446,7 @@@ static void subflow_state_change(struc
    __subflow_state_change(sk);
if (subflow_simultaneous_connect(sk)) {
 +		mptcp_propagate_sndbuf(parent, sk);
    	mptcp_do_fallback(sk);
    	mptcp_rcv_space_init(mptcp_sk(parent), sk);
    	pr_fallback(mptcp_sk(parent));
@@@ -1432,8 -1348,6 +1464,8 @@@
    if (mptcp_subflow_data_available(sk))
    	mptcp_data_ready(parent, sk);
+	subflow_sched_work_if_closed(mptcp_sk(parent), sk);
 +
    if (__mptcp_check_fallback(mptcp_sk(parent)) &&
        !subflow->rx_eof && subflow_is_done(sk)) {
    	subflow->rx_eof = 1;
@@@ -1470,9 -1384,11 +1502,11 @@@ static int subflow_ulp_init(struct soc
    ctx->tcp_data_ready = sk->sk_data_ready;
    ctx->tcp_state_change = sk->sk_state_change;
    ctx->tcp_write_space = sk->sk_write_space;
+ 	ctx->tcp_error_report = sk->sk_error_report;
    sk->sk_data_ready = subflow_data_ready;
    sk->sk_write_space = subflow_write_space;
    sk->sk_state_change = subflow_state_change;
+ 	sk->sk_error_report = subflow_error_report;
  out:
    return err;
  }
@@@ -1496,7 -1412,6 +1530,7 @@@ static void subflow_ulp_release(struct 
    	sock_put(sk);
    }
+	mptcp_subflow_ops_undo_override(ssk);
    if (release)
    	kfree_rcu(ctx, rcu);
  }
@@@ -1526,6 -1441,7 +1560,7 @@@ static void subflow_ulp_clone(const str
    new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
    new_ctx->tcp_state_change = old_ctx->tcp_state_change;
    new_ctx->tcp_write_space = old_ctx->tcp_write_space;
+ 	new_ctx->tcp_error_report = old_ctx->tcp_error_report;
    new_ctx->rel_write_seq = 1;
    new_ctx->tcp_sock = newsk;
@@@ -1550,16 -1466,6 +1585,16 @@@
    }
  }
+static void tcp_release_cb_override(struct sock *ssk)
 +{
 +	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 +
 +	if (mptcp_subflow_has_delegated_action(subflow))
 +		mptcp_subflow_process_delegated(ssk);
 +
 +	tcp_release_cb(ssk);
 +}
 +
  static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
    .name		= "mptcp",
    .owner		= THIS_MODULE,
@@@ -1600,9 -1506,6 +1635,9 @@@ void __init mptcp_subflow_init(void
    subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
    subflow_specific.sk_rx_dst_set = subflow_finish_connect;
+	tcp_prot_override = tcp_prot;
 +	tcp_prot_override.release_cb = tcp_release_cb_override;
 +
  #if IS_ENABLED(CONFIG_MPTCP_IPV6)
    subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
    subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
@@@ -1618,9 -1521,6 +1653,9 @@@
    subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
    subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
    subflow_v6m_specific.net_frag_header_len = 0;
 +
 +	tcpv6_prot_override = tcpv6_prot;
 +	tcpv6_prot_override.release_cb = tcp_release_cb_override;
  #endif
mptcp_diag_subflow_init(&subflow_ulp_ops);
diff --combined net/sched/act_api.c
index 4dd235ce9a07,ebc8f1413078..b919826939e0
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@@ -908,7 -908,7 +908,7 @@@ static const struct nla_policy tcf_acti
    [TCA_ACT_HW_STATS]	= NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY),
  };
- static void tcf_idr_insert_many(struct tc_action *actions[])
+ void tcf_idr_insert_many(struct tc_action *actions[])
  {
    int i;
@@@ -928,13 -928,19 +928,13 @@@
    }
  }
-struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 -				    struct nlattr *nla, struct nlattr *est,
 -				    char *name, int ovr, int bind,
 -				    bool rtnl_held,
 -				    struct netlink_ext_ack *extack)
 +struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
 +					 bool rtnl_held,
 +					 struct netlink_ext_ack *extack)
  {
 -	struct nla_bitfield32 flags = { 0, 0 };
 -	u8 hw_stats = TCA_ACT_HW_STATS_ANY;
 -	struct tc_action *a;
 +	struct nlattr *tb[TCA_ACT_MAX + 1];
    struct tc_action_ops *a_o;
 -	struct tc_cookie *cookie = NULL;
    char act_name[IFNAMSIZ];
 -	struct nlattr *tb[TCA_ACT_MAX + 1];
    struct nlattr *kind;
    int err;
@@@ -942,21 -948,33 +942,21 @@@
    	err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
    					  tcf_action_policy, extack);
    	if (err < 0)
 -			goto err_out;
 +			return ERR_PTR(err);
    	err = -EINVAL;
    	kind = tb[TCA_ACT_KIND];
    	if (!kind) {
    		NL_SET_ERR_MSG(extack, "TC action kind must be specified");
 -			goto err_out;
 +			return ERR_PTR(err);
    	}
    	if (nla_strscpy(act_name, kind, IFNAMSIZ) < 0) {
    		NL_SET_ERR_MSG(extack, "TC action name too long");
 -			goto err_out;
 +			return ERR_PTR(err);
    	}
 -		if (tb[TCA_ACT_COOKIE]) {
 -			cookie = nla_memdup_cookie(tb);
 -			if (!cookie) {
 -				NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
 -				err = -ENOMEM;
 -				goto err_out;
 -			}
 -		}
 -		hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]);
 -		if (tb[TCA_ACT_FLAGS])
 -			flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]);
    } else {
    	if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) {
    		NL_SET_ERR_MSG(extack, "TC action name too long");
 -			err = -EINVAL;
 -			goto err_out;
 +			return ERR_PTR(-EINVAL);
    	}
    }
@@@ -978,56 -996,24 +978,56 @@@
    	 * indicate this using -EAGAIN.
    	 */
    	if (a_o != NULL) {
 -			err = -EAGAIN;
 -			goto err_mod;
 +			module_put(a_o->owner);
 +			return ERR_PTR(-EAGAIN);
    	}
  #endif
    	NL_SET_ERR_MSG(extack, "Failed to load TC action module");
 -		err = -ENOENT;
 -		goto err_free;
 +		return ERR_PTR(-ENOENT);
    }
+	return a_o;
 +}
 +
 +struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 +				    struct nlattr *nla, struct nlattr *est,
 +				    char *name, int ovr, int bind,
 +				    struct tc_action_ops *a_o, bool rtnl_held,
 +				    struct netlink_ext_ack *extack)
 +{
 +	struct nla_bitfield32 flags = { 0, 0 };
 +	u8 hw_stats = TCA_ACT_HW_STATS_ANY;
 +	struct nlattr *tb[TCA_ACT_MAX + 1];
 +	struct tc_cookie *cookie = NULL;
 +	struct tc_action *a;
 +	int err;
 +
    /* backward compatibility for policer */
 -	if (name == NULL)
 +	if (name == NULL) {
 +		err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
 +						  tcf_action_policy, extack);
 +		if (err < 0)
 +			return ERR_PTR(err);
 +		if (tb[TCA_ACT_COOKIE]) {
 +			cookie = nla_memdup_cookie(tb);
 +			if (!cookie) {
 +				NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
 +				err = -ENOMEM;
 +				goto err_out;
 +			}
 +		}
 +		hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]);
 +		if (tb[TCA_ACT_FLAGS])
 +			flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]);
 +
    	err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
    			rtnl_held, tp, flags.value, extack);
 -	else
 +	} else {
    	err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
    			tp, flags.value, extack);
 +	}
    if (err < 0)
 -		goto err_mod;
 +		goto err_out;
if (!name && tb[TCA_ACT_COOKIE])
    	tcf_set_action_cookie(&a->act_cookie, cookie);
@@@ -1044,11 -1030,14 +1044,11 @@@
return a;
-err_mod:
 -	module_put(a_o->owner);
 -err_free:
 +err_out:
    if (cookie) {
    	kfree(cookie->data);
    	kfree(cookie);
    }
 -err_out:
    return ERR_PTR(err);
  }
@@@ -1059,7 -1048,6 +1059,7 @@@ int tcf_action_init(struct net *net, st
    	    struct tc_action *actions[], size_t *attr_size,
    	    bool rtnl_held, struct netlink_ext_ack *extack)
  {
 +	struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {};
    struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
    struct tc_action *act;
    size_t sz = 0;
@@@ -1071,20 -1059,9 +1071,20 @@@
    if (err < 0)
    	return err;
+	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
 +		struct tc_action_ops *a_o;
 +
 +		a_o = tc_action_load_ops(name, tb[i], rtnl_held, extack);
 +		if (IS_ERR(a_o)) {
 +			err = PTR_ERR(a_o);
 +			goto err_mod;
 +		}
 +		ops[i - 1] = a_o;
 +	}
 +
    for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
    	act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
 -					rtnl_held, extack);
 +					ops[i - 1], rtnl_held, extack);
    	if (IS_ERR(act)) {
    		err = PTR_ERR(act);
    		goto err;
@@@ -1104,11 -1081,6 +1104,11 @@@
err:
    tcf_action_destroy(actions, bind);
 +err_mod:
 +	for (i = 0; i < TCA_ACT_MAX_PRIO; i++) {
 +		if (ops[i])
 +			module_put(ops[i]->owner);
 +	}
    return err;
  }
diff --combined net/sched/cls_api.c
index a67c66a512a4,0b3900dd2354..e37556cc37ab
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@@ -3043,23 -3043,17 +3043,24 @@@ int tcf_exts_validate(struct net *net, 
    	size_t attr_size = 0;
if (exts->police && tb[exts->police]) {
 +			struct tc_action_ops *a_o;
 +
 +			a_o = tc_action_load_ops("police", tb[exts->police], rtnl_held, extack);
 +			if (IS_ERR(a_o))
 +				return PTR_ERR(a_o);
    		act = tcf_action_init_1(net, tp, tb[exts->police],
    					rate_tlv, "police", ovr,
 -						TCA_ACT_BIND, rtnl_held,
 +						TCA_ACT_BIND, a_o, rtnl_held,
    					extack);
 -			if (IS_ERR(act))
 +			if (IS_ERR(act)) {
 +				module_put(a_o->owner);
    			return PTR_ERR(act);
 +			}
act->type = exts->type = TCA_OLD_COMPAT;
    		exts->actions[0] = act;
    		exts->nr_actions = 1;
+ 			tcf_idr_insert_many(exts->actions);
    	} else if (exts->action && tb[exts->action]) {
    		int err;
diff --combined net/sched/cls_flower.c
index caf7643e9c83,46c1b3e9f66a..2409e522c68f
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@@ -30,6 -30,11 +30,11 @@@
#include <uapi/linux/netfilter/nf_conntrack_common.h>
+ #define TCA_FLOWER_KEY_CT_FLAGS_MAX \
+ 		((__TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) << 1)
+ #define TCA_FLOWER_KEY_CT_FLAGS_MASK \
+ 		(TCA_FLOWER_KEY_CT_FLAGS_MAX - 1)
+ 
  struct fl_flow_key {
    struct flow_dissector_key_meta meta;
    struct flow_dissector_key_control control;
@@@ -291,11 -296,9 +296,11 @@@ static u16 fl_ct_info_to_flower_map[] 
    [IP_CT_RELATED] =		TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
    				TCA_FLOWER_KEY_CT_FLAGS_RELATED,
    [IP_CT_ESTABLISHED_REPLY] =	TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
 -					TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED,
 +					TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED |
 +					TCA_FLOWER_KEY_CT_FLAGS_REPLY,
    [IP_CT_RELATED_REPLY] =		TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
 -					TCA_FLOWER_KEY_CT_FLAGS_RELATED,
 +					TCA_FLOWER_KEY_CT_FLAGS_RELATED |
 +					TCA_FLOWER_KEY_CT_FLAGS_REPLY,
    [IP_CT_NEW] =			TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
    				TCA_FLOWER_KEY_CT_FLAGS_NEW,
  };
@@@ -304,7 -307,6 +309,7 @@@ static int fl_classify(struct sk_buff *
    	       struct tcf_result *res)
  {
    struct cls_fl_head *head = rcu_dereference_bh(tp->root);
 +	bool post_ct = qdisc_skb_cb(skb)->post_ct;
    struct fl_flow_key skb_key;
    struct fl_flow_mask *mask;
    struct cls_fl_filter *f;
@@@ -321,8 -323,7 +326,8 @@@
    	skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
    	skb_flow_dissect_ct(skb, &mask->dissector, &skb_key,
    			    fl_ct_info_to_flower_map,
 -				    ARRAY_SIZE(fl_ct_info_to_flower_map));
 +				    ARRAY_SIZE(fl_ct_info_to_flower_map),
 +				    post_ct);
    	skb_flow_dissect_hash(skb, &mask->dissector, &skb_key);
    	skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
@@@ -690,8 -691,10 +695,10 @@@ static const struct nla_policy fl_polic
    [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
    [TCA_FLOWER_KEY_ENC_OPTS]	= { .type = NLA_NESTED },
    [TCA_FLOWER_KEY_ENC_OPTS_MASK]	= { .type = NLA_NESTED },
- 	[TCA_FLOWER_KEY_CT_STATE]	= { .type = NLA_U16 },
- 	[TCA_FLOWER_KEY_CT_STATE_MASK]	= { .type = NLA_U16 },
+ 	[TCA_FLOWER_KEY_CT_STATE]	=
+ 		NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK),
+ 	[TCA_FLOWER_KEY_CT_STATE_MASK]	=
+ 		NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK),
    [TCA_FLOWER_KEY_CT_ZONE]	= { .type = NLA_U16 },
    [TCA_FLOWER_KEY_CT_ZONE_MASK]	= { .type = NLA_U16 },
    [TCA_FLOWER_KEY_CT_MARK]	= { .type = NLA_U32 },
@@@ -1394,12 -1397,33 +1401,33 @@@ static int fl_set_enc_opt(struct nlatt
    return 0;
  }
+ static int fl_validate_ct_state(u16 state, struct nlattr *tb,
+ 				struct netlink_ext_ack *extack)
+ {
+ 	if (state && !(state & TCA_FLOWER_KEY_CT_FLAGS_TRACKED)) {
+ 		NL_SET_ERR_MSG_ATTR(extack, tb,
+ 				    "no trk, so no other flag can be set");
+ 		return -EINVAL;
+ 	}
+ 
+ 	if (state & TCA_FLOWER_KEY_CT_FLAGS_NEW &&
+ 	    state & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED) {
+ 		NL_SET_ERR_MSG_ATTR(extack, tb,
+ 				    "new and est are mutually exclusive");
+ 		return -EINVAL;
+ 	}
+ 
+ 	return 0;
+ }
+ 
  static int fl_set_key_ct(struct nlattr **tb,
    		 struct flow_dissector_key_ct *key,
    		 struct flow_dissector_key_ct *mask,
    		 struct netlink_ext_ack *extack)
  {
    if (tb[TCA_FLOWER_KEY_CT_STATE]) {
+ 		int err;
+ 
    	if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) {
    		NL_SET_ERR_MSG(extack, "Conntrack isn't enabled");
    		return -EOPNOTSUPP;
@@@ -1407,6 -1431,13 +1435,13 @@@
    	fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
    		       &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
    		       sizeof(key->ct_state));
+ 
+ 		err = fl_validate_ct_state(mask->ct_state,
+ 					   tb[TCA_FLOWER_KEY_CT_STATE_MASK],
+ 					   extack);
+ 		if (err)
+ 			return err;
+ 
    }
    if (tb[TCA_FLOWER_KEY_CT_ZONE]) {
    	if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
diff --combined tools/testing/selftests/net/forwarding/tc_flower.sh
index a554838666c4,b11d8e6b5bc1..4b58ccae3429
--- a/tools/testing/selftests/net/forwarding/tc_flower.sh
+++ b/tools/testing/selftests/net/forwarding/tc_flower.sh
@@@ -3,9 -3,7 +3,9 @@@
ALL_TESTS="match_dst_mac_test match_src_mac_test match_dst_ip_test \
    match_src_ip_test match_ip_flags_test match_pcp_test match_vlan_test \
- 	match_ip_tos_test match_indev_test match_mpls_label_test \
 -	match_ip_tos_test match_indev_test match_ip_ttl_test"
++	match_ip_tos_test match_indev_testmatch_ip_ttl_test match_mpls_label_test \
 +	match_mpls_tc_test match_mpls_bos_test match_mpls_ttl_test \
 +	match_mpls_lse_test"
  NUM_NETIFS=2
  source tc_common.sh
  source lib.sh
@@@ -312,6 -310,42 +312,42 @@@ match_ip_tos_test(
    log_test "ip_tos match ($tcflags)"
  }
+ match_ip_ttl_test()
+ {
+ 	RET=0
+ 
+ 	tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ 		$tcflags dst_ip 192.0.2.2 ip_ttl 63 action drop
+ 	tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ 		$tcflags dst_ip 192.0.2.2 action drop
+ 
+ 	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ 		-t ip "ttl=63" -q
+ 
+ 	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ 		-t ip "ttl=63,mf,frag=256" -q
+ 
+ 	tc_check_packets "dev $h2 ingress" 102 1
+ 	check_fail $? "Matched on the wrong filter (no check on ttl)"
+ 
+ 	tc_check_packets "dev $h2 ingress" 101 2
+ 	check_err $? "Did not match on correct filter (ttl=63)"
+ 
+ 	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ 		-t ip "ttl=255" -q
+ 
+ 	tc_check_packets "dev $h2 ingress" 101 3
+ 	check_fail $? "Matched on a wrong filter (ttl=63)"
+ 
+ 	tc_check_packets "dev $h2 ingress" 102 1
+ 	check_err $? "Did not match on correct filter (no check on ttl)"
+ 
+ 	tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+ 	tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+ 
+ 	log_test "ip_ttl match ($tcflags)"
+ }
+ 
  match_indev_test()
  {
    RET=0
@@@ -336,309 -370,6 +372,309 @@@
    log_test "indev match ($tcflags)"
  }
+# Unfortunately, mausezahn can't build MPLS headers when used in L2
 +# mode, so we have this function to build Label Stack Entries.
 +mpls_lse()
 +{
 +	local label=$1
 +	local tc=$2
 +	local bos=$3
 +	local ttl=$4
 +
 +	printf "%02x %02x %02x %02x"                        \
 +		$((label >> 12))                            \
 +		$((label >> 4 & 0xff))                      \
 +		$((((label & 0xf) << 4) + (tc << 1) + bos)) \
 +		$ttl
 +}
 +
 +match_mpls_label_test()
 +{
 +	local ethtype="88 47"; readonly ethtype
 +	local pkt
 +
 +	RET=0
 +
 +	check_tc_mpls_support $h2 || return 0
 +
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 1 handle 101 \
 +		flower $tcflags mpls_label 0 action drop
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 2 handle 102 \
 +		flower $tcflags mpls_label 1048575 action drop
 +
 +	pkt="$ethtype $(mpls_lse 1048575 0 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	tc_check_packets "dev $h2 ingress" 101 1
 +	check_fail $? "Matched on a wrong filter (1048575)"
 +
 +	tc_check_packets "dev $h2 ingress" 102 1
 +	check_err $? "Did not match on correct filter (1048575)"
 +
 +	pkt="$ethtype $(mpls_lse 0 0 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	tc_check_packets "dev $h2 ingress" 102 2
 +	check_fail $? "Matched on a wrong filter (0)"
 +
 +	tc_check_packets "dev $h2 ingress" 101 1
 +	check_err $? "Did not match on correct filter (0)"
 +
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 2 handle 102 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 1 handle 101 flower
 +
 +	log_test "mpls_label match ($tcflags)"
 +}
 +
 +match_mpls_tc_test()
 +{
 +	local ethtype="88 47"; readonly ethtype
 +	local pkt
 +
 +	RET=0
 +
 +	check_tc_mpls_support $h2 || return 0
 +
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 1 handle 101 \
 +		flower $tcflags mpls_tc 0 action drop
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 2 handle 102 \
 +		flower $tcflags mpls_tc 7 action drop
 +
 +	pkt="$ethtype $(mpls_lse 0 7 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	tc_check_packets "dev $h2 ingress" 101 1
 +	check_fail $? "Matched on a wrong filter (7)"
 +
 +	tc_check_packets "dev $h2 ingress" 102 1
 +	check_err $? "Did not match on correct filter (7)"
 +
 +	pkt="$ethtype $(mpls_lse 0 0 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	tc_check_packets "dev $h2 ingress" 102 2
 +	check_fail $? "Matched on a wrong filter (0)"
 +
 +	tc_check_packets "dev $h2 ingress" 101 1
 +	check_err $? "Did not match on correct filter (0)"
 +
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 2 handle 102 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 1 handle 101 flower
 +
 +	log_test "mpls_tc match ($tcflags)"
 +}
 +
 +match_mpls_bos_test()
 +{
 +	local ethtype="88 47"; readonly ethtype
 +	local pkt
 +
 +	RET=0
 +
 +	check_tc_mpls_support $h2 || return 0
 +
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 1 handle 101 \
 +		flower $tcflags mpls_bos 0 action drop
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 2 handle 102 \
 +		flower $tcflags mpls_bos 1 action drop
 +
 +	pkt="$ethtype $(mpls_lse 0 0 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	tc_check_packets "dev $h2 ingress" 101 1
 +	check_fail $? "Matched on a wrong filter (1)"
 +
 +	tc_check_packets "dev $h2 ingress" 102 1
 +	check_err $? "Did not match on correct filter (1)"
 +
 +	# Need to add a second label to properly mark the Bottom of Stack
 +	pkt="$ethtype $(mpls_lse 0 0 0 255) $(mpls_lse 0 0 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	tc_check_packets "dev $h2 ingress" 102 2
 +	check_fail $? "Matched on a wrong filter (0)"
 +
 +	tc_check_packets "dev $h2 ingress" 101 1
 +	check_err $? "Did not match on correct filter (0)"
 +
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 2 handle 102 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 1 handle 101 flower
 +
 +	log_test "mpls_bos match ($tcflags)"
 +}
 +
 +match_mpls_ttl_test()
 +{
 +	local ethtype="88 47"; readonly ethtype
 +	local pkt
 +
 +	RET=0
 +
 +	check_tc_mpls_support $h2 || return 0
 +
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 1 handle 101 \
 +		flower $tcflags mpls_ttl 0 action drop
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 2 handle 102 \
 +		flower $tcflags mpls_ttl 255 action drop
 +
 +	pkt="$ethtype $(mpls_lse 0 0 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	tc_check_packets "dev $h2 ingress" 101 1
 +	check_fail $? "Matched on a wrong filter (255)"
 +
 +	tc_check_packets "dev $h2 ingress" 102 1
 +	check_err $? "Did not match on correct filter (255)"
 +
 +	pkt="$ethtype $(mpls_lse 0 0 1 0)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	tc_check_packets "dev $h2 ingress" 102 2
 +	check_fail $? "Matched on a wrong filter (0)"
 +
 +	tc_check_packets "dev $h2 ingress" 101 1
 +	check_err $? "Did not match on correct filter (0)"
 +
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 2 handle 102 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 1 handle 101 flower
 +
 +	log_test "mpls_ttl match ($tcflags)"
 +}
 +
 +match_mpls_lse_test()
 +{
 +	local ethtype="88 47"; readonly ethtype
 +	local pkt
 +
 +	RET=0
 +
 +	check_tc_mpls_lse_stats $h2 || return 0
 +
 +	# Match on first LSE (minimal values for each field)
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 1 handle 101 \
 +		flower $tcflags mpls lse depth 1 label 0 action continue
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 2 handle 102 \
 +		flower $tcflags mpls lse depth 1 tc 0 action continue
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 3 handle 103 \
 +		flower $tcflags mpls lse depth 1 bos 0 action continue
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 4 handle 104 \
 +		flower $tcflags mpls lse depth 1 ttl 0 action continue
 +
 +	# Match on second LSE (maximal values for each field)
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 5 handle 105 \
 +		flower $tcflags mpls lse depth 2 label 1048575 action continue
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 6 handle 106 \
 +		flower $tcflags mpls lse depth 2 tc 7 action continue
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 7 handle 107 \
 +		flower $tcflags mpls lse depth 2 bos 1 action continue
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 8 handle 108 \
 +		flower $tcflags mpls lse depth 2 ttl 255 action continue
 +
 +	# Match on LSE depth
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 9 handle 109 \
 +		flower $tcflags mpls lse depth 1 action continue
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 10 handle 110 \
 +		flower $tcflags mpls lse depth 2 action continue
 +	tc filter add dev $h2 ingress protocol mpls_uc pref 11 handle 111 \
 +		flower $tcflags mpls lse depth 3 action continue
 +
 +	# Base packet, matched by all filters (except for stack depth 3)
 +	pkt="$ethtype $(mpls_lse 0 0 0 0) $(mpls_lse 1048575 7 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	# Make a variant of the above packet, with a non-matching value
 +	# for each LSE field
 +
 +	# Wrong label at depth 1
 +	pkt="$ethtype $(mpls_lse 1 0 0 0) $(mpls_lse 1048575 7 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	# Wrong TC at depth 1
 +	pkt="$ethtype $(mpls_lse 0 1 0 0) $(mpls_lse 1048575 7 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	# Wrong BOS at depth 1 (not adding a second LSE here since BOS is set
 +	# in the first label, so anything that'd follow wouldn't be considered)
 +	pkt="$ethtype $(mpls_lse 0 0 1 0)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	# Wrong TTL at depth 1
 +	pkt="$ethtype $(mpls_lse 0 0 0 1) $(mpls_lse 1048575 7 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	# Wrong label at depth 2
 +	pkt="$ethtype $(mpls_lse 0 0 0 0) $(mpls_lse 1048574 7 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	# Wrong TC at depth 2
 +	pkt="$ethtype $(mpls_lse 0 0 0 0) $(mpls_lse 1048575 6 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	# Wrong BOS at depth 2 (adding a third LSE here since BOS isn't set in
 +	# the second label)
 +	pkt="$ethtype $(mpls_lse 0 0 0 0) $(mpls_lse 1048575 7 0 255)"
 +	pkt="$pkt $(mpls_lse 0 0 1 255)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	# Wrong TTL at depth 2
 +	pkt="$ethtype $(mpls_lse 0 0 0 0) $(mpls_lse 1048575 7 1 254)"
 +	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac "$pkt" -q
 +
 +	# Filters working at depth 1 should match all packets but one
 +
 +	tc_check_packets "dev $h2 ingress" 101 8
 +	check_err $? "Did not match on correct filter"
 +
 +	tc_check_packets "dev $h2 ingress" 102 8
 +	check_err $? "Did not match on correct filter"
 +
 +	tc_check_packets "dev $h2 ingress" 103 8
 +	check_err $? "Did not match on correct filter"
 +
 +	tc_check_packets "dev $h2 ingress" 104 8
 +	check_err $? "Did not match on correct filter"
 +
 +	# Filters working at depth 2 should match all packets but two (because
 +	# of the test packet where the label stack depth is just one)
 +
 +	tc_check_packets "dev $h2 ingress" 105 7
 +	check_err $? "Did not match on correct filter"
 +
 +	tc_check_packets "dev $h2 ingress" 106 7
 +	check_err $? "Did not match on correct filter"
 +
 +	tc_check_packets "dev $h2 ingress" 107 7
 +	check_err $? "Did not match on correct filter"
 +
 +	tc_check_packets "dev $h2 ingress" 108 7
 +	check_err $? "Did not match on correct filter"
 +
 +	# Finally, verify the filters that only match on LSE depth
 +
 +	tc_check_packets "dev $h2 ingress" 109 9
 +	check_err $? "Did not match on correct filter"
 +
 +	tc_check_packets "dev $h2 ingress" 110 8
 +	check_err $? "Did not match on correct filter"
 +
 +	tc_check_packets "dev $h2 ingress" 111 1
 +	check_err $? "Did not match on correct filter"
 +
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 11 handle 111 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 10 handle 110 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 9 handle 109 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 8 handle 108 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 7 handle 107 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 6 handle 106 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 5 handle 105 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 4 handle 104 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 3 handle 103 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 2 handle 102 flower
 +	tc filter del dev $h2 ingress protocol mpls_uc pref 1 handle 101 flower
 +
 +	log_test "mpls lse match ($tcflags)"
 +}
 +
  setup_prepare()
  {
    h1=${NETIFS[p1]}
-- 
LinuxNextTracking