The following commit has been merged in the master branch: commit 11c51ca1135ad53dc84366758f9b04f5ecf041a5 Merge: b63c757d26dccc6a0f62ece8a6ccd6613c09b895 8f36e00065436412a02d1f50ad77375bdb506300 Author: Stephen Rothwell sfr@canb.auug.org.au Date: Thu Dec 21 09:58:32 2017 +1100
Merge remote-tracking branch 'net-next/master'
diff --combined MAINTAINERS index 020f6313a30d,129c591e0f34..a904a6493b51 --- a/MAINTAINERS +++ b/MAINTAINERS @@@ -321,7 -321,7 +321,7 @@@ F: drivers/acpi/apei
ACPI COMPONENT ARCHITECTURE (ACPICA) M: Robert Moore robert.moore@intel.com -M: Lv Zheng lv.zheng@intel.com +M: Erik Schmauss erik.schmauss@intel.com M: "Rafael J. Wysocki" rafael.j.wysocki@intel.com L: linux-acpi@vger.kernel.org L: devel@acpica.org @@@ -1583,7 -1583,6 +1583,7 @@@ F: arch/arm/boot/dts/kirkwood F: arch/arm/configs/mvebu_*_defconfig F: arch/arm/mach-mvebu/ F: arch/arm64/boot/dts/marvell/armada* +F: drivers/cpufreq/armada-37xx-cpufreq.c F: drivers/cpufreq/mvebu-cpufreq.c F: drivers/irqchip/irq-armada-370-xp.c F: drivers/irqchip/irq-mvebu-* @@@ -2502,8 -2501,6 +2502,8 @@@ L: linux-arm-kernel@lists.infradead.or S: Maintained F: Documentation/devicetree/bindings/arm/axentia.txt F: arch/arm/boot/dts/at91-linea.dtsi +F: arch/arm/boot/dts/at91-natte.dtsi +F: arch/arm/boot/dts/at91-nattis-2-natte-2.dts F: arch/arm/boot/dts/at91-tse850-3.dts
AXENTIA ASOC DRIVERS @@@ -2692,7 -2689,6 +2692,6 @@@ F: drivers/mtd/devices/block2mtd.
BLUETOOTH DRIVERS M: Marcel Holtmann marcel@holtmann.org - M: Gustavo Padovan gustavo@padovan.org M: Johan Hedberg johan.hedberg@gmail.com L: linux-bluetooth@vger.kernel.org W: http://www.bluez.org/ @@@ -2703,7 -2699,6 +2702,6 @@@ F: drivers/bluetooth
BLUETOOTH SUBSYSTEM M: Marcel Holtmann marcel@holtmann.org - M: Gustavo Padovan gustavo@padovan.org M: Johan Hedberg johan.hedberg@gmail.com L: linux-bluetooth@vger.kernel.org W: http://www.bluez.org/ @@@ -2728,12 -2723,16 +2726,16 @@@ M: Alexei Starovoitov <ast@kernel.org M: Daniel Borkmann daniel@iogearbox.net L: netdev@vger.kernel.org L: linux-kernel@vger.kernel.org + T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git + T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git S: Supported F: arch/x86/net/bpf_jit* F: Documentation/networking/filter.txt F: Documentation/bpf/ F: include/linux/bpf* F: include/linux/filter.h + F: include/trace/events/bpf.h + F: include/trace/events/xdp.h F: include/uapi/linux/bpf* F: include/uapi/linux/filter.h F: kernel/bpf/ @@@ -2837,8 -2836,6 +2839,8 @@@ S: Maintaine F: arch/arm/mach-bcm/*brcmstb* F: arch/arm/boot/dts/bcm7*.dts* F: drivers/bus/brcmstb_gisb.c +F: arch/arm/mm/cache-b15-rac.c +F: arch/arm/include/asm/hardware/cache-b15-rac.h N: brcmstb
BROADCOM BMIPS CPUFREQ DRIVER @@@ -8686,15 -8683,6 +8688,15 @@@ T: git git://linuxtv.org/media_tree.gi S: Maintained F: drivers/media/dvb-frontends/stv6111*
+MEDIA DRIVERS FOR NVIDIA TEGRA - VDE +M: Dmitry Osipenko digetx@gmail.com +L: linux-media@vger.kernel.org +L: linux-tegra@vger.kernel.org +T: git git://linuxtv.org/media_tree.git +S: Maintained +F: Documentation/devicetree/bindings/media/nvidia,tegra-vde.txt +F: drivers/staging/media/tegra-vde/ + MEDIA INPUT INFRASTRUCTURE (V4L/DVB) M: Mauro Carvalho Chehab mchehab@s-opensource.com M: Mauro Carvalho Chehab mchehab@kernel.org @@@ -8738,6 -8726,13 +8740,13 @@@ L: netdev@vger.kernel.or S: Maintained F: drivers/net/ethernet/mediatek/
+ MEDIATEK SWITCH DRIVER + M: Sean Wang sean.wang@mediatek.com + L: netdev@vger.kernel.org + S: Maintained + F: drivers/net/dsa/mt7530.* + F: net/dsa/tag_mtk.c + MEDIATEK JPEG DRIVER M: Rick Chang rick.chang@mediatek.com M: Bin Liu bin.liu@mediatek.com @@@ -9614,6 -9609,11 +9623,11 @@@ NETWORKING [WIRELESS L: linux-wireless@vger.kernel.org Q: http://patchwork.kernel.org/project/linux-wireless/list/
+ NETDEVSIM + M: Jakub Kicinski jakub.kicinski@netronome.com + S: Maintained + F: drivers/net/netdevsim/* + NETXEN (1/10) GbE SUPPORT M: Manish Chopra manish.chopra@cavium.com M: Rahul Verma rahul.verma@cavium.com @@@ -10566,12 -10566,8 +10580,12 @@@ T: git git://git.kernel.org/pub/scm/lin S: Supported F: Documentation/devicetree/bindings/pci/ F: Documentation/PCI/ +F: drivers/acpi/pci* F: drivers/pci/ +F: include/asm-generic/pci* F: include/linux/pci* +F: include/uapi/linux/pci* +F: lib/pci* F: arch/x86/pci/ F: arch/x86/kernel/quirks.c
@@@ -10910,7 -10906,6 +10924,7 @@@ F: include/linux/pm. F: include/linux/pm_* F: include/linux/powercap.h F: drivers/powercap/ +F: kernel/configs/nopm.config
POWER STATE COORDINATION INTERFACE (PSCI) M: Mark Rutland mark.rutland@arm.com @@@ -12358,14 -12353,6 +12372,14 @@@ T: git git://linuxtv.org/anttip/media_t S: Maintained F: drivers/media/tuners/si2157*
+SI2165 MEDIA DRIVER +M: Matthias Schwarzott zzam@gentoo.org +L: linux-media@vger.kernel.org +W: https://linuxtv.org +Q: http://patchwork.linuxtv.org/project/linux-media/list/ +S: Maintained +F: drivers/media/dvb-frontends/si2165* + SI2168 MEDIA DRIVER M: Antti Palosaari crope@iki.fi L: linux-media@vger.kernel.org @@@ -12896,6 -12883,12 +12910,6 @@@ S: Odd Fixe F: Documentation/devicetree/bindings/staging/iio/ F: drivers/staging/iio/
-STAGING - LIRC (LINUX INFRARED REMOTE CONTROL) DRIVERS -M: Jarod Wilson jarod@wilsonet.com -W: http://www.lirc.org/ -S: Odd Fixes -F: drivers/staging/media/lirc/ - STAGING - LUSTRE PARALLEL FILESYSTEM M: Oleg Drokin oleg.drokin@intel.com M: Andreas Dilger andreas.dilger@intel.com @@@ -13277,15 -13270,6 +13291,15 @@@ T: git git://linuxtv.org/anttip/media_t S: Maintained F: drivers/media/tuners/tda18218*
+TDA18250 MEDIA DRIVER +M: Olli Salonen olli.salonen@iki.fi +L: linux-media@vger.kernel.org +W: https://linuxtv.org +Q: http://patchwork.linuxtv.org/project/linux-media/list/ +T: git git://linuxtv.org/media_tree.git +S: Maintained +F: drivers/media/tuners/tda18250* + TDA18271 MEDIA DRIVER M: Michael Krufky mkrufky@linuxtv.org L: linux-media@vger.kernel.org @@@ -13523,7 -13507,6 +13537,7 @@@ M: Mika Westerberg <mika.westerberg@lin M: Yehezkel Bernat yehezkel.bernat@intel.com T: git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git S: Maintained +F: Documentation/admin-guide/thunderbolt.rst F: drivers/thunderbolt/ F: include/linux/thunderbolt.h
diff --combined arch/arm/boot/dts/imx25.dtsi index c43cf704b768,fcaff1c66bcb..9445f8e1473c --- a/arch/arm/boot/dts/imx25.dtsi +++ b/arch/arm/boot/dts/imx25.dtsi @@@ -122,7 -122,7 +122,7 @@@ };
can1: can@43f88000 { - compatible = "fsl,imx25-flexcan", "fsl,p1010-flexcan"; + compatible = "fsl,imx25-flexcan"; reg = <0x43f88000 0x4000>; interrupts = <43>; clocks = <&clks 75>, <&clks 75>; @@@ -131,7 -131,7 +131,7 @@@ };
can2: can@43f8c000 { - compatible = "fsl,imx25-flexcan", "fsl,p1010-flexcan"; + compatible = "fsl,imx25-flexcan"; reg = <0x43f8c000 0x4000>; interrupts = <44>; clocks = <&clks 76>, <&clks 76>; @@@ -628,13 -628,11 +628,13 @@@ usbphy0: usb-phy@0 { reg = <0>; compatible = "usb-nop-xceiv"; + #phy-cells = <0>; };
usbphy1: usb-phy@1 { reg = <1>; compatible = "usb-nop-xceiv"; + #phy-cells = <0>; }; }; }; diff --combined arch/arm/boot/dts/imx35.dtsi index f049c692c6b0,1f0e2203b576..e08c0c193767 --- a/arch/arm/boot/dts/imx35.dtsi +++ b/arch/arm/boot/dts/imx35.dtsi @@@ -303,7 -303,7 +303,7 @@@ };
can1: can@53fe4000 { - compatible = "fsl,imx35-flexcan", "fsl,p1010-flexcan"; + compatible = "fsl,imx35-flexcan"; reg = <0x53fe4000 0x1000>; clocks = <&clks 33>, <&clks 33>; clock-names = "ipg", "per"; @@@ -312,7 -312,7 +312,7 @@@ };
can2: can@53fe8000 { - compatible = "fsl,imx35-flexcan", "fsl,p1010-flexcan"; + compatible = "fsl,imx35-flexcan"; reg = <0x53fe8000 0x1000>; clocks = <&clks 34>, <&clks 34>; clock-names = "ipg", "per"; @@@ -402,13 -402,11 +402,13 @@@ usbphy0: usb-phy@0 { reg = <0>; compatible = "usb-nop-xceiv"; + #phy-cells = <0>; };
usbphy1: usb-phy@1 { reg = <1>; compatible = "usb-nop-xceiv"; + #phy-cells = <0>; }; }; }; diff --combined arch/arm/boot/dts/imx53.dtsi index fb6cdd629ee7,85071ff8c639..d55b0755a36e --- a/arch/arm/boot/dts/imx53.dtsi +++ b/arch/arm/boot/dts/imx53.dtsi @@@ -303,7 -303,6 +303,7 @@@ compatible = "usb-nop-xceiv"; clocks = <&clks IMX5_CLK_USB_PHY1_GATE>; clock-names = "main_clk"; + #phy-cells = <0>; status = "okay"; };
@@@ -311,7 -310,6 +311,7 @@@ compatible = "usb-nop-xceiv"; clocks = <&clks IMX5_CLK_USB_PHY2_GATE>; clock-names = "main_clk"; + #phy-cells = <0>; status = "okay"; };
@@@ -538,7 -536,7 +538,7 @@@ };
can1: can@53fc8000 { - compatible = "fsl,imx53-flexcan", "fsl,p1010-flexcan"; + compatible = "fsl,imx53-flexcan"; reg = <0x53fc8000 0x4000>; interrupts = <82>; clocks = <&clks IMX5_CLK_CAN1_IPG_GATE>, @@@ -548,7 -546,7 +548,7 @@@ };
can2: can@53fcc000 { - compatible = "fsl,imx53-flexcan", "fsl,p1010-flexcan"; + compatible = "fsl,imx53-flexcan"; reg = <0x53fcc000 0x4000>; interrupts = <83>; clocks = <&clks IMX5_CLK_CAN2_IPG_GATE>, diff --combined arch/arm/boot/dts/ls1021a-twr.dts index f7946f40d35d,7202d9c504be..3adf79372057 --- a/arch/arm/boot/dts/ls1021a-twr.dts +++ b/arch/arm/boot/dts/ls1021a-twr.dts @@@ -228,10 -228,6 +228,10 @@@ }; };
+&esdhc { + status = "okay"; +}; + &sai1 { status = "okay"; }; @@@ -247,3 -243,19 +247,19 @@@ &uart1 { status = "okay"; }; + + &can0 { + status = "okay"; + }; + + &can1 { + status = "okay"; + }; + + &can2 { + status = "disabled"; + }; + + &can3 { + status = "disabled"; + }; diff --combined arch/arm/boot/dts/ls1021a.dtsi index 64249726b3cb,7789031898b0..a121c9130271 --- a/arch/arm/boot/dts/ls1021a.dtsi +++ b/arch/arm/boot/dts/ls1021a.dtsi @@@ -154,22 -154,8 +154,22 @@@ big-endian; };
+ qspi: quadspi@1550000 { + compatible = "fsl,ls1021a-qspi"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x0 0x1550000 0x0 0x10000>, + <0x0 0x40000000 0x0 0x40000000>; + reg-names = "QuadSPI", "QuadSPI-memory"; + interrupts = <GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>; + clock-names = "qspi_en", "qspi"; + clocks = <&clockgen 4 1>, <&clockgen 4 1>; + big-endian; + status = "disabled"; + }; + esdhc: esdhc@1560000 { - compatible = "fsl,esdhc"; + compatible = "fsl,ls1021a-esdhc", "fsl,esdhc"; reg = <0x0 0x1560000 0x0 0x10000>; interrupts = <GIC_SPI 94 IRQ_TYPE_LEVEL_HIGH>; clock-frequency = <0>; @@@ -744,5 -730,41 +744,41 @@@ <0000 0 0 3 &gic GIC_SPI 191 IRQ_TYPE_LEVEL_HIGH>, <0000 0 0 4 &gic GIC_SPI 193 IRQ_TYPE_LEVEL_HIGH>; }; + + can0: can@2a70000 { + compatible = "fsl,ls1021ar2-flexcan"; + reg = <0x0 0x2a70000 0x0 0x1000>; + interrupts = <GIC_SPI 126 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clockgen 4 1>, <&clockgen 4 1>; + clock-names = "ipg", "per"; + big-endian; + }; + + can1: can@2a80000 { + compatible = "fsl,ls1021ar2-flexcan"; + reg = <0x0 0x2a80000 0x0 0x1000>; + interrupts = <GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clockgen 4 1>, <&clockgen 4 1>; + clock-names = "ipg", "per"; + big-endian; + }; + + can2: can@2a90000 { + compatible = "fsl,ls1021ar2-flexcan"; + reg = <0x0 0x2a90000 0x0 0x1000>; + interrupts = <GIC_SPI 128 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clockgen 4 1>, <&clockgen 4 1>; + clock-names = "ipg", "per"; + big-endian; + }; + + can3: can@2aa0000 { + compatible = "fsl,ls1021ar2-flexcan"; + reg = <0x0 0x2aa0000 0x0 0x1000>; + interrupts = <GIC_SPI 129 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clockgen 4 1>, <&clockgen 4 1>; + clock-names = "ipg", "per"; + big-endian; + }; }; }; diff --combined arch/powerpc/net/bpf_jit_comp64.c index d183b4801bdb,d5a5bc43cf8f..6771c63b2bec --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@@ -763,8 -763,7 +763,8 @@@ emit_clear func = (u8 *) __bpf_call_base + imm;
/* Save skb pointer if we need to re-cache skb data */ - if (bpf_helper_changes_pkt_data(func)) + if ((ctx->seen & SEEN_SKB) && + bpf_helper_changes_pkt_data(func)) PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_func_call(image, ctx, (u64)func); @@@ -773,8 -772,7 +773,8 @@@ PPC_MR(b2p[BPF_REG_0], 3);
/* refresh skb cache */ - if (bpf_helper_changes_pkt_data(func)) { + if ((ctx->seen & SEEN_SKB) && + bpf_helper_changes_pkt_data(func)) { /* reload skb pointer to r3 */ PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_skb_loads(image, ctx); @@@ -995,7 -993,7 +995,7 @@@ struct bpf_prog *bpf_int_jit_compile(st struct bpf_prog *tmp_fp; bool bpf_blinded = false;
- if (!bpf_jit_enable) + if (!fp->jit_requested) return org_fp;
tmp_fp = bpf_jit_blind_constants(org_fp); diff --combined arch/s390/net/bpf_jit_comp.c index 9557d8b516df,f4baa8c514d3..1dfadbd126f3 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@@ -55,7 -55,8 +55,7 @@@ struct bpf_jit #define SEEN_LITERAL 8 /* code uses literals */ #define SEEN_FUNC 16 /* calls C functions */ #define SEEN_TAIL_CALL 32 /* code uses tail calls */ -#define SEEN_SKB_CHANGE 64 /* code changes skb data */ -#define SEEN_REG_AX 128 /* code uses constant blinding */ +#define SEEN_REG_AX 64 /* code uses constant blinding */ #define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB)
/* @@@ -447,12 -448,12 +447,12 @@@ static void bpf_jit_prologue(struct bpf EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, 152); } - if (jit->seen & SEEN_SKB) + if (jit->seen & SEEN_SKB) { emit_load_skb_data_hlen(jit); - if (jit->seen & SEEN_SKB_CHANGE) /* stg %b1,ST_OFF_SKBP(%r0,%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15, STK_OFF_SKBP); + } }
/* @@@ -982,8 -983,8 +982,8 @@@ static noinline int bpf_jit_insn(struc EMIT2(0x0d00, REG_14, REG_W1); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); - if (bpf_helper_changes_pkt_data((void *)func)) { - jit->seen |= SEEN_SKB_CHANGE; + if ((jit->seen & SEEN_SKB) && + bpf_helper_changes_pkt_data((void *)func)) { /* lg %b1,ST_OFF_SKBP(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0, REG_15, STK_OFF_SKBP); @@@ -1299,7 -1300,7 +1299,7 @@@ struct bpf_prog *bpf_int_jit_compile(st struct bpf_jit jit; int pass;
- if (!bpf_jit_enable) + if (!fp->jit_requested) return orig_fp;
tmp = bpf_jit_blind_constants(fp); diff --combined arch/sparc/net/bpf_jit_comp_64.c index ff5f9cb3039a,a2f1b5e774a7..22aff21fa44d --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@@ -1245,16 -1245,14 +1245,16 @@@ static int build_insn(const struct bpf_ u8 *func = ((u8 *)__bpf_call_base) + imm;
ctx->saw_call = true; + if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func)) + emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx);
emit_call((u32 *)func, ctx); emit_nop(ctx);
emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
- if (bpf_helper_changes_pkt_data(func) && ctx->saw_ld_abs_ind) - load_skb_regs(ctx, bpf2sparc[BPF_REG_6]); + if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func)) + load_skb_regs(ctx, L7); break; }
@@@ -1519,7 -1517,7 +1519,7 @@@ struct bpf_prog *bpf_int_jit_compile(st u8 *image_ptr; int pass;
- if (!bpf_jit_enable) + if (!prog->jit_requested) return orig_prog;
tmp = bpf_jit_blind_constants(prog); diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_main.c index d9d8227f195f,0f5c012de52e..3aa1c90e7c86 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@@ -71,6 -71,11 +71,6 @@@ struct mlx5e_channel_param struct mlx5e_cq_param icosq_cq; };
-static int mlx5e_get_node(struct mlx5e_priv *priv, int ix) -{ - return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix); -} - static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev) { return MLX5_CAP_GEN(mdev, striding_rq) && @@@ -78,8 -83,8 +78,8 @@@ MLX5_CAP_ETH(mdev, reg_umr_sq); }
-void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev, - struct mlx5e_params *params, u8 rq_type) +void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, u8 rq_type) { params->rq_wq_type = rq_type; params->lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ; @@@ -88,8 -93,10 +88,8 @@@ params->log_rq_size = is_kdump_kernel() ? MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW : MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW; - params->mpwqe_log_stride_sz = - MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS) ? - MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) : - MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev); + params->mpwqe_log_stride_sz = MLX5E_MPWQE_STRIDE_SZ(mdev, + MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)); params->mpwqe_log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - params->mpwqe_log_stride_sz; break; @@@ -113,14 -120,13 +113,14 @@@ MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)); }
-static void mlx5e_set_rq_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params) +static void mlx5e_set_rq_params(struct mlx5_core_dev *mdev, + struct mlx5e_params *params) { u8 rq_type = mlx5e_check_fragmented_striding_rq_cap(mdev) && !params->xdp_prog && !MLX5_IPSEC_DEV(mdev) ? MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ : MLX5_WQ_TYPE_LINKED_LIST; - mlx5e_set_rq_type_params(mdev, params, rq_type); + mlx5e_init_rq_type_params(mdev, params, rq_type); }
static void mlx5e_update_carrier(struct mlx5e_priv *priv) @@@ -438,16 -444,17 +438,16 @@@ static int mlx5e_rq_alloc_mpwqe_info(st int wq_sz = mlx5_wq_ll_get_size(&rq->wq); int mtt_sz = mlx5e_get_wqe_mtt_sz(); int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1; - int node = mlx5e_get_node(c->priv, c->ix); int i;
rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info), - GFP_KERNEL, node); + GFP_KERNEL, cpu_to_node(c->cpu)); if (!rq->mpwqe.info) goto err_out;
/* We allocate more than mtt_sz as we will align the pointer */ - rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, - GFP_KERNEL, node); + rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL, + cpu_to_node(c->cpu)); if (unlikely(!rq->mpwqe.mtt_no_align)) goto err_free_wqe_info;
@@@ -555,7 -562,7 +555,7 @@@ static int mlx5e_alloc_rq(struct mlx5e_ int err; int i;
- rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); + rqp->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq, &rq->wq_ctrl); @@@ -622,7 -629,8 +622,7 @@@ default: /* MLX5_WQ_TYPE_LINKED_LIST */ rq->wqe.frag_info = kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info), - GFP_KERNEL, - mlx5e_get_node(c->priv, c->ix)); + GFP_KERNEL, cpu_to_node(c->cpu)); if (!rq->wqe.frag_info) { err = -ENOMEM; goto err_rq_wq_destroy; @@@ -992,13 -1000,13 +992,13 @@@ static int mlx5e_alloc_xdpsq(struct mlx sq->uar_map = mdev->mlx5e_res.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode;
- param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); + param->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
- err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix)); + err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu)); if (err) goto err_sq_wq_destroy;
@@@ -1045,13 -1053,13 +1045,13 @@@ static int mlx5e_alloc_icosq(struct mlx sq->channel = c; sq->uar_map = mdev->mlx5e_res.bfreg.map;
- param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); + param->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
- err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix)); + err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu)); if (err) goto err_sq_wq_destroy;
@@@ -1118,13 -1126,13 +1118,13 @@@ static int mlx5e_alloc_txqsq(struct mlx if (MLX5_IPSEC_DEV(c->priv->mdev)) set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
- param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); + param->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
- err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix)); + err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu)); if (err) goto err_sq_wq_destroy;
@@@ -1496,8 -1504,8 +1496,8 @@@ static int mlx5e_alloc_cq(struct mlx5e_ struct mlx5_core_dev *mdev = c->priv->mdev; int err;
- param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix); - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); + param->wq.buf_numa_node = cpu_to_node(c->cpu); + param->wq.db_numa_node = cpu_to_node(c->cpu); param->eq_ix = c->ix;
err = mlx5e_alloc_cq_common(mdev, param, cq); @@@ -1596,11 -1604,6 +1596,11 @@@ static void mlx5e_close_cq(struct mlx5e mlx5e_free_cq(cq); }
+static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix) +{ + return cpumask_first(priv->mdev->priv.irq_info[ix].mask); +} + static int mlx5e_open_tx_cqs(struct mlx5e_channel *c, struct mlx5e_params *params, struct mlx5e_channel_param *cparam) @@@ -1749,13 -1752,12 +1749,13 @@@ static int mlx5e_open_channel(struct ml { struct mlx5e_cq_moder icocq_moder = {0, 0}; struct net_device *netdev = priv->netdev; + int cpu = mlx5e_get_cpu(priv, ix); struct mlx5e_channel *c; unsigned int irq; int err; int eqn;
- c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix)); + c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu)); if (!c) return -ENOMEM;
@@@ -1763,7 -1765,6 +1763,7 @@@ c->mdev = priv->mdev; c->tstamp = &priv->tstamp; c->ix = ix; + c->cpu = cpu; c->pdev = &priv->mdev->pdev->dev; c->netdev = priv->netdev; c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key); @@@ -1852,7 -1853,8 +1852,7 @@@ static void mlx5e_activate_channel(stru for (tc = 0; tc < c->num_tc; tc++) mlx5e_activate_txqsq(&c->sq[tc]); mlx5e_activate_rq(&c->rq); - netif_set_xps_queue(c->netdev, - mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix); + netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix); }
static void mlx5e_deactivate_channel(struct mlx5e_channel *c) @@@ -3677,7 -3679,6 +3677,7 @@@ static netdev_features_t mlx5e_tunnel_f struct sk_buff *skb, netdev_features_t features) { + unsigned int offset = 0; struct udphdr *udph; u8 proto; u16 port; @@@ -3687,7 -3688,7 +3687,7 @@@ proto = ip_hdr(skb)->protocol; break; case htons(ETH_P_IPV6): - proto = ipv6_hdr(skb)->nexthdr; + proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL); break; default: goto out; @@@ -4307,9 -4308,6 +4307,6 @@@ static void mlx5e_nic_cleanup(struct ml { mlx5e_ipsec_cleanup(priv); mlx5e_vxlan_cleanup(priv); - - if (priv->channels.params.xdp_prog) - bpf_prog_put(priv->channels.params.xdp_prog); }
static int mlx5e_init_nic_rx(struct mlx5e_priv *priv) diff --combined drivers/net/ethernet/netronome/nfp/bpf/main.c index 13190aa09faf,4f6553f01178..348471fae6a2 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@@ -34,10 -34,12 +34,12 @@@ #include <net/pkt_cls.h>
#include "../nfpcore/nfp_cpp.h" + #include "../nfpcore/nfp_nffw.h" #include "../nfp_app.h" #include "../nfp_main.h" #include "../nfp_net.h" #include "../nfp_port.h" + #include "fw.h" #include "main.h"
static bool nfp_net_ebpf_capable(struct nfp_net *nn) @@@ -82,43 -84,11 +84,11 @@@ static const char *nfp_bpf_extra_cap(st return nfp_net_ebpf_capable(nn) ? "BPF" : ""; }
- static int - nfp_bpf_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id) - { - int err; - - nn->app_priv = kzalloc(sizeof(struct nfp_bpf_vnic), GFP_KERNEL); - if (!nn->app_priv) - return -ENOMEM; - - err = nfp_app_nic_vnic_alloc(app, nn, id); - if (err) - goto err_free_priv; - - return 0; - err_free_priv: - kfree(nn->app_priv); - return err; - } - - static void nfp_bpf_vnic_free(struct nfp_app *app, struct nfp_net *nn) - { - struct nfp_bpf_vnic *bv = nn->app_priv; - - if (nn->dp.bpf_offload_xdp) - nfp_bpf_xdp_offload(app, nn, NULL); - WARN_ON(bv->tc_prog); - kfree(bv); - } - static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) { struct tc_cls_bpf_offload *cls_bpf = type_data; struct nfp_net *nn = cb_priv; - struct bpf_prog *oldprog; - struct nfp_bpf_vnic *bv; - int err;
if (type != TC_SETUP_CLSBPF || !tc_can_offload(nn->dp.netdev) || @@@ -126,6 -96,8 +96,8 @@@ cls_bpf->common.protocol != htons(ETH_P_ALL) || cls_bpf->common.chain_index) return -EOPNOTSUPP; + if (nn->dp.bpf_offload_xdp) + return -EBUSY;
/* Only support TC direct action */ if (!cls_bpf->exts_integrated || @@@ -134,25 -106,16 +106,10 @@@ return -EOPNOTSUPP; }
- switch (cls_bpf->command) { - case TC_CLSBPF_REPLACE: - return nfp_net_bpf_offload(nn, cls_bpf->prog, true); - case TC_CLSBPF_ADD: - return nfp_net_bpf_offload(nn, cls_bpf->prog, false); - case TC_CLSBPF_DESTROY: - return nfp_net_bpf_offload(nn, NULL, true); - default: + if (cls_bpf->command != TC_CLSBPF_OFFLOAD) return -EOPNOTSUPP; - } + - bv = nn->app_priv; - oldprog = cls_bpf->oldprog; - - /* Don't remove if oldprog doesn't match driver's state */ - if (bv->tc_prog != oldprog) { - oldprog = NULL; - if (!cls_bpf->prog) - return 0; - } - - err = nfp_net_bpf_offload(nn, cls_bpf->prog, oldprog); - if (err) - return err; - - bv->tc_prog = cls_bpf->prog; - return 0; ++ return nfp_net_bpf_offload(nn, cls_bpf->prog, cls_bpf->oldprog); }
static int nfp_bpf_setup_tc_block(struct net_device *netdev, @@@ -194,14 -157,126 +151,126 @@@ static bool nfp_bpf_tc_busy(struct nfp_ return nn->dp.ctrl & NFP_NET_CFG_CTRL_BPF; }
+ static int + nfp_bpf_parse_cap_adjust_head(struct nfp_app_bpf *bpf, void __iomem *value, + u32 length) + { + struct nfp_bpf_cap_tlv_adjust_head __iomem *cap = value; + struct nfp_cpp *cpp = bpf->app->pf->cpp; + + if (length < sizeof(*cap)) { + nfp_err(cpp, "truncated adjust_head TLV: %d\n", length); + return -EINVAL; + } + + bpf->adjust_head.flags = readl(&cap->flags); + bpf->adjust_head.off_min = readl(&cap->off_min); + bpf->adjust_head.off_max = readl(&cap->off_max); + bpf->adjust_head.guaranteed_sub = readl(&cap->guaranteed_sub); + bpf->adjust_head.guaranteed_add = readl(&cap->guaranteed_add); + + if (bpf->adjust_head.off_min > bpf->adjust_head.off_max) { + nfp_err(cpp, "invalid adjust_head TLV: min > max\n"); + return -EINVAL; + } + if (!FIELD_FIT(UR_REG_IMM_MAX, bpf->adjust_head.off_min) || + !FIELD_FIT(UR_REG_IMM_MAX, bpf->adjust_head.off_max)) { + nfp_warn(cpp, "disabling adjust_head - driver expects min/max to fit in as immediates\n"); + memset(&bpf->adjust_head, 0, sizeof(bpf->adjust_head)); + return 0; + } + + return 0; + } + + static int nfp_bpf_parse_capabilities(struct nfp_app *app) + { + struct nfp_cpp *cpp = app->pf->cpp; + struct nfp_cpp_area *area; + u8 __iomem *mem, *start; + + mem = nfp_rtsym_map(app->pf->rtbl, "_abi_bpf_capabilities", "bpf.cap", + 8, &area); + if (IS_ERR(mem)) + return PTR_ERR(mem) == -ENOENT ? 0 : PTR_ERR(mem); + + start = mem; + while (mem - start + 8 < nfp_cpp_area_size(area)) { + u8 __iomem *value; + u32 type, length; + + type = readl(mem); + length = readl(mem + 4); + value = mem + 8; + + mem += 8 + length; + if (mem - start > nfp_cpp_area_size(area)) + goto err_release_free; + + switch (type) { + case NFP_BPF_CAP_TYPE_ADJUST_HEAD: + if (nfp_bpf_parse_cap_adjust_head(app->priv, value, + length)) + goto err_release_free; + break; + default: + nfp_dbg(cpp, "unknown BPF capability: %d\n", type); + break; + } + } + if (mem - start != nfp_cpp_area_size(area)) { + nfp_err(cpp, "BPF capabilities left after parsing, parsed:%zd total length:%zu\n", + mem - start, nfp_cpp_area_size(area)); + goto err_release_free; + } + + nfp_cpp_area_release_free(area); + + return 0; + + err_release_free: + nfp_err(cpp, "invalid BPF capabilities at offset:%zd\n", mem - start); + nfp_cpp_area_release_free(area); + return -EINVAL; + } + + static int nfp_bpf_init(struct nfp_app *app) + { + struct nfp_app_bpf *bpf; + int err; + + bpf = kzalloc(sizeof(*bpf), GFP_KERNEL); + if (!bpf) + return -ENOMEM; + bpf->app = app; + app->priv = bpf; + + err = nfp_bpf_parse_capabilities(app); + if (err) + goto err_free_bpf; + + return 0; + + err_free_bpf: + kfree(bpf); + return err; + } + + static void nfp_bpf_clean(struct nfp_app *app) + { + kfree(app->priv); + } + const struct nfp_app_type app_bpf = { .id = NFP_APP_BPF_NIC, .name = "ebpf",
+ .init = nfp_bpf_init, + .clean = nfp_bpf_clean, + .extra_cap = nfp_bpf_extra_cap,
- .vnic_alloc = nfp_bpf_vnic_alloc, - .vnic_free = nfp_bpf_vnic_free, + .vnic_alloc = nfp_app_nic_vnic_alloc,
.setup_tc = nfp_bpf_setup_tc, .tc_busy = nfp_bpf_tc_busy, diff --combined drivers/net/ethernet/netronome/nfp/bpf/main.h index 57b6043177a3,f49669bf6b44..aae1be9ed056 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@@ -1,5 -1,5 +1,5 @@@ /* - * Copyright (C) 2016 Netronome Systems, Inc. + * Copyright (C) 2016-2017 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@@ -78,6 -78,29 +78,29 @@@ enum pkt_vec #define NFP_BPF_ABI_FLAGS reg_imm(0) #define NFP_BPF_ABI_FLAG_MARK 1
+ /** + * struct nfp_app_bpf - bpf app priv structure + * @app: backpointer to the app + * + * @adjust_head: adjust head capability + * @flags: extra flags for adjust head + * @off_min: minimal packet offset within buffer required + * @off_max: maximum packet offset within buffer required + * @guaranteed_sub: amount of negative adjustment guaranteed possible + * @guaranteed_add: amount of positive adjustment guaranteed possible + */ + struct nfp_app_bpf { + struct nfp_app *app; + + struct nfp_bpf_cap_adjust_head { + u32 flags; + int off_min; + int off_max; + int guaranteed_sub; + int guaranteed_add; + } adjust_head; + }; + struct nfp_prog; struct nfp_insn_meta; typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); @@@ -89,23 -112,39 +112,39 @@@ #define nfp_meta_next(meta) list_next_entry(meta, l) #define nfp_meta_prev(meta) list_prev_entry(meta, l)
+ #define FLAG_INSN_IS_JUMP_DST BIT(0) + /** * struct nfp_insn_meta - BPF instruction wrapper * @insn: BPF instruction * @ptr: pointer type for memory operations + * @ldst_gather_len: memcpy length gathered from load/store sequence + * @paired_st: the paired store insn at the head of the sequence + * @arg2: arg2 for call instructions * @ptr_not_const: pointer is not always constant + * @jmp_dst: destination info for jump instructions * @off: index of first generated machine instruction (in nfp_prog.prog) * @n: eBPF instruction number + * @flags: eBPF instruction extra optimization flags * @skip: skip this instruction (optimized out) * @double_cb: callback for second part of the instruction * @l: link on nfp_prog->insns list */ struct nfp_insn_meta { struct bpf_insn insn; - struct bpf_reg_state ptr; - bool ptr_not_const; + union { + struct { + struct bpf_reg_state ptr; + struct bpf_insn *paired_st; + s16 ldst_gather_len; + bool ptr_not_const; + }; + struct nfp_insn_meta *jmp_dst; + struct bpf_reg_state arg2; + }; unsigned int off; unsigned short n; + unsigned short flags; bool skip; instr_cb_t double_cb;
@@@ -134,23 -173,38 +173,38 @@@ static inline u8 mbpf_mode(const struc return BPF_MODE(meta->insn.code); }
+ static inline bool is_mbpf_load(const struct nfp_insn_meta *meta) + { + return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM); + } + + static inline bool is_mbpf_store(const struct nfp_insn_meta *meta) + { + return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_MEM); + } + /** * struct nfp_prog - nfp BPF program + * @bpf: backpointer to the bpf app priv structure * @prog: machine code * @prog_len: number of valid instructions in @prog array * @__prog_alloc_len: alloc size of @prog array * @verifier_meta: temporary storage for verifier's insn meta * @type: BPF program type * @start_off: address of the first instruction in the memory + * @last_bpf_off: address of the last instruction translated from BPF * @tgt_out: jump target for normal exit * @tgt_abort: jump target for abort (e.g. access outside of packet buffer) * @tgt_done: jump target to get the next packet * @n_translated: number of successfully translated instructions (for errors) * @error: error code if something went wrong * @stack_depth: max stack depth from the verifier + * @adjust_head_location: if program has single adjust head call - the insn no. * @insns: list of BPF instruction wrappers (struct nfp_insn_meta) */ struct nfp_prog { + struct nfp_app_bpf *bpf; + u64 *prog; unsigned int prog_len; unsigned int __prog_alloc_len; @@@ -160,6 -214,7 +214,7 @@@ enum bpf_prog_type type;
unsigned int start_off; + unsigned int last_bpf_off; unsigned int tgt_out; unsigned int tgt_abort; unsigned int tgt_done; @@@ -168,18 -223,11 +223,19 @@@ int error;
unsigned int stack_depth; + unsigned int adjust_head_location;
struct list_head insns; };
+/** + * struct nfp_bpf_vnic - per-vNIC BPF priv structure + * @tc_prog: currently loaded cls_bpf program + */ +struct nfp_bpf_vnic { + struct bpf_prog *tc_prog; +}; + int nfp_bpf_jit(struct nfp_prog *prog);
extern const struct bpf_ext_analyzer_ops nfp_bpf_analyzer_ops; @@@ -197,4 -245,7 +253,7 @@@ int nfp_bpf_translate(struct nfp_app *a struct bpf_prog *prog); int nfp_bpf_destroy(struct nfp_app *app, struct nfp_net *nn, struct bpf_prog *prog); + struct nfp_insn_meta * + nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + unsigned int insn_idx, unsigned int n_insns); #endif diff --combined drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f1063dc00398,c52a9963c19d..beb9f5d070e1 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@@ -482,7 -482,7 +482,7 @@@ static void stmmac_get_rx_hwtstamp(stru desc = np;
/* Check if timestamp is available */ - if (priv->hw->desc->get_rx_timestamp_status(desc, priv->adv_ts)) { + if (priv->hw->desc->get_rx_timestamp_status(p, np, priv->adv_ts)) { ns = priv->hw->desc->get_timestamp(desc, priv->adv_ts); netdev_dbg(priv->dev, "get valid RX hw timestamp %llu\n", ns); shhwtstamp = skb_hwtstamps(skb); @@@ -1997,22 -1997,60 +1997,60 @@@ static void stmmac_set_dma_operation_mo static void stmmac_dma_interrupt(struct stmmac_priv *priv) { u32 tx_channel_count = priv->plat->tx_queues_to_use; - int status; + u32 rx_channel_count = priv->plat->rx_queues_to_use; + u32 channels_to_check = tx_channel_count > rx_channel_count ? + tx_channel_count : rx_channel_count; u32 chan; + bool poll_scheduled = false; + int status[channels_to_check]; + + /* Each DMA channel can be used for rx and tx simultaneously, yet + * napi_struct is embedded in struct stmmac_rx_queue rather than in a + * stmmac_channel struct. + * Because of this, stmmac_poll currently checks (and possibly wakes) + * all tx queues rather than just a single tx queue. + */ + for (chan = 0; chan < channels_to_check; chan++) + status[chan] = priv->hw->dma->dma_interrupt(priv->ioaddr, + &priv->xstats, + chan);
- for (chan = 0; chan < tx_channel_count; chan++) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[chan]; + for (chan = 0; chan < rx_channel_count; chan++) { + if (likely(status[chan] & handle_rx)) { + struct stmmac_rx_queue *rx_q = &priv->rx_queue[chan];
- status = priv->hw->dma->dma_interrupt(priv->ioaddr, - &priv->xstats, chan); - if (likely((status & handle_rx)) || (status & handle_tx)) { if (likely(napi_schedule_prep(&rx_q->napi))) { stmmac_disable_dma_irq(priv, chan); __napi_schedule(&rx_q->napi); + poll_scheduled = true; + } + } + } + + /* If we scheduled poll, we already know that tx queues will be checked. + * If we didn't schedule poll, see if any DMA channel (used by tx) has a + * completed transmission, if so, call stmmac_poll (once). + */ + if (!poll_scheduled) { + for (chan = 0; chan < tx_channel_count; chan++) { + if (status[chan] & handle_tx) { + /* It doesn't matter what rx queue we choose + * here. We use 0 since it always exists. + */ + struct stmmac_rx_queue *rx_q = + &priv->rx_queue[0]; + + if (likely(napi_schedule_prep(&rx_q->napi))) { + stmmac_disable_dma_irq(priv, chan); + __napi_schedule(&rx_q->napi); + } + break; } } + }
- if (unlikely(status & tx_hard_error_bump_tc)) { + for (chan = 0; chan < tx_channel_count; chan++) { + if (unlikely(status[chan] & tx_hard_error_bump_tc)) { /* Try to bump up the dma threshold on this failure */ if (unlikely(priv->xstats.threshold != SF_DMA_MODE) && (tc <= 256)) { @@@ -2029,7 -2067,7 +2067,7 @@@ chan); priv->xstats.threshold = tc; } - } else if (unlikely(status == tx_hard_error)) { + } else if (unlikely(status[chan] == tx_hard_error)) { stmmac_tx_err(priv, chan); } } @@@ -2533,7 -2571,7 +2571,7 @@@ static int stmmac_hw_setup(struct net_d }
if (priv->hw->pcs && priv->hw->mac->pcs_ctrl_ane) - priv->hw->mac->pcs_ctrl_ane(priv->hw, 1, priv->hw->ps, 0); + priv->hw->mac->pcs_ctrl_ane(priv->ioaddr, 1, priv->hw->ps, 0);
/* set TX and RX rings length */ stmmac_set_rings_length(priv); diff --combined drivers/net/netdevsim/bpf.c index 000000000000,7da814686ad9..afaf980bbbe7 mode 000000,100644..100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@@ -1,0 -1,370 +1,364 @@@ + /* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree. + * + * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE + * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME + * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + */ + + #include <linux/bpf.h> + #include <linux/bpf_verifier.h> + #include <linux/debugfs.h> + #include <linux/kernel.h> + #include <linux/rtnetlink.h> + #include <net/pkt_cls.h> + + #include "netdevsim.h" + + struct nsim_bpf_bound_prog { + struct netdevsim *ns; + struct bpf_prog *prog; + struct dentry *ddir; + const char *state; + bool is_loaded; + struct list_head l; + }; + + static int nsim_debugfs_bpf_string_read(struct seq_file *file, void *data) + { + const char **str = file->private; + + if (*str) + seq_printf(file, "%s\n", *str); + + return 0; + } + + static int nsim_debugfs_bpf_string_open(struct inode *inode, struct file *f) + { + return single_open(f, nsim_debugfs_bpf_string_read, inode->i_private); + } + + static const struct file_operations nsim_bpf_string_fops = { + .owner = THIS_MODULE, + .open = nsim_debugfs_bpf_string_open, + .release = single_release, + .read = seq_read, + .llseek = seq_lseek + }; + + static int + nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn) + { + struct nsim_bpf_bound_prog *state; + + state = env->prog->aux->offload->dev_priv; + if (state->ns->bpf_bind_verifier_delay && !insn_idx) + msleep(state->ns->bpf_bind_verifier_delay); + + return 0; + } + + static const struct bpf_ext_analyzer_ops nsim_bpf_analyzer_ops = { + .insn_hook = nsim_bpf_verify_insn, + }; + + static bool nsim_xdp_offload_active(struct netdevsim *ns) + { + return ns->xdp_prog_mode == XDP_ATTACHED_HW; + } + + static void nsim_prog_set_loaded(struct bpf_prog *prog, bool loaded) + { + struct nsim_bpf_bound_prog *state; + + if (!prog || !prog->aux->offload) + return; + + state = prog->aux->offload->dev_priv; + state->is_loaded = loaded; + } + + static int + nsim_bpf_offload(struct netdevsim *ns, struct bpf_prog *prog, bool oldprog) + { + nsim_prog_set_loaded(ns->bpf_offloaded, false); + + WARN(!!ns->bpf_offloaded != oldprog, + "bad offload state, expected offload %sto be active", + oldprog ? "" : "not "); + ns->bpf_offloaded = prog; + ns->bpf_offloaded_id = prog ? prog->aux->id : 0; + nsim_prog_set_loaded(prog, true); + + return 0; + } + + int nsim_bpf_setup_tc_block_cb(enum tc_setup_type type, + void *type_data, void *cb_priv) + { + struct tc_cls_bpf_offload *cls_bpf = type_data; + struct bpf_prog *prog = cls_bpf->prog; + struct netdevsim *ns = cb_priv; + + if (type != TC_SETUP_CLSBPF || + !tc_can_offload(ns->netdev) || + cls_bpf->common.protocol != htons(ETH_P_ALL) || + cls_bpf->common.chain_index) + return -EOPNOTSUPP; + + if (nsim_xdp_offload_active(ns)) + return -EBUSY; + + if (!ns->bpf_tc_accept) + return -EOPNOTSUPP; + /* Note: progs without skip_sw will probably not be dev bound */ + if (prog && !prog->aux->offload && !ns->bpf_tc_non_bound_accept) + return -EOPNOTSUPP; + - switch (cls_bpf->command) { - case TC_CLSBPF_REPLACE: - return nsim_bpf_offload(ns, prog, true); - case TC_CLSBPF_ADD: - return nsim_bpf_offload(ns, prog, false); - case TC_CLSBPF_DESTROY: - return nsim_bpf_offload(ns, NULL, true); - default: ++ if (cls_bpf->command != TC_CLSBPF_OFFLOAD) + return -EOPNOTSUPP; - } ++ ++ return nsim_bpf_offload(ns, prog, cls_bpf->oldprog); + } + + int nsim_bpf_disable_tc(struct netdevsim *ns) + { + if (ns->bpf_offloaded && !nsim_xdp_offload_active(ns)) + return -EBUSY; + return 0; + } + + static int nsim_xdp_offload_prog(struct netdevsim *ns, struct netdev_bpf *bpf) + { + if (!nsim_xdp_offload_active(ns) && !bpf->prog) + return 0; + if (!nsim_xdp_offload_active(ns) && bpf->prog && ns->bpf_offloaded) { + NSIM_EA(bpf->extack, "TC program is already loaded"); + return -EBUSY; + } + + return nsim_bpf_offload(ns, bpf->prog, nsim_xdp_offload_active(ns)); + } + + static int nsim_xdp_set_prog(struct netdevsim *ns, struct netdev_bpf *bpf) + { + int err; + + if (ns->xdp_prog && (bpf->flags ^ ns->xdp_flags) & XDP_FLAGS_MODES) { + NSIM_EA(bpf->extack, "program loaded with different flags"); + return -EBUSY; + } + + if (bpf->command == XDP_SETUP_PROG && !ns->bpf_xdpdrv_accept) { + NSIM_EA(bpf->extack, "driver XDP disabled in DebugFS"); + return -EOPNOTSUPP; + } + if (bpf->command == XDP_SETUP_PROG_HW && !ns->bpf_xdpoffload_accept) { + NSIM_EA(bpf->extack, "XDP offload disabled in DebugFS"); + return -EOPNOTSUPP; + } + + if (bpf->command == XDP_SETUP_PROG_HW) { + err = nsim_xdp_offload_prog(ns, bpf); + if (err) + return err; + } + + if (ns->xdp_prog) + bpf_prog_put(ns->xdp_prog); + + ns->xdp_prog = bpf->prog; + ns->xdp_flags = bpf->flags; + + if (!bpf->prog) + ns->xdp_prog_mode = XDP_ATTACHED_NONE; + else if (bpf->command == XDP_SETUP_PROG) + ns->xdp_prog_mode = XDP_ATTACHED_DRV; + else + ns->xdp_prog_mode = XDP_ATTACHED_HW; + + return 0; + } + + static int nsim_bpf_create_prog(struct netdevsim *ns, struct bpf_prog *prog) + { + struct nsim_bpf_bound_prog *state; + char name[16]; + + state = kzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return -ENOMEM; + + state->ns = ns; + state->prog = prog; + state->state = "verify"; + + /* Program id is not populated yet when we create the state. */ + sprintf(name, "%u", ns->prog_id_gen++); + state->ddir = debugfs_create_dir(name, ns->ddir_bpf_bound_progs); + if (IS_ERR_OR_NULL(state->ddir)) { + kfree(state); + return -ENOMEM; + } + + debugfs_create_u32("id", 0400, state->ddir, &prog->aux->id); + debugfs_create_file("state", 0400, state->ddir, + &state->state, &nsim_bpf_string_fops); + debugfs_create_bool("loaded", 0400, state->ddir, &state->is_loaded); + + list_add_tail(&state->l, &ns->bpf_bound_progs); + + prog->aux->offload->dev_priv = state; + + return 0; + } + + static void nsim_bpf_destroy_prog(struct bpf_prog *prog) + { + struct nsim_bpf_bound_prog *state; + + state = prog->aux->offload->dev_priv; + WARN(state->is_loaded, + "offload state destroyed while program still bound"); + debugfs_remove_recursive(state->ddir); + list_del(&state->l); + kfree(state); + } + + static int nsim_setup_prog_checks(struct netdevsim *ns, struct netdev_bpf *bpf) + { + if (bpf->prog && bpf->prog->aux->offload) { + NSIM_EA(bpf->extack, "attempt to load offloaded prog to drv"); + return -EINVAL; + } + if (ns->netdev->mtu > NSIM_XDP_MAX_MTU) { + NSIM_EA(bpf->extack, "MTU too large w/ XDP enabled"); + return -EINVAL; + } + if (nsim_xdp_offload_active(ns)) { + NSIM_EA(bpf->extack, "xdp offload active, can't load drv prog"); + return -EBUSY; + } + return 0; + } + + static int + nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf) + { + struct nsim_bpf_bound_prog *state; + + if (!bpf->prog) + return 0; + + if (!bpf->prog->aux->offload) { + NSIM_EA(bpf->extack, "xdpoffload of non-bound program"); + return -EINVAL; + } + if (bpf->prog->aux->offload->netdev != ns->netdev) { + NSIM_EA(bpf->extack, "program bound to different dev"); + return -EINVAL; + } + + state = bpf->prog->aux->offload->dev_priv; + if (WARN_ON(strcmp(state->state, "xlated"))) { + NSIM_EA(bpf->extack, "offloading program in bad state"); + return -EINVAL; + } + return 0; + } + + int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) + { + struct netdevsim *ns = netdev_priv(dev); + struct nsim_bpf_bound_prog *state; + int err; + + ASSERT_RTNL(); + + switch (bpf->command) { + case BPF_OFFLOAD_VERIFIER_PREP: + if (!ns->bpf_bind_accept) + return -EOPNOTSUPP; + + err = nsim_bpf_create_prog(ns, bpf->verifier.prog); + if (err) + return err; + + bpf->verifier.ops = &nsim_bpf_analyzer_ops; + return 0; + case BPF_OFFLOAD_TRANSLATE: + state = bpf->offload.prog->aux->offload->dev_priv; + + state->state = "xlated"; + return 0; + case BPF_OFFLOAD_DESTROY: + nsim_bpf_destroy_prog(bpf->offload.prog); + return 0; + case XDP_QUERY_PROG: + bpf->prog_attached = ns->xdp_prog_mode; + bpf->prog_id = ns->xdp_prog ? ns->xdp_prog->aux->id : 0; + bpf->prog_flags = ns->xdp_prog ? ns->xdp_flags : 0; + return 0; + case XDP_SETUP_PROG: + err = nsim_setup_prog_checks(ns, bpf); + if (err) + return err; + + return nsim_xdp_set_prog(ns, bpf); + case XDP_SETUP_PROG_HW: + err = nsim_setup_prog_hw_checks(ns, bpf); + if (err) + return err; + + return nsim_xdp_set_prog(ns, bpf); + default: + return -EINVAL; + } + } + + int nsim_bpf_init(struct netdevsim *ns) + { + INIT_LIST_HEAD(&ns->bpf_bound_progs); + + debugfs_create_u32("bpf_offloaded_id", 0400, ns->ddir, + &ns->bpf_offloaded_id); + + ns->bpf_bind_accept = true; + debugfs_create_bool("bpf_bind_accept", 0600, ns->ddir, + &ns->bpf_bind_accept); + debugfs_create_u32("bpf_bind_verifier_delay", 0600, ns->ddir, + &ns->bpf_bind_verifier_delay); + ns->ddir_bpf_bound_progs = + debugfs_create_dir("bpf_bound_progs", ns->ddir); + if (IS_ERR_OR_NULL(ns->ddir_bpf_bound_progs)) + return -ENOMEM; + + ns->bpf_tc_accept = true; + debugfs_create_bool("bpf_tc_accept", 0600, ns->ddir, + &ns->bpf_tc_accept); + debugfs_create_bool("bpf_tc_non_bound_accept", 0600, ns->ddir, + &ns->bpf_tc_non_bound_accept); + ns->bpf_xdpdrv_accept = true; + debugfs_create_bool("bpf_xdpdrv_accept", 0600, ns->ddir, + &ns->bpf_xdpdrv_accept); + ns->bpf_xdpoffload_accept = true; + debugfs_create_bool("bpf_xdpoffload_accept", 0600, ns->ddir, + &ns->bpf_xdpoffload_accept); + + return 0; + } + + void nsim_bpf_uninit(struct netdevsim *ns) + { + WARN_ON(!list_empty(&ns->bpf_bound_progs)); + WARN_ON(ns->xdp_prog); + WARN_ON(ns->bpf_offloaded); + } diff --combined drivers/net/phy/marvell.c index 82104edca393,2fc026dc170a..80c120a9f2f3 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@@ -879,8 -879,6 +879,8 @@@ static int m88e1510_config_init(struct
/* SGMII-to-Copper mode initialization */ if (phydev->interface == PHY_INTERFACE_MODE_SGMII) { + u32 pause; + /* Select page 18 */ err = marvell_set_page(phydev, 18); if (err < 0) @@@ -904,16 -902,6 +904,16 @@@ err = marvell_set_page(phydev, MII_MARVELL_COPPER_PAGE); if (err < 0) return err; + + /* There appears to be a bug in the 88e1512 when used in + * SGMII to copper mode, where the AN advertisment register + * clears the pause bits each time a negotiation occurs. + * This means we can never be truely sure what was advertised, + * so disable Pause support. + */ + pause = SUPPORTED_Pause | SUPPORTED_Asym_Pause; + phydev->supported &= ~pause; + phydev->advertising &= ~pause; }
return m88e1121_config_init(phydev); @@@ -1974,7 -1962,6 +1974,6 @@@ static struct phy_driver marvell_driver .probe = marvell_probe, .config_init = &marvell_config_init, .config_aneg = &m88e1101_config_aneg, - .read_status = &genphy_read_status, .ack_interrupt = &marvell_ack_interrupt, .config_intr = &marvell_config_intr, .resume = &genphy_resume, @@@ -1992,7 -1979,6 +1991,6 @@@ .probe = marvell_probe, .config_init = &m88e1111_config_init, .config_aneg = &marvell_config_aneg, - .read_status = &genphy_read_status, .ack_interrupt = &marvell_ack_interrupt, .config_intr = &marvell_config_intr, .resume = &genphy_resume, @@@ -2028,7 -2014,6 +2026,6 @@@ .probe = marvell_probe, .config_init = &m88e1118_config_init, .config_aneg = &m88e1118_config_aneg, - .read_status = &genphy_read_status, .ack_interrupt = &marvell_ack_interrupt, .config_intr = &marvell_config_intr, .resume = &genphy_resume, @@@ -2085,8 -2070,7 +2082,7 @@@ .flags = PHY_HAS_INTERRUPT, .probe = marvell_probe, .config_init = &m88e1145_config_init, - .config_aneg = &marvell_config_aneg, + .config_aneg = &m88e1101_config_aneg, - .read_status = &genphy_read_status, .ack_interrupt = &marvell_ack_interrupt, .config_intr = &marvell_config_intr, .resume = &genphy_resume, @@@ -2104,7 -2088,6 +2100,6 @@@ .probe = marvell_probe, .config_init = &m88e1149_config_init, .config_aneg = &m88e1118_config_aneg, - .read_status = &genphy_read_status, .ack_interrupt = &marvell_ack_interrupt, .config_intr = &marvell_config_intr, .resume = &genphy_resume, @@@ -2122,7 -2105,6 +2117,6 @@@ .probe = marvell_probe, .config_init = &m88e1111_config_init, .config_aneg = &marvell_config_aneg, - .read_status = &genphy_read_status, .ack_interrupt = &marvell_ack_interrupt, .config_intr = &marvell_config_intr, .resume = &genphy_resume, @@@ -2139,8 -2121,6 +2133,6 @@@ .flags = PHY_HAS_INTERRUPT, .probe = marvell_probe, .config_init = &m88e1116r_config_init, - .config_aneg = &genphy_config_aneg, - .read_status = &genphy_read_status, .ack_interrupt = &marvell_ack_interrupt, .config_intr = &marvell_config_intr, .resume = &genphy_resume, @@@ -2216,7 -2196,6 +2208,6 @@@ .features = PHY_BASIC_FEATURES, .flags = PHY_HAS_INTERRUPT, .probe = marvell_probe, - .config_aneg = &genphy_config_aneg, .config_init = &m88e3016_config_init, .aneg_done = &marvell_aneg_done, .read_status = &marvell_read_status, diff --combined drivers/net/tun.c index 2ffe5dba7e09,e367d6310353..164fef1d1cf3 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@@ -195,6 -195,11 +195,11 @@@ struct tun_flow_entry
#define TUN_NUM_FLOW_ENTRIES 1024
+ struct tun_steering_prog { + struct rcu_head rcu; + struct bpf_prog *prog; + }; + /* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device. @@@ -232,6 -237,7 +237,7 @@@ struct tun_struct u32 rx_batched; struct tun_pcpu_stats __percpu *pcpu_stats; struct bpf_prog __rcu *xdp_prog; + struct tun_steering_prog __rcu *steering_prog; };
static int tun_napi_receive(struct napi_struct *napi, int budget) @@@ -537,15 -543,12 +543,12 @@@ static inline void tun_flow_save_rps_rx * different rxq no. here. If we could not get rxhash, then we would * hope the rxq no. may help here. */ - static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) { - struct tun_struct *tun = netdev_priv(dev); struct tun_flow_entry *e; u32 txq = 0; u32 numqueues = 0;
- rcu_read_lock(); numqueues = READ_ONCE(tun->numqueues);
txq = __skb_get_hash_symmetric(skb); @@@ -563,10 -566,37 +566,37 @@@ txq -= numqueues; }
- rcu_read_unlock(); return txq; }
+ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) + { + struct tun_steering_prog *prog; + u16 ret = 0; + + prog = rcu_dereference(tun->steering_prog); + if (prog) + ret = bpf_prog_run_clear_cb(prog->prog, skb); + + return ret % tun->numqueues; + } + + static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback) + { + struct tun_struct *tun = netdev_priv(dev); + u16 ret; + + rcu_read_lock(); + if (rcu_dereference(tun->steering_prog)) + ret = tun_ebpf_select_queue(tun, skb); + else + ret = tun_automq_select_queue(tun, skb); + rcu_read_unlock(); + + return ret; + } + static inline bool tun_not_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); @@@ -673,7 -703,6 +703,6 @@@ static void tun_detach(struct tun_file static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); - struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog); struct tun_file *tfile, *tmp; int i, n = tun->numqueues;
@@@ -708,9 -737,6 +737,6 @@@ } BUG_ON(tun->numdisabled != 0);
- if (xdp_prog) - bpf_prog_put(xdp_prog); - if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } @@@ -937,23 -963,10 +963,10 @@@ static int tun_net_close(struct net_dev }
/* Net device start xmit */ - static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) + static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) { - struct tun_struct *tun = netdev_priv(dev); - int txq = skb->queue_mapping; - struct tun_file *tfile; - u32 numqueues = 0; - - rcu_read_lock(); - tfile = rcu_dereference(tun->tfiles[txq]); - numqueues = READ_ONCE(tun->numqueues); - - /* Drop packet if interface is not attached */ - if (txq >= numqueues) - goto drop; - #ifdef CONFIG_RPS - if (numqueues == 1 && static_key_false(&rps_needed)) { + if (tun->numqueues == 1 && static_key_false(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ @@@ -969,6 -982,24 +982,24 @@@ } } #endif + } + + /* Net device start xmit */ + static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) + { + struct tun_struct *tun = netdev_priv(dev); + int txq = skb->queue_mapping; + struct tun_file *tfile; + + rcu_read_lock(); + tfile = rcu_dereference(tun->tfiles[txq]); + + /* Drop packet if interface is not attached */ + if (txq >= tun->numqueues) + goto drop; + + if (!rcu_dereference(tun->steering_prog)) + tun_automq_xmit(tun, skb);
tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
@@@ -1248,12 -1279,12 +1279,12 @@@ static void tun_net_init(struct net_dev /* Character device part */
/* Poll */ -static unsigned int tun_chr_poll(struct file *file, poll_table *wait) +static __poll_t tun_chr_poll(struct file *file, poll_table *wait) { struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); struct sock *sk; - unsigned int mask = 0; + __poll_t mask = 0;
if (!tun) return POLLERR; @@@ -1551,7 -1582,7 +1582,7 @@@ static ssize_t tun_get_user(struct tun_ int copylen; bool zerocopy = false; int err; - u32 rxhash; + u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tun);
@@@ -1739,7 -1770,10 +1770,10 @@@ rcu_read_unlock(); }
- rxhash = __skb_get_hash_symmetric(skb); + rcu_read_lock(); + if (!rcu_dereference(tun->steering_prog)) + rxhash = __skb_get_hash_symmetric(skb); + rcu_read_unlock();
if (frags) { /* Exercise flow dissector code path. */ @@@ -1783,7 -1817,9 +1817,9 @@@ u64_stats_update_end(&stats->syncp); put_cpu_ptr(stats);
- tun_flow_update(tun, rxhash, tfile); + if (rxhash) + tun_flow_update(tun, rxhash, tfile); + return total_len; }
@@@ -1991,6 -2027,39 +2027,39 @@@ static ssize_t tun_chr_read_iter(struc return ret; }
+ static void tun_steering_prog_free(struct rcu_head *rcu) + { + struct tun_steering_prog *prog = container_of(rcu, + struct tun_steering_prog, rcu); + + bpf_prog_destroy(prog->prog); + kfree(prog); + } + + static int __tun_set_steering_ebpf(struct tun_struct *tun, + struct bpf_prog *prog) + { + struct tun_steering_prog *old, *new = NULL; + + if (prog) { + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + new->prog = prog; + } + + spin_lock_bh(&tun->lock); + old = rcu_dereference_protected(tun->steering_prog, + lockdep_is_held(&tun->lock)); + rcu_assign_pointer(tun->steering_prog, new); + spin_unlock_bh(&tun->lock); + + if (old) + call_rcu(&old->rcu, tun_steering_prog_free); + + return 0; + } + static void tun_free_netdev(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); @@@ -1999,6 -2068,7 +2068,7 @@@ free_percpu(tun->pcpu_stats); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); + __tun_set_steering_ebpf(tun, NULL); }
static void tun_setup(struct net_device *dev) @@@ -2287,6 -2357,7 +2357,7 @@@ static int tun_set_iff(struct net *net tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->rx_batched = 0; + RCU_INIT_POINTER(tun->steering_prog, NULL);
tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); if (!tun->pcpu_stats) { @@@ -2479,6 -2550,25 +2550,25 @@@ unlock return ret; }
+ static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data) + { + struct bpf_prog *prog; + int fd; + + if (copy_from_user(&fd, data, sizeof(fd))) + return -EFAULT; + + if (fd == -1) { + prog = NULL; + } else { + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + + return __tun_set_steering_ebpf(tun, prog); + } + static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { @@@ -2755,6 -2845,10 +2845,10 @@@ ret = 0; break;
+ case TUNSETSTEERINGEBPF: + ret = tun_set_steering_ebpf(tun, argp); + break; + default: ret = -EINVAL; break; diff --combined drivers/net/vxlan.c index 31f4b7911ef8,48a0dc238f73..82090ae7ced1 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@@ -2155,13 -2155,6 +2155,13 @@@ static void vxlan_xmit_one(struct sk_bu }
ndst = &rt->dst; + if (skb_dst(skb)) { + int mtu = dst_mtu(ndst) - VXLAN_HEADROOM; + + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, + skb, mtu); + } + tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr), @@@ -2197,13 -2190,6 +2197,13 @@@ goto out_unlock; }
+ if (skb_dst(skb)) { + int mtu = dst_mtu(ndst) - VXLAN6_HEADROOM; + + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, + skb, mtu); + } + tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); skb_scrub_packet(skb, xnet); @@@ -3117,11 -3103,6 +3117,11 @@@ static void vxlan_config_apply(struct n
max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); + if (max_mtu < ETH_MIN_MTU) + max_mtu = ETH_MIN_MTU; + + if (!changelink && !conf->mtu) + dev->mtu = max_mtu; }
if (dev->mtu > max_mtu) @@@ -3711,18 -3692,16 +3711,16 @@@ static __net_init int vxlan_init_net(st return 0; }
- static void __net_exit vxlan_exit_net(struct net *net) + static void vxlan_destroy_tunnels(struct net *net, struct list_head *head) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan, *next; struct net_device *dev, *aux; unsigned int h; - LIST_HEAD(list);
- rtnl_lock(); for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &vxlan_link_ops) - unregister_netdevice_queue(dev, &list); + unregister_netdevice_queue(dev, head);
list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { /* If vxlan->dev is in the same netns, it has already been added @@@ -3730,20 -3709,30 +3728,30 @@@ */ if (!net_eq(dev_net(vxlan->dev), net)) { gro_cells_destroy(&vxlan->gro_cells); - unregister_netdevice_queue(vxlan->dev, &list); + unregister_netdevice_queue(vxlan->dev, head); } }
- unregister_netdevice_many(&list); - rtnl_unlock(); - for (h = 0; h < PORT_HASH_SIZE; ++h) WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); }
+ static void __net_exit vxlan_exit_batch_net(struct list_head *net_list) + { + struct net *net; + LIST_HEAD(list); + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) + vxlan_destroy_tunnels(net, &list); + + unregister_netdevice_many(&list); + rtnl_unlock(); + } + static struct pernet_operations vxlan_net_ops = { .init = vxlan_init_net, - .exit = vxlan_exit_net, + .exit_batch = vxlan_exit_batch_net, .id = &vxlan_net_id, .size = sizeof(struct vxlan_net), }; diff --combined drivers/s390/net/qeth_core_main.c index 3614df68830f,a007f6249166..bdc28330800e --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@@ -564,7 -564,7 +564,7 @@@ static struct qeth_reply *qeth_alloc_re
reply = kzalloc(sizeof(struct qeth_reply), GFP_ATOMIC); if (reply) { - atomic_set(&reply->refcnt, 1); + refcount_set(&reply->refcnt, 1); atomic_set(&reply->received, 0); reply->card = card; } @@@ -573,14 -573,12 +573,12 @@@
static void qeth_get_reply(struct qeth_reply *reply) { - WARN_ON(atomic_read(&reply->refcnt) <= 0); - atomic_inc(&reply->refcnt); + refcount_inc(&reply->refcnt); }
static void qeth_put_reply(struct qeth_reply *reply) { - WARN_ON(atomic_read(&reply->refcnt) <= 0); - if (atomic_dec_and_test(&reply->refcnt)) + if (refcount_dec_and_test(&reply->refcnt)) kfree(reply); }
@@@ -4218,9 -4216,8 +4216,8 @@@ static int qeth_setadpparms_change_maca cmd = (struct qeth_ipa_cmd *) data; if (!card->options.layer2 || !(card->info.mac_bits & QETH_LAYER2_MAC_READ)) { - memcpy(card->dev->dev_addr, - &cmd->data.setadapterparms.data.change_addr.addr, - OSA_ADDR_LEN); + ether_addr_copy(card->dev->dev_addr, + cmd->data.setadapterparms.data.change_addr.addr); card->info.mac_bits |= QETH_LAYER2_MAC_READ; } qeth_default_setadapterparms_cb(card, reply, (unsigned long) cmd); @@@ -4242,9 -4239,9 +4239,9 @@@ int qeth_setadpparms_change_macaddr(str return -ENOMEM; cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE); cmd->data.setadapterparms.data.change_addr.cmd = CHANGE_ADDR_READ_MAC; - cmd->data.setadapterparms.data.change_addr.addr_size = OSA_ADDR_LEN; - memcpy(&cmd->data.setadapterparms.data.change_addr.addr, - card->dev->dev_addr, OSA_ADDR_LEN); + cmd->data.setadapterparms.data.change_addr.addr_size = ETH_ALEN; + ether_addr_copy(cmd->data.setadapterparms.data.change_addr.addr, + card->dev->dev_addr); rc = qeth_send_ipa_cmd(card, iob, qeth_setadpparms_change_macaddr_cb, NULL); return rc; @@@ -5386,13 -5383,6 +5383,13 @@@ out } EXPORT_SYMBOL_GPL(qeth_poll);
+static int qeth_setassparms_inspect_rc(struct qeth_ipa_cmd *cmd) +{ + if (!cmd->hdr.return_code) + cmd->hdr.return_code = cmd->data.setassparms.hdr.return_code; + return cmd->hdr.return_code; +} + int qeth_setassparms_cb(struct qeth_card *card, struct qeth_reply *reply, unsigned long data) { @@@ -6249,7 -6239,7 +6246,7 @@@ static int qeth_ipa_checksum_run_cmd_cb (struct qeth_checksum_cmd *)reply->param;
QETH_CARD_TEXT(card, 4, "chkdoccb"); - if (cmd->hdr.return_code) + if (qeth_setassparms_inspect_rc(cmd)) return 0;
memset(chksum_cb, 0, sizeof(*chksum_cb)); diff --combined fs/btrfs/disk-io.c index c3325b9e2538,5da18ebc9222..dd363532b8a9 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@@ -30,6 -30,7 +30,7 @@@ #include <linux/ratelimit.h> #include <linux/uuid.h> #include <linux/semaphore.h> + #include <linux/bpf.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" @@@ -220,7 -221,7 +221,7 @@@ void btrfs_set_buffer_lockdep_class(u6 * extents on the btree inode are pretty simple, there's one extent * that covers the entire device */ -static struct extent_map *btree_get_extent(struct btrfs_inode *inode, +struct extent_map *btree_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { @@@ -285,7 -286,7 +286,7 @@@ static int csum_tree_block(struct btrfs int verify) { u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - char *result = NULL; + char result[BTRFS_CSUM_SIZE]; unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; @@@ -294,6 -295,7 +295,6 @@@ unsigned long map_len; int err; u32 crc = ~(u32)0; - unsigned long inline_result;
len = buf->len - offset; while (len > 0) { @@@ -307,7 -309,13 +308,7 @@@ len -= cur_len; offset += cur_len; } - if (csum_size > sizeof(inline_result)) { - result = kzalloc(csum_size, GFP_NOFS); - if (!result) - return -ENOMEM; - } else { - result = (char *)&inline_result; - } + memset(result, 0, BTRFS_CSUM_SIZE);
btrfs_csum_final(crc, result);
@@@ -322,12 -330,15 +323,12 @@@ "%s checksum verify failed on %llu wanted %X found %X level %d", fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); - if (result != (char *)&inline_result) - kfree(result); return -EUCLEAN; } } else { write_extent_buffer(buf, result, 0, csum_size); } - if (result != (char *)&inline_result) - kfree(result); + return 0; }
@@@ -381,7 -392,7 +382,7 @@@ static int verify_parent_transid(struc clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state, GFP_NOFS); + &cached_state); if (need_lock) btrfs_tree_read_unlock_blocking(eb); return ret; @@@ -445,7 -456,7 +446,7 @@@ static int btree_read_extent_buffer_pag io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, - btree_get_extent, mirror_num); + mirror_num); if (!ret) { if (!verify_parent_transid(io_tree, eb, parent_transid, 0)) @@@ -1002,7 -1013,7 +1003,7 @@@ void readahead_tree_block(struct btrfs_ if (IS_ERR(buf)) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, WAIT_NONE, btree_get_extent, 0); + buf, WAIT_NONE, 0); free_extent_buffer(buf); }
@@@ -1021,7 -1032,7 +1022,7 @@@ int reada_tree_block_flagged(struct btr set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK, - btree_get_extent, mirror_num); + mirror_num); if (ret) { free_extent_buffer(buf); return ret; @@@ -1158,7 -1169,6 +1159,7 @@@ static void __setup_root(struct btrfs_r spin_lock_init(&root->accounting_lock); spin_lock_init(&root->log_extents_lock[0]); spin_lock_init(&root->log_extents_lock[1]); + spin_lock_init(&root->qgroup_meta_rsv_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); @@@ -1175,6 -1185,7 +1176,6 @@@ atomic_set(&root->orphan_inodes, 0); refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); - atomic64_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@@ -1233,7 -1244,7 +1234,7 @@@ struct btrfs_root *btrfs_create_tree(st struct btrfs_root *root; struct btrfs_key key; int ret = 0; - uuid_le uuid; + uuid_le uuid = { 0 };
root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) @@@ -1274,8 -1285,7 +1275,8 @@@ btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); - uuid_le_gen(&uuid); + if (is_fstree(objectid)) + uuid_le_gen(&uuid); memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); root->root_item.drop_level = 0;
@@@ -3114,6 -3124,7 +3115,7 @@@ recovery_tree_root goto fail_block_groups; goto retry_root_backup; } + BPF_ALLOW_ERROR_INJECTION(open_ctree);
static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { @@@ -3348,7 -3359,7 +3350,7 @@@ static void write_dev_flush(struct btrf bio->bi_private = &device->flush_wait;
btrfsic_submit_bio(bio); - device->flush_bio_sent = 1; + set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); }
/* @@@ -3358,10 -3369,10 +3360,10 @@@ static blk_status_t wait_dev_flush(stru { struct bio *bio = device->flush_bio;
- if (!device->flush_bio_sent) + if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return BLK_STS_OK;
- device->flush_bio_sent = 0; + clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); wait_for_completion_io(&device->flush_wait);
return bio->bi_status; @@@ -3385,16 -3396,14 +3387,16 @@@ static int barrier_all_devices(struct b int errors_wait = 0; blk_status_t ret;
+ lockdep_assert_held(&info->fs_devices->device_list_mutex); /* send down all the barriers */ head = &info->fs_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { - if (dev->missing) + list_for_each_entry(dev, head, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->bdev) continue; - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue;
write_dev_flush(dev); @@@ -3402,15 -3411,14 +3404,15 @@@ }
/* wait for all the barriers */ - list_for_each_entry_rcu(dev, head, dev_list) { - if (dev->missing) + list_for_each_entry(dev, head, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->bdev) { errors_wait++; continue; } - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue;
ret = wait_dev_flush(dev); @@@ -3502,13 -3510,12 +3504,13 @@@ int write_all_supers(struct btrfs_fs_in } }
- list_for_each_entry_rcu(dev, head, dev_list) { + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; continue; } - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue;
btrfs_set_stack_device_generation(dev_item, 0); @@@ -3544,11 -3551,10 +3546,11 @@@ }
total_errors = 0; - list_for_each_entry_rcu(dev, head, dev_list) { + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) continue; - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue;
ret = wait_dev_supers(dev, max_mirrors); diff --combined fs/btrfs/free-space-cache.c index 9e8c1f046e02,fb1382893bfc..9088b0b0d10f --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@@ -22,6 -22,7 +22,7 @@@ #include <linux/slab.h> #include <linux/math64.h> #include <linux/ratelimit.h> + #include <linux/bpf.h> #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@@ -332,6 -333,7 +333,7 @@@ static int io_ctl_init(struct btrfs_io_
return 0; } + BPF_ALLOW_ERROR_INJECTION(io_ctl_init);
static void io_ctl_free(struct btrfs_io_ctl *io_ctl) { @@@ -993,7 -995,8 +995,7 @@@ update_cache_item(struct btrfs_trans_ha ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, - GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); goto fail; } leaf = path->nodes[0]; @@@ -1007,7 -1010,7 +1009,7 @@@ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL, GFP_NOFS); + NULL); btrfs_release_path(path); goto fail; } @@@ -1104,7 -1107,8 +1106,7 @@@ static int flush_dirty_cache(struct ino ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, - GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
return ret; } @@@ -1125,7 -1129,8 +1127,7 @@@ cleanup_write_cache_enospc(struct inod { io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, cached_state, - GFP_NOFS); + i_size_read(inode) - 1, cached_state); }
static int __btrfs_wait_cache_io(struct btrfs_root *root, @@@ -1319,7 -1324,7 +1321,7 @@@ static int __btrfs_write_out_cache(stru io_ctl_drop_pages(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); + i_size_read(inode) - 1, &cached_state);
/* * at this point the pages are under IO and we're happy, @@@ -3545,7 -3550,7 +3547,7 @@@ int btrfs_write_out_ino_cache(struct bt if (ret) { if (release_metadata) btrfs_delalloc_release_metadata(BTRFS_I(inode), - inode->i_size); + inode->i_size, true); #ifdef DEBUG btrfs_err(fs_info, "failed to write free ino cache for root %llu", diff --combined include/linux/bpf.h index b63a592ad29d,da54ef644fcd..0dcd1d7c9825 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@@ -200,6 -200,9 +200,9 @@@ struct bpf_prog_aux u32 max_ctx_offset; u32 stack_depth; u32 id; + u32 func_cnt; + struct bpf_prog **func; + void *jit_data; /* JIT specific data. arch dependent */ struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_prog_ops *ops; @@@ -285,6 -288,9 +288,9 @@@ int bpf_prog_array_copy_to_user(struct
void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, struct bpf_prog *old_prog); + int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, + __u32 __user *prog_ids, u32 request_cnt, + __u32 __user *prog_cnt); int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, @@@ -399,6 -405,7 +405,7 @@@ static inline void bpf_long_memcpy(voi
/* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); + void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
/* Map specifics */ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); @@@ -419,8 -426,6 +426,8 @@@ static inline int bpf_map_attr_numa_nod attr->numa_node : NUMA_NO_NODE; }
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@@ -508,12 -513,6 +515,12 @@@ static inline int cpu_map_enqueue(struc { return 0; } + +static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, + enum bpf_prog_type type) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif /* CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, @@@ -522,8 -521,6 +529,8 @@@ return bpf_prog_get_type_dev(ufd, type, false); }
+bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); + int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog);
@@@ -586,4 -583,15 +593,15 @@@ extern const struct bpf_func_proto bpf_ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) + #ifdef CONFIG_BPF_KPROBE_OVERRIDE + #define BPF_ALLOW_ERROR_INJECTION(fname) \ + static unsigned long __used \ + __attribute__((__section__("_kprobe_error_inject_list"))) \ + _eil_addr_##fname = (unsigned long)fname; + #else + #define BPF_ALLOW_ERROR_INJECTION(fname) + #endif + #endif + #endif /* _LINUX_BPF_H */ diff --combined include/linux/module.h index e6249795f9e2,548fa09fa806..0fd65481c045 --- a/include/linux/module.h +++ b/include/linux/module.h @@@ -475,6 -475,11 +475,11 @@@ struct module ctor_fn_t *ctors; unsigned int num_ctors; #endif + + #ifdef CONFIG_BPF_KPROBE_OVERRIDE + unsigned int num_kprobe_ei_funcs; + unsigned long *kprobe_ei_funcs; + #endif } ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} @@@ -606,9 -611,6 +611,9 @@@ int ref_module(struct module *a, struc __mod ? __mod->name : "kernel"; \ })
+/* Dereference module function descriptor */ +void *dereference_module_function_descriptor(struct module *mod, void *ptr); + /* For kallsyms to ask for address resolution. namebuf should be at * least KSYM_NAME_LEN long: a pointer to namebuf is returned if * found, otherwise NULL. */ @@@ -763,13 -765,6 +768,13 @@@ static inline bool is_module_sig_enforc return false; }
+/* Dereference module function descriptor */ +static inline +void *dereference_module_function_descriptor(struct module *mod, void *ptr) +{ + return ptr; +} + #endif /* CONFIG_MODULES */
#ifdef CONFIG_SYSFS diff --combined include/linux/pci.h index 95807535d175,0314e0716c30..66cca1c6f742 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@@ -48,17 -48,17 +48,17 @@@ * In the interest of not exposing interfaces to user-space unnecessarily, * the following kernel-only defines are being added here. */ -#define PCI_DEVID(bus, devfn) ((((u16)(bus)) << 8) | (devfn)) +#define PCI_DEVID(bus, devfn) ((((u16)(bus)) << 8) | (devfn)) /* return bus from PCI devid = ((u16)bus_number) << 8) | devfn */ #define PCI_BUS_NUM(x) (((x) >> 8) & 0xff)
/* pci_slot represents a physical slot */ struct pci_slot { - struct pci_bus *bus; /* The bus this slot is on */ - struct list_head list; /* node in list of slots on this bus */ - struct hotplug_slot *hotplug; /* Hotplug info (migrate over time) */ - unsigned char number; /* PCI_SLOT(pci_dev->devfn) */ - struct kobject kobj; + struct pci_bus *bus; /* Bus this slot is on */ + struct list_head list; /* Node in list of slots */ + struct hotplug_slot *hotplug; /* Hotplug info (move here) */ + unsigned char number; /* PCI_SLOT(pci_dev->devfn) */ + struct kobject kobj; };
static inline const char *pci_slot_name(const struct pci_slot *slot) @@@ -72,7 -72,9 +72,7 @@@ enum pci_mmap_state pci_mmap_mem };
-/* - * For PCI devices, the region numbers are assigned this way: - */ +/* For PCI devices, the region numbers are assigned this way: */ enum { /* #0-5: standard PCI resources */ PCI_STD_RESOURCES, @@@ -81,23 -83,23 +81,23 @@@ /* #6: expansion ROM resource */ PCI_ROM_RESOURCE,
- /* device specific resources */ + /* Device-specific resources */ #ifdef CONFIG_PCI_IOV PCI_IOV_RESOURCES, PCI_IOV_RESOURCE_END = PCI_IOV_RESOURCES + PCI_SRIOV_NUM_BARS - 1, #endif
- /* resources assigned to buses behind the bridge */ + /* Resources assigned to buses behind the bridge */ #define PCI_BRIDGE_RESOURCE_NUM 4
PCI_BRIDGE_RESOURCES, PCI_BRIDGE_RESOURCE_END = PCI_BRIDGE_RESOURCES + PCI_BRIDGE_RESOURCE_NUM - 1,
- /* total resources associated with a PCI device */ + /* Total resources associated with a PCI device */ PCI_NUM_RESOURCES,
- /* preserve this for compatibility */ + /* Preserve this for compatibility */ DEVICE_COUNT_RESOURCE = PCI_NUM_RESOURCES, };
@@@ -150,10 -152,9 +150,10 @@@ static inline const char *pci_power_nam #define PCI_PM_D3COLD_WAIT 100 #define PCI_PM_BUS_WAIT 50
-/** The pci_channel state describes connectivity between the CPU and - * the pci device. If some PCI bus between here and the pci device - * has crashed or locked up, this info is reflected here. +/** + * The pci_channel state describes connectivity between the CPU and + * the PCI device. If some PCI bus between here and the PCI device + * has crashed or locked up, this info is reflected here. */ typedef unsigned int __bitwise pci_channel_state_t;
@@@ -183,7 -184,9 +183,7 @@@ enum pcie_reset_state
typedef unsigned short __bitwise pci_dev_flags_t; enum pci_dev_flags { - /* INTX_DISABLE in PCI_COMMAND register disables MSI - * generation too. - */ + /* INTX_DISABLE in PCI_COMMAND register disables MSI too */ PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG = (__force pci_dev_flags_t) (1 << 0), /* Device configuration is irrevocably lost if disabled into D3 */ PCI_DEV_FLAGS_NO_D3 = (__force pci_dev_flags_t) (1 << 1), @@@ -199,7 -202,7 +199,7 @@@ PCI_DEV_FLAGS_NO_PM_RESET = (__force pci_dev_flags_t) (1 << 7), /* Get VPD from function 0 VPD */ PCI_DEV_FLAGS_VPD_REF_F0 = (__force pci_dev_flags_t) (1 << 8), - /* a non-root bridge where translation occurs, stop alias search here */ + /* A non-root bridge where translation occurs, stop alias search here */ PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT = (__force pci_dev_flags_t) (1 << 9), /* Do not use FLR even if device advertises PCI_AF_CAP */ PCI_DEV_FLAGS_NO_FLR_RESET = (__force pci_dev_flags_t) (1 << 10), @@@ -219,17 -222,17 +219,17 @@@ enum pci_bus_flags PCI_BUS_FLAGS_NO_AERSID = (__force pci_bus_flags_t) 4, };
-/* These values come from the PCI Express Spec */ +/* Values from Link Status register, PCIe r3.1, sec 7.8.8 */ enum pcie_link_width { PCIE_LNK_WIDTH_RESRV = 0x00, PCIE_LNK_X1 = 0x01, PCIE_LNK_X2 = 0x02, PCIE_LNK_X4 = 0x04, PCIE_LNK_X8 = 0x08, - PCIE_LNK_X12 = 0x0C, + PCIE_LNK_X12 = 0x0c, PCIE_LNK_X16 = 0x10, PCIE_LNK_X32 = 0x20, - PCIE_LNK_WIDTH_UNKNOWN = 0xFF, + PCIE_LNK_WIDTH_UNKNOWN = 0xff, };
/* Based on the PCI Hotplug Spec, but some values are made up by us */ @@@ -260,15 -263,15 +260,15 @@@ enum pci_bus_speed };
struct pci_cap_saved_data { - u16 cap_nr; - bool cap_extended; - unsigned int size; - u32 data[0]; + u16 cap_nr; + bool cap_extended; + unsigned int size; + u32 data[0]; };
struct pci_cap_saved_state { - struct hlist_node next; - struct pci_cap_saved_data cap; + struct hlist_node next; + struct pci_cap_saved_data cap; };
struct irq_affinity; @@@ -277,17 -280,19 +277,17 @@@ struct pci_vpd struct pci_sriov; struct pci_ats;
-/* - * The pci_dev structure is used to describe PCI devices. - */ +/* The pci_dev structure describes PCI devices */ struct pci_dev { - struct list_head bus_list; /* node in per-bus list */ - struct pci_bus *bus; /* bus this device is on */ - struct pci_bus *subordinate; /* bus this device bridges to */ + struct list_head bus_list; /* Node in per-bus list */ + struct pci_bus *bus; /* Bus this device is on */ + struct pci_bus *subordinate; /* Bus this device bridges to */
- void *sysdata; /* hook for sys-specific extension */ - struct proc_dir_entry *procent; /* device entry in /proc/bus/pci */ + void *sysdata; /* Hook for sys-specific extension */ + struct proc_dir_entry *procent; /* Device entry in /proc/bus/pci */ struct pci_slot *slot; /* Physical slot this device is in */
- unsigned int devfn; /* encoded device & function index */ + unsigned int devfn; /* Encoded device & function index */ unsigned short vendor; unsigned short device; unsigned short subsystem_vendor; @@@ -302,12 -307,12 +302,12 @@@ u8 msi_cap; /* MSI capability offset */ u8 msix_cap; /* MSI-X capability offset */ u8 pcie_mpss:3; /* PCIe Max Payload Size Supported */ - u8 rom_base_reg; /* which config register controls the ROM */ - u8 pin; /* which interrupt pin this device uses */ - u16 pcie_flags_reg; /* cached PCIe Capabilities Register */ - unsigned long *dma_alias_mask;/* mask of enabled devfn aliases */ + u8 rom_base_reg; /* Config register controlling ROM */ + u8 pin; /* Interrupt pin this device uses */ + u16 pcie_flags_reg; /* Cached PCIe Capabilities Register */ + unsigned long *dma_alias_mask;/* Mask of enabled devfn aliases */
- struct pci_driver *driver; /* which driver has allocated this device */ + struct pci_driver *driver; /* Driver bound to this device */ u64 dma_mask; /* Mask of the bits of bus address this device implements. Normally this is 0xffffffff. You only need to change @@@ -316,9 -321,9 +316,9 @@@
struct device_dma_parameters dma_parms;
- pci_power_t current_state; /* Current operating state. In ACPI-speak, - this is D0-D3, D0 being fully functional, - and D3 being off. */ + pci_power_t current_state; /* Current operating state. In ACPI, + this is D0-D3, D0 being fully + functional, and D3 being off. */ u8 pm_cap; /* PM capability offset */ unsigned int pme_support:5; /* Bitmask of states from which PME# can be generated */ @@@ -329,10 -334,10 +329,10 @@@ unsigned int no_d3cold:1; /* D3cold is forbidden */ unsigned int bridge_d3:1; /* Allow D3 for bridge */ unsigned int d3cold_allowed:1; /* D3cold is allowed by user */ - unsigned int mmio_always_on:1; /* disallow turning off io/mem - decoding during bar sizing */ + unsigned int mmio_always_on:1; /* Disallow turning off io/mem + decoding during BAR sizing */ unsigned int wakeup_prepared:1; - unsigned int runtime_d3cold:1; /* whether go through runtime + unsigned int runtime_d3cold:1; /* Whether go through runtime D3cold, not set for devices powered on/off by the corresponding bridge */ @@@ -345,14 -350,12 +345,14 @@@
#ifdef CONFIG_PCIEASPM struct pcie_link_state *link_state; /* ASPM link state */ + unsigned int ltr_path:1; /* Latency Tolerance Reporting + supported from root to here */ #endif
- pci_channel_state_t error_state; /* current connectivity state */ - struct device dev; /* Generic device interface */ + pci_channel_state_t error_state; /* Current connectivity state */ + struct device dev; /* Generic device interface */
- int cfg_size; /* Size of configuration space */ + int cfg_size; /* Size of config space */
/* * Instead of touching interrupt line and base address registers @@@ -361,47 -364,47 +361,47 @@@ unsigned int irq; struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */
- bool match_driver; /* Skip attaching driver */ - /* These fields are used by common fixups */ - unsigned int transparent:1; /* Subtractive decode PCI bridge */ - unsigned int multifunction:1;/* Part of multi-function device */ - /* keep track of device state */ + bool match_driver; /* Skip attaching driver */ + + unsigned int transparent:1; /* Subtractive decode bridge */ + unsigned int multifunction:1; /* Multi-function device */ + unsigned int is_added:1; - unsigned int is_busmaster:1; /* device is busmaster */ - unsigned int no_msi:1; /* device may not use msi */ - unsigned int no_64bit_msi:1; /* device may only use 32-bit MSIs */ - unsigned int block_cfg_access:1; /* config space access is blocked */ - unsigned int broken_parity_status:1; /* Device generates false positive parity */ - unsigned int irq_reroute_variant:2; /* device needs IRQ rerouting variant */ + unsigned int is_busmaster:1; /* Is busmaster */ + unsigned int no_msi:1; /* May not use MSI */ + unsigned int no_64bit_msi:1; /* May only use 32-bit MSIs */ + unsigned int block_cfg_access:1; /* Config space access blocked */ + unsigned int broken_parity_status:1; /* Generates false positive parity */ + unsigned int irq_reroute_variant:2; /* Needs IRQ rerouting variant */ unsigned int msi_enabled:1; unsigned int msix_enabled:1; - unsigned int ari_enabled:1; /* ARI forwarding */ - unsigned int ats_enabled:1; /* Address Translation Service */ + unsigned int ari_enabled:1; /* ARI forwarding */ + unsigned int ats_enabled:1; /* Address Translation Svc */ unsigned int pasid_enabled:1; /* Process Address Space ID */ unsigned int pri_enabled:1; /* Page Request Interface */ unsigned int is_managed:1; - unsigned int needs_freset:1; /* Dev requires fundamental reset */ + unsigned int needs_freset:1; /* Requires fundamental reset */ unsigned int state_saved:1; unsigned int is_physfn:1; unsigned int is_virtfn:1; unsigned int reset_fn:1; - unsigned int is_hotplug_bridge:1; - unsigned int is_thunderbolt:1; /* Thunderbolt controller */ - unsigned int __aer_firmware_first_valid:1; + unsigned int is_hotplug_bridge:1; + unsigned int is_thunderbolt:1; /* Thunderbolt controller */ + unsigned int __aer_firmware_first_valid:1; unsigned int __aer_firmware_first:1; - unsigned int broken_intx_masking:1; /* INTx masking can't be used */ - unsigned int io_window_1k:1; /* Intel P2P bridge 1K I/O windows */ + unsigned int broken_intx_masking:1; /* INTx masking can't be used */ + unsigned int io_window_1k:1; /* Intel bridge 1K I/O windows */ unsigned int irq_managed:1; unsigned int has_secondary_link:1; - unsigned int non_compliant_bars:1; /* broken BARs; ignore them */ - unsigned int is_probed:1; /* device probing in progress */ + unsigned int non_compliant_bars:1; /* Broken BARs; ignore them */ + unsigned int is_probed:1; /* Device probing in progress */ pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */
- u32 saved_config_space[16]; /* config space saved at suspend time */ + u32 saved_config_space[16]; /* Config space saved at suspend time */ struct hlist_head saved_cap_space; - struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */ - int rom_attr_enabled; /* has display of the rom attribute been enabled? */ + struct bin_attribute *rom_attr; /* Attribute descriptor for sysfs ROM entry */ + int rom_attr_enabled; /* Display of ROM attribute enabled? */ struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */ struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
@@@ -416,12 -419,12 +416,12 @@@ struct pci_vpd *vpd; #ifdef CONFIG_PCI_ATS union { - struct pci_sriov *sriov; /* SR-IOV capability related */ - struct pci_dev *physfn; /* the PF this VF is associated with */ + struct pci_sriov *sriov; /* PF: SR-IOV info */ + struct pci_dev *physfn; /* VF: related PF */ }; u16 ats_cap; /* ATS Capability offset */ u8 ats_stu; /* ATS Smallest Translation Unit */ - atomic_t ats_ref_cnt; /* number of VFs with ATS enabled */ + atomic_t ats_ref_cnt; /* Number of VFs with ATS enabled */ #endif #ifdef CONFIG_PCI_PRI u32 pri_reqs_alloc; /* Number of PRI requests allocated */ @@@ -429,11 -432,11 +429,11 @@@ #ifdef CONFIG_PCI_PASID u16 pasid_features; #endif - phys_addr_t rom; /* Physical address of ROM if it's not from the BAR */ - size_t romlen; /* Length of ROM if it's not from the BAR */ - char *driver_override; /* Driver name to force a match */ + phys_addr_t rom; /* Physical address if not from BAR */ + size_t romlen; /* Length if not from BAR */ + char *driver_override; /* Driver name to force a match */
- unsigned long priv_flags; /* Private flags for the pci driver */ + unsigned long priv_flags; /* Private flags for the PCI driver */ };
static inline struct pci_dev *pci_physfn(struct pci_dev *dev) @@@ -456,26 -459,26 +456,26 @@@ static inline int pci_channel_offline(s }
struct pci_host_bridge { - struct device dev; - struct pci_bus *bus; /* root bus */ - struct pci_ops *ops; - void *sysdata; - int busnr; + struct device dev; + struct pci_bus *bus; /* Root bus */ + struct pci_ops *ops; + void *sysdata; + int busnr; struct list_head windows; /* resource_entry */ - u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* platform IRQ swizzler */ + u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */ int (*map_irq)(const struct pci_dev *, u8, u8); void (*release_fn)(struct pci_host_bridge *); - void *release_data; + void *release_data; struct msi_controller *msi; - unsigned int ignore_reset_delay:1; /* for entire hierarchy */ - unsigned int no_ext_tags:1; /* no Extended Tags */ + unsigned int ignore_reset_delay:1; /* For entire hierarchy */ + unsigned int no_ext_tags:1; /* No Extended Tags */ /* Resource alignment requirements */ resource_size_t (*align_resource)(struct pci_dev *dev, const struct resource *res, resource_size_t start, resource_size_t size, resource_size_t align); - unsigned long private[0] ____cacheline_aligned; + unsigned long private[0] ____cacheline_aligned; };
#define to_pci_host_bridge(n) container_of(n, struct pci_host_bridge, dev) @@@ -497,8 -500,8 +497,8 @@@ void pci_free_host_bridge(struct pci_ho struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus);
void pci_set_host_bridge_release(struct pci_host_bridge *bridge, - void (*release_fn)(struct pci_host_bridge *), - void *release_data); + void (*release_fn)(struct pci_host_bridge *), + void *release_data);
int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge);
@@@ -518,32 -521,32 +518,32 @@@ #define PCI_SUBTRACTIVE_DECODE 0x1
struct pci_bus_resource { - struct list_head list; - struct resource *res; - unsigned int flags; + struct list_head list; + struct resource *res; + unsigned int flags; };
#define PCI_REGION_FLAG_MASK 0x0fU /* These bits of resource flags tell us the PCI region flags */
struct pci_bus { - struct list_head node; /* node in list of buses */ - struct pci_bus *parent; /* parent bus this bridge is on */ - struct list_head children; /* list of child buses */ - struct list_head devices; /* list of devices on this bus */ - struct pci_dev *self; /* bridge device as seen by parent */ - struct list_head slots; /* list of slots on this bus; + struct list_head node; /* Node in list of buses */ + struct pci_bus *parent; /* Parent bus this bridge is on */ + struct list_head children; /* List of child buses */ + struct list_head devices; /* List of devices on this bus */ + struct pci_dev *self; /* Bridge device as seen by parent */ + struct list_head slots; /* List of slots on this bus; protected by pci_slot_mutex */ struct resource *resource[PCI_BRIDGE_RESOURCE_NUM]; - struct list_head resources; /* address space routed to this bus */ - struct resource busn_res; /* bus numbers routed to this bus */ + struct list_head resources; /* Address space routed to this bus */ + struct resource busn_res; /* Bus numbers routed to this bus */
- struct pci_ops *ops; /* configuration access functions */ + struct pci_ops *ops; /* Configuration access functions */ struct msi_controller *msi; /* MSI controller */ - void *sysdata; /* hook for sys-specific extension */ - struct proc_dir_entry *procdir; /* directory entry in /proc/bus/pci */ + void *sysdata; /* Hook for sys-specific extension */ + struct proc_dir_entry *procdir; /* Directory entry in /proc/bus/pci */
- unsigned char number; /* bus number */ - unsigned char primary; /* number of primary bridge */ + unsigned char number; /* Bus number */ + unsigned char primary; /* Number of primary bridge */ unsigned char max_bus_speed; /* enum pci_bus_speed */ unsigned char cur_bus_speed; /* enum pci_bus_speed */ #ifdef CONFIG_PCI_DOMAINS_GENERIC @@@ -552,12 -555,12 +552,12 @@@
char name[48];
- unsigned short bridge_ctl; /* manage NO_ISA/FBB/et al behaviors */ - pci_bus_flags_t bus_flags; /* inherited by child buses */ + unsigned short bridge_ctl; /* Manage NO_ISA/FBB/et al behaviors */ + pci_bus_flags_t bus_flags; /* Inherited by child buses */ struct device *bridge; struct device dev; - struct bin_attribute *legacy_io; /* legacy I/O for this bus */ - struct bin_attribute *legacy_mem; /* legacy mem */ + struct bin_attribute *legacy_io; /* Legacy I/O for this bus */ + struct bin_attribute *legacy_mem; /* Legacy mem */ unsigned int is_added:1; };
@@@ -614,7 -617,9 +614,7 @@@ static inline bool pci_dev_msi_enabled( static inline bool pci_dev_msi_enabled(struct pci_dev *pci_dev) { return false; } #endif
-/* - * Error values that may be returned by PCI functions. - */ +/* Error values that may be returned by PCI functions */ #define PCIBIOS_SUCCESSFUL 0x00 #define PCIBIOS_FUNC_NOT_SUPPORTED 0x81 #define PCIBIOS_BAD_VENDOR_ID 0x83 @@@ -623,7 -628,9 +623,7 @@@ #define PCIBIOS_SET_FAILED 0x88 #define PCIBIOS_BUFFER_TOO_SMALL 0x89
-/* - * Translate above to generic errno for passing back through non-PCI code. - */ +/* Translate above to generic errno for passing back through non-PCI code */ static inline int pcibios_err_to_errno(int err) { if (err <= PCIBIOS_SUCCESSFUL) @@@ -673,13 -680,13 +673,13 @@@ typedef u32 pci_bus_addr_t #endif
struct pci_bus_region { - pci_bus_addr_t start; - pci_bus_addr_t end; + pci_bus_addr_t start; + pci_bus_addr_t end; };
struct pci_dynids { - spinlock_t lock; /* protects list, index */ - struct list_head list; /* for IDs added at runtime */ + spinlock_t lock; /* Protects list, index */ + struct list_head list; /* For IDs added at runtime */ };
@@@ -693,13 -700,13 +693,13 @@@ typedef unsigned int __bitwise pci_ers_result_t;
enum pci_ers_result { - /* no result/none/not supported in device driver */ + /* No result/none/not supported in device driver */ PCI_ERS_RESULT_NONE = (__force pci_ers_result_t) 1,
/* Device driver can recover without slot reset */ PCI_ERS_RESULT_CAN_RECOVER = (__force pci_ers_result_t) 2,
- /* Device driver wants slot to be reset. */ + /* Device driver wants slot to be reset */ PCI_ERS_RESULT_NEED_RESET = (__force pci_ers_result_t) 3,
/* Device has completely failed, is unrecoverable */ @@@ -735,27 -742,27 +735,27 @@@ struct pci_error_handlers
struct module; struct pci_driver { - struct list_head node; - const char *name; - const struct pci_device_id *id_table; /* must be non-NULL for probe to be called */ - int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */ - void (*remove) (struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */ - int (*suspend) (struct pci_dev *dev, pm_message_t state); /* Device suspended */ - int (*suspend_late) (struct pci_dev *dev, pm_message_t state); - int (*resume_early) (struct pci_dev *dev); - int (*resume) (struct pci_dev *dev); /* Device woken up */ + struct list_head node; + const char *name; + const struct pci_device_id *id_table; /* Must be non-NULL for probe to be called */ + int (*probe)(struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */ + void (*remove)(struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */ + int (*suspend)(struct pci_dev *dev, pm_message_t state); /* Device suspended */ + int (*suspend_late)(struct pci_dev *dev, pm_message_t state); + int (*resume_early)(struct pci_dev *dev); + int (*resume) (struct pci_dev *dev); /* Device woken up */ void (*shutdown) (struct pci_dev *dev); - int (*sriov_configure) (struct pci_dev *dev, int num_vfs); /* PF pdev */ + int (*sriov_configure) (struct pci_dev *dev, int num_vfs); /* On PF */ const struct pci_error_handlers *err_handler; const struct attribute_group **groups; struct device_driver driver; - struct pci_dynids dynids; + struct pci_dynids dynids; };
#define to_pci_driver(drv) container_of(drv, struct pci_driver, driver)
/** - * PCI_DEVICE - macro used to describe a specific pci device + * PCI_DEVICE - macro used to describe a specific PCI device * @vend: the 16 bit PCI Vendor ID * @dev: the 16 bit PCI Device ID * @@@ -768,7 -775,7 +768,7 @@@ .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
/** - * PCI_DEVICE_SUB - macro used to describe a specific pci device with subsystem + * PCI_DEVICE_SUB - macro used to describe a specific PCI device with subsystem * @vend: the 16 bit PCI Vendor ID * @dev: the 16 bit PCI Device ID * @subvend: the 16 bit PCI Subvendor ID @@@ -782,7 -789,7 +782,7 @@@ .subvendor = (subvend), .subdevice = (subdev)
/** - * PCI_DEVICE_CLASS - macro used to describe a specific pci device class + * PCI_DEVICE_CLASS - macro used to describe a specific PCI device class * @dev_class: the class, subclass, prog-if triple for this device * @dev_class_mask: the class mask for this device * @@@ -796,7 -803,7 +796,7 @@@ .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
/** - * PCI_VDEVICE - macro used to describe a specific pci device in short form + * PCI_VDEVICE - macro used to describe a specific PCI device in short form * @vend: the vendor name * @dev: the 16 bit PCI Device ID * @@@ -805,21 -812,22 +805,21 @@@ * to PCI_ANY_ID. The macro allows the next field to follow as the device * private data. */ - #define PCI_VDEVICE(vend, dev) \ .vendor = PCI_VENDOR_ID_##vend, .device = (dev), \ .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, 0, 0
enum { - PCI_REASSIGN_ALL_RSRC = 0x00000001, /* ignore firmware setup */ - PCI_REASSIGN_ALL_BUS = 0x00000002, /* reassign all bus numbers */ - PCI_PROBE_ONLY = 0x00000004, /* use existing setup */ - PCI_CAN_SKIP_ISA_ALIGN = 0x00000008, /* don't do ISA alignment */ - PCI_ENABLE_PROC_DOMAINS = 0x00000010, /* enable domains in /proc */ + PCI_REASSIGN_ALL_RSRC = 0x00000001, /* Ignore firmware setup */ + PCI_REASSIGN_ALL_BUS = 0x00000002, /* Reassign all bus numbers */ + PCI_PROBE_ONLY = 0x00000004, /* Use existing setup */ + PCI_CAN_SKIP_ISA_ALIGN = 0x00000008, /* Don't do ISA alignment */ + PCI_ENABLE_PROC_DOMAINS = 0x00000010, /* Enable domains in /proc */ PCI_COMPAT_DOMAIN_0 = 0x00000020, /* ... except domain 0 */ - PCI_SCAN_ALL_PCIE_DEVS = 0x00000040, /* scan all, not just dev 0 */ + PCI_SCAN_ALL_PCIE_DEVS = 0x00000040, /* Scan all, not just dev 0 */ };
-/* these external functions are only available when PCI support is enabled */ +/* These external functions are only available when PCI support is enabled */ #ifdef CONFIG_PCI
extern unsigned int pci_flags; @@@ -832,11 -840,11 +832,11 @@@ static inline int pci_has_flag(int flag void pcie_bus_configure_settings(struct pci_bus *bus);
enum pcie_bus_config_types { - PCIE_BUS_TUNE_OFF, /* don't touch MPS at all */ - PCIE_BUS_DEFAULT, /* ensure MPS matches upstream bridge */ - PCIE_BUS_SAFE, /* use largest MPS boot-time devices support */ - PCIE_BUS_PERFORMANCE, /* use MPS and MRRS for best performance */ - PCIE_BUS_PEER2PEER, /* set MPS = 128 for all devices */ + PCIE_BUS_TUNE_OFF, /* Don't touch MPS at all */ + PCIE_BUS_DEFAULT, /* Ensure MPS matches upstream bridge */ + PCIE_BUS_SAFE, /* Use largest MPS boot-time devices support */ + PCIE_BUS_PERFORMANCE, /* Use MPS and MRRS for best performance */ + PCIE_BUS_PEER2PEER, /* Set MPS = 128 for all devices */ };
extern enum pcie_bus_config_types pcie_bus_config; @@@ -845,7 -853,7 +845,7 @@@ extern struct bus_type pci_bus_type
/* Do NOT directly access these two variables, unless you are arch-specific PCI * code, or PCI core code. */ -extern struct list_head pci_root_buses; /* list of all known PCI buses */ +extern struct list_head pci_root_buses; /* List of all known PCI buses */ /* Some device drivers need know if PCI is initiated */ int no_pci_devices(void);
@@@ -883,8 -891,8 +883,8 @@@ int pci_bus_insert_busn_res(struct pci_ int pci_bus_update_busn_res_end(struct pci_bus *b, int busmax); void pci_bus_release_busn_res(struct pci_bus *b); struct pci_bus *pci_scan_root_bus(struct device *parent, int bus, - struct pci_ops *ops, void *sysdata, - struct list_head *resources); + struct pci_ops *ops, void *sysdata, + struct list_head *resources); int pci_scan_root_bus_bridge(struct pci_host_bridge *bridge); struct pci_bus *pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr); @@@ -941,10 -949,10 +941,10 @@@ int pci_find_next_ht_capability(struct struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device, - struct pci_dev *from); + struct pci_dev *from); struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device, - unsigned int ss_vendor, unsigned int ss_device, - struct pci_dev *from); + unsigned int ss_vendor, unsigned int ss_device, + struct pci_dev *from); struct pci_dev *pci_get_slot(struct pci_bus *bus, unsigned int devfn); struct pci_dev *pci_get_domain_bus_and_slot(int domain, unsigned int bus, unsigned int devfn); @@@ -1020,7 -1028,7 +1020,7 @@@ static inline int pcie_capability_clear return pcie_capability_clear_and_set_dword(dev, pos, clear, 0); }
-/* user-space driven config access */ +/* User-space driven config access */ int pci_user_read_config_byte(struct pci_dev *dev, int where, u8 *val); int pci_user_read_config_word(struct pci_dev *dev, int where, u16 *val); int pci_user_read_config_dword(struct pci_dev *dev, int where, u32 *val); @@@ -1064,6 -1072,7 +1064,7 @@@ int pci_set_pcie_reset_state(struct pci int pci_set_cacheline_size(struct pci_dev *dev); #define HAVE_PCI_SET_MWI int __must_check pci_set_mwi(struct pci_dev *dev); + int __must_check pcim_set_mwi(struct pci_dev *dev); int pci_try_set_mwi(struct pci_dev *dev); void pci_clear_mwi(struct pci_dev *dev); void pci_intx(struct pci_dev *dev, int enable); @@@ -1162,7 -1171,7 +1163,7 @@@ unsigned int pci_rescan_bus(struct pci_ void pci_lock_rescan_remove(void); void pci_unlock_rescan_remove(void);
-/* Vital product data routines */ +/* Vital Product Data routines */ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf); ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); int pci_set_vpd_size(struct pci_dev *dev, size_t len); @@@ -1247,7 -1256,9 +1248,7 @@@ static inline pci_bus_addr_t pci_bus_ad int __must_check __pci_register_driver(struct pci_driver *, struct module *, const char *mod_name);
-/* - * pci_register_driver must be a macro so that KBUILD_MODNAME can be expanded - */ +/* pci_register_driver() must be a macro so KBUILD_MODNAME can be expanded */ #define pci_register_driver(driver) \ __pci_register_driver(driver, THIS_MODULE, KBUILD_MODNAME)
@@@ -1262,7 -1273,8 +1263,7 @@@ void pci_unregister_driver(struct pci_d * use this macro once, and calling it replaces module_init() and module_exit() */ #define module_pci_driver(__pci_driver) \ - module_driver(__pci_driver, pci_register_driver, \ - pci_unregister_driver) + module_driver(__pci_driver, pci_register_driver, pci_unregister_driver)
/** * builtin_pci_driver() - Helper macro for registering a PCI driver @@@ -1301,10 -1313,10 +1302,10 @@@ resource_size_t pcibios_iov_resource_al int pci_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags);
-#define PCI_IRQ_LEGACY (1 << 0) /* allow legacy interrupts */ -#define PCI_IRQ_MSI (1 << 1) /* allow MSI interrupts */ -#define PCI_IRQ_MSIX (1 << 2) /* allow MSI-X interrupts */ -#define PCI_IRQ_AFFINITY (1 << 3) /* auto-assign affinity */ +#define PCI_IRQ_LEGACY (1 << 0) /* Allow legacy interrupts */ +#define PCI_IRQ_MSI (1 << 1) /* Allow MSI interrupts */ +#define PCI_IRQ_MSIX (1 << 2) /* Allow MSI-X interrupts */ +#define PCI_IRQ_AFFINITY (1 << 3) /* Auto-assign affinity */ #define PCI_IRQ_ALL_TYPES \ (PCI_IRQ_LEGACY | PCI_IRQ_MSI | PCI_IRQ_MSIX)
@@@ -1323,8 -1335,8 +1324,8 @@@ #define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
struct msix_entry { - u32 vector; /* kernel uses to write allocated vector */ - u16 entry; /* driver uses to specify entry, OS writes */ + u32 vector; /* Kernel uses to write allocated vector */ + u16 entry; /* Driver uses to specify entry, OS writes */ };
#ifdef CONFIG_PCI_MSI @@@ -1364,10 -1376,10 +1365,10 @@@ static inline int pci_msi_enabled(void static inline int pci_enable_msi(struct pci_dev *dev) { return -ENOSYS; } static inline int pci_enable_msix_range(struct pci_dev *dev, - struct msix_entry *entries, int minvec, int maxvec) + struct msix_entry *entries, int minvec, int maxvec) { return -ENOSYS; } static inline int pci_enable_msix_exact(struct pci_dev *dev, - struct msix_entry *entries, int nvec) + struct msix_entry *entries, int nvec) { return -ENOSYS; }
static inline int @@@ -1532,9 -1544,9 +1533,9 @@@ static inline int acpi_pci_bus_find_dom int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent); #endif
-/* some architectures require additional setup to direct VGA traffic */ +/* Some architectures require additional setup to direct VGA traffic */ typedef int (*arch_set_vga_state_t)(struct pci_dev *pdev, bool decode, - unsigned int command_bits, u32 flags); + unsigned int command_bits, u32 flags); void pci_register_set_vga_state(arch_set_vga_state_t func);
static inline int @@@ -1573,9 -1585,10 +1574,9 @@@ static inline void pci_clear_flags(int static inline int pci_has_flag(int flag) { return 0; }
/* - * If the system does not have PCI, clearly these return errors. Define - * these as simple inline functions to avoid hair in drivers. + * If the system does not have PCI, clearly these return errors. Define + * these as simple inline functions to avoid hair in drivers. */ - #define _PCI_NOP(o, s, t) \ static inline int pci_##o##_config_##s(struct pci_dev *dev, \ int where, t val) \ @@@ -1714,10 -1727,8 +1715,10 @@@ int pci_iobar_pfn(struct pci_dev *pdev #define pci_root_bus_fwnode(bus) NULL #endif
-/* these helpers provide future and backwards compatibility - * for accessing popular PCI BAR info */ +/* + * These helpers provide future and backwards compatibility + * for accessing popular PCI BAR info + */ #define pci_resource_start(dev, bar) ((dev)->resource[(bar)].start) #define pci_resource_end(dev, bar) ((dev)->resource[(bar)].end) #define pci_resource_flags(dev, bar) ((dev)->resource[(bar)].flags) @@@ -1729,8 -1740,7 +1730,8 @@@ (pci_resource_end((dev), (bar)) - \ pci_resource_start((dev), (bar)) + 1))
-/* Similar to the helpers above, these manipulate per-pci_dev +/* + * Similar to the helpers above, these manipulate per-pci_dev * driver-specific data. They are really just a wrapper around * the generic device structure functions of these calls. */ @@@ -1744,14 -1754,16 +1745,14 @@@ static inline void pci_set_drvdata(stru dev_set_drvdata(&pdev->dev, data); }
-/* If you want to know what to call your pci_dev, ask this function. - * Again, it's a wrapper around the generic device. - */ static inline const char *pci_name(const struct pci_dev *pdev) { return dev_name(&pdev->dev); }
-/* Some archs don't want to expose struct resource to userland as-is +/* + * Some archs don't want to expose struct resource to userland as-is * in sysfs and /proc */ #ifdef HAVE_ARCH_PCI_RESOURCE_TO_USER @@@ -1770,16 -1782,16 +1771,16 @@@ static inline void pci_resource_to_user
/* - * The world is not perfect and supplies us with broken PCI devices. - * For at least a part of these bugs we need a work-around, so both - * generic (drivers/pci/quirks.c) and per-architecture code can define - * fixup hooks to be called for particular buggy devices. + * The world is not perfect and supplies us with broken PCI devices. + * For at least a part of these bugs we need a work-around, so both + * generic (drivers/pci/quirks.c) and per-architecture code can define + * fixup hooks to be called for particular buggy devices. */
struct pci_fixup { - u16 vendor; /* You can use PCI_ANY_ID here of course */ - u16 device; /* You can use PCI_ANY_ID here of course */ - u32 class; /* You can use PCI_ANY_ID here too */ + u16 vendor; /* Or PCI_ANY_ID */ + u16 device; /* Or PCI_ANY_ID */ + u32 class; /* Or PCI_ANY_ID */ unsigned int class_shift; /* should be 0, 8, 16 */ void (*hook)(struct pci_dev *dev); }; @@@ -1821,19 -1833,23 +1822,19 @@@ enum pci_fixup_pass #define DECLARE_PCI_FIXUP_CLASS_RESUME(vendor, device, class, \ class_shift, hook) \ DECLARE_PCI_FIXUP_SECTION(.pci_fixup_resume, \ - resume##hook, vendor, device, class, \ - class_shift, hook) + resume##hook, vendor, device, class, class_shift, hook) #define DECLARE_PCI_FIXUP_CLASS_RESUME_EARLY(vendor, device, class, \ class_shift, hook) \ DECLARE_PCI_FIXUP_SECTION(.pci_fixup_resume_early, \ - resume_early##hook, vendor, device, \ - class, class_shift, hook) + resume_early##hook, vendor, device, class, class_shift, hook) #define DECLARE_PCI_FIXUP_CLASS_SUSPEND(vendor, device, class, \ class_shift, hook) \ DECLARE_PCI_FIXUP_SECTION(.pci_fixup_suspend, \ - suspend##hook, vendor, device, class, \ - class_shift, hook) + suspend##hook, vendor, device, class, class_shift, hook) #define DECLARE_PCI_FIXUP_CLASS_SUSPEND_LATE(vendor, device, class, \ class_shift, hook) \ DECLARE_PCI_FIXUP_SECTION(.pci_fixup_suspend_late, \ - suspend_late##hook, vendor, device, \ - class, class_shift, hook) + suspend_late##hook, vendor, device, class, class_shift, hook)
#define DECLARE_PCI_FIXUP_EARLY(vendor, device, hook) \ DECLARE_PCI_FIXUP_SECTION(.pci_fixup_early, \ @@@ -1849,16 -1865,20 +1850,16 @@@ hook, vendor, device, PCI_ANY_ID, 0, hook) #define DECLARE_PCI_FIXUP_RESUME(vendor, device, hook) \ DECLARE_PCI_FIXUP_SECTION(.pci_fixup_resume, \ - resume##hook, vendor, device, \ - PCI_ANY_ID, 0, hook) + resume##hook, vendor, device, PCI_ANY_ID, 0, hook) #define DECLARE_PCI_FIXUP_RESUME_EARLY(vendor, device, hook) \ DECLARE_PCI_FIXUP_SECTION(.pci_fixup_resume_early, \ - resume_early##hook, vendor, device, \ - PCI_ANY_ID, 0, hook) + resume_early##hook, vendor, device, PCI_ANY_ID, 0, hook) #define DECLARE_PCI_FIXUP_SUSPEND(vendor, device, hook) \ DECLARE_PCI_FIXUP_SECTION(.pci_fixup_suspend, \ - suspend##hook, vendor, device, \ - PCI_ANY_ID, 0, hook) + suspend##hook, vendor, device, PCI_ANY_ID, 0, hook) #define DECLARE_PCI_FIXUP_SUSPEND_LATE(vendor, device, hook) \ DECLARE_PCI_FIXUP_SECTION(.pci_fixup_suspend_late, \ - suspend_late##hook, vendor, device, \ - PCI_ANY_ID, 0, hook) + suspend_late##hook, vendor, device, PCI_ANY_ID, 0, hook)
#ifdef CONFIG_PCI_QUIRKS void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev); @@@ -1945,7 -1965,6 +1946,7 @@@ int pci_vfs_assigned(struct pci_dev *de int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs); int pci_sriov_get_totalvfs(struct pci_dev *dev); resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno); +void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe); #else static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id) { @@@ -1973,7 -1992,6 +1974,7 @@@ static inline int pci_sriov_get_totalvf { return 0; } static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno) { return 0; } +static inline void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe) { } #endif
#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE) @@@ -2095,7 -2113,7 +2096,7 @@@ static inline u16 pci_vpd_lrdt_size(con */ static inline u16 pci_vpd_lrdt_tag(const u8 *lrdt) { - return (u16)(lrdt[0] & PCI_VPD_LRDT_TIN_MASK); + return (u16)(lrdt[0] & PCI_VPD_LRDT_TIN_MASK); }
/** @@@ -2180,7 -2198,7 +2181,7 @@@ static inline struct device_node *pci_b return bus ? bus->dev.of_node : NULL; }
-#else /* CONFIG_OF */ +#else /* CONFIG_OF */ static inline void pci_set_of_node(struct pci_dev *dev) { } static inline void pci_release_of_node(struct pci_dev *dev) { } static inline void pci_set_bus_of_node(struct pci_bus *bus) { } @@@ -2189,7 -2207,7 +2190,7 @@@ static inline struct device_node pci_device_to_OF_node(const struct pci_dev *pdev) { return NULL; } static inline struct irq_domain * pci_host_bridge_of_msi_domain(struct pci_bus *bus) { return NULL; } -#endif /* CONFIG_OF */ +#endif /* CONFIG_OF */
#ifdef CONFIG_ACPI struct irq_domain *pci_host_bridge_acpi_msi_domain(struct pci_bus *bus); @@@ -2214,7 -2232,7 +2215,7 @@@ int pci_for_each_dma_alias(struct pci_d int (*fn)(struct pci_dev *pdev, u16 alias, void *data), void *data);
-/* helper functions for operation of device flag */ +/* Helper functions for operation of device flag */ static inline void pci_set_dev_assigned(struct pci_dev *pdev) { pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED; @@@ -2261,7 -2279,7 +2262,7 @@@ static inline bool pci_is_thunderbolt_a return false; }
-/* provide the legacy pci_dma_* API */ +/* Provide the legacy pci_dma_* API */ #include <linux/pci-dma-compat.h>
#endif /* LINUX_PCI_H */ diff --combined include/linux/skbuff.h index a87e43d16f44,b8e0da6c27d6..ac89a93b7c83 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@@ -1211,6 -1211,11 +1211,11 @@@ static inline bool skb_flow_dissect_flo data, proto, nhoff, hlen, flags); }
+ void + skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container); + static inline __u32 skb_get_hash(struct sk_buff *skb) { if (!skb->l4_hash && !skb->sw_hash) @@@ -3241,7 -3246,7 +3246,7 @@@ struct sk_buff *__skb_recv_datagram(str int *peeked, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err); -unsigned int datagram_poll(struct file *file, struct socket *sock, +__poll_t datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); diff --combined include/net/inet_connection_sock.h index ec72cdb5bc39,8e1bf9ae4a5e..6692d67e9245 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@@ -77,6 -77,7 +77,7 @@@ struct inet_connection_sock_af_ops * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data + * @icsk_listen_portaddr_node hash to the portaddr listener hashtable * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@@ -101,6 -102,7 +102,7 @@@ struct inet_connection_sock const struct inet_connection_sock_af_ops *icsk_af_ops; const struct tcp_ulp_ops *icsk_ulp_ops; void *icsk_ulp_data; + struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:6, icsk_ca_setsockopt:1, @@@ -305,7 -307,7 +307,7 @@@ void inet_csk_prepare_forced_close(stru /* * LISTEN is a special case for poll.. */ -static inline unsigned int inet_csk_listen_poll(const struct sock *sk) +static inline __poll_t inet_csk_listen_poll(const struct sock *sk) { return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (POLLIN | POLLRDNORM) : 0; diff --combined include/net/sctp/sctp.h index 608d123ef25f,20c0c1be2ca7..f7ae6b0a21d0 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@@ -107,7 -107,7 +107,7 @@@ int sctp_backlog_rcv(struct sock *sk, s int sctp_inet_listen(struct socket *sock, int backlog); void sctp_write_space(struct sock *sk); void sctp_data_ready(struct sock *sk); -unsigned int sctp_poll(struct file *file, struct socket *sock, +__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); void sctp_copy_sock(struct sock *newsk, struct sock *sk, @@@ -116,7 -116,7 +116,7 @@@ extern struct percpu_counter sctp_socke int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *); struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *);
- int sctp_transport_walk_start(struct rhashtable_iter *iter); + void sctp_transport_walk_start(struct rhashtable_iter *iter); void sctp_transport_walk_stop(struct rhashtable_iter *iter); struct sctp_transport *sctp_transport_get_next(struct net *net, struct rhashtable_iter *iter); @@@ -444,13 -444,13 +444,13 @@@ static inline int sctp_frag_point(cons int frag = pmtu;
frag -= sp->pf->af->net_header_len; - frag -= sizeof(struct sctphdr) + sizeof(struct sctp_data_chunk); + frag -= sizeof(struct sctphdr) + sctp_datachk_len(&asoc->stream);
if (asoc->user_frag) frag = min_t(int, frag, asoc->user_frag);
frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN - - sizeof(struct sctp_data_chunk))); + sctp_datachk_len(&asoc->stream)));
return frag; } diff --combined include/net/sock.h index f90685441143,6c1db823f8b9..ae68e1be0c1d --- a/include/net/sock.h +++ b/include/net/sock.h @@@ -1262,6 -1262,7 +1262,7 @@@ proto_memory_pressure(struct proto *pro /* Called with local bh disabled */ void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc); int sock_prot_inuse_get(struct net *net, struct proto *proto); + int sock_inuse_get(struct net *net); #else static inline void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc) @@@ -1578,7 -1579,7 +1579,7 @@@ int sock_no_connect(struct socket *, st int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, int, bool); int sock_no_getname(struct socket *, struct sockaddr *, int *, int); -unsigned int sock_no_poll(struct file *, struct socket *, +__poll_t sock_no_poll(struct file *, struct socket *, struct poll_table_struct *); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); @@@ -2332,31 -2333,6 +2333,6 @@@ static inline bool sk_listener(const st return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); }
- /** - * sk_state_load - read sk->sk_state for lockless contexts - * @sk: socket pointer - * - * Paired with sk_state_store(). Used in places we do not hold socket lock : - * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ... - */ - static inline int sk_state_load(const struct sock *sk) - { - return smp_load_acquire(&sk->sk_state); - } - - /** - * sk_state_store - update sk->sk_state - * @sk: socket pointer - * @newstate: new state - * - * Paired with sk_state_load(). Should be used in contexts where - * state change might impact lockless readers. - */ - static inline void sk_state_store(struct sock *sk, int newstate) - { - smp_store_release(&sk->sk_state, newstate); - } - void sock_enable_timestamp(struct sock *sk, int flag); int sock_get_timestamp(struct sock *, struct timeval __user *); int sock_get_timestampns(struct sock *, struct timespec __user *); @@@ -2407,4 -2383,15 +2383,15 @@@ static inline int sk_get_rmem0(const st return *proto->sysctl_rmem; }
+ /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) + * Some wifi drivers need to tweak it to get more chunks. + * They can use this helper from their ndo_start_xmit() + */ + static inline void sk_pacing_shift_update(struct sock *sk, int val) + { + if (!sk || !sk_fullsock(sk) || sk->sk_pacing_shift == val) + return; + sk->sk_pacing_shift = val; + } + #endif /* _SOCK_H */ diff --combined include/net/tcp.h index 50b21a49d870,6939e69d3c37..26c2793846a1 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@@ -387,7 -387,7 +387,7 @@@ bool tcp_peer_is_proven(struct request_ void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op); -unsigned int tcp_poll(struct file *file, struct socket *sock, +__poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); @@@ -1507,8 -1507,7 +1507,7 @@@ int tcp_md5_hash_key(struct tcp_md5sig_
/* From tcp_fastopen.c */ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, - struct tcp_fastopen_cookie *cookie, int *syn_loss, - unsigned long *last_syn_loss); + struct tcp_fastopen_cookie *cookie); void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost, u16 try_exp); @@@ -1546,7 -1545,7 +1545,7 @@@ extern unsigned int sysctl_tcp_fastopen void tcp_fastopen_active_disable(struct sock *sk); bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); - void tcp_fastopen_active_timeout_reset(void); + void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired);
/* Latencies incurred by various limits for a sender. They are * chronograph-like stats that are mutually exclusive. @@@ -2011,10 -2010,12 +2010,12 @@@ static inline int tcp_call_bpf(struct s struct bpf_sock_ops_kern sock_ops; int ret;
- if (sk_fullsock(sk)) + memset(&sock_ops, 0, sizeof(sock_ops)); + if (sk_fullsock(sk)) { + sock_ops.is_fullsock = 1; sock_owned_by_me(sk); + }
- memset(&sock_ops, 0, sizeof(sock_ops)); sock_ops.sk = sk; sock_ops.op = op;
diff --combined include/net/xfrm.h index ae35991b5877,1ec0c4760646..059213a4096e --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@@ -968,7 -968,7 +968,7 @@@ static inline bool xfrm_sec_ctx_match(s
/* A struct encoding bundle of transformations to apply to some set of flow. * - * dst->child points to the next element of bundle. + * xdst->child points to the next element of bundle. * dst->xfrm points to an instanse of transformer. * * Due to unfortunate limitations of current routing cache, which we @@@ -984,6 -984,8 +984,8 @@@ struct xfrm_dst struct rt6_info rt6; } u; struct dst_entry *route; + struct dst_entry *child; + struct dst_entry *path; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols, num_xfrms; u32 xfrm_genid; @@@ -994,7 -996,35 +996,35 @@@ u32 path_cookie; };
+ static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst) + { #ifdef CONFIG_XFRM + if (dst->xfrm) { + const struct xfrm_dst *xdst = (const struct xfrm_dst *) dst; + + return xdst->path; + } + #endif + return (struct dst_entry *) dst; + } + + static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) + { + #ifdef CONFIG_XFRM + if (dst->xfrm) { + struct xfrm_dst *xdst = (struct xfrm_dst *) dst; + return xdst->child; + } + #endif + return NULL; + } + + #ifdef CONFIG_XFRM + static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child) + { + xdst->child = child; + } + static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) { xfrm_pols_put(xdst->pols, xdst->num_pols); @@@ -1570,9 -1600,6 +1600,9 @@@ int xfrm_init_state(struct xfrm_state * int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); +int xfrm_trans_queue(struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)); int xfrm_output_resume(struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb); @@@ -1869,12 -1896,14 +1899,14 @@@ bool xfrm_dev_offload_ok(struct sk_buf static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { struct xfrm_state *x = dst->xfrm; + struct xfrm_dst *xdst;
if (!x || !x->type_offload) return false;
- if (x->xso.offload_handle && (x->xso.dev == dst->path->dev) && - !dst->child->xfrm) + xdst = (struct xfrm_dst *) dst; + if (x->xso.offload_handle && (x->xso.dev == xfrm_dst_path(dst)->dev) && + !xdst->child->xfrm) return true;
return false; diff --combined kernel/bpf/syscall.c index 5cb783fc8224,e2e1c78ce1dc..da932743e116 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@@ -1057,7 -1057,7 +1057,7 @@@ struct bpf_prog *bpf_prog_inc_not_zero( } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
-static bool bpf_prog_get_ok(struct bpf_prog *prog, +bool bpf_prog_get_ok(struct bpf_prog *prog, enum bpf_prog_type *attach_type, bool attach_drv) { /* not an attachment, just a refcount inc, always allow */ @@@ -1194,7 -1194,8 +1194,8 @@@ static int bpf_prog_load(union bpf_att goto free_used_maps;
/* eBPF program is ready to be JITed */ - prog = bpf_prog_select_runtime(prog, &err); + if (!prog->bpf_func) + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps;
diff --combined kernel/bpf/verifier.c index e39b01317b6f,48b2901cf483..6d8e432453b8 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@@ -20,6 -20,8 +20,8 @@@ #include <linux/file.h> #include <linux/vmalloc.h> #include <linux/stringify.h> + #include <linux/bsearch.h> + #include <linux/sort.h>
#include "disasm.h"
@@@ -216,23 -218,48 +218,48 @@@ static const char * const reg_type_str[ [PTR_TO_PACKET_END] = "pkt_end", };
+ static void print_liveness(struct bpf_verifier_env *env, + enum bpf_reg_liveness live) + { + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + verbose(env, "_"); + if (live & REG_LIVE_READ) + verbose(env, "r"); + if (live & REG_LIVE_WRITTEN) + verbose(env, "w"); + } + + static struct bpf_func_state *func(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg) + { + struct bpf_verifier_state *cur = env->cur_state; + + return cur->frame[reg->frameno]; + } + static void print_verifier_state(struct bpf_verifier_env *env, - struct bpf_verifier_state *state) + const struct bpf_func_state *state) { - struct bpf_reg_state *reg; + const struct bpf_reg_state *reg; enum bpf_reg_type t; int i;
+ if (state->frameno) + verbose(env, " frame%d:", state->frameno); for (i = 0; i < MAX_BPF_REG; i++) { reg = &state->regs[i]; t = reg->type; if (t == NOT_INIT) continue; - verbose(env, " R%d=%s", i, reg_type_str[t]); + verbose(env, " R%d", i); + print_liveness(env, reg->live); + verbose(env, "=%s", reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); + if (t == PTR_TO_STACK) + verbose(env, ",call_%d", func(env, reg)->callsite); } else { verbose(env, "(id=%d", reg->id); if (t != SCALAR_VALUE) @@@ -277,16 -304,21 +304,21 @@@ } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) - verbose(env, " fp%d=%s", - -MAX_BPF_STACK + i * BPF_REG_SIZE, + if (state->stack[i].slot_type[0] == STACK_SPILL) { + verbose(env, " fp%d", + (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); + } + if (state->stack[i].slot_type[0] == STACK_ZERO) + verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); } verbose(env, "\n"); }
- static int copy_stack_state(struct bpf_verifier_state *dst, - const struct bpf_verifier_state *src) + static int copy_stack_state(struct bpf_func_state *dst, + const struct bpf_func_state *src) { if (!src->stack) return 0; @@@ -302,13 -334,13 +334,13 @@@
/* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from - * the program calls into realloc_verifier_state() to grow the stack size. + * the program calls into realloc_func_state() to grow the stack size. * Note there is a non-zero 'parent' pointer inside bpf_verifier_state * which this function copies over. It points to previous bpf_verifier_state * which is never reallocated */ - static int realloc_verifier_state(struct bpf_verifier_state *state, int size, - bool copy_old) + static int realloc_func_state(struct bpf_func_state *state, int size, + bool copy_old) { u32 old_size = state->allocated_stack; struct bpf_stack_state *new_stack; @@@ -341,10 -373,21 +373,21 @@@ return 0; }
+ static void free_func_state(struct bpf_func_state *state) + { + kfree(state->stack); + kfree(state); + } + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { - kfree(state->stack); + int i; + + for (i = 0; i <= state->curframe; i++) { + free_func_state(state->frame[i]); + state->frame[i] = NULL; + } if (free_self) kfree(state); } @@@ -352,18 -395,46 +395,46 @@@ /* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack */ - static int copy_verifier_state(struct bpf_verifier_state *dst, - const struct bpf_verifier_state *src) + static int copy_func_state(struct bpf_func_state *dst, + const struct bpf_func_state *src) { int err;
- err = realloc_verifier_state(dst, src->allocated_stack, false); + err = realloc_func_state(dst, src->allocated_stack, false); if (err) return err; - memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); + memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack)); return copy_stack_state(dst, src); }
+ static int copy_verifier_state(struct bpf_verifier_state *dst_state, + const struct bpf_verifier_state *src) + { + struct bpf_func_state *dst; + int i, err; + + /* if dst has more stack frames then src frame, free them */ + for (i = src->curframe + 1; i <= dst_state->curframe; i++) { + free_func_state(dst_state->frame[i]); + dst_state->frame[i] = NULL; + } + dst_state->curframe = src->curframe; + dst_state->parent = src->parent; + for (i = 0; i <= src->curframe; i++) { + dst = dst_state->frame[i]; + if (!dst) { + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return -ENOMEM; + dst_state->frame[i] = dst; + } + err = copy_func_state(dst, src->frame[i]); + if (err) + return err; + } + return 0; + } + static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx) { @@@ -425,6 -496,10 +496,10 @@@ err static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; + #define CALLEE_SAVED_REGS 5 + static const int callee_saved[CALLEE_SAVED_REGS] = { + BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9 + };
static void __mark_reg_not_init(struct bpf_reg_state *reg);
@@@ -449,6 -524,13 +524,13 @@@ static void __mark_reg_known_zero(struc __mark_reg_known(reg, 0); }
+ static void __mark_reg_const_zero(struct bpf_reg_state *reg) + { + __mark_reg_known(reg, 0); + reg->off = 0; + reg->type = SCALAR_VALUE; + } + static void mark_reg_known_zero(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { @@@ -560,6 -642,7 +642,7 @@@ static void __mark_reg_unknown(struct b reg->id = 0; reg->off = 0; reg->var_off = tnum_unknown; + reg->frameno = 0; __mark_reg_unbounded(reg); }
@@@ -568,8 -651,8 +651,8 @@@ static void mark_reg_unknown(struct bpf { if (WARN_ON(regno >= MAX_BPF_REG)) { verbose(env, "mark_reg_unknown(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) + /* Something bad happened, let's kill all regs except FP */ + for (regno = 0; regno < BPF_REG_FP; regno++) __mark_reg_not_init(regs + regno); return; } @@@ -587,8 -670,8 +670,8 @@@ static void mark_reg_not_init(struct bp { if (WARN_ON(regno >= MAX_BPF_REG)) { verbose(env, "mark_reg_not_init(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) + /* Something bad happened, let's kill all regs except FP */ + for (regno = 0; regno < BPF_REG_FP; regno++) __mark_reg_not_init(regs + regno); return; } @@@ -596,8 -679,9 +679,9 @@@ }
static void init_reg_state(struct bpf_verifier_env *env, - struct bpf_reg_state *regs) + struct bpf_func_state *state) { + struct bpf_reg_state *regs = state->regs; int i;
for (i = 0; i < MAX_BPF_REG; i++) { @@@ -608,41 -692,217 +692,217 @@@ /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; mark_reg_known_zero(env, regs, BPF_REG_FP); + regs[BPF_REG_FP].frameno = state->frameno;
/* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; mark_reg_known_zero(env, regs, BPF_REG_1); }
+ #define BPF_MAIN_FUNC (-1) + static void init_func_state(struct bpf_verifier_env *env, + struct bpf_func_state *state, + int callsite, int frameno, int subprogno) + { + state->callsite = callsite; + state->frameno = frameno; + state->subprogno = subprogno; + init_reg_state(env, state); + } + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ DST_OP_NO_MARK /* same as above, check only, don't mark */ };
- static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) + static int cmp_subprogs(const void *a, const void *b) + { + return *(int *)a - *(int *)b; + } + + static int find_subprog(struct bpf_verifier_env *env, int off) + { + u32 *p; + + p = bsearch(&off, env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs); + if (!p) + return -ENOENT; + return p - env->subprog_starts; + + } + + static int add_subprog(struct bpf_verifier_env *env, int off) + { + int insn_cnt = env->prog->len; + int ret; + + if (off >= insn_cnt || off < 0) { + verbose(env, "call to invalid destination\n"); + return -EINVAL; + } + ret = find_subprog(env, off); + if (ret >= 0) + return 0; + if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { + verbose(env, "too many subprograms\n"); + return -E2BIG; + } + env->subprog_starts[env->subprog_cnt++] = off; + sort(env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); + return 0; + } + + static int check_subprogs(struct bpf_verifier_env *env) + { + int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + + /* determine subprog starts. The end is one before the next starts */ + for (i = 0; i < insn_cnt; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + if (!env->allow_ptr_leaks) { + verbose(env, "function calls to other bpf functions are allowed for root only\n"); + return -EPERM; + } + if (bpf_prog_is_dev_bound(env->prog->aux)) { + verbose(env, "funcation calls in offloaded programs are not supported yet\n"); + return -EINVAL; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + } + + if (env->log.level > 1) + for (i = 0; i < env->subprog_cnt; i++) + verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); + + /* now check that all jumps are within the same subprog */ + subprog_start = 0; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + for (i = 0; i < insn_cnt; i++) { + u8 code = insn[i].code; + + if (BPF_CLASS(code) != BPF_JMP) + goto next; + if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + goto next; + off = i + insn[i].off + 1; + if (off < subprog_start || off >= subprog_end) { + verbose(env, "jump out of range from insn %d to %d\n", i, off); + return -EINVAL; + } + next: + if (i == subprog_end - 1) { + /* to avoid fall-through from one subprog into another + * the last insn of the subprog should be either exit + * or unconditional jump back + */ + if (code != (BPF_JMP | BPF_EXIT) && + code != (BPF_JMP | BPF_JA)) { + verbose(env, "last insn is not an exit or jmp\n"); + return -EINVAL; + } + subprog_start = subprog_end; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + } + } + return 0; + } + + struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + u32 regno) + { + struct bpf_verifier_state *tmp = NULL; + + /* 'parent' could be a state of caller and + * 'state' could be a state of callee. In such case + * parent->curframe < state->curframe + * and it's ok for r1 - r5 registers + * + * 'parent' could be a callee's state after it bpf_exit-ed. + * In such case parent->curframe > state->curframe + * and it's ok for r0 only + */ + if (parent->curframe == state->curframe || + (parent->curframe < state->curframe && + regno >= BPF_REG_1 && regno <= BPF_REG_5) || + (parent->curframe > state->curframe && + regno == BPF_REG_0)) + return parent; + + if (parent->curframe > state->curframe && + regno >= BPF_REG_6) { + /* for callee saved regs we have to skip the whole chain + * of states that belong to callee and mark as LIVE_READ + * the registers before the call + */ + tmp = parent; + while (tmp && tmp->curframe != state->curframe) { + tmp = tmp->parent; + } + if (!tmp) + goto bug; + parent = tmp; + } else { + goto bug; + } + return parent; + bug: + verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); + verbose(env, "regno %d parent frame %d current frame %d\n", + regno, parent->curframe, state->curframe); + return 0; + } + + static int mark_reg_read(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + u32 regno) { - struct bpf_verifier_state *parent = state->parent; + bool writes = parent == state->parent; /* Observe write marks */
if (regno == BPF_REG_FP) /* We don't need to worry about FP liveness because it's read-only */ - return; + return 0;
while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->regs[regno].live & REG_LIVE_WRITTEN) + if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) break; + parent = skip_callee(env, state, parent, regno); + if (!parent) + return -EFAULT; /* ... then we depend on parent's value */ - parent->regs[regno].live |= REG_LIVE_READ; + parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; state = parent; parent = state->parent; + writes = true; } + return 0; }
static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { - struct bpf_reg_state *regs = env->cur_state->regs; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs;
if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); @@@ -655,7 -915,7 +915,7 @@@ verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - mark_reg_read(env->cur_state, regno); + return mark_reg_read(env, vstate, vstate->parent, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@@ -686,17 -946,25 +946,25 @@@ static bool is_spillable_regtype(enum b } }
+ /* Does this register contain a constant zero? */ + static bool register_is_null(struct bpf_reg_state *reg) + { + return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); + } + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_verifier_state *state, int off, - int size, int value_regno) + struct bpf_func_state *state, /* func where register points to */ + int off, int size, int value_regno) { + struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + enum bpf_reg_type type;
- err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), - true); + err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), + true); if (err) return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@@ -709,8 -977,9 +977,9 @@@ return -EACCES; }
+ cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0 && - is_spillable_regtype(state->regs[value_regno].type)) { + is_spillable_regtype((type = cur->regs[value_regno].type))) {
/* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@@ -718,51 -987,116 +987,116 @@@ return -EACCES; }
+ if (state != cur && type == PTR_TO_STACK) { + verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); + return -EINVAL; + } + /* save register state */ - state->stack[spi].spilled_ptr = state->regs[value_regno]; + state->stack[spi].spilled_ptr = cur->regs[value_regno]; state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
for (i = 0; i < BPF_REG_SIZE; i++) state->stack[spi].slot_type[i] = STACK_SPILL; } else { + u8 type = STACK_MISC; + /* regular write of data into stack */ state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
+ /* only mark the slot as written if all 8 bytes were written + * otherwise read propagation may incorrectly stop too soon + * when stack slots are partially written. + * This heuristic means that read propagation will be + * conservative, since it will add reg_live_read marks + * to stack slots all the way to first state when programs + * writes+reads less than 8 bytes + */ + if (size == BPF_REG_SIZE) + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + + /* when we zero initialize stack slots mark them as such */ + if (value_regno >= 0 && + register_is_null(&cur->regs[value_regno])) + type = STACK_ZERO; + for (i = 0; i < size; i++) state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = - STACK_MISC; + type; } return 0; }
- static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot) + /* registers of every function are unique and mark_reg_read() propagates + * the liveness in the following cases: + * - from callee into caller for R1 - R5 that were used as arguments + * - from caller into callee for R0 that used as result of the call + * - from caller to the same caller skipping states of the callee for R6 - R9, + * since R6 - R9 are callee saved by implicit function prologue and + * caller's R6 != callee's R6, so when we propagate liveness up to + * parent states we need to skip callee states for R6 - R9. + * + * stack slot marking is different, since stacks of caller and callee are + * accessible in both (since caller can pass a pointer to caller's stack to + * callee which can pass it to another function), hence mark_stack_slot_read() + * has to propagate the stack liveness to all parent states at given frame number. + * Consider code: + * f1() { + * ptr = fp - 8; + * *ptr = ctx; + * call f2 { + * .. = *ptr; + * } + * .. = *ptr; + * } + * First *ptr is reading from f1's stack and mark_stack_slot_read() has + * to mark liveness at the f1's frame and not f2's frame. + * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has + * to propagate liveness to f2 states at f1's frame level and further into + * f1 states at f1's frame level until write into that stack slot + */ + static void mark_stack_slot_read(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + int slot, int frameno) { - struct bpf_verifier_state *parent = state->parent; + bool writes = parent == state->parent; /* Observe write marks */
while (parent) { + if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) + /* since LIVE_WRITTEN mark is only done for full 8-byte + * write the read marks are conservative and parent + * state may not even have the stack allocated. In such case + * end the propagation, since the loop reached beginning + * of the function + */ + break; /* if read wasn't screened by an earlier write ... */ - if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) + if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; /* ... then we depend on parent's value */ - parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; + parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; state = parent; parent = state->parent; + writes = true; } }
static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_verifier_state *state, int off, int size, - int value_regno) + struct bpf_func_state *reg_state /* func where register points to */, + int off, int size, int value_regno) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; u8 *stype;
- if (state->allocated_stack <= slot) { + if (reg_state->allocated_stack <= slot) { verbose(env, "invalid read from stack off %d+0 size %d\n", off, size); return -EACCES; } - stype = state->stack[spi].slot_type; + stype = reg_state->stack[spi].slot_type;
if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { @@@ -778,21 -1112,44 +1112,44 @@@
if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = state->stack[spi].spilled_ptr; - mark_stack_slot_read(state, spi); + state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; + /* mark reg as written since spilled pointer state likely + * has its liveness marks cleared by is_state_visited() + * which resets stack/reg liveness for state transitions + */ + state->regs[value_regno].live |= REG_LIVE_WRITTEN; } + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); return 0; } else { + int zeros = 0; + for (i = 0; i < size; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { - verbose(env, "invalid read from stack off %d+%d size %d\n", - off, i, size); - return -EACCES; + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) + continue; + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { + zeros++; + continue; } + verbose(env, "invalid read from stack off %d+%d size %d\n", + off, i, size); + return -EACCES; + } + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); + if (value_regno >= 0) { + if (zeros == size) { + /* any size read into register is zero extended, + * so the whole register == const_zero + */ + __mark_reg_const_zero(&state->regs[value_regno]); + } else { + /* have read misc data from the stack */ + mark_reg_unknown(env, state->regs, value_regno); + } + state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - if (value_regno >= 0) - /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); return 0; } } @@@ -817,7 -1174,8 +1174,8 @@@ static int __check_map_access(struct bp static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { - struct bpf_verifier_state *state = env->cur_state; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err;
@@@ -1067,6 -1425,54 +1425,54 @@@ static int check_ptr_alignment(struct b strict); }
+ static int update_stack_depth(struct bpf_verifier_env *env, + const struct bpf_func_state *func, + int off) + { + u16 stack = env->subprog_stack_depth[func->subprogno], total = 0; + struct bpf_verifier_state *cur = env->cur_state; + int i; + + if (stack >= -off) + return 0; + + /* update known max for given subprogram */ + env->subprog_stack_depth[func->subprogno] = -off; + + /* compute the total for current call chain */ + for (i = 0; i <= cur->curframe; i++) { + u32 depth = env->subprog_stack_depth[cur->frame[i]->subprogno]; + + /* round up to 32-bytes, since this is granularity + * of interpreter stack sizes + */ + depth = round_up(depth, 32); + total += depth; + } + + if (total > MAX_BPF_STACK) { + verbose(env, "combined stack size of %d calls is %d. Too large\n", + cur->curframe, total); + return -EACCES; + } + return 0; + } + + static int get_callee_stack_depth(struct bpf_verifier_env *env, + const struct bpf_insn *insn, int idx) + { + int start = idx + insn->imm + 1, subprog; + + subprog = find_subprog(env, start); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + start); + return -EFAULT; + } + subprog++; + return env->subprog_stack_depth[subprog]; + } + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@@ -1077,9 -1483,9 +1483,9 @@@ static int check_mem_access(struct bpf_ int bpf_size, enum bpf_access_type t, int value_regno) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; + struct bpf_func_state *state; int size, err = 0;
size = bpf_size_to_bytes(bpf_size); @@@ -1168,8 -1574,10 +1574,10 @@@ return -EACCES; }
- if (env->prog->aux->stack_depth < -off) - env->prog->aux->stack_depth = -off; + state = func(env, reg); + err = update_stack_depth(env, state, off); + if (err) + return err;
if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, @@@ -1243,12 -1651,6 +1651,6 @@@ static int check_xadd(struct bpf_verifi BPF_SIZE(insn->code), BPF_WRITE, -1); }
- /* Does this register contain a constant zero? */ - static bool register_is_null(struct bpf_reg_state reg) - { - return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0); - } - /* when register 'regno' is passed into function that will read 'access_size' * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized. @@@ -1259,31 -1661,31 +1661,31 @@@ static int check_stack_boundary(struct int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_verifier_state *state = env->cur_state; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg = cur_regs(env) + regno; + struct bpf_func_state *state = func(env, reg); int off, i, slot, spi;
- if (regs[regno].type != PTR_TO_STACK) { + if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && - register_is_null(regs[regno])) + register_is_null(reg)) return 0;
verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[regs[regno].type], + reg_type_str[reg->type], reg_type_str[PTR_TO_STACK]); return -EACCES; }
/* Only allow fixed-offset stack reads */ - if (!tnum_is_const(regs[regno].var_off)) { + if (!tnum_is_const(reg->var_off)) { char tn_buf[48];
- tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); } - off = regs[regno].off + regs[regno].var_off.value; + off = reg->off + reg->var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size < 0 || (access_size == 0 && !zero_size_allowed)) { verbose(env, "invalid stack type R%d off=%d access_size=%d\n", @@@ -1291,9 -1693,6 +1693,6 @@@ return -EACCES; }
- if (env->prog->aux->stack_depth < -off) - env->prog->aux->stack_depth = -off; - if (meta && meta->raw_mode) { meta->access_size = access_size; meta->regno = regno; @@@ -1301,17 -1700,32 +1700,32 @@@ }
for (i = 0; i < access_size; i++) { + u8 *stype; + slot = -(off + i) - 1; spi = slot / BPF_REG_SIZE; - if (state->allocated_stack <= slot || - state->stack[spi].slot_type[slot % BPF_REG_SIZE] != - STACK_MISC) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - off, i, access_size); - return -EACCES; + if (state->allocated_stack <= slot) + goto err; + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + if (*stype == STACK_MISC) + goto mark; + if (*stype == STACK_ZERO) { + /* helper can write anything into the stack */ + *stype = STACK_MISC; + goto mark; } + err: + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + off, i, access_size); + return -EACCES; + mark: + /* reading any byte out of 8-byte 'spill_slot' will cause + * the whole slot to be marked as 'read' + */ + mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, + spi, state->frameno); } - return 0; + return update_stack_depth(env, state, off); }
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, @@@ -1391,7 -1805,7 +1805,7 @@@ static int check_func_arg(struct bpf_ve * passed in as argument, it's a SCALAR_VALUE type. Final test * happens during stack boundary checking. */ - if (register_is_null(*reg) && + if (register_is_null(reg) && arg_type == ARG_PTR_TO_MEM_OR_NULL) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && @@@ -1564,6 -1978,10 +1978,10 @@@ static int check_map_func_compatibility case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; + if (env->subprog_cnt) { + verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); + return -EINVAL; + } break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: @@@ -1625,9 -2043,9 +2043,9 @@@ static int check_raw_mode(const struct /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. */ - static void clear_all_pkt_pointers(struct bpf_verifier_env *env) + static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, + struct bpf_func_state *state) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs, *reg; int i;
@@@ -1644,7 -2062,121 +2062,121 @@@ } }
- static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) + static void clear_all_pkt_pointers(struct bpf_verifier_env *env) + { + struct bpf_verifier_state *vstate = env->cur_state; + int i; + + for (i = 0; i <= vstate->curframe; i++) + __clear_all_pkt_pointers(env, vstate->frame[i]); + } + + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) + { + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + int i, subprog, target_insn; + + if (state->curframe >= MAX_CALL_FRAMES) { + verbose(env, "the call stack of %d frames is too deep\n", + state->curframe); + return -E2BIG; + } + + target_insn = *insn_idx + insn->imm; + subprog = find_subprog(env, target_insn + 1); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", + target_insn + 1); + return -EFAULT; + } + + caller = state->frame[state->curframe]; + if (state->frame[state->curframe + 1]) { + verbose(env, "verifier bug. Frame %d already allocated\n", + state->curframe + 1); + return -EFAULT; + } + + callee = kzalloc(sizeof(*callee), GFP_KERNEL); + if (!callee) + return -ENOMEM; + state->frame[state->curframe + 1] = callee; + + /* callee cannot access r0, r6 - r9 for reading and has to write + * into its own stack before reading from it. + * callee can read/write into caller's stack + */ + init_func_state(env, callee, + /* remember the callsite, it will be used by bpf_exit */ + *insn_idx /* callsite */, + state->curframe + 1 /* frameno within this callchain */, + subprog + 1 /* subprog number within this prog */); + + /* copy r1 - r5 args that callee can access */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + callee->regs[i] = caller->regs[i]; + + /* after the call regsiters r0 - r5 were scratched */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + mark_reg_not_init(env, caller->regs, caller_saved[i]); + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + } + + /* only increment it after check_reg_arg() finished */ + state->curframe++; + + /* and go analyze first insn of the callee */ + *insn_idx = target_insn; + + if (env->log.level) { + verbose(env, "caller:\n"); + print_verifier_state(env, caller); + verbose(env, "callee:\n"); + print_verifier_state(env, callee); + } + return 0; + } + + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) + { + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + struct bpf_reg_state *r0; + + callee = state->frame[state->curframe]; + r0 = &callee->regs[BPF_REG_0]; + if (r0->type == PTR_TO_STACK) { + /* technically it's ok to return caller's stack pointer + * (or caller's caller's pointer) back to the caller, + * since these pointers are valid. Only current stack + * pointer will be invalid as soon as function exits, + * but let's be conservative + */ + verbose(env, "cannot return stack pointer to the caller\n"); + return -EINVAL; + } + + state->curframe--; + caller = state->frame[state->curframe]; + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + + *insn_idx = callee->callsite + 1; + if (env->log.level) { + verbose(env, "returning from callee:\n"); + print_verifier_state(env, callee); + verbose(env, "to caller at %d:\n", *insn_idx); + print_verifier_state(env, caller); + } + /* clear everything in the callee */ + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return 0; + } + + static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; @@@ -1674,13 -2206,7 +2206,13 @@@ return -EINVAL; }
+ /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(fn->func); + if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { + verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", + func_id_name(func_id), func_id); + return -EINVAL; + }
memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; @@@ -1810,7 -2336,9 +2342,9 @@@ static int adjust_ptr_min_max_vals(stru const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { - struct bpf_reg_state *regs = cur_regs(env), *dst_reg; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; @@@ -1822,13 -2350,13 +2356,13 @@@ dst_reg = ®s[dst];
if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: known but bad sbounds\n"); return -EINVAL; } if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: known but bad ubounds\n"); return -EINVAL; @@@ -2230,7 -2758,9 +2764,9 @@@ static int adjust_scalar_min_max_vals(s static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); int rc; @@@ -2304,12 -2834,12 +2840,12 @@@
/* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } @@@ -2463,14 -2993,15 +2999,15 @@@ static int check_alu_op(struct bpf_veri return 0; }
- static void find_good_pkt_pointers(struct bpf_verifier_state *state, + static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *reg; u16 new_range; - int i; + int i, j;
if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@@ -2540,12 -3071,15 +3077,15 @@@ /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range);
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - reg = &state->stack[i].spilled_ptr; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); + for (j = 0; j <= vstate->curframe; j++) { + state = vstate->frame[j]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + reg = &state->stack[i].spilled_ptr; + if (reg->type == type && reg->id == dst_reg->id) + reg->range = max(reg->range, new_range); + } } }
@@@ -2783,20 -3317,24 +3323,24 @@@ static void mark_map_reg(struct bpf_reg /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ - static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, + static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs; u32 id = regs[regno].id; - int i; + int i, j;
for (i = 0; i < MAX_BPF_REG; i++) mark_map_reg(regs, i, id, is_null);
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + for (j = 0; j <= vstate->curframe; j++) { + state = vstate->frame[j]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + } } }
@@@ -2896,8 -3434,10 +3440,10 @@@ static bool try_match_pkt_pointers(cons static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; - struct bpf_reg_state *regs = this_branch->regs, *dst_reg; + struct bpf_verifier_state *this_branch = env->cur_state; + struct bpf_verifier_state *other_branch; + struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; + struct bpf_reg_state *dst_reg, *other_branch_regs; u8 opcode = BPF_OP(insn->code); int err;
@@@ -2940,8 -3480,9 +3486,9 @@@ if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == SCALAR_VALUE && - tnum_equals_const(dst_reg->var_off, insn->imm)) { - if (opcode == BPF_JEQ) { + tnum_is_const(dst_reg->var_off)) { + if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) || + (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) { /* if (imm == imm) goto pc+off; * only follow the goto, ignore fall-through */ @@@ -2959,6 -3500,7 +3506,7 @@@ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); if (!other_branch) return -EFAULT; + other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
/* detect if we are comparing against a constant value so we can adjust * our min/max values for our dst register. @@@ -2971,22 -3513,22 +3519,22 @@@ if (dst_reg->type == SCALAR_VALUE && regs[insn->src_reg].type == SCALAR_VALUE) { if (tnum_is_const(regs[insn->src_reg].var_off)) - reg_set_min_max(&other_branch->regs[insn->dst_reg], + reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, regs[insn->src_reg].var_off.value, opcode); else if (tnum_is_const(dst_reg->var_off)) - reg_set_min_max_inv(&other_branch->regs[insn->src_reg], + reg_set_min_max_inv(&other_branch_regs[insn->src_reg], ®s[insn->src_reg], dst_reg->var_off.value, opcode); else if (opcode == BPF_JEQ || opcode == BPF_JNE) /* Comparing for equality, we can combine knowledge */ - reg_combine_min_max(&other_branch->regs[insn->src_reg], - &other_branch->regs[insn->dst_reg], + reg_combine_min_max(&other_branch_regs[insn->src_reg], + &other_branch_regs[insn->dst_reg], ®s[insn->src_reg], ®s[insn->dst_reg], opcode); } } else if (dst_reg->type == SCALAR_VALUE) { - reg_set_min_max(&other_branch->regs[insn->dst_reg], + reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, insn->imm, opcode); }
@@@ -3007,7 -3549,7 +3555,7 @@@ return -EACCES; } if (env->log.level) - print_verifier_state(env, this_branch); + print_verifier_state(env, this_branch->frame[this_branch->curframe]); return 0; }
@@@ -3092,6 -3634,18 +3640,18 @@@ static int check_ld_abs(struct bpf_veri return -EINVAL; }
+ if (env->subprog_cnt) { + /* when program has LD_ABS insn JITs and interpreter assume + * that r1 == ctx == skb which is not the case for callees + * that can have arbitrary arguments. It's problematic + * for main prog as well since JITs would need to analyze + * all functions in order to make proper register save/restore + * decisions in the main prog. Hence disallow LD_ABS with calls + */ + verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); + return -EINVAL; + } + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { @@@ -3268,6 -3822,10 +3828,10 @@@ static int check_cfg(struct bpf_verifie int ret = 0; int i, t;
+ ret = check_subprogs(env); + if (ret < 0) + return ret; + insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; @@@ -3300,6 -3858,14 +3864,14 @@@ peek_stack goto err_free; if (t + 1 < insn_cnt) env->explored_states[t + 1] = STATE_LIST_MARK; + if (insns[t].src_reg == BPF_PSEUDO_CALL) { + env->explored_states[t] = STATE_LIST_MARK; + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } } else if (opcode == BPF_JA) { if (BPF_SRC(insns[t].code) != BPF_K) { ret = -EINVAL; @@@ -3418,11 -3984,21 +3990,21 @@@ static bool check_ids(u32 old_id, u32 c static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct idpair *idmap) { + bool equal; + if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ return true;
- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0) + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; + + if (rold->type == PTR_TO_STACK) + /* two stack pointers are equal only if they're pointing to + * the same stack frame, since fp-8 in foo != fp-8 in bar + */ + return equal && rold->frameno == rcur->frameno; + + if (equal) return true;
if (rold->type == NOT_INIT) @@@ -3495,7 -4071,6 +4077,6 @@@ tnum_in(rold->var_off, rcur->var_off); case PTR_TO_CTX: case CONST_PTR_TO_MAP: - case PTR_TO_STACK: case PTR_TO_PACKET_END: /* Only valid matches are exact, which memcmp() above * would have accepted @@@ -3510,8 -4085,8 +4091,8 @@@ return false; }
- static bool stacksafe(struct bpf_verifier_state *old, - struct bpf_verifier_state *cur, + static bool stacksafe(struct bpf_func_state *old, + struct bpf_func_state *cur, struct idpair *idmap) { int i, spi; @@@ -3529,8 -4104,19 +4110,19 @@@ for (i = 0; i < old->allocated_stack; i++) { spi = i / BPF_REG_SIZE;
+ if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) + /* explored state didn't use this */ + return true; + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; + /* if old state was safe with misc data in the stack + * it will be safe with zero-initialized stack. + * The opposite is not true + */ + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && + cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) + continue; if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) /* Ex: old explored (safe) state has STACK_SPILL in @@@ -3587,9 -4173,8 +4179,8 @@@ * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ - static bool states_equal(struct bpf_verifier_env *env, - struct bpf_verifier_state *old, - struct bpf_verifier_state *cur) + static bool func_states_equal(struct bpf_func_state *old, + struct bpf_func_state *cur) { struct idpair *idmap; bool ret = false; @@@ -3613,71 -4198,72 +4204,72 @@@ out_free return ret; }
+ static bool states_equal(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) + { + int i; + + if (old->curframe != cur->curframe) + return false; + + /* for states to be equal callsites have to be the same + * and all frame states need to be equivalent + */ + for (i = 0; i <= old->curframe; i++) { + if (old->frame[i]->callsite != cur->frame[i]->callsite) + return false; + if (!func_states_equal(old->frame[i], cur->frame[i])) + return false; + } + return true; + } + /* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at a - * jump target (in the first iteration of the propagate_liveness() loop), - * we didn't arrive by the straight-line code, so read marks in state must - * propagate to parent regardless of state's write marks. + * straight-line code between a state and its parent. When we arrive at an + * equivalent state (jump target or such) we didn't arrive by the straight-line + * code, so read marks in the state must propagate to the parent regardless + * of the state's write marks. That's what 'parent == state->parent' comparison + * in mark_reg_read() and mark_stack_slot_read() is for. */ - static bool do_propagate_liveness(const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent) + static int propagate_liveness(struct bpf_verifier_env *env, + const struct bpf_verifier_state *vstate, + struct bpf_verifier_state *vparent) { - bool writes = parent == state->parent; /* Observe write marks */ - bool touched = false; /* any changes made? */ - int i; + int i, frame, err = 0; + struct bpf_func_state *state, *parent;
- if (!parent) - return touched; + if (vparent->curframe != vstate->curframe) { + WARN(1, "propagate_live: parent frame %d current frame %d\n", + vparent->curframe, vstate->curframe); + return -EFAULT; + } /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); /* We don't need to worry about FP liveness because it's read-only */ for (i = 0; i < BPF_REG_FP; i++) { - if (parent->regs[i].live & REG_LIVE_READ) + if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) continue; - if (writes && (state->regs[i].live & REG_LIVE_WRITTEN)) - continue; - if (state->regs[i].live & REG_LIVE_READ) { - parent->regs[i].live |= REG_LIVE_READ; - touched = true; + if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { + err = mark_reg_read(env, vstate, vparent, i); + if (err) + return err; } } + /* ... and stack slots */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) - continue; - if (writes && - (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) - continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { - parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; - touched = true; + for (frame = 0; frame <= vstate->curframe; frame++) { + state = vstate->frame[frame]; + parent = vparent->frame[frame]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && + i < parent->allocated_stack / BPF_REG_SIZE; i++) { + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) + continue; + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) + mark_stack_slot_read(env, vstate, vparent, i, frame); } } - return touched; - } - - /* "parent" is "a state from which we reach the current state", but initially - * it is not the state->parent (i.e. "the state whose straight-line code leads - * to the current state"), instead it is the state that happened to arrive at - * a (prunable) equivalent of the current state. See comment above - * do_propagate_liveness() for consequences of this. - * This function is just a more efficient way of calling mark_reg_read() or - * mark_stack_slot_read() on each reg in "parent" that is read in "state", - * though it requires that parent != state->parent in the call arguments. - */ - static void propagate_liveness(const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent) - { - while (do_propagate_liveness(state, parent)) { - /* Something changed, so we need to feed those changes onward */ - state = parent; - parent = state->parent; - } + return err; }
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) @@@ -3685,7 -4271,7 +4277,7 @@@ struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state; - int i, err; + int i, j, err;
sl = env->explored_states[insn_idx]; if (!sl) @@@ -3706,7 -4292,9 +4298,9 @@@ * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - propagate_liveness(&sl->state, cur); + err = propagate_liveness(env, &sl->state, cur); + if (err) + return err; return 1; } sl = sl->next; @@@ -3714,9 -4302,10 +4308,10 @@@
/* there were no equivalent states, remember current one. * technically the current state is not proven to be safe yet, - * but it will either reach bpf_exit (which means it's safe) or - * it will be rejected. Since there are no loops, we won't be - * seeing this 'insn_idx' instruction again on the way to bpf_exit + * but it will either reach outer most bpf_exit (which means it's safe) + * or it will be rejected. Since there are no loops, we won't be + * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) + * again on the way to bpf_exit */ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) @@@ -3740,10 -4329,15 +4335,15 @@@ * explored_states can get read marks.) */ for (i = 0; i < BPF_REG_FP; i++) - cur->regs[i].live = REG_LIVE_NONE; - for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) - if (cur->stack[i].slot_type[0] == STACK_SPILL) - cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; + cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; + + /* all stack frames are accessible from callee, clear them all */ + for (j = 0; j <= cur->curframe; j++) { + struct bpf_func_state *frame = cur->frame[j]; + + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) + frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + } return 0; }
@@@ -3761,7 -4355,7 +4361,7 @@@ static int do_check(struct bpf_verifier struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; - int insn_cnt = env->prog->len; + int insn_cnt = env->prog->len, i; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; @@@ -3769,9 -4363,18 +4369,18 @@@ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); if (!state) return -ENOMEM; - env->cur_state = state; - init_reg_state(env, state->regs); + state->curframe = 0; state->parent = NULL; + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + if (!state->frame[0]) { + kfree(state); + return -ENOMEM; + } + env->cur_state = state; + init_func_state(env, state->frame[0], + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno */, + 0 /* subprogno, zero == main subprog */); insn_idx = 0; for (;;) { struct bpf_insn *insn; @@@ -3818,7 -4421,7 +4427,7 @@@ else verbose(env, "\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(env, state); + print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; }
@@@ -3951,13 -4554,17 +4560,17 @@@ if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || - insn->src_reg != BPF_REG_0 || + (insn->src_reg != BPF_REG_0 && + insn->src_reg != BPF_PSEUDO_CALL) || insn->dst_reg != BPF_REG_0) { verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; }
- err = check_call(env, insn->imm, insn_idx); + if (insn->src_reg == BPF_PSEUDO_CALL) + err = check_func_call(env, insn, &insn_idx); + else + err = check_helper_call(env, insn->imm, insn_idx); if (err) return err;
@@@ -3982,6 -4589,16 +4595,16 @@@ return -EINVAL; }
+ if (state->curframe) { + /* exit from nested function */ + prev_insn_idx = insn_idx; + err = prepare_func_exit(env, &insn_idx); + if (err) + return err; + do_print_state = true; + continue; + } + /* eBPF calling convetion is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time @@@ -4042,8 -4659,16 +4665,16 @@@ process_bpf_exit insn_idx++; }
- verbose(env, "processed %d insns, stack depth %d\n", insn_processed, - env->prog->aux->stack_depth); + verbose(env, "processed %d insns, stack depth ", insn_processed); + for (i = 0; i < env->subprog_cnt + 1; i++) { + u32 depth = env->subprog_stack_depth[i]; + + verbose(env, "%d", depth); + if (i + 1 < env->subprog_cnt + 1) + verbose(env, "+"); + } + verbose(env, "\n"); + env->prog->aux->stack_depth = env->subprog_stack_depth[0]; return 0; }
@@@ -4229,6 -4854,19 +4860,19 @@@ static int adjust_insn_aux_data(struct return 0; }
+ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) + { + int i; + + if (len == 1) + return; + for (i = 0; i < env->subprog_cnt; i++) { + if (env->subprog_starts[i] < off) + continue; + env->subprog_starts[i] += len - 1; + } + } + static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@@ -4239,6 -4877,7 +4883,7 @@@ return NULL; if (adjust_insn_aux_data(env, new_prog->len, off, len)) return NULL; + adjust_subprog_starts(env, off, len); return new_prog; }
@@@ -4373,6 -5012,150 +5018,150 @@@ static int convert_ctx_accesses(struct return 0; }
+ static int jit_subprogs(struct bpf_verifier_env *env) + { + struct bpf_prog *prog = env->prog, **func, *tmp; + int i, j, subprog_start, subprog_end = 0, len, subprog; + struct bpf_insn *insn = prog->insnsi; + void *old_bpf_func; + int err = -ENOMEM; + + if (env->subprog_cnt == 0) + return 0; + + for (i = 0; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + subprog = find_subprog(env, i + insn->imm + 1); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + i + insn->imm + 1); + return -EFAULT; + } + /* temporarily remember subprog id inside insn instead of + * aux_data, since next loop will split up all insns into funcs + */ + insn->off = subprog + 1; + /* remember original imm in case JIT fails and fallback + * to interpreter will be needed + */ + env->insn_aux_data[i].call_imm = insn->imm; + /* point imm to __bpf_call_base+1 from JITs point of view */ + insn->imm = 1; + } + + func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); + if (!func) + return -ENOMEM; + + for (i = 0; i <= env->subprog_cnt; i++) { + subprog_start = subprog_end; + if (env->subprog_cnt == i) + subprog_end = prog->len; + else + subprog_end = env->subprog_starts[i]; + + len = subprog_end - subprog_start; + func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); + if (!func[i]) + goto out_free; + memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], + len * sizeof(struct bpf_insn)); + func[i]->len = len; + func[i]->is_func = 1; + /* Use bpf_prog_F_tag to indicate functions in stack traces. + * Long term would need debug info to populate names + */ + func[i]->aux->name[0] = 'F'; + func[i]->aux->stack_depth = env->subprog_stack_depth[i]; + func[i]->jit_requested = 1; + func[i] = bpf_int_jit_compile(func[i]); + if (!func[i]->jited) { + err = -ENOTSUPP; + goto out_free; + } + cond_resched(); + } + /* at this point all bpf functions were successfully JITed + * now populate all bpf_calls with correct addresses and + * run last pass of JIT + */ + for (i = 0; i <= env->subprog_cnt; i++) { + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + subprog = insn->off; + insn->off = 0; + insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) + func[subprog]->bpf_func - + __bpf_call_base; + } + } + for (i = 0; i <= env->subprog_cnt; i++) { + old_bpf_func = func[i]->bpf_func; + tmp = bpf_int_jit_compile(func[i]); + if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { + verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); + err = -EFAULT; + goto out_free; + } + cond_resched(); + } + + /* finally lock prog and jit images for all functions and + * populate kallsysm + */ + for (i = 0; i <= env->subprog_cnt; i++) { + bpf_prog_lock_ro(func[i]); + bpf_prog_kallsyms_add(func[i]); + } + prog->jited = 1; + prog->bpf_func = func[0]->bpf_func; + prog->aux->func = func; + prog->aux->func_cnt = env->subprog_cnt + 1; + return 0; + out_free: + for (i = 0; i <= env->subprog_cnt; i++) + if (func[i]) + bpf_jit_free(func[i]); + kfree(func); + /* cleanup main prog to be interpreted */ + prog->jit_requested = 0; + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + insn->off = 0; + insn->imm = env->insn_aux_data[i].call_imm; + } + return err; + } + + static int fixup_call_args(struct bpf_verifier_env *env) + { + struct bpf_prog *prog = env->prog; + struct bpf_insn *insn = prog->insnsi; + int i, depth; + + if (env->prog->jit_requested) + if (jit_subprogs(env) == 0) + return 0; + + for (i = 0; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + depth = get_callee_stack_depth(env, insn, i); + if (depth < 0) + return depth; + bpf_patch_call_args(insn, depth); + } + return 0; + } + /* fixup insn->imm field of bpf_call instructions * and inline eligible helpers as explicit sequence of BPF instructions * @@@ -4392,11 -5175,15 +5181,15 @@@ static int fixup_bpf_calls(struct bpf_v for (i = 0; i < insn_cnt; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL)) continue; + if (insn->src_reg == BPF_PSEUDO_CALL) + continue;
if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_override_return) + prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can @@@ -4419,7 -5206,7 +5212,7 @@@ /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * handlers are currently limited to 64 bit only. */ - if (ebpf_jit_enabled() && BITS_PER_LONG == 64 && + if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_map_lookup_elem) { map_ptr = env->insn_aux_data[i + delta].map_ptr; if (map_ptr == BPF_MAP_PTR_POISON || @@@ -4571,12 -5358,12 +5364,12 @@@ int bpf_check(struct bpf_prog **prog, u if (!env->explored_states) goto skip_full_check;
+ env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + ret = check_cfg(env); if (ret < 0) goto skip_full_check;
- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = do_check(env); if (env->cur_state) { free_verifier_state(env->cur_state, true); @@@ -4597,6 -5384,9 +5390,9 @@@ skip_full_check if (ret == 0) ret = fixup_bpf_calls(env);
+ if (ret == 0) + ret = fixup_call_args(env); + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; if (log->level && !log->ubuf) { diff --combined kernel/events/core.c index 4dd0e1ea876d,878d86c513d6..812f03e99ef3 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@@ -4511,11 -4511,11 +4511,11 @@@ perf_read(struct file *file, char __use return ret; }
-static unsigned int perf_poll(struct file *file, poll_table *wait) +static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct ring_buffer *rb; - unsigned int events = POLLHUP; + __poll_t events = POLLHUP;
poll_wait(file, &event->waitq, wait);
@@@ -4723,6 -4723,9 +4723,9 @@@ static long _perf_ioctl(struct perf_eve rcu_read_unlock(); return 0; } + + case PERF_EVENT_IOC_QUERY_BPF: + return perf_event_query_prog_array(event, (void __user *)arg); default: return -ENOTTY; } @@@ -8080,6 -8083,13 +8083,13 @@@ static int perf_event_set_bpf_prog(stru return -EINVAL; }
+ /* Kprobe override only works for kprobes, not uprobes. */ + if (prog->kprobe_override && + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { + bpf_prog_put(prog); + return -EINVAL; + } + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event);
diff --combined kernel/module.c index 8042b8fcbf14,bd695bfdc5c4..83075a104710 --- a/kernel/module.c +++ b/kernel/module.c @@@ -3118,7 -3118,11 +3118,11 @@@ static int find_module_sections(struct sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif - + #ifdef CONFIG_BPF_KPROBE_OVERRIDE + mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list", + sizeof(*mod->kprobe_ei_funcs), + &mod->num_kprobe_ei_funcs); + #endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries);
@@@ -3938,12 -3942,6 +3942,12 @@@ static const char *get_ksymbol(struct m return symname(kallsyms, best); }
+void * __weak dereference_module_function_descriptor(struct module *mod, + void *ptr) +{ + return ptr; +} + /* For kallsyms to ask for address resolution. NULL means not found. Careful * not to lock to avoid deadlock on oopses, simply disable preemption. */ const char *module_address_lookup(unsigned long addr, diff --combined net/atm/common.c index 8f12f1c6fa14,5763fd241dc3..6523f38c4957 --- a/net/atm/common.c +++ b/net/atm/common.c @@@ -14,7 -14,7 +14,7 @@@ #include <linux/capability.h> #include <linux/mm.h> #include <linux/sched/signal.h> - #include <linux/time.h> /* struct timeval */ + #include <linux/time64.h> /* 64-bit time for seconds */ #include <linux/skbuff.h> #include <linux/bitops.h> #include <linux/init.h> @@@ -648,11 -648,11 +648,11 @@@ out return error; }
-unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct atm_vcc *vcc; - unsigned int mask; + __poll_t mask;
sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; diff --combined net/batman-adv/icmp_socket.c index a98e0a986cef,f2ef75b7fa73..70951174fa17 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@@ -1,3 -1,4 +1,4 @@@ + // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner @@@ -26,6 -27,7 +27,7 @@@ #include <linux/export.h> #include <linux/fcntl.h> #include <linux/fs.h> + #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/kernel.h> #include <linux/list.h> @@@ -55,6 -57,9 +57,9 @@@ static void batadv_socket_add_packet(st struct batadv_icmp_header *icmph, size_t icmp_len);
+ /** + * batadv_socket_init() - Initialize soft interface independent socket data + */ void batadv_socket_init(void) { memset(batadv_socket_client_hash, 0, sizeof(batadv_socket_client_hash)); @@@ -292,7 -297,7 +297,7 @@@ out return len; }
-static unsigned int batadv_socket_poll(struct file *file, poll_table *wait) +static __poll_t batadv_socket_poll(struct file *file, poll_table *wait) { struct batadv_socket_client *socket_client = file->private_data;
@@@ -314,6 -319,12 +319,12 @@@ static const struct file_operations bat .llseek = no_llseek, };
+ /** + * batadv_socket_setup() - Create debugfs "socket" file + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_socket_setup(struct batadv_priv *bat_priv) { struct dentry *d; @@@ -333,7 -344,7 +344,7 @@@ err }
/** - * batadv_socket_add_packet - schedule an icmp packet to be sent to + * batadv_socket_add_packet() - schedule an icmp packet to be sent to * userspace on an icmp socket. * @socket_client: the socket this packet belongs to * @icmph: pointer to the header of the icmp packet @@@ -390,7 -401,7 +401,7 @@@ static void batadv_socket_add_packet(st }
/** - * batadv_socket_receive_packet - schedule an icmp packet to be received + * batadv_socket_receive_packet() - schedule an icmp packet to be received * locally and sent to userspace. * @icmph: pointer to the header of the icmp packet * @icmp_len: total length of the icmp packet diff --combined net/batman-adv/log.c index 76451460c98d,da004980ab8b..9be74a44e99d --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@@ -1,3 -1,4 +1,4 @@@ + // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner @@@ -24,6 -25,7 +25,7 @@@ #include <linux/export.h> #include <linux/fcntl.h> #include <linux/fs.h> + #include <linux/gfp.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/module.h> @@@ -86,6 -88,13 +88,13 @@@ static int batadv_fdebug_log(struct bat return 0; }
+ /** + * batadv_debug_log() - Add debug log entry + * @bat_priv: the bat priv with all the soft interface information + * @fmt: format string + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) { va_list args; @@@ -176,7 -185,7 +185,7 @@@ static ssize_t batadv_log_read(struct f return error; }
-static unsigned int batadv_log_poll(struct file *file, poll_table *wait) +static __poll_t batadv_log_poll(struct file *file, poll_table *wait) { struct batadv_priv *bat_priv = file->private_data; struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; @@@ -197,6 -206,12 +206,12 @@@ static const struct file_operations bat .llseek = no_llseek, };
+ /** + * batadv_debug_log_setup() - Initialize debug log + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure + */ int batadv_debug_log_setup(struct batadv_priv *bat_priv) { struct dentry *d; @@@ -222,6 -237,10 +237,10 @@@ err return -ENOMEM; }
+ /** + * batadv_debug_log_cleanup() - Destroy debug log + * @bat_priv: the bat priv with all the soft interface information + */ void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) { kfree(bat_priv->debug_log); diff --combined net/bluetooth/af_bluetooth.c index 671b907ba678,f044202346c6..f897681780db --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@@ -421,7 -421,7 +421,7 @@@ out } EXPORT_SYMBOL(bt_sock_stream_recvmsg);
-static inline unsigned int bt_accept_poll(struct sock *parent) +static inline __poll_t bt_accept_poll(struct sock *parent) { struct bt_sock *s, *n; struct sock *sk; @@@ -437,11 -437,11 +437,11 @@@ return 0; }
-unsigned int bt_sock_poll(struct file *file, struct socket *sock, +__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - unsigned int mask = 0; + __poll_t mask = 0;
BT_DBG("sock %p, sk %p", sock, sk);
@@@ -766,43 -766,39 +766,39 @@@ static int __init bt_init(void return err;
err = sock_register(&bt_sock_family_ops); - if (err < 0) { - bt_sysfs_cleanup(); - return err; - } + if (err) + goto cleanup_sysfs;
BT_INFO("HCI device and connection manager initialized");
err = hci_sock_init(); - if (err < 0) - goto error; + if (err) + goto unregister_socket;
err = l2cap_init(); - if (err < 0) - goto sock_err; + if (err) + goto cleanup_socket;
err = sco_init(); - if (err < 0) { - l2cap_exit(); - goto sock_err; - } + if (err) + goto cleanup_cap;
err = mgmt_init(); - if (err < 0) { - sco_exit(); - l2cap_exit(); - goto sock_err; - } + if (err) + goto cleanup_sco;
return 0;
- sock_err: + cleanup_sco: + sco_exit(); + cleanup_cap: + l2cap_exit(); + cleanup_socket: hci_sock_cleanup(); - - error: + unregister_socket: sock_unregister(PF_BLUETOOTH); + cleanup_sysfs: bt_sysfs_cleanup(); - return err; }
diff --combined net/core/dev.c index 01ee854454a8,c7db39926769..59ead3910ab7 --- a/net/core/dev.c +++ b/net/core/dev.c @@@ -1542,6 -1542,23 +1542,23 @@@ void dev_disable_lro(struct net_device } EXPORT_SYMBOL(dev_disable_lro);
+ /** + * dev_disable_gro_hw - disable HW Generic Receive Offload on a device + * @dev: device + * + * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be + * called under RTNL. This is needed if Generic XDP is installed on + * the device. + */ + static void dev_disable_gro_hw(struct net_device *dev) + { + dev->wanted_features &= ~NETIF_F_GRO_HW; + netdev_update_features(dev); + + if (unlikely(dev->features & NETIF_F_GRO_HW)) + netdev_WARN(dev, "failed to disable GRO_HW!\n"); + } + static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { @@@ -2803,7 -2820,7 +2820,7 @@@ struct sk_buff *__skb_gso_segment(struc
segs = skb_mac_gso_segment(skb, features);
- if (unlikely(skb_needs_check(skb, tx_path))) + if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb);
return segs; @@@ -3162,6 -3179,21 +3179,21 @@@ static inline int __dev_xmit_skb(struc int rc;
qdisc_calculate_pkt_len(skb, q); + + if (q->flags & TCQ_F_NOLOCK) { + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + __qdisc_drop(skb, &to_free); + rc = NET_XMIT_DROP; + } else { + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + __qdisc_run(q); + } + + if (unlikely(to_free)) + kfree_skb_list(to_free); + return rc; + } + /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@@ -3192,9 -3224,9 +3224,9 @@@ contended = false; } __qdisc_run(q); - } else - qdisc_run_end(q); + }
+ qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; @@@ -3204,6 -3236,7 +3236,7 @@@ contended = false; } __qdisc_run(q); + qdisc_run_end(q); } } spin_unlock(root_lock); @@@ -3904,7 -3937,7 +3937,7 @@@ static u32 netif_receive_generic_xdp(st hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) goto do_drop; - if (troom > 0 && __skb_linearize(skb)) + if (skb_linearize(skb)) goto do_drop; }
@@@ -4143,19 -4176,22 +4176,22 @@@ static __latent_entropy void net_tx_act
while (head) { struct Qdisc *q = head; - spinlock_t *root_lock; + spinlock_t *root_lock = NULL;
head = head->next_sched;
- root_lock = qdisc_lock(q); - spin_lock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); - spin_unlock(root_lock); + if (root_lock) + spin_unlock(root_lock); } } } @@@ -4545,6 -4581,7 +4581,7 @@@ static int generic_xdp_install(struct n } else if (new && !old) { static_key_slow_inc(&generic_xdp_needed); dev_disable_lro(dev); + dev_disable_gro_hw(dev); } break;
@@@ -7073,17 -7110,21 +7110,21 @@@ int dev_change_proto_down(struct net_de } EXPORT_SYMBOL(dev_change_proto_down);
- u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id) + void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, + struct netdev_bpf *xdp) { - struct netdev_bpf xdp; - - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG; + memset(xdp, 0, sizeof(*xdp)); + xdp->command = XDP_QUERY_PROG;
/* Query must always succeed. */ - WARN_ON(bpf_op(dev, &xdp) < 0); - if (prog_id) - *prog_id = xdp.prog_id; + WARN_ON(bpf_op(dev, xdp) < 0); + } + + static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) + { + struct netdev_bpf xdp; + + __dev_xdp_query(dev, bpf_op, &xdp);
return xdp.prog_attached; } @@@ -7106,6 -7147,27 +7147,27 @@@ static int dev_xdp_install(struct net_d return bpf_op(dev, &xdp); }
+ static void dev_xdp_uninstall(struct net_device *dev) + { + struct netdev_bpf xdp; + bpf_op_t ndo_bpf; + + /* Remove generic XDP */ + WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL)); + + /* Remove from the driver */ + ndo_bpf = dev->netdev_ops->ndo_bpf; + if (!ndo_bpf) + return; + + __dev_xdp_query(dev, ndo_bpf, &xdp); + if (xdp.prog_attached == XDP_ATTACHED_NONE) + return; + + /* Program removal should always succeed */ + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); + } + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@@ -7134,10 -7196,10 +7196,10 @@@ int dev_change_xdp_fd(struct net_devic bpf_chk = generic_xdp_install;
if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL)) + if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op, NULL)) + __dev_xdp_attached(dev, bpf_op)) return -EBUSY;
prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, @@@ -7236,6 -7298,7 +7298,7 @@@ static void rollback_registered_many(st /* Shutdown queueing discipline. */ dev_shutdown(dev);
+ dev_xdp_uninstall(dev);
/* Notify protocols, that we are about to destroy * this device. They should clean all the things. @@@ -7379,6 -7442,18 +7442,18 @@@ static netdev_features_t netdev_fix_fea features &= ~dev->gso_partial_features; }
+ if (!(features & NETIF_F_RXCSUM)) { + /* NETIF_F_GRO_HW implies doing RXCSUM since every packet + * successfully merged by hardware must also have the + * checksum verified by hardware. If the user does not + * want to enable RXCSUM, logically, we should disable GRO_HW. + */ + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); + features &= ~NETIF_F_GRO_HW; + } + } + return features; }
@@@ -8195,7 -8270,6 +8270,6 @@@ EXPORT_SYMBOL(alloc_netdev_mqs) void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; - struct bpf_prog *prog;
might_sleep(); netif_free_tx_queues(dev); @@@ -8214,12 -8288,6 +8288,6 @@@ free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL;
- prog = rcu_dereference_protected(dev->xdp_prog, 1); - if (prog) { - bpf_prog_put(prog); - static_key_slow_dec(&generic_xdp_needed); - } - /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); diff --combined net/core/sock.c index 1211159718ad,72d14b221784..420c380bc61d --- a/net/core/sock.c +++ b/net/core/sock.c @@@ -145,6 -145,8 +145,8 @@@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list);
+ static void sock_inuse_add(struct net *net, int val); + /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through @@@ -1531,8 -1533,11 +1533,11 @@@ struct sock *sk_alloc(struct net *net, sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; - if (likely(sk->sk_net_refcnt)) + if (likely(sk->sk_net_refcnt)) { get_net(net); + sock_inuse_add(net, 1); + } + sock_net_set(sk, net); refcount_set(&sk->sk_wmem_alloc, 1);
@@@ -1595,6 -1600,9 +1600,9 @@@ void sk_destruct(struct sock *sk
static void __sk_free(struct sock *sk) { + if (likely(sk->sk_net_refcnt)) + sock_inuse_add(sock_net(sk), -1); + if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) sock_diag_broadcast_destroy(sk); else @@@ -1716,6 -1724,8 +1724,8 @@@ struct sock *sk_clone_lock(const struc newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + if (likely(newsk->sk_net_refcnt)) + sock_inuse_add(sock_net(newsk), 1);
/* * Before updating sk_refcnt, we must commit prior changes to memory @@@ -2496,7 -2506,7 +2506,7 @@@ int sock_no_getname(struct socket *sock } EXPORT_SYMBOL(sock_no_getname);
-unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) +__poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) { return 0; } @@@ -3045,7 -3055,7 +3055,7 @@@ static DECLARE_BITMAP(proto_inuse_idx,
void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { - __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); + __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); } EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
@@@ -3055,21 -3065,50 +3065,50 @@@ int sock_prot_inuse_get(struct net *net int res = 0;
for_each_possible_cpu(cpu) - res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; + res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
return res >= 0 ? res : 0; } EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+ static void sock_inuse_add(struct net *net, int val) + { + this_cpu_add(*net->core.sock_inuse, val); + } + + int sock_inuse_get(struct net *net) + { + int cpu, res = 0; + + for_each_possible_cpu(cpu) + res += *per_cpu_ptr(net->core.sock_inuse, cpu); + + return res; + } + + EXPORT_SYMBOL_GPL(sock_inuse_get); + static int __net_init sock_inuse_init_net(struct net *net) { - net->core.inuse = alloc_percpu(struct prot_inuse); - return net->core.inuse ? 0 : -ENOMEM; + net->core.prot_inuse = alloc_percpu(struct prot_inuse); + if (net->core.prot_inuse == NULL) + return -ENOMEM; + + net->core.sock_inuse = alloc_percpu(int); + if (net->core.sock_inuse == NULL) + goto out; + + return 0; + + out: + free_percpu(net->core.prot_inuse); + return -ENOMEM; }
static void __net_exit sock_inuse_exit_net(struct net *net) { - free_percpu(net->core.inuse); + free_percpu(net->core.prot_inuse); + free_percpu(net->core.sock_inuse); }
static struct pernet_operations net_inuse_ops = { @@@ -3112,6 -3151,10 +3151,10 @@@ static inline void assign_proto_idx(str static inline void release_proto_idx(struct proto *prot) { } + + static void sock_inuse_add(struct net *net, int val) + { + } #endif
static void req_prot_cleanup(struct request_sock_ops *rsk_prot) diff --combined net/dccp/proto.c index 8b8db3d481bd,7a75a1d3568b..ada84f62b6bd --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@@ -110,7 -110,7 +110,7 @@@ void dccp_set_state(struct sock *sk, co /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk->sk_state = state; + inet_sk_set_state(sk, state); }
EXPORT_SYMBOL_GPL(dccp_set_state); @@@ -318,10 -318,10 +318,10 @@@ EXPORT_SYMBOL_GPL(dccp_disconnect) * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ -unsigned int dccp_poll(struct file *file, struct socket *sock, +__poll_t dccp_poll(struct file *file, struct socket *sock, poll_table *wait) { - unsigned int mask; + __poll_t mask; struct sock *sk = sock->sk;
sock_poll_wait(file, sk_sleep(sk), wait); diff --combined net/ipv4/ip_gre.c index 45ffd3d045d2,90c912307814..78365094f56c --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@@ -114,7 -114,8 +114,8 @@@ MODULE_PARM_DESC(log_ecn_error, "Log pa static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, bool truncate); + __be32 id, u32 index, + bool truncate, bool is_ipv4);
static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; @@@ -255,34 -256,43 +256,43 @@@ static int erspan_rcv(struct sk_buff *s { struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; + struct erspan_base_hdr *ershdr; + struct erspan_metadata *pkt_md; struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; - struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 index; + int ver; int len;
itn = net_generic(net, erspan_net_id); len = gre_hdr_len + sizeof(*ershdr);
+ /* Check based hdr len */ if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT;
iph = ip_hdr(skb); - ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len); + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET;
/* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); - index = ershdr->md.index; tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, iph->saddr, iph->daddr, tpi->key);
if (tunnel) { + len = gre_hdr_len + erspan_hdr_len(ver); + if (unlikely(!pskb_may_pull(skb, len))) + return PACKET_REJECT; + + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + pkt_md = (struct erspan_metadata *)(ershdr + 1); + if (__iptunnel_pull_header(skb, - gre_hdr_len + sizeof(*ershdr), + len, htons(ETH_P_TEB), false, false) < 0) goto drop; @@@ -303,15 -313,32 +313,32 @@@ return PACKET_REJECT;
md = ip_tunnel_info_opts(&tun_dst->u.tun_info); - if (!md) + if (!md) { + dst_release((struct dst_entry *)tun_dst); return PACKET_REJECT; + } + + memcpy(md, pkt_md, sizeof(*md)); + md->version = ver;
- md->index = index; info = &tun_dst->u.tun_info; info->key.tun_flags |= TUNNEL_ERSPAN_OPT; info->options_len = sizeof(*md); } else { - tunnel->index = ntohl(index); + tunnel->erspan_ver = ver; + if (ver == 1) { + tunnel->index = ntohl(pkt_md->u.index); + } else { + u16 md2_flags; + u16 dir, hwid; + + md2_flags = ntohs(pkt_md->u.md2.flags); + dir = (md2_flags & DIR_MASK) >> DIR_OFFSET; + hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET; + tunnel->dir = dir; + tunnel->hwid = hwid; + } + }
skb_reset_mac_header(skb); @@@ -405,14 -432,17 +432,17 @@@ static int gre_rcv(struct sk_buff *skb if (hdr_len < 0) goto drop;
- if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) { + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || + tpi.proto == htons(ETH_P_ERSPAN2))) { if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; + goto out; }
if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0;
+ out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); @@@ -560,6 -590,7 +590,7 @@@ static void erspan_fb_xmit(struct sk_bu bool truncate = false; struct flowi4 fl; int tunnel_hlen; + int version; __be16 df;
tun_info = skb_tunnel_info(skb); @@@ -568,9 -599,13 +599,13 @@@ goto err_free_skb;
key = &tun_info->key; + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto err_free_rt;
/* ERSPAN has fixed 8 byte GRE header */ - tunnel_hlen = 8 + sizeof(struct erspanhdr); + version = md->version; + tunnel_hlen = 8 + erspan_hdr_len(version);
rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); if (!rt) @@@ -584,12 -619,23 +619,23 @@@ truncate = true; }
- md = ip_tunnel_info_opts(tun_info); - if (!md) - goto err_free_rt; + if (version == 1) { + erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), + ntohl(md->u.index), truncate, true); + } else if (version == 2) { + u16 md2_flags; + u8 direction; + u16 hwid;
- erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), - ntohl(md->index), truncate); + md2_flags = ntohs(md->u.md2.flags); + direction = (md2_flags & DIR_MASK) >> DIR_OFFSET; + hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET; + + erspan_build_header_v2(skb, tunnel_id_to_key32(key->tun_id), + direction, hwid, truncate, true); + } else { + goto err_free_rt; + }
gre_build_header(skb, 8, TUNNEL_SEQ, htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++)); @@@ -668,52 -714,6 +714,6 @@@ free_skb return NETDEV_TX_OK; }
- static inline u8 tos_to_cos(u8 tos) - { - u8 dscp, cos; - - dscp = tos >> 2; - cos = dscp >> 3; - return cos; - } - - static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, bool truncate) - { - struct iphdr *iphdr = ip_hdr(skb); - struct ethhdr *eth = eth_hdr(skb); - enum erspan_encap_type enc_type; - struct erspanhdr *ershdr; - struct qtag_prefix { - __be16 eth_type; - __be16 tci; - } *qp; - u16 vlan_tci = 0; - - enc_type = ERSPAN_ENCAP_NOVLAN; - - /* If mirrored packet has vlan tag, extract tci and - * perserve vlan header in the mirrored frame. - */ - if (eth->h_proto == htons(ETH_P_8021Q)) { - qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); - vlan_tci = ntohs(qp->tci); - enc_type = ERSPAN_ENCAP_INFRAME; - } - - skb_push(skb, sizeof(*ershdr)); - ershdr = (struct erspanhdr *)skb->data; - memset(ershdr, 0, sizeof(*ershdr)); - - ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | - (ERSPAN_VERSION << VER_OFFSET)); - ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) | - ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) | - (enc_type << EN_OFFSET & EN_MASK) | - ((truncate << T_OFFSET) & T_MASK)); - ershdr->md.index = htonl(index & INDEX_MASK); - } - static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct net_device *dev) { @@@ -737,7 -737,14 +737,14 @@@ }
/* Push ERSPAN header */ - erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate); + if (tunnel->erspan_ver == 1) + erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, + truncate, true); + else + erspan_build_header_v2(skb, tunnel->parms.o_key, + tunnel->dir, tunnel->hwid, + truncate, true); + tunnel->parms.o_flags &= ~TUNNEL_KEY; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); return NETDEV_TX_OK; @@@ -1209,13 -1216,32 +1216,32 @@@ static int ipgre_netlink_parms(struct n if (data[IFLA_GRE_FWMARK]) *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
- if (data[IFLA_GRE_ERSPAN_INDEX]) { - t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (data[IFLA_GRE_ERSPAN_VER]) { + t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
- if (t->index & ~INDEX_MASK) + if (t->erspan_ver != 1 && t->erspan_ver != 2) return -EINVAL; }
+ if (t->erspan_ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) { + t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (t->index & ~INDEX_MASK) + return -EINVAL; + } + } else if (t->erspan_ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) { + t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + if (t->dir & ~(DIR_MASK >> DIR_OFFSET)) + return -EINVAL; + } + if (data[IFLA_GRE_ERSPAN_HWID]) { + t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + if (t->hwid & ~(HWID_MASK >> HWID_OFFSET)) + return -EINVAL; + } + } + return 0; }
@@@ -1282,7 -1308,7 +1308,7 @@@ static int erspan_tunnel_init(struct ne tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + - sizeof(struct erspanhdr); + erspan_hdr_len(tunnel->erspan_ver); t_hlen = tunnel->hlen + sizeof(struct iphdr);
dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; @@@ -1310,7 -1336,6 +1336,7 @@@ static const struct net_device_ops ersp static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &gre_tap_netdev_ops; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; @@@ -1413,6 -1438,12 +1439,12 @@@ static size_t ipgre_get_size(const stru nla_total_size(4) + /* IFLA_GRE_ERSPAN_INDEX */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_VER */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_DIR */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_HWID */ + nla_total_size(2) + 0; }
@@@ -1455,9 -1486,18 +1487,18 @@@ static int ipgre_fill_info(struct sk_bu goto nla_put_failure; }
- if (t->index) + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) + goto nla_put_failure; + + if (t->erspan_ver == 1) { if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) goto nla_put_failure; + } else if (t->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) + goto nla_put_failure; + }
return 0;
@@@ -1493,6 -1533,9 +1534,9 @@@ static const struct nla_policy ipgre_po [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, };
static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --combined net/ipv4/tcp.c index c4a7ee7f6721,67d39b79c801..ca042cdf8496 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@@ -283,8 -283,6 +283,6 @@@ #include <asm/ioctls.h> #include <net/busy_poll.h>
- #include <trace/events/tcp.h> - struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count);
@@@ -493,9 -491,9 +491,9 @@@ static void tcp_tx_timestamp(struct soc * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ -unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { - unsigned int mask; + __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); int state; @@@ -504,7 -502,7 +502,7 @@@
sock_poll_wait(file, sk_sleep(sk), wait);
- state = sk_state_load(sk); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk);
@@@ -2040,8 -2038,6 +2038,6 @@@ void tcp_set_state(struct sock *sk, in { int oldstate = sk->sk_state;
- trace_tcp_set_state(sk, oldstate, state); - switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) @@@ -2065,7 -2061,7 +2061,7 @@@ /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk_state_store(sk, state); + inet_sk_state_store(sk, state);
#ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); @@@ -2920,7 -2916,7 +2916,7 @@@ void tcp_get_info(struct sock *sk, stru if (sk->sk_type != SOCK_STREAM) return;
- info->tcpi_state = sk_state_load(sk); + info->tcpi_state = inet_sk_state_load(sk);
/* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); @@@ -3578,6 -3574,9 +3574,9 @@@ void __init tcp_init(void percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); inet_hashinfo_init(&tcp_hashinfo); + inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", + thash_entries, 21, /* one slot per 2 MB*/ + 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, diff --combined net/ipv4/udp.c index ef45adfc0edb,e9c0d1e1772e..0942a5f43ea5 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@@ -357,18 -357,12 +357,12 @@@ fail } EXPORT_SYMBOL(udp_lib_get_port);
- static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, - unsigned int port) - { - return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; - } - int udp_v4_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = - udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); + ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); unsigned int hash2_partial = - udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); + ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
/* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; @@@ -445,7 -439,7 +439,7 @@@ static struct sock *udp4_lib_lookup2(st struct sk_buff *skb) { struct sock *sk, *result; - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0;
result = NULL; @@@ -454,23 -448,16 +448,16 @@@ score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } badness = score; result = sk; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@@ -488,11 -475,11 +475,11 @@@ struct sock *__udp4_lib_lookup(struct n unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; bool exact_dif = udp_lib_exact_dif_match(net, skb); - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0;
if (hslot->count > 10) { - hash2 = udp4_portaddr_hash(net, daddr, hnum); + hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) @@@ -503,7 -490,7 +490,7 @@@ exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; - hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; /* avoid searching the same slot again. */ if (unlikely(slot2 == old_slot2)) @@@ -526,23 -513,16 +513,16 @@@ begin score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@@ -1775,7 -1755,7 +1755,7 @@@ EXPORT_SYMBOL(udp_lib_rehash)
static void udp_v4_rehash(struct sock *sk) { - u16 new_hash = udp4_portaddr_hash(sock_net(sk), + u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); udp_lib_rehash(sk, new_hash); @@@ -1966,9 -1946,9 +1946,9 @@@ static int __udp4_lib_mcast_deliver(str struct sk_buff *nskb;
if (use_hash2) { - hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & + hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & udptable->mask; - hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask; + hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); @@@ -2200,7 -2180,7 +2180,7 @@@ static struct sock *__udp4_lib_demux_lo int dif, int sdif) { unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); + unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); @@@ -2502,9 -2482,9 +2482,9 @@@ int compat_udp_getsockopt(struct sock * * but then block when reading it. Add special case code * to work around these arguably broken applications. */ -unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) { - unsigned int mask = datagram_poll(file, sock, wait); + __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk;
if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) diff --combined net/ipv6/ip6_gre.c index 416c8913f132,8451d00b210b..97f148f15429 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@@ -55,6 -55,8 +55,8 @@@ #include <net/ip6_route.h> #include <net/ip6_tunnel.h> #include <net/gre.h> + #include <net/erspan.h> + #include <net/dst_metadata.h>
static bool log_ecn_error = true; @@@ -68,11 -70,13 +70,13 @@@ static unsigned int ip6gre_net_id __rea struct ip6gre_net { struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
+ struct ip6_tnl __rcu *collect_md_tun; struct net_device *fb_tunnel_dev; };
static struct rtnl_link_ops ip6gre_link_ops __read_mostly; static struct rtnl_link_ops ip6gre_tap_ops __read_mostly; + static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly; static int ip6gre_tunnel_init(struct net_device *dev); static void ip6gre_tunnel_setup(struct net_device *dev); static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t); @@@ -121,7 -125,8 +125,8 @@@ static struct ip6_tnl *ip6gre_tunnel_lo unsigned int h1 = HASH_KEY(key); struct ip6_tnl *t, *cand = NULL; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - int dev_type = (gre_proto == htons(ETH_P_TEB)) ? + int dev_type = (gre_proto == htons(ETH_P_TEB) || + gre_proto == htons(ETH_P_ERSPAN)) ? ARPHRD_ETHER : ARPHRD_IP6GRE; int score, cand_score = 4;
@@@ -226,6 -231,10 +231,10 @@@ if (cand) return cand;
+ t = rcu_dereference(ign->collect_md_tun); + if (t && t->dev->flags & IFF_UP) + return t; + dev = ign->fb_tunnel_dev; if (dev->flags & IFF_UP) return netdev_priv(dev); @@@ -261,6 -270,9 +270,9 @@@ static void ip6gre_tunnel_link(struct i { struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t);
+ if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun, t); + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } @@@ -270,6 -282,9 +282,9 @@@ static void ip6gre_tunnel_unlink(struc struct ip6_tnl __rcu **tp; struct ip6_tnl *iter;
+ if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun, NULL); + for (tp = ip6gre_bucket(ign, t); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { @@@ -460,7 -475,111 +475,111 @@@ static int ip6gre_rcv(struct sk_buff *s &ipv6h->saddr, &ipv6h->daddr, tpi->key, tpi->proto); if (tunnel) { - ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + if (tunnel->parms.collect_md) { + struct metadata_dst *tun_dst; + __be64 tun_id; + __be16 flags; + + flags = tpi->flags; + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0); + if (!tun_dst) + return PACKET_REJECT; + + ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + } else { + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + } + + return PACKET_RCVD; + } + + return PACKET_REJECT; + } + + static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len, + struct tnl_ptk_info *tpi) + { + struct erspan_base_hdr *ershdr; + struct erspan_metadata *pkt_md; + const struct ipv6hdr *ipv6h; + struct ip6_tnl *tunnel; + u8 ver; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) + return PACKET_REJECT; + + ipv6h = ipv6_hdr(skb); + ershdr = (struct erspan_base_hdr *)skb->data; + ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET; + tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); + + tunnel = ip6gre_tunnel_lookup(skb->dev, + &ipv6h->saddr, &ipv6h->daddr, tpi->key, + tpi->proto); + if (tunnel) { + int len = erspan_hdr_len(ver); + + if (unlikely(!pskb_may_pull(skb, len))) + return PACKET_REJECT; + + ershdr = (struct erspan_base_hdr *)skb->data; + pkt_md = (struct erspan_metadata *)(ershdr + 1); + + if (__iptunnel_pull_header(skb, len, + htons(ETH_P_TEB), + false, false) < 0) + return PACKET_REJECT; + + if (tunnel->parms.collect_md) { + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + struct erspan_metadata *md; + __be64 tun_id; + __be16 flags; + + tpi->flags |= TUNNEL_KEY; + flags = tpi->flags; + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, + sizeof(*md)); + if (!tun_dst) + return PACKET_REJECT; + + info = &tun_dst->u.tun_info; + md = ip_tunnel_info_opts(info); + if (!md) { + dst_release((struct dst_entry *)tun_dst); + return PACKET_REJECT; + } + + memcpy(md, pkt_md, sizeof(*md)); + md->version = ver; + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + info->options_len = sizeof(*md); + + ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + + } else { + tunnel->parms.erspan_ver = ver; + + if (ver == 1) { + tunnel->parms.index = ntohl(pkt_md->u.index); + } else { + u16 md2_flags; + u16 dir, hwid; + + md2_flags = ntohs(pkt_md->u.md2.flags); + dir = (md2_flags & DIR_MASK) >> DIR_OFFSET; + hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET; + tunnel->parms.dir = dir; + tunnel->parms.hwid = hwid; + } + + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + }
return PACKET_RCVD; } @@@ -481,9 -600,17 +600,17 @@@ static int gre_rcv(struct sk_buff *skb if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false)) goto drop;
+ if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || + tpi.proto == htons(ETH_P_ERSPAN2))) { + if (ip6erspan_rcv(skb, hdr_len, &tpi) == PACKET_RCVD) + return 0; + goto out; + } + if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD) return 0;
+ out: icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); @@@ -496,6 -623,78 +623,78 @@@ static int gre_handle_offloads(struct s csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); }
+ static void prepare_ip6gre_xmit_ipv4(struct sk_buff *skb, + struct net_device *dev, + struct flowi6 *fl6, __u8 *dsfield, + int *encap_limit) + { + const struct iphdr *iph = ip_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + *encap_limit = t->parms.encap_limit; + + memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + *dsfield = ipv4_get_dsfield(iph); + else + *dsfield = ip6_tclass(t->parms.flowinfo); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6->flowi6_mark = skb->mark; + else + fl6->flowi6_mark = t->parms.fwmark; + + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); + } + + static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb, + struct net_device *dev, + struct flowi6 *fl6, __u8 *dsfield, + int *encap_limit) + { + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + __u16 offset; + + offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ + + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + + tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + *encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { + *encap_limit = t->parms.encap_limit; + } + + memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + *dsfield = ipv6_get_dsfield(ipv6h); + else + *dsfield = ip6_tclass(t->parms.flowinfo); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6->flowlabel |= ip6_flowlabel(ipv6h); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6->flowi6_mark = skb->mark; + else + fl6->flowi6_mark = t->parms.fwmark; + + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + return 0; + } + static netdev_tx_t __gre6_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, @@@ -517,8 -716,38 +716,38 @@@
/* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; - gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, - protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); + + if (tunnel->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + __be16 flags; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || + !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) + return -EINVAL; + + key = &tun_info->key; + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_proto = IPPROTO_GRE; + fl6->daddr = key->u.ipv6.dst; + fl6->flowlabel = key->label; + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + dsfield = key->tos; + flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + tunnel->tun_hlen = gre_calc_hlen(flags); + + gre_build_header(skb, tunnel->tun_hlen, + flags, protocol, + tunnel_id_to_key32(tun_info->key.tun_id), 0); + + } else { + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, + protocol, tunnel->parms.o_key, + htonl(tunnel->o_seqno)); + }
return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); @@@ -527,30 -756,17 +756,17 @@@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - const struct iphdr *iph = ip_hdr(skb); int encap_limit = -1; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield = 0; __u32 mtu; int err;
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv4_get_dsfield(iph); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + if (!t->parms.collect_md) + prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, + &dsfield, &encap_limit);
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) @@@ -574,46 -790,17 +790,17 @@@ static inline int ip6gre_xmit_ipv6(stru struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h = ipv6_hdr(skb); int encap_limit = -1; - __u16 offset; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield = 0; __u32 mtu; int err;
if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) return -1;
- offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); - /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ - ipv6h = ipv6_hdr(skb); - - if (offset > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); - return -1; - } - encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv6_get_dsfield(ipv6h); - else - dsfield = ip6_tclass(t->parms.flowinfo); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= ip6_flowlabel(ipv6h); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + if (!t->parms.collect_md && + prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) + return -1;
if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) return -1; @@@ -660,7 -847,8 +847,8 @@@ static int ip6gre_xmit_other(struct sk_ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit;
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + if (!t->parms.collect_md) + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) @@@ -705,6 -893,141 +893,141 @@@ tx_err return NETDEV_TX_OK; }
+ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) + { + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + struct dst_entry *dst = skb_dst(skb); + struct net_device_stats *stats; + bool truncate = false; + int encap_limit = -1; + __u8 dsfield = false; + struct flowi6 fl6; + int err = -EINVAL; + __u32 mtu; + + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) + goto tx_err; + + if (gre_handle_offloads(skb, false)) + goto tx_err; + + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); + truncate = true; + } + + t->parms.o_flags &= ~TUNNEL_KEY; + IPCB(skb)->flags = 0; + + /* For collect_md mode, derive fl6 from the tunnel key, + * for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}. + */ + if (t->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct erspan_metadata *md; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || + !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) + return -EINVAL; + + key = &tun_info->key; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_GRE; + fl6.daddr = key->u.ipv6.dst; + fl6.flowlabel = key->label; + fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + dsfield = key->tos; + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto tx_err; + + if (md->version == 1) { + erspan_build_header(skb, + tunnel_id_to_key32(key->tun_id), + ntohl(md->u.index), truncate, + false); + } else if (md->version == 2) { + u16 md2_flags; + u16 dir, hwid; + + md2_flags = ntohs(md->u.md2.flags); + dir = (md2_flags & DIR_MASK) >> DIR_OFFSET; + hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET; + + erspan_build_header_v2(skb, + tunnel_id_to_key32(key->tun_id), + dir, hwid, truncate, + false); + } + } else { + switch (skb->protocol) { + case htons(ETH_P_IP): + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, + &dsfield, &encap_limit); + break; + case htons(ETH_P_IPV6): + if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) + goto tx_err; + if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, + &dsfield, &encap_limit)) + goto tx_err; + break; + default: + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + break; + } + + if (t->parms.erspan_ver == 1) + erspan_build_header(skb, t->parms.o_key, + t->parms.index, + truncate, false); + else + erspan_build_header_v2(skb, t->parms.o_key, + t->parms.dir, + t->parms.hwid, + truncate, false); + fl6.daddr = t->parms.raddr; + } + + /* Push GRE header. */ + gre_build_header(skb, 8, TUNNEL_SEQ, + htons(ETH_P_ERSPAN), 0, htonl(t->o_seqno++)); + + /* TooBig packet may have updated dst->dev's mtu */ + if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) + dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu); + + err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, + NEXTHDR_GRE); + if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ + if (err == -EMSGSIZE) { + if (skb->protocol == htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } + + goto tx_err; + } + return NETDEV_TX_OK; + + tx_err: + stats = &t->dev->stats; + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; + } + static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) { struct net_device *dev = t->dev; @@@ -1048,6 -1371,11 +1371,11 @@@ static int ip6gre_tunnel_init_common(st if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8;
+ if (tunnel->parms.collect_md) { + dev->features |= NETIF_F_NETNS_LOCAL; + netif_keep_dst(dev); + } + return 0; }
@@@ -1062,6 -1390,9 +1390,9 @@@ static int ip6gre_tunnel_init(struct ne
tunnel = netdev_priv(dev);
+ if (tunnel->parms.collect_md) + return 0; + memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr));
@@@ -1084,7 -1415,6 +1415,6 @@@ static void ip6gre_fb_tunnel_init(struc dev_hold(dev); }
- static struct inet6_protocol ip6gre_protocol __read_mostly = { .handler = gre_rcv, .err_handler = ip6gre_err, @@@ -1099,7 -1429,8 +1429,8 @@@ static void ip6gre_destroy_tunnels(stru
for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6gre_link_ops || - dev->rtnl_link_ops == &ip6gre_tap_ops) + dev->rtnl_link_ops == &ip6gre_tap_ops || + dev->rtnl_link_ops == &ip6erspan_tap_ops) unregister_netdevice_queue(dev, head);
for (prio = 0; prio < 4; prio++) { @@@ -1221,6 -1552,70 +1552,70 @@@ out return ip6gre_tunnel_validate(tb, data, extack); }
+ static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) + { + __be16 flags = 0; + int ret, ver = 0; + + if (!data) + return 0; + + ret = ip6gre_tap_validate(tb, data, extack); + if (ret) + return ret; + + /* ERSPAN should only have GRE sequence and key flag */ + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (!data[IFLA_GRE_COLLECT_METADATA] && + flags != (GRE_SEQ | GRE_KEY)) + return -EINVAL; + + /* ERSPAN Session ID only has 10-bit. Since we reuse + * 32-bit key field as ID, check it's range. + */ + if (data[IFLA_GRE_IKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_OKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_ERSPAN_VER]) { + ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); + if (ver != 1 && ver != 2) + return -EINVAL; + } + + if (ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) { + u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + + if (index & ~INDEX_MASK) + return -EINVAL; + } + } else if (ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) { + u16 dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + + if (dir & ~(DIR_MASK >> DIR_OFFSET)) + return -EINVAL; + } + + if (data[IFLA_GRE_ERSPAN_HWID]) { + u16 hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + + if (hwid & ~(HWID_MASK >> HWID_OFFSET)) + return -EINVAL; + } + } + + return 0; + }
static void ip6gre_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) @@@ -1267,6 -1662,22 +1662,22 @@@
if (data[IFLA_GRE_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + + if (data[IFLA_GRE_COLLECT_METADATA]) + parms->collect_md = true; + + if (data[IFLA_GRE_ERSPAN_VER]) + parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); + + if (parms->erspan_ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) + parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + } else if (parms->erspan_ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) + parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + if (data[IFLA_GRE_ERSPAN_HWID]) + parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + } }
static int ip6gre_tap_init(struct net_device *dev) @@@ -1303,12 -1714,64 +1714,65 @@@ static const struct net_device_ops ip6g NETIF_F_HIGHDMA | \ NETIF_F_HW_CSUM)
+ static int ip6erspan_tap_init(struct net_device *dev) + { + struct ip6_tnl *tunnel; + int t_hlen; + int ret; + + tunnel = netdev_priv(dev); + + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + + tunnel->tun_hlen = 8; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + erspan_hdr_len(tunnel->parms.erspan_ver); + t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); + + dev->hard_header_len = LL_MAX_HEADER + t_hlen; + dev->mtu = ETH_DATA_LEN - t_hlen; + if (dev->type == ARPHRD_ETHER) + dev->mtu -= ETH_HLEN; + if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu -= 8; + + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + tunnel = netdev_priv(dev); + ip6gre_tnl_link_config(tunnel, 1); + + return 0; + } + + static const struct net_device_ops ip6erspan_netdev_ops = { + .ndo_init = ip6erspan_tap_init, + .ndo_uninit = ip6gre_tunnel_uninit, + .ndo_start_xmit = ip6erspan_tunnel_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ip6_tnl_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip6_tnl_get_iflink, + }; + static void ip6gre_tap_setup(struct net_device *dev) {
ether_setup(dev);
+ dev->max_mtu = 0; dev->netdev_ops = &ip6gre_tap_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; @@@ -1373,8 -1836,13 +1837,13 @@@ static int ip6gre_newlink(struct net *s
ip6gre_netlink_parms(data, &nt->parms);
- if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) - return -EEXIST; + if (nt->parms.collect_md) { + if (rtnl_dereference(ign->collect_md_tun)) + return -EEXIST; + } else { + if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) + return -EEXIST; + }
if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); @@@ -1493,8 -1961,12 +1962,12 @@@ static size_t ip6gre_get_size(const str nla_total_size(2) + /* IFLA_GRE_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_GRE_COLLECT_METADATA */ + nla_total_size(0) + /* IFLA_GRE_FWMARK */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_INDEX */ + nla_total_size(4) + 0; }
@@@ -1516,7 -1988,8 +1989,8 @@@ static int ip6gre_fill_info(struct sk_b nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) || - nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark)) + nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark) || + nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) goto nla_put_failure;
if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, @@@ -1529,6 -2002,24 +2003,24 @@@ t->encap.flags)) goto nla_put_failure;
+ if (p->collect_md) { + if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) + goto nla_put_failure; + } + + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) + goto nla_put_failure; + + if (p->erspan_ver == 1) { + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) + goto nla_put_failure; + } else if (p->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid)) + goto nla_put_failure; + } + return 0;
nla_put_failure: @@@ -1551,9 -2042,28 +2043,28 @@@ static const struct nla_policy ip6gre_p [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, };
+ static void ip6erspan_tap_setup(struct net_device *dev) + { + ether_setup(dev); + + dev->netdev_ops = &ip6erspan_netdev_ops; + dev->needs_free_netdev = true; + dev->priv_destructor = ip6gre_dev_free; + + dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); + } + static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { .kind = "ip6gre", .maxtype = IFLA_GRE_MAX, @@@ -1583,6 -2093,20 +2094,20 @@@ static struct rtnl_link_ops ip6gre_tap_ .get_link_net = ip6_tnl_get_link_net, };
+ static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = { + .kind = "ip6erspan", + .maxtype = IFLA_GRE_MAX, + .policy = ip6gre_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = ip6erspan_tap_setup, + .validate = ip6erspan_tap_validate, + .newlink = ip6gre_newlink, + .changelink = ip6gre_changelink, + .get_size = ip6gre_get_size, + .fill_info = ip6gre_fill_info, + .get_link_net = ip6_tnl_get_link_net, + }; + /* * And now the modules code and kernel interface. */ @@@ -1611,9 -2135,15 +2136,15 @@@ static int __init ip6gre_init(void if (err < 0) goto tap_ops_failed;
+ err = rtnl_link_register(&ip6erspan_tap_ops); + if (err < 0) + goto erspan_link_failed; + out: return err;
+ erspan_link_failed: + rtnl_link_unregister(&ip6gre_tap_ops); tap_ops_failed: rtnl_link_unregister(&ip6gre_link_ops); rtnl_link_failed: @@@ -1627,6 -2157,7 +2158,7 @@@ static void __exit ip6gre_fini(void { rtnl_link_unregister(&ip6gre_tap_ops); rtnl_link_unregister(&ip6gre_link_ops); + rtnl_link_unregister(&ip6erspan_tap_ops); inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE); unregister_pernet_device(&ip6gre_net_ops); } @@@ -1638,4 -2169,5 +2170,5 @@@ MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)" MODULE_DESCRIPTION("GRE over IPv6 tunneling device"); MODULE_ALIAS_RTNL_LINK("ip6gre"); MODULE_ALIAS_RTNL_LINK("ip6gretap"); + MODULE_ALIAS_RTNL_LINK("ip6erspan"); MODULE_ALIAS_NETDEV("ip6gre0"); diff --combined net/ipv6/ip6_tunnel.c index 931c38f6ff4a,6ff2f21ae3fc..8a4610e84e58 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@@ -861,7 -861,7 +861,7 @@@ int ip6_tnl_rcv(struct ip6_tnl *t, stru struct metadata_dst *tun_dst, bool log_ecn_err) { - return __ip6_tnl_rcv(t, skb, tpi, NULL, ip6ip6_dscp_ecn_decapsulate, + return __ip6_tnl_rcv(t, skb, tpi, tun_dst, ip6ip6_dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); @@@ -979,6 -979,9 +979,9 @@@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t int ret = 0; struct net *net = t->net;
+ if (t->parms.collect_md) + return 1; + if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { @@@ -1123,13 -1126,8 +1126,13 @@@ route_lookup max_headroom += 8; mtu -= 8; } - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; + if (skb->protocol == htons(ETH_P_IPV6)) { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + } else if (mtu < 576) { + mtu = 576; + } + if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { diff --combined net/ipv6/route.c index 2bc91c349273,b3f4d19b3ca5..5c014ce747e0 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@@ -186,7 -186,7 +186,7 @@@ static void rt6_uncached_list_flush_dev
static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) { - return dst_metrics_write_ptr(rt->dst.from); + return dst_metrics_write_ptr(&rt->from->dst); }
static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) @@@ -391,7 -391,7 +391,7 @@@ static void ip6_dst_destroy(struct dst_ { struct rt6_info *rt = (struct rt6_info *)dst; struct rt6_exception_bucket *bucket; - struct dst_entry *from = dst->from; + struct rt6_info *from = rt->from; struct inet6_dev *idev;
dst_destroy_metrics_generic(dst); @@@ -409,8 -409,8 +409,8 @@@ kfree(bucket); }
- dst->from = NULL; - dst_release(from); + rt->from = NULL; + dst_release(&from->dst); }
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@@ -443,9 -443,9 +443,9 @@@ static bool rt6_check_expired(const str if (rt->rt6i_flags & RTF_EXPIRES) { if (time_after(jiffies, rt->dst.expires)) return true; - } else if (rt->dst.from) { + } else if (rt->from) { return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || - rt6_check_expired((struct rt6_info *)rt->dst.from); + rt6_check_expired(rt->from); } return false; } @@@ -502,7 -502,7 +502,7 @@@ static inline struct rt6_info *rt6_devi if (!oif && ipv6_addr_any(saddr)) goto out;
- for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) { + for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { struct net_device *dev = sprt->dst.dev;
if (oif) { @@@ -721,7 -721,7 +721,7 @@@ static struct rt6_info *find_rr_leaf(st
match = NULL; cont = NULL; - for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) { + for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@@ -731,7 -731,7 +731,7 @@@ }
for (rt = leaf; rt && rt != rr_head; - rt = rcu_dereference(rt->dst.rt6_next)) { + rt = rcu_dereference(rt->rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@@ -743,7 -743,7 +743,7 @@@ if (match || !cont) return match;
- for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next)) + for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next)) match = find_match(rt, oif, strict, &mpri, match, do_rr);
return match; @@@ -781,7 -781,7 +781,7 @@@ static struct rt6_info *rt6_select(stru &do_rr);
if (do_rr) { - struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next); + struct rt6_info *next = rcu_dereference(rt0->rt6_next);
/* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) @@@ -1054,7 -1054,7 +1054,7 @@@ static struct rt6_info *ip6_rt_cache_al */
if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = (struct rt6_info *)ort->dst.from; + ort = ort->from;
rcu_read_lock(); dev = ip6_rt_get_dev_rcu(ort); @@@ -1274,7 -1274,7 +1274,7 @@@ static int rt6_insert_exception(struct
/* ort can't be a cache or pcpu route */ if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = (struct rt6_info *)ort->dst.from; + ort = ort->from; WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
spin_lock_bh(&rt6_exception_lock); @@@ -1415,8 -1415,8 +1415,8 @@@ static struct rt6_info *rt6_find_cached /* Remove the passed in cached rt from the hash table that contains it */ int rt6_remove_exception_rt(struct rt6_info *rt) { - struct rt6_info *from = (struct rt6_info *)rt->dst.from; struct rt6_exception_bucket *bucket; + struct rt6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; int err; @@@ -1460,8 -1460,8 +1460,8 @@@ */ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { - struct rt6_info *from = (struct rt6_info *)rt->dst.from; struct rt6_exception_bucket *bucket; + struct rt6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex;
@@@ -1929,9 -1929,9 +1929,9 @@@ struct dst_entry *ip6_blackhole_route(s
static void rt6_dst_from_metrics_check(struct rt6_info *rt) { - if (rt->dst.from && - dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) - dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); + if (rt->from && + dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst)) + dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true); }
static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) @@@ -1951,7 -1951,7 +1951,7 @@@ static struct dst_entry *rt6_dst_from_c { if (!__rt6_check_expired(rt) && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && - rt6_check((struct rt6_info *)(rt->dst.from), cookie)) + rt6_check(rt->from, cookie)) return &rt->dst; else return NULL; @@@ -1971,7 -1971,7 +1971,7 @@@ static struct dst_entry *ip6_dst_check( rt6_dst_from_metrics_check(rt);
if (rt->rt6i_flags & RTF_PCPU || - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) + (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) return rt6_dst_from_check(rt, cookie); else return rt6_check(rt, cookie); @@@ -2336,7 -2336,6 +2336,7 @@@ struct dst_entry *icmp6_dst_alloc(struc }
rt->dst.flags |= DST_HOST; + rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; @@@ -3056,11 -3055,11 +3056,11 @@@ out
static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) { - BUG_ON(from->dst.from); + BUG_ON(from->from);
rt->rt6i_flags &= ~RTF_EXPIRES; dst_hold(&from->dst); - rt->dst.from = &from->dst; + rt->from = from; dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); }
@@@ -4597,8 -4596,6 +4597,6 @@@ static int __net_init ip6_route_net_ini GFP_KERNEL); if (!net->ipv6.ip6_null_entry) goto out_ip6_dst_entries; - net->ipv6.ip6_null_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); @@@ -4610,8 -4607,6 +4608,6 @@@ GFP_KERNEL); if (!net->ipv6.ip6_prohibit_entry) goto out_ip6_null_entry; - net->ipv6.ip6_prohibit_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_prohibit_entry; net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); @@@ -4621,8 -4616,6 +4617,6 @@@ GFP_KERNEL); if (!net->ipv6.ip6_blk_hole_entry) goto out_ip6_prohibit_entry; - net->ipv6.ip6_blk_hole_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); @@@ -4779,11 -4772,20 +4773,20 @@@ int __init ip6_route_init(void if (ret) goto fib6_rules_init;
- ret = -ENOBUFS; - if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) || - __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) || - __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, - RTNL_FLAG_DOIT_UNLOCKED)) + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, + inet6_rtm_newroute, NULL, 0); + if (ret < 0) + goto out_register_late_subsys; + + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, + inet6_rtm_delroute, NULL, 0); + if (ret < 0) + goto out_register_late_subsys; + + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, + inet6_rtm_getroute, NULL, + RTNL_FLAG_DOIT_UNLOCKED); + if (ret < 0) goto out_register_late_subsys;
ret = register_netdevice_notifier(&ip6_route_dev_notifier); @@@ -4801,6 -4803,7 +4804,7 @@@ out return ret;
out_register_late_subsys: + rtnl_unregister_all(PF_INET6); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_init: fib6_rules_cleanup(); diff --combined net/sctp/socket.c index 03d9d24b38ba,aadcd4244d9b..f0ca32547a21 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@@ -201,6 -201,22 +201,22 @@@ static void sctp_for_each_tx_datachunk( cb(chunk); }
+ static void sctp_for_each_rx_skb(struct sctp_association *asoc, struct sock *sk, + void (*cb)(struct sk_buff *, struct sock *)) + + { + struct sk_buff *skb, *tmp; + + sctp_skb_for_each(skb, &asoc->ulpq.lobby, tmp) + cb(skb, sk); + + sctp_skb_for_each(skb, &asoc->ulpq.reasm, tmp) + cb(skb, sk); + + sctp_skb_for_each(skb, &asoc->ulpq.reasm_uo, tmp) + cb(skb, sk); + } + /* Verify that this is a valid address. */ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr, int len) @@@ -1528,7 -1544,7 +1544,7 @@@ static void sctp_close(struct sock *sk
lock_sock_nested(sk, SINGLE_DEPTH_NESTING); sk->sk_shutdown = SHUTDOWN_MASK; - sk->sk_state = SCTP_SS_CLOSING; + inet_sk_set_state(sk, SCTP_SS_CLOSING);
ep = sctp_sk(sk)->ep;
@@@ -1554,6 -1570,7 +1570,7 @@@
if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) || !skb_queue_empty(&asoc->ulpq.reasm) || + !skb_queue_empty(&asoc->ulpq.reasm_uo) || (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) { struct sctp_chunk *chunk;
@@@ -2002,7 -2019,20 +2019,20 @@@ static int sctp_sendmsg(struct sock *sk if (err < 0) goto out_free;
- wait_connect = true; + /* If stream interleave is enabled, wait_connect has to be + * done earlier than data enqueue, as it needs to make data + * or idata according to asoc->intl_enable which is set + * after connection is done. + */ + if (sctp_sk(asoc->base.sk)->strm_interleave) { + timeo = sock_sndtimeo(sk, 0); + err = sctp_wait_for_connect(asoc, &timeo); + if (err) + goto out_unlock; + } else { + wait_connect = true; + } + pr_debug("%s: we associated primitively\n", __func__); }
@@@ -2281,7 -2311,7 +2311,7 @@@ static int sctp_setsockopt_events(struc if (!event) return -ENOMEM;
- sctp_ulpq_tail_event(&asoc->ulpq, event); + asoc->stream.si->enqueue_event(&asoc->ulpq, event); } }
@@@ -3180,7 -3210,7 +3210,7 @@@ static int sctp_setsockopt_maxseg(struc if (val == 0) { val = asoc->pathmtu - sp->pf->af->net_header_len; val -= sizeof(struct sctphdr) + - sizeof(struct sctp_data_chunk); + sctp_datachk_len(&asoc->stream); } asoc->user_frag = val; asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu); @@@ -3350,7 -3380,10 +3380,10 @@@ static int sctp_setsockopt_fragment_int if (get_user(val, (int __user *)optval)) return -EFAULT;
- sctp_sk(sk)->frag_interleave = (val == 0) ? 0 : 1; + sctp_sk(sk)->frag_interleave = !!val; + + if (!sctp_sk(sk)->frag_interleave) + sctp_sk(sk)->strm_interleave = 0;
return 0; } @@@ -4023,6 -4056,40 +4056,40 @@@ out return retval; }
+ static int sctp_setsockopt_interleaving_supported(struct sock *sk, + char __user *optval, + unsigned int optlen) + { + struct sctp_sock *sp = sctp_sk(sk); + struct net *net = sock_net(sk); + struct sctp_assoc_value params; + int retval = -EINVAL; + + if (optlen < sizeof(params)) + goto out; + + optlen = sizeof(params); + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_id) + goto out; + + if (!net->sctp.intl_enable || !sp->frag_interleave) { + retval = -EPERM; + goto out; + } + + sp->strm_interleave = !!params.assoc_value; + + retval = 0; + + out: + return retval; + } + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@@ -4210,6 -4277,10 +4277,10 @@@ static int sctp_setsockopt(struct sock case SCTP_STREAM_SCHEDULER_VALUE: retval = sctp_setsockopt_scheduler_value(sk, optval, optlen); break; + case SCTP_INTERLEAVING_SUPPORTED: + retval = sctp_setsockopt_interleaving_supported(sk, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; @@@ -4586,7 -4657,7 +4657,7 @@@ static void sctp_shutdown(struct sock * if (how & SEND_SHUTDOWN && !list_empty(&ep->asocs)) { struct sctp_association *asoc;
- sk->sk_state = SCTP_SS_CLOSING; + inet_sk_set_state(sk, SCTP_SS_CLOSING); asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); sctp_primitive_SHUTDOWN(net, asoc, NULL); @@@ -4680,20 -4751,11 +4751,11 @@@ int sctp_get_sctp_info(struct sock *sk EXPORT_SYMBOL_GPL(sctp_get_sctp_info);
/* use callback to avoid exporting the core structure */ - int sctp_transport_walk_start(struct rhashtable_iter *iter) + void sctp_transport_walk_start(struct rhashtable_iter *iter) { - int err; - rhltable_walk_enter(&sctp_transport_hashtable, iter);
- err = rhashtable_walk_start(iter); - if (err && err != -EAGAIN) { - rhashtable_walk_stop(iter); - rhashtable_walk_exit(iter); - return err; - } - - return 0; + rhashtable_walk_start(iter); }
void sctp_transport_walk_stop(struct rhashtable_iter *iter) @@@ -4784,12 -4846,10 +4846,10 @@@ int sctp_for_each_transport(int (*cb)(s struct net *net, int *pos, void *p) { struct rhashtable_iter hti; struct sctp_transport *tsp; - int ret; + int ret = 0;
again: - ret = sctp_transport_walk_start(&hti); - if (ret) - return ret; + sctp_transport_walk_start(&hti);
tsp = sctp_transport_get_idx(net, &hti, *pos + 1); for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) { @@@ -6984,6 -7044,47 +7044,47 @@@ out return retval; }
+ static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len, + char __user *optval, + int __user *optlen) + { + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + params.assoc_value = asoc->intl_enable; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + params.assoc_value = sp->strm_interleave; + } else { + retval = -EINVAL; + goto out; + } + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + + out: + return retval; + } + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@@ -7174,6 -7275,10 +7275,10 @@@ retval = sctp_getsockopt_scheduler_value(sk, len, optval, optlen); break; + case SCTP_INTERLEAVING_SUPPORTED: + retval = sctp_getsockopt_interleaving_supported(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; @@@ -7408,13 -7513,13 +7513,13 @@@ static int sctp_listen_start(struct soc * sockets. * */ - sk->sk_state = SCTP_SS_LISTENING; + inet_sk_set_state(sk, SCTP_SS_LISTENING); if (!ep->base.bind_addr.port) { if (sctp_autobind(sk)) return -EAGAIN; } else { if (sctp_get_port(sk, inet_sk(sk)->inet_num)) { - sk->sk_state = SCTP_SS_CLOSED; + inet_sk_set_state(sk, SCTP_SS_CLOSED); return -EADDRINUSE; } } @@@ -7500,11 -7605,11 +7605,11 @@@ out * here, again, by modeling the current TCP/UDP code. We don't have * a good way to test with it yet. */ -unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct sctp_sock *sp = sctp_sk(sk); - unsigned int mask; + __poll_t mask;
poll_wait(file, sk_sleep(sk), wait);
@@@ -8411,11 -8516,7 +8516,7 @@@ static void sctp_sock_migrate(struct so
}
- sctp_skb_for_each(skb, &assoc->ulpq.reasm, tmp) - sctp_skb_set_owner_r_frag(skb, newsk); - - sctp_skb_for_each(skb, &assoc->ulpq.lobby, tmp) - sctp_skb_set_owner_r_frag(skb, newsk); + sctp_for_each_rx_skb(assoc, newsk, sctp_skb_set_owner_r_frag);
/* Set the type of socket to indicate that it is peeled off from the * original UDP-style socket or created with the accept() call on a @@@ -8441,10 -8542,10 +8542,10 @@@ * is called, set RCV_SHUTDOWN flag. */ if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) { - newsk->sk_state = SCTP_SS_CLOSED; + inet_sk_set_state(newsk, SCTP_SS_CLOSED); newsk->sk_shutdown |= RCV_SHUTDOWN; } else { - newsk->sk_state = SCTP_SS_ESTABLISHED; + inet_sk_set_state(newsk, SCTP_SS_ESTABLISHED); }
release_sock(newsk); diff --combined net/sctp/ulpqueue.c index e36ec5dd64c6,97fae53310e0..0b427100b0d4 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@@ -60,6 -60,7 +60,7 @@@ struct sctp_ulpq *sctp_ulpq_init(struc
ulpq->asoc = asoc; skb_queue_head_init(&ulpq->reasm); + skb_queue_head_init(&ulpq->reasm_uo); skb_queue_head_init(&ulpq->lobby); ulpq->pd_mode = 0;
@@@ -83,6 -84,10 +84,10 @@@ void sctp_ulpq_flush(struct sctp_ulpq * sctp_ulpevent_free(event); }
+ while ((skb = __skb_dequeue(&ulpq->reasm_uo)) != NULL) { + event = sctp_skb2event(skb); + sctp_ulpevent_free(event); + } }
/* Dispose of a ulpqueue. */ @@@ -104,6 -109,9 +109,9 @@@ int sctp_ulpq_tail_data(struct sctp_ulp if (!event) return -ENOMEM;
+ event->ssn = ntohs(chunk->subh.data_hdr->ssn); + event->ppid = chunk->subh.data_hdr->ppid; + /* Do reassembly if needed. */ event = sctp_ulpq_reasm(ulpq, event);
@@@ -328,9 -336,10 +336,10 @@@ static void sctp_ulpq_store_reasm(struc * payload was fragmented on the way and ip had to reassemble them. * We add the rest of skb's to the first skb's fraglist. */ - static struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net, - struct sk_buff_head *queue, struct sk_buff *f_frag, - struct sk_buff *l_frag) + struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net, + struct sk_buff_head *queue, + struct sk_buff *f_frag, + struct sk_buff *l_frag) { struct sk_buff *pos; struct sk_buff *new = NULL; @@@ -853,7 -862,7 +862,7 @@@ static struct sctp_ulpevent *sctp_ulpq_ struct sctp_stream *stream;
/* Check if this message needs ordering. */ - if (SCTP_DATA_UNORDERED & event->msg_flags) + if (event->msg_flags & SCTP_DATA_UNORDERED) return event;
/* Note: The stream ID must be verified before this routine. */ @@@ -974,8 -983,8 +983,8 @@@ void sctp_ulpq_skip(struct sctp_ulpq *u sctp_ulpq_reap_ordered(ulpq, sid); }
- static __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, - struct sk_buff_head *list, __u16 needed) + __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, struct sk_buff_head *list, + __u16 needed) { __u16 freed = 0; __u32 tsn, last_tsn; @@@ -1084,21 -1093,29 +1093,21 @@@ void sctp_ulpq_partial_delivery(struct void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { - struct sctp_association *asoc; - __u16 needed, freed; - - asoc = ulpq->asoc; + struct sctp_association *asoc = ulpq->asoc; + __u32 freed = 0; + __u16 needed;
- if (chunk) { - needed = ntohs(chunk->chunk_hdr->length); - needed -= sizeof(struct sctp_data_chunk); - } else - needed = SCTP_DEFAULT_MAXWINDOW; - - freed = 0; + needed = ntohs(chunk->chunk_hdr->length) - + sizeof(struct sctp_data_chunk);
if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_order(ulpq, needed); - if (freed < needed) { + if (freed < needed) freed += sctp_ulpq_renege_frags(ulpq, needed - freed); - } } /* If able to free enough room, accept this chunk. */ - if (chunk && (freed >= needed)) { - int retval; - retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); + if (freed >= needed) { + int retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); /* * Enter partial delivery if chunk has not been * delivered; otherwise, drain the reassembly queue. @@@ -1132,7 -1149,7 +1141,7 @@@ void sctp_ulpq_abort_pd(struct sctp_ulp &sctp_sk(sk)->subscribe)) ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, - gfp); + 0, 0, 0, gfp); if (ev) __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
diff --combined net/smc/af_smc.c index 449f62e1e270,daf8075f5a4c..b6e4e2e4fe12 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@@ -520,7 -520,7 +520,7 @@@ decline_rdma smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { rc = smc_clc_send_decline(smc, reason_code); - if (rc < sizeof(struct smc_clc_msg_decline)) + if (rc < 0) goto out_err; } goto out_connected; @@@ -751,14 -751,16 +751,16 @@@ static void smc_listen_work(struct work { struct smc_sock *new_smc = container_of(work, struct smc_sock, smc_listen_work); + struct smc_clc_msg_proposal_prefix *pclc_prfx; struct socket *newclcsock = new_smc->clcsock; struct smc_sock *lsmc = new_smc->listen_smc; struct smc_clc_msg_accept_confirm cclc; int local_contact = SMC_REUSE_CONTACT; struct sock *newsmcsk = &new_smc->sk; - struct smc_clc_msg_proposal pclc; + struct smc_clc_msg_proposal *pclc; struct smc_ib_device *smcibdev; struct sockaddr_in peeraddr; + u8 buf[SMC_CLC_MAX_LEN]; struct smc_link *link; int reason_code = 0; int rc = 0, len; @@@ -775,7 -777,7 +777,7 @@@ /* do inband token exchange - *wait for and receive SMC Proposal CLC message */ - reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc), + reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf), SMC_CLC_PROPOSAL); if (reason_code < 0) goto out_err; @@@ -804,8 -806,11 +806,11 @@@ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ goto decline_rdma; } - if ((pclc.outgoing_subnet != subnet) || - (pclc.prefix_len != prefix_len)) { + + pclc = (struct smc_clc_msg_proposal *)&buf; + pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (pclc_prfx->outgoing_subnet != subnet || + pclc_prfx->prefix_len != prefix_len) { reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ goto decline_rdma; } @@@ -816,7 -821,7 +821,7 @@@ /* allocate connection / link group */ mutex_lock(&smc_create_lgr_pending); local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, - smcibdev, ibport, &pclc.lcl, 0); + smcibdev, ibport, &pclc->lcl, 0); if (local_contact < 0) { rc = local_contact; if (rc == -ENOMEM) @@@ -879,11 -884,9 +884,9 @@@ } /* QP confirmation over RoCE fabric */ reason_code = smc_serv_conf_first_link(new_smc); - if (reason_code < 0) { + if (reason_code < 0) /* peer is not aware of a problem */ - rc = reason_code; goto out_err_unlock; - } if (reason_code > 0) goto decline_rdma_unlock; } @@@ -916,8 -919,7 +919,7 @@@ decline_rdma smc_conn_free(&new_smc->conn); new_smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(new_smc, reason_code); - if (rc < sizeof(struct smc_clc_msg_decline)) + if (smc_clc_send_decline(new_smc, reason_code) < 0) goto out_err; } goto out_connected; @@@ -1107,7 -1109,7 +1109,7 @@@ out return rc; }
-static unsigned int smc_accept_poll(struct sock *parent) +static __poll_t smc_accept_poll(struct sock *parent) { struct smc_sock *isk; struct sock *sk; @@@ -1126,11 -1128,11 +1128,11 @@@ return 0; }
-static unsigned int smc_poll(struct file *file, struct socket *sock, +static __poll_t smc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - unsigned int mask = 0; + __poll_t mask = 0; struct smc_sock *smc; int rc;
diff --combined net/smc/smc_clc.c index 511548085d16,abf7ceb6690b..8ac51583a063 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@@ -22,6 -22,54 +22,54 @@@ #include "smc_clc.h" #include "smc_ib.h"
+ /* check if received message has a correct header length and contains valid + * heading and trailing eyecatchers + */ + static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) + { + struct smc_clc_msg_proposal_prefix *pclc_prfx; + struct smc_clc_msg_accept_confirm *clc; + struct smc_clc_msg_proposal *pclc; + struct smc_clc_msg_decline *dclc; + struct smc_clc_msg_trail *trl; + + if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) + return false; + switch (clcm->type) { + case SMC_CLC_PROPOSAL: + pclc = (struct smc_clc_msg_proposal *)clcm; + pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (ntohs(pclc->hdr.length) != + sizeof(*pclc) + ntohs(pclc->iparea_offset) + + sizeof(*pclc_prfx) + + pclc_prfx->ipv6_prefixes_cnt * + sizeof(struct smc_clc_ipv6_prefix) + + sizeof(*trl)) + return false; + trl = (struct smc_clc_msg_trail *) + ((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl)); + break; + case SMC_CLC_ACCEPT: + case SMC_CLC_CONFIRM: + clc = (struct smc_clc_msg_accept_confirm *)clcm; + if (ntohs(clc->hdr.length) != sizeof(*clc)) + return false; + trl = &clc->trl; + break; + case SMC_CLC_DECLINE: + dclc = (struct smc_clc_msg_decline *)clcm; + if (ntohs(dclc->hdr.length) != sizeof(*dclc)) + return false; + trl = &dclc->trl; + break; + default: + return false; + } + if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) + return false; + return true; + } + /* Wait for data on the tcp-socket, analyze received data * Returns: * 0 if success and it was not a decline that we received. @@@ -35,7 -83,7 +83,7 @@@ int smc_clc_wait_msg(struct smc_sock *s struct smc_clc_msg_hdr *clcm = buf; struct msghdr msg = {NULL, 0}; int reason_code = 0; - struct kvec vec; + struct kvec vec = {buf, buflen}; int len, datlen; int krflags;
@@@ -43,15 -91,12 +91,15 @@@ * so we don't consume any subsequent CLC message or payload data * in the TCP byte stream */ - vec.iov_base = buf; - vec.iov_len = buflen; + /* + * Caller must make sure that buflen is no less than + * sizeof(struct smc_clc_msg_hdr) + */ krflags = MSG_PEEK | MSG_WAITALL; smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; - len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, - sizeof(struct smc_clc_msg_hdr), krflags); + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, + sizeof(struct smc_clc_msg_hdr)); + len = sock_recvmsg(smc->clcsock, &msg, krflags); if (signal_pending(current)) { reason_code = -EINTR; clc_sk->sk_err = EINTR; @@@ -75,9 -120,7 +123,7 @@@ } datlen = ntohs(clcm->length); if ((len < sizeof(struct smc_clc_msg_hdr)) || - (datlen < sizeof(struct smc_clc_msg_decline)) || - (datlen > sizeof(struct smc_clc_msg_accept_confirm)) || - memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) || + (datlen > buflen) || ((clcm->type != SMC_CLC_DECLINE) && (clcm->type != expected_type))) { smc->sk.sk_err = EPROTO; @@@ -86,12 -129,13 +132,12 @@@ }
/* receive the complete CLC message */ - vec.iov_base = buf; - vec.iov_len = buflen; memset(&msg, 0, sizeof(struct msghdr)); + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, buflen); krflags = MSG_WAITALL; smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; - len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags); + len = sock_recvmsg(smc->clcsock, &msg, krflags); - if (len < datlen) { + if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) { smc->sk.sk_err = EPROTO; reason_code = -EPROTO; goto out; @@@ -135,7 -179,7 +181,7 @@@ int smc_clc_send_decline(struct smc_soc smc->sk.sk_err = EPROTO; if (len < 0) smc->sk.sk_err = -len; - return len; + return sock_error(&smc->sk); }
/* send CLC PROPOSAL message across internal TCP socket */ @@@ -143,33 -187,43 +189,43 @@@ int smc_clc_send_proposal(struct smc_so struct smc_ib_device *smcibdev, u8 ibport) { + struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_msg_proposal pclc; + struct smc_clc_msg_trail trl; int reason_code = 0; + struct kvec vec[3]; struct msghdr msg; - struct kvec vec; - int len, rc; + int len, plen, rc;
/* send SMC Proposal CLC message */ + plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl); memset(&pclc, 0, sizeof(pclc)); memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); pclc.hdr.type = SMC_CLC_PROPOSAL; - pclc.hdr.length = htons(sizeof(pclc)); + pclc.hdr.length = htons(plen); pclc.hdr.version = SMC_CLC_V1; /* SMC version */ memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE); memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); + pclc.iparea_offset = htons(0);
+ memset(&pclc_prfx, 0, sizeof(pclc_prfx)); /* determine subnet and mask from internal TCP socket */ - rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet, - &pclc.prefix_len); + rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet, + &pclc_prfx.prefix_len); if (rc) return SMC_CLC_DECL_CNFERR; /* configuration error */ - memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + pclc_prfx.ipv6_prefixes_cnt = 0; + memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memset(&msg, 0, sizeof(msg)); - vec.iov_base = &pclc; - vec.iov_len = sizeof(pclc); + vec[0].iov_base = &pclc; + vec[0].iov_len = sizeof(pclc); + vec[1].iov_base = &pclc_prfx; + vec[1].iov_len = sizeof(pclc_prfx); + vec[2].iov_base = &trl; + vec[2].iov_len = sizeof(trl); /* due to the few bytes needed for clc-handshake this cannot block */ - len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc)); + len = kernel_sendmsg(smc->clcsock, &msg, vec, 3, plen); if (len < sizeof(pclc)) { if (len >= 0) { reason_code = -ENETUNREACH; diff --combined net/socket.c index 092baa464afc,bbd2e9ceb692..60d05479b2c1 --- a/net/socket.c +++ b/net/socket.c @@@ -118,7 -118,7 +118,7 @@@ static ssize_t sock_write_iter(struct k static int sock_mmap(struct file *file, struct vm_area_struct *vma);
static int sock_close(struct inode *inode, struct file *file); -static unsigned int sock_poll(struct file *file, +static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT @@@ -163,12 -163,6 +163,6 @@@ static DEFINE_SPINLOCK(net_family_lock) static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
/* - * Statistics counters of the socket lists - */ - - static DEFINE_PER_CPU(int, sockets_in_use); - - /* * Support routines. * Move socket addresses back and forth across the kernel/user * divide and look after the messy bits. @@@ -578,7 -572,6 +572,6 @@@ struct socket *sock_alloc(void inode->i_gid = current_fsgid(); inode->i_op = &sockfs_inode_ops;
- this_cpu_add(sockets_in_use, 1); return sock; } EXPORT_SYMBOL(sock_alloc); @@@ -605,7 -598,6 +598,6 @@@ void sock_release(struct socket *sock if (rcu_dereference_protected(sock->wq, 1)->fasync_list) pr_err("%s: fasync list not empty!\n", __func__);
- this_cpu_sub(sockets_in_use, 1); if (!sock->file) { iput(SOCK_INODE(sock)); return; @@@ -1095,9 -1087,9 +1087,9 @@@ out_release EXPORT_SYMBOL(sock_create_lite);
/* No kernel lock held - perfect */ -static unsigned int sock_poll(struct file *file, poll_table *wait) +static __poll_t sock_poll(struct file *file, poll_table *wait) { - unsigned int busy_flag = 0; + __poll_t busy_flag = 0; struct socket *sock;
/* @@@ -2622,17 -2614,8 +2614,8 @@@ core_initcall(sock_init); /* early init #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { - int cpu; - int counter = 0; - - for_each_possible_cpu(cpu) - counter += per_cpu(sockets_in_use, cpu); - - /* It can be negative, by the way. 8) */ - if (counter < 0) - counter = 0; - - seq_printf(seq, "sockets: used %d\n", counter); + seq_printf(seq, "sockets: used %d\n", + sock_inuse_get(seq->private)); } #endif /* CONFIG_PROC_FS */
diff --combined net/tipc/socket.c index fcbd6489a8b5,0cdf5c2ad881..1a31445e1a31 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@@ -710,13 -710,13 +710,13 @@@ static int tipc_getname(struct socket * * imply that the operation will succeed, merely that it should be performed * and will not block. */ -static unsigned int tipc_poll(struct file *file, struct socket *sock, +static __poll_t tipc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_group *grp = tsk->group; - u32 revents = 0; + __poll_t revents = 0;
sock_poll_wait(file, sk_sleep(sk), wait);
@@@ -2640,9 -2640,7 +2640,7 @@@ void tipc_sk_reinit(struct net *net rhashtable_walk_enter(&tn->sk_rht, &iter);
do { - tsk = ERR_PTR(rhashtable_walk_start(&iter)); - if (IS_ERR(tsk)) - goto walk_stop; + rhashtable_walk_start(&iter);
while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { spin_lock_bh(&tsk->sk.sk_lock.slock); @@@ -2651,7 -2649,7 +2649,7 @@@ msg_set_orignode(msg, tn->own_addr); spin_unlock_bh(&tsk->sk.sk_lock.slock); } - walk_stop: + rhashtable_walk_stop(&iter); } while (tsk == ERR_PTR(-EAGAIN)); } diff --combined net/xfrm/xfrm_input.c index 3f6f6f8c9fa5,ac277b97e0d7..26b10eb7a206 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@@ -8,29 -8,15 +8,29 @@@ * */
+#include <linux/bottom_half.h> +#include <linux/interrupt.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/netdevice.h> +#include <linux/percpu.h> #include <net/dst.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h>
+struct xfrm_trans_tasklet { + struct tasklet_struct tasklet; + struct sk_buff_head queue; +}; + +struct xfrm_trans_cb { + int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb); +}; + +#define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) + static struct kmem_cache *secpath_cachep __read_mostly;
static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); @@@ -39,8 -25,6 +39,8 @@@ static struct xfrm_input_afinfo const _ static struct gro_cells gro_cells; static struct net_device xfrm_napi_dev;
+static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet); + int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; @@@ -223,7 -207,7 +223,7 @@@ int xfrm_input(struct sk_buff *skb, in xfrm_address_t *daddr; struct xfrm_mode *inner_mode; u32 mark = skb->mark; - unsigned int family; + unsigned int family = AF_UNSPEC; int decaps = 0; int async = 0; bool xfrm_gro = false; @@@ -232,16 -216,6 +232,16 @@@
if (encap_type < 0) { x = xfrm_input_state(skb); + + if (unlikely(x->km.state != XFRM_STATE_VALID)) { + if (x->km.state == XFRM_STATE_ACQ) + XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); + else + XFRM_INC_STATS(net, + LINUX_MIB_XFRMINSTATEINVALID); + goto drop; + } + family = x->outer_mode->afinfo->family;
/* An encap_type of -1 indicates async resumption. */ @@@ -257,7 -231,6 +257,6 @@@
if (xo && (xo->flags & CRYPTO_DONE)) { crypto_done = true; - x = xfrm_input_state(skb); family = XFRM_SPI_SKB_CB(skb)->family;
if (!(xo->status & CRYPTO_SUCCESS)) { @@@ -493,41 -466,9 +492,41 @@@ int xfrm_input_resume(struct sk_buff *s } EXPORT_SYMBOL(xfrm_input_resume);
+static void xfrm_trans_reinject(unsigned long data) +{ + struct xfrm_trans_tasklet *trans = (void *)data; + struct sk_buff_head queue; + struct sk_buff *skb; + + __skb_queue_head_init(&queue); + skb_queue_splice_init(&trans->queue, &queue); + + while ((skb = __skb_dequeue(&queue))) + XFRM_TRANS_SKB_CB(skb)->finish(dev_net(skb->dev), NULL, skb); +} + +int xfrm_trans_queue(struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)) +{ + struct xfrm_trans_tasklet *trans; + + trans = this_cpu_ptr(&xfrm_trans_tasklet); + + if (skb_queue_len(&trans->queue) >= netdev_max_backlog) + return -ENOBUFS; + + XFRM_TRANS_SKB_CB(skb)->finish = finish; + skb_queue_tail(&trans->queue, skb); + tasklet_schedule(&trans->tasklet); + return 0; +} +EXPORT_SYMBOL(xfrm_trans_queue); + void __init xfrm_input_init(void) { int err; + int i;
init_dummy_netdev(&xfrm_napi_dev); err = gro_cells_init(&gro_cells, &xfrm_napi_dev); @@@ -538,13 -479,4 +537,13 @@@ sizeof(struct sec_path), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + + for_each_possible_cpu(i) { + struct xfrm_trans_tasklet *trans; + + trans = &per_cpu(xfrm_trans_tasklet, i); + __skb_queue_head_init(&trans->queue); + tasklet_init(&trans->tasklet, xfrm_trans_reinject, + (unsigned long)trans); + } } diff --combined net/xfrm/xfrm_policy.c index 70aa5cb0c659,e3a5aca9cdda..d8a8129b9232 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@@ -54,7 -54,7 +54,7 @@@ static struct xfrm_policy_afinfo const static struct kmem_cache *xfrm_dst_cache __read_mostly; static __read_mostly seqcount_t xfrm_policy_hash_generation;
- static void xfrm_init_pmtu(struct dst_entry *dst); + static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); static void xfrm_policy_queue_process(struct timer_list *t); @@@ -1168,15 -1168,9 +1168,15 @@@ static struct xfrm_policy *xfrm_sk_poli again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { - bool match = xfrm_selector_match(&pol->selector, fl, family); + bool match; int err = 0;
+ if (pol->family != family) { + pol = NULL; + goto out; + } + + match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((sk->sk_mark & pol->mark.m) != pol->mark.v) { pol = NULL; @@@ -1257,7 -1251,7 +1257,7 @@@ EXPORT_SYMBOL(xfrm_policy_delete)
int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) { - struct net *net = xp_net(pol); + struct net *net = sock_net(sk); struct xfrm_policy *old_pol;
#ifdef CONFIG_XFRM_SUB_POLICY @@@ -1544,7 -1538,9 +1544,9 @@@ static inline int xfrm_fill_dst(struct */
static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, - struct xfrm_state **xfrm, int nx, + struct xfrm_state **xfrm, + struct xfrm_dst **bundle, + int nx, const struct flowi *fl, struct dst_entry *dst) { @@@ -1552,8 -1548,8 +1554,8 @@@ unsigned long now = jiffies; struct net_device *dev; struct xfrm_mode *inner_mode; - struct dst_entry *dst_prev = NULL; - struct dst_entry *dst0 = NULL; + struct xfrm_dst *xdst_prev = NULL; + struct xfrm_dst *xdst0 = NULL; int i = 0; int err; int header_len = 0; @@@ -1579,13 -1575,14 +1581,14 @@@ goto put_states; }
- if (!dst_prev) - dst0 = dst1; + bundle[i] = xdst; + if (!xdst_prev) + xdst0 = xdst; else /* Ref count is taken during xfrm_alloc_dst() * No need to do dst_clone() on dst1 */ - dst_prev->child = dst1; + xfrm_dst_set_child(xdst_prev, &xdst->u.dst);
if (xfrm[i]->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(xfrm[i], @@@ -1622,8 -1619,7 +1625,7 @@@ dst1->input = dst_discard; dst1->output = inner_mode->afinfo->output;
- dst1->next = dst_prev; - dst_prev = dst1; + xdst_prev = xdst;
header_len += xfrm[i]->props.header_len; if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT) @@@ -1631,40 -1627,39 +1633,39 @@@ trailer_len += xfrm[i]->props.trailer_len; }
- dst_prev->child = dst; - dst0->path = dst; + xfrm_dst_set_child(xdst_prev, dst); + xdst0->path = dst;
err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst;
- xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len); - xfrm_init_pmtu(dst_prev); - - for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) { - struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev; + xfrm_init_path(xdst0, dst, nfheader_len); + xfrm_init_pmtu(bundle, nx);
- err = xfrm_fill_dst(xdst, dev, fl); + for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst; + xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) { + err = xfrm_fill_dst(xdst_prev, dev, fl); if (err) goto free_dst;
- dst_prev->header_len = header_len; - dst_prev->trailer_len = trailer_len; - header_len -= xdst->u.dst.xfrm->props.header_len; - trailer_len -= xdst->u.dst.xfrm->props.trailer_len; + xdst_prev->u.dst.header_len = header_len; + xdst_prev->u.dst.trailer_len = trailer_len; + header_len -= xdst_prev->u.dst.xfrm->props.header_len; + trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len; }
out: - return dst0; + return &xdst0->u.dst;
put_states: for (; i < nx; i++) xfrm_state_put(xfrm[i]); free_dst: - if (dst0) - dst_release_immediate(dst0); - dst0 = ERR_PTR(err); + if (xdst0) + dst_release_immediate(&xdst0->u.dst); + xdst0 = ERR_PTR(err); goto out; }
@@@ -1806,7 -1801,7 +1807,7 @@@ static bool xfrm_xdst_can_reuse(struct for (i = 0; i < num; i++) { if (!dst || dst->xfrm != xfrm[i]) return false; - dst = dst->child; + dst = xfrm_dst_child(dst); }
return xfrm_bundle_ok(xdst); @@@ -1819,6 -1814,7 +1820,7 @@@ xfrm_resolve_and_create_bundle(struct x { struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; + struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct xfrm_dst *xdst, *old; struct dst_entry *dst; int err; @@@ -1839,7 -1835,6 +1841,7 @@@ sizeof(struct xfrm_policy *) * num_pols) == 0 && xfrm_xdst_can_reuse(xdst, xfrm, err)) { dst_hold(&xdst->u.dst); + xfrm_pols_put(pols, num_pols); while (err > 0) xfrm_state_put(xfrm[--err]); return xdst; @@@ -1847,7 -1842,7 +1849,7 @@@
old = xdst;
- dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig); + dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig); if (IS_ERR(dst)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); return ERR_CAST(dst); @@@ -1887,8 -1882,8 +1889,8 @@@ static void xfrm_policy_queue_process(s xfrm_decode_session(skb, &fl, dst->ops->family); spin_unlock(&pq->hold_queue.lock);
- dst_hold(dst->path); - dst = xfrm_lookup(net, dst->path, &fl, sk, 0); + dst_hold(xfrm_dst_path(dst)); + dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, 0); if (IS_ERR(dst)) goto purge_queue;
@@@ -1917,8 -1912,8 +1919,8 @@@ skb = __skb_dequeue(&list);
xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); - dst_hold(skb_dst(skb)->path); - dst = xfrm_lookup(net, skb_dst(skb)->path, &fl, skb->sk, 0); + dst_hold(xfrm_dst_path(skb_dst(skb))); + dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; @@@ -2019,8 -2014,8 +2021,8 @@@ static struct xfrm_dst *xfrm_create_dum dst1->output = xdst_queue_output;
dst_hold(dst); - dst1->child = dst; - dst1->path = dst; + xfrm_dst_set_child(xdst, dst); + xdst->path = dst;
xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);
@@@ -2583,7 -2578,7 +2585,7 @@@ static int stale_bundle(struct dst_entr
void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { - while ((dst = dst->child) && dst->xfrm && dst->dev == dev) { + while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { dst->dev = dev_net(dev)->loopback_dev; dev_hold(dst->dev); dev_put(dev); @@@ -2607,13 -2602,15 +2609,15 @@@ static struct dst_entry *xfrm_negative_ return dst; }
- static void xfrm_init_pmtu(struct dst_entry *dst) + static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) { - do { - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + while (nr--) { + struct xfrm_dst *xdst = bundle[nr]; u32 pmtu, route_mtu_cached; + struct dst_entry *dst;
- pmtu = dst_mtu(dst->child); + dst = &xdst->u.dst; + pmtu = dst_mtu(xfrm_dst_child(dst)); xdst->child_mtu_cached = pmtu;
pmtu = xfrm_state_mtu(dst->xfrm, pmtu); @@@ -2625,7 -2622,7 +2629,7 @@@ pmtu = route_mtu_cached;
dst_metric_set(dst, RTAX_MTU, pmtu); - } while ((dst = dst->next)); + } }
/* Check that the bundle accepts the flow and its components are @@@ -2634,19 -2631,20 +2638,20 @@@
static int xfrm_bundle_ok(struct xfrm_dst *first) { + struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct dst_entry *dst = &first->u.dst; - struct xfrm_dst *last; + struct xfrm_dst *xdst; + int start_from, nr; u32 mtu;
- if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) || + if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0;
if (dst->flags & DST_XFRM_QUEUE) return 1;
- last = NULL; - + start_from = nr = 0; do { struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
@@@ -2658,9 -2656,11 +2663,11 @@@ xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) return 0;
- mtu = dst_mtu(dst->child); + bundle[nr++] = xdst; + + mtu = dst_mtu(xfrm_dst_child(dst)); if (xdst->child_mtu_cached != mtu) { - last = xdst; + start_from = nr; xdst->child_mtu_cached = mtu; }
@@@ -2668,30 -2668,30 +2675,30 @@@ return 0; mtu = dst_mtu(xdst->route); if (xdst->route_mtu_cached != mtu) { - last = xdst; + start_from = nr; xdst->route_mtu_cached = mtu; }
- dst = dst->child; + dst = xfrm_dst_child(dst); } while (dst->xfrm);
- if (likely(!last)) + if (likely(!start_from)) return 1;
- mtu = last->child_mtu_cached; - for (;;) { - dst = &last->u.dst; + xdst = bundle[start_from - 1]; + mtu = xdst->child_mtu_cached; + while (start_from--) { + dst = &xdst->u.dst;
mtu = xfrm_state_mtu(dst->xfrm, mtu); - if (mtu > last->route_mtu_cached) - mtu = last->route_mtu_cached; + if (mtu > xdst->route_mtu_cached) + mtu = xdst->route_mtu_cached; dst_metric_set(dst, RTAX_MTU, mtu); - - if (last == first) + if (!start_from) break;
- last = (struct xfrm_dst *)last->u.dst.next; - last->child_mtu_cached = mtu; + xdst = bundle[start_from - 1]; + xdst->child_mtu_cached = mtu; }
return 1; @@@ -2699,22 -2699,20 +2706,20 @@@
static unsigned int xfrm_default_advmss(const struct dst_entry *dst) { - return dst_metric_advmss(dst->path); + return dst_metric_advmss(xfrm_dst_path(dst)); }
static unsigned int xfrm_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
- return mtu ? : dst_mtu(dst->path); + return mtu ? : dst_mtu(xfrm_dst_path(dst)); }
static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, const void *daddr) { - const struct dst_entry *path = dst->path; - - for (; dst != path; dst = dst->child) { + while (dst->xfrm) { const struct xfrm_state *xfrm = dst->xfrm;
if (xfrm->props.mode == XFRM_MODE_TRANSPORT) @@@ -2723,6 -2721,8 +2728,8 @@@ daddr = xfrm->coaddr; else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) daddr = &xfrm->id.daddr; + + dst = xfrm_dst_child(dst); } return daddr; } @@@ -2731,7 -2731,7 +2738,7 @@@ static struct neighbour *xfrm_neigh_loo struct sk_buff *skb, const void *daddr) { - const struct dst_entry *path = dst->path; + const struct dst_entry *path = xfrm_dst_path(dst);
if (!skb) daddr = xfrm_get_dst_nexthop(dst, daddr); @@@ -2740,7 -2740,7 +2747,7 @@@
static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) { - const struct dst_entry *path = dst->path; + const struct dst_entry *path = xfrm_dst_path(dst);
daddr = xfrm_get_dst_nexthop(dst, daddr); path->ops->confirm_neigh(path, daddr); diff --combined net/xfrm/xfrm_state.c index 500b3391f474,1b7856be3eeb..cc4c519cad76 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@@ -1343,7 -1343,6 +1343,7 @@@ static struct xfrm_state *xfrm_state_cl
if (orig->aead) { x->aead = xfrm_algo_aead_clone(orig->aead); + x->geniv = orig->geniv; if (!x->aead) goto error; } @@@ -2049,6 -2048,13 +2049,13 @@@ int xfrm_user_policy(struct sock *sk, i struct xfrm_mgr *km; struct xfrm_policy *pol = NULL;
+ if (!optval && !optlen) { + xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); + xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL); + __sk_dst_reset(sk); + return 0; + } + if (optlen <= 0 || optlen > PAGE_SIZE) return -EMSGSIZE;
diff --combined tools/testing/selftests/bpf/Makefile index 05fc4e2e7b3a,a1fcb0c31d02..f1fdb36269f2 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@@ -11,16 -11,18 +11,18 @@@ ifneq ($(wildcard $(GENHDR)), endif
CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include -LDLIBS += -lcap -lelf +LDLIBS += -lcap -lelf -lrt
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_align test_verifier_log test_dev_cgroup
TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ - sockmap_verdict_prog.o dev_cgroup.o + sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \ + test_l4lb_noinline.o test_xdp_noinline.o
- TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh + TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \ + test_offload.py
include ../lib.mk
@@@ -49,8 -51,13 +51,13 @@@ els CPU ?= generic endif
+ CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \ + -Wno-compare-distinct-pointer-types + + $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline + $(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline + %.o: %.c - $(CLANG) -I. -I./include/uapi -I../../../include/uapi \ - -Wno-compare-distinct-pointer-types \ + $(CLANG) $(CLANG_FLAGS) \ -O2 -target bpf -emit-llvm -c $< -o - | \ $(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@ diff --combined tools/testing/selftests/bpf/test_progs.c index 6761be18a91f,6472ca98690e..09087ab12293 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@@ -21,8 -21,10 +21,10 @@@ typedef __u16 __sum16 #include <linux/ipv6.h> #include <linux/tcp.h> #include <linux/filter.h> + #include <linux/perf_event.h> #include <linux/unistd.h>
+ #include <sys/ioctl.h> #include <sys/wait.h> #include <sys/resource.h> #include <sys/types.h> @@@ -167,10 -169,9 +169,9 @@@ out #define NUM_ITER 100000 #define VIP_NUM 5
- static void test_l4lb(void) + static void test_l4lb(const char *file) { unsigned int nr_cpus = bpf_num_possible_cpus(); - const char *file = "./test_l4lb.o"; struct vip key = {.protocol = 6}; struct vip_meta { __u32 flags; @@@ -247,6 -248,95 +248,95 @@@ out bpf_object__close(obj); }
+ static void test_l4lb_all(void) + { + const char *file1 = "./test_l4lb.o"; + const char *file2 = "./test_l4lb_noinline.o"; + + test_l4lb(file1); + test_l4lb(file2); + } + + static void test_xdp_noinline(void) + { + const char *file = "./test_xdp_noinline.o"; + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct vip key = {.protocol = 6}; + struct vip_meta { + __u32 flags; + __u32 vip_num; + } value = {.vip_num = VIP_NUM}; + __u32 stats_key = VIP_NUM; + struct vip_stats { + __u64 bytes; + __u64 pkts; + } stats[nr_cpus]; + struct real_definition { + union { + __be32 dst; + __be32 dstv6[4]; + }; + __u8 flags; + } real_def = {.dst = MAGIC_VAL}; + __u32 ch_key = 11, real_num = 3; + __u32 duration, retval, size; + int err, i, prog_fd, map_fd; + __u64 bytes = 0, pkts = 0; + struct bpf_object *obj; + char buf[128]; + u32 *magic = (u32 *)buf; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); + if (err) { + error_cnt++; + return; + } + + map_fd = bpf_find_map(__func__, obj, "vip_map"); + if (map_fd < 0) + goto out; + bpf_map_update_elem(map_fd, &key, &value, 0); + + map_fd = bpf_find_map(__func__, obj, "ch_rings"); + if (map_fd < 0) + goto out; + bpf_map_update_elem(map_fd, &ch_key, &real_num, 0); + + map_fd = bpf_find_map(__func__, obj, "reals"); + if (map_fd < 0) + goto out; + bpf_map_update_elem(map_fd, &real_num, &real_def, 0); + + err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4), + buf, &size, &retval, &duration); + CHECK(err || errno || retval != 1 || size != 54 || + *magic != MAGIC_VAL, "ipv4", + "err %d errno %d retval %d size %d magic %x\n", + err, errno, retval, size, *magic); + + err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6), + buf, &size, &retval, &duration); + CHECK(err || errno || retval != 1 || size != 74 || + *magic != MAGIC_VAL, "ipv6", + "err %d errno %d retval %d size %d magic %x\n", + err, errno, retval, size, *magic); + + map_fd = bpf_find_map(__func__, obj, "stats"); + if (map_fd < 0) + goto out; + bpf_map_lookup_elem(map_fd, &stats_key, stats); + for (i = 0; i < nr_cpus; i++) { + bytes += stats[i].bytes; + pkts += stats[i].pkts; + } + if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) { + error_cnt++; + printf("test_xdp_noinline:FAIL:stats %lld %lld\n", bytes, pkts); + } + out: + bpf_object__close(obj); + } + static void test_tcp_estats(void) { const char *file = "./test_tcp_estats.o"; @@@ -351,7 -441,7 +441,7 @@@ static void test_bpf_obj_id(void info_len != sizeof(struct bpf_map_info) || strcmp((char *)map_infos[i].name, expected_map_name), "get-map-info(fd)", - "err %d errno %d type %d(%d) info_len %u(%lu) key_size %u value_size %u max_entries %u map_flags %X name %s(%s)\n", + "err %d errno %d type %d(%d) info_len %u(%Zu) key_size %u value_size %u max_entries %u map_flags %X name %s(%s)\n", err, errno, map_infos[i].type, BPF_MAP_TYPE_ARRAY, info_len, sizeof(struct bpf_map_info), @@@ -395,7 -485,7 +485,7 @@@ *(int *)prog_infos[i].map_ids != map_infos[i].id || strcmp((char *)prog_infos[i].name, expected_prog_name), "get-prog-info(fd)", - "err %d errno %d i %d type %d(%d) info_len %u(%lu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n", + "err %d errno %d i %d type %d(%d) info_len %u(%Zu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n", err, errno, i, prog_infos[i].type, BPF_PROG_TYPE_SOCKET_FILTER, info_len, sizeof(struct bpf_prog_info), @@@ -463,7 -553,7 +553,7 @@@ memcmp(&prog_info, &prog_infos[i], info_len) || *(int *)prog_info.map_ids != saved_map_id, "get-prog-info(next_id->fd)", - "err %d errno %d info_len %u(%lu) memcmp %d map_id %u(%u)\n", + "err %d errno %d info_len %u(%Zu) memcmp %d map_id %u(%u)\n", err, errno, info_len, sizeof(struct bpf_prog_info), memcmp(&prog_info, &prog_infos[i], info_len), *(int *)prog_info.map_ids, saved_map_id); @@@ -509,7 -599,7 +599,7 @@@ memcmp(&map_info, &map_infos[i], info_len) || array_value != array_magic_value, "check get-map-info(next_id->fd)", - "err %d errno %d info_len %u(%lu) memcmp %d array_value %llu(%llu)\n", + "err %d errno %d info_len %u(%Zu) memcmp %d array_value %llu(%llu)\n", err, errno, info_len, sizeof(struct bpf_map_info), memcmp(&map_info, &map_infos[i], info_len), array_value, array_magic_value); @@@ -617,6 -707,136 +707,136 @@@ static void test_obj_name(void } }
+ static void test_tp_attach_query(void) + { + const int num_progs = 3; + int i, j, bytes, efd, err, prog_fd[num_progs], pmu_fd[num_progs]; + __u32 duration = 0, info_len, saved_prog_ids[num_progs]; + const char *file = "./test_tracepoint.o"; + struct perf_event_query_bpf *query; + struct perf_event_attr attr = {}; + struct bpf_object *obj[num_progs]; + struct bpf_prog_info prog_info; + char buf[256]; + + snprintf(buf, sizeof(buf), + "/sys/kernel/debug/tracing/events/sched/sched_switch/id"); + efd = open(buf, O_RDONLY, 0); + if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno)) + return; + bytes = read(efd, buf, sizeof(buf)); + close(efd); + if (CHECK(bytes <= 0 || bytes >= sizeof(buf), + "read", "bytes %d errno %d\n", bytes, errno)) + return; + + attr.config = strtol(buf, NULL, 0); + attr.type = PERF_TYPE_TRACEPOINT; + attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN; + attr.sample_period = 1; + attr.wakeup_events = 1; + + query = malloc(sizeof(*query) + sizeof(__u32) * num_progs); + for (i = 0; i < num_progs; i++) { + err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj[i], + &prog_fd[i]); + if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno)) + goto cleanup1; + + bzero(&prog_info, sizeof(prog_info)); + prog_info.jited_prog_len = 0; + prog_info.xlated_prog_len = 0; + prog_info.nr_map_ids = 0; + info_len = sizeof(prog_info); + err = bpf_obj_get_info_by_fd(prog_fd[i], &prog_info, &info_len); + if (CHECK(err, "bpf_obj_get_info_by_fd", "err %d errno %d\n", + err, errno)) + goto cleanup1; + saved_prog_ids[i] = prog_info.id; + + pmu_fd[i] = syscall(__NR_perf_event_open, &attr, -1 /* pid */, + 0 /* cpu 0 */, -1 /* group id */, + 0 /* flags */); + if (CHECK(pmu_fd[i] < 0, "perf_event_open", "err %d errno %d\n", + pmu_fd[i], errno)) + goto cleanup2; + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0); + if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", + err, errno)) + goto cleanup3; + + if (i == 0) { + /* check NULL prog array query */ + query->ids_len = num_progs; + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query); + if (CHECK(err || query->prog_cnt != 0, + "perf_event_ioc_query_bpf", + "err %d errno %d query->prog_cnt %u\n", + err, errno, query->prog_cnt)) + goto cleanup3; + } + + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[i]); + if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", + err, errno)) + goto cleanup3; + + if (i == 1) { + /* try to get # of programs only */ + query->ids_len = 0; + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query); + if (CHECK(err || query->prog_cnt != 2, + "perf_event_ioc_query_bpf", + "err %d errno %d query->prog_cnt %u\n", + err, errno, query->prog_cnt)) + goto cleanup3; + + /* try a few negative tests */ + /* invalid query pointer */ + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, + (struct perf_event_query_bpf *)0x1); + if (CHECK(!err || errno != EFAULT, + "perf_event_ioc_query_bpf", + "err %d errno %d\n", err, errno)) + goto cleanup3; + + /* no enough space */ + query->ids_len = 1; + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query); + if (CHECK(!err || errno != ENOSPC || query->prog_cnt != 2, + "perf_event_ioc_query_bpf", + "err %d errno %d query->prog_cnt %u\n", + err, errno, query->prog_cnt)) + goto cleanup3; + } + + query->ids_len = num_progs; + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query); + if (CHECK(err || query->prog_cnt != (i + 1), + "perf_event_ioc_query_bpf", + "err %d errno %d query->prog_cnt %u\n", + err, errno, query->prog_cnt)) + goto cleanup3; + for (j = 0; j < i + 1; j++) + if (CHECK(saved_prog_ids[j] != query->ids[j], + "perf_event_ioc_query_bpf", + "#%d saved_prog_id %x query prog_id %x\n", + j, saved_prog_ids[j], query->ids[j])) + goto cleanup3; + } + + i = num_progs - 1; + for (; i >= 0; i--) { + cleanup3: + ioctl(pmu_fd[i], PERF_EVENT_IOC_DISABLE); + cleanup2: + close(pmu_fd[i]); + cleanup1: + bpf_object__close(obj[i]); + } + free(query); + } + int main(void) { struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; @@@ -625,11 -845,13 +845,13 @@@
test_pkt_access(); test_xdp(); - test_l4lb(); + test_l4lb_all(); + test_xdp_noinline(); test_tcp_estats(); test_bpf_obj_id(); test_pkt_md_access(); test_obj_name(); + test_tp_attach_query();
printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS; diff --combined tools/testing/selftests/bpf/test_verifier.c index b03ecfd7185b,3bacff0d6f91..ae2e215247f0 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@@ -2,6 -2,7 +2,7 @@@ * Testsuite for eBPF verifier * * Copyright (c) 2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2017 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@@ -277,7 -278,7 +278,7 @@@ static struct bpf_test tests[] = .insns = { BPF_ALU64_REG(BPF_MOV, BPF_REG_0, BPF_REG_2), }, - .errstr = "jump out of range", + .errstr = "not an exit", .result = REJECT, }, { @@@ -5648,7 -5649,7 +5649,7 @@@ "helper access to variable memory: size > 0 not allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)", .insns = { BPF_MOV64_IMM(BPF_REG_1, 0), - BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_2, 1), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), @@@ -5883,7 -5884,7 +5884,7 @@@ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), - BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_2, 1), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63), @@@ -6117,30 -6118,6 +6118,30 @@@ .result = ACCEPT, }, { + "ld_abs: tests on r6 and skb data reload helper", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_B, 0), + BPF_LD_ABS(BPF_H, 0), + BPF_LD_ABS(BPF_W, 0), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_6, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_3, 2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_vlan_push), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_7), + BPF_LD_ABS(BPF_B, 0), + BPF_LD_ABS(BPF_H, 0), + BPF_LD_ABS(BPF_W, 0), + BPF_MOV64_IMM(BPF_REG_0, 42), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, + { "ld_ind: check calling conv, r1", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), @@@ -8121,6 -8098,1623 +8122,1623 @@@ .result = REJECT, .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, }, + { + "calls: basic sanity", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + }, + { + "calls: not on unpriviledged", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "calls: overlapping caller/callee", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "last insn is not an exit or jmp", + .result = REJECT, + }, + { + "calls: wrong recursive calls", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, 4), + BPF_JMP_IMM(BPF_JA, 0, 0, 4), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "calls: wrong src reg", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "BPF_CALL uses reserved fields", + .result = REJECT, + }, + { + "calls: wrong off value", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, -1, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "BPF_CALL uses reserved fields", + .result = REJECT, + }, + { + "calls: jump back loop", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -1), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "back-edge from insn 0 to 0", + .result = REJECT, + }, + { + "calls: conditional call", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "calls: conditional call 2", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + }, + { + "calls: conditional call 3", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_JMP_IMM(BPF_JA, 0, 0, 4), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, -6), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_JMP_IMM(BPF_JA, 0, 0, -6), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "back-edge from insn", + .result = REJECT, + }, + { + "calls: conditional call 4", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, -5), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + }, + { + "calls: conditional call 5", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, -6), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "back-edge from insn", + .result = REJECT, + }, + { + "calls: conditional call 6", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -2), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "back-edge from insn", + .result = REJECT, + }, + { + "calls: using r0 returned by callee", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + }, + { + "calls: using uninit r0 from callee", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "!read_ok", + .result = REJECT, + }, + { + "calls: callee is using r1", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_ACT, + .result = ACCEPT, + }, + { + "calls: callee using args1", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "allowed for root only", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "calls: callee using wrong args2", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "R2 !read_ok", + .result = REJECT, + }, + { + "calls: callee using two args", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, + offsetof(struct __sk_buff, len)), + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, + offsetof(struct __sk_buff, len)), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "allowed for root only", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "calls: callee changing pkt pointers", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1, + offsetof(struct xdp_md, data_end)), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_6), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_8, BPF_REG_7, 2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + /* clear_all_pkt_pointers() has to walk all frames + * to make sure that pkt pointers in the caller + * are cleared when callee is calling a helper that + * adjusts packet size + */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_xdp_adjust_head), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R6 invalid mem access 'inv'", + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "calls: two calls with args", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_7), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, + { + "calls: calls with stack arith", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -64), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -64), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -64), + BPF_MOV64_IMM(BPF_REG_0, 42), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, + { + "calls: calls with misaligned stack access", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -63), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -61), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -63), + BPF_MOV64_IMM(BPF_REG_0, 42), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, + .errstr = "misaligned stack access", + .result = REJECT, + }, + { + "calls: calls control flow, jump test", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 42), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 43), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, -3), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, + { + "calls: calls control flow, jump test 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 42), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 43), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -3), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "jump out of range from insn 1 to 4", + .result = REJECT, + }, + { + "calls: two calls with bad jump", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_7), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -3), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "jump out of range from insn 11 to 9", + .result = REJECT, + }, + { + "calls: recursive call. test1", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "back-edge", + .result = REJECT, + }, + { + "calls: recursive call. test2", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -3), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "back-edge", + .result = REJECT, + }, + { + "calls: unreachable code", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "unreachable insn 6", + .result = REJECT, + }, + { + "calls: invalid call", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -4), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "invalid destination", + .result = REJECT, + }, + { + "calls: invalid call 2", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 0x7fffffff), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "invalid destination", + .result = REJECT, + }, + { + "calls: jumping across function bodies. test1", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -3), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "calls: jumping across function bodies. test2", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "calls: call without exit", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -2), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "not an exit", + .result = REJECT, + }, + { + "calls: call into middle of ld_imm64", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "last insn", + .result = REJECT, + }, + { + "calls: call into middle of other call", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "last insn", + .result = REJECT, + }, + { + "calls: ld_abs with changing ctx data in callee", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_B, 0), + BPF_LD_ABS(BPF_H, 0), + BPF_LD_ABS(BPF_W, 0), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_7), + BPF_LD_ABS(BPF_B, 0), + BPF_LD_ABS(BPF_H, 0), + BPF_LD_ABS(BPF_W, 0), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_3, 2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_vlan_push), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "BPF_LD_[ABS|IND] instructions cannot be mixed", + .result = REJECT, + }, + { + "calls: two calls with bad fallthrough", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_7), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .errstr = "not an exit", + .result = REJECT, + }, + { + "calls: two calls with stack read", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_7), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_XDP, + .result = ACCEPT, + }, + { + "calls: two calls with stack write", + .insns = { + /* main prog */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -16), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 7), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_8), + /* write into stack frame of main prog */ + BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0), + BPF_EXIT_INSN(), + + /* subprog 2 */ + /* read from stack frame of main prog */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_XDP, + .result = ACCEPT, + }, + { + "calls: spill into caller stack frame", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_XDP, + .errstr = "cannot spill", + .result = REJECT, + }, + { + "calls: write into caller stack frame", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_XDP, + .result = ACCEPT, + }, + { + "calls: write into callee stack frame", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, -8), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_XDP, + .errstr = "cannot return stack pointer", + .result = REJECT, + }, + { + "calls: two calls with stack write and void return", + .insns = { + /* main prog */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -16), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + + /* subprog 2 */ + /* write into stack frame of main prog */ + BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0), + BPF_EXIT_INSN(), /* void return */ + }, + .prog_type = BPF_PROG_TYPE_XDP, + .result = ACCEPT, + }, + { + "calls: ambiguous return value", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "allowed for root only", + .result_unpriv = REJECT, + .errstr = "R0 !read_ok", + .result = REJECT, + }, + { + "calls: two calls that return map_value", + .insns = { + /* main prog */ + /* pass fp-16, fp-8 into a function */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8), + + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + /* fetch secound map_value_ptr from the stack */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -16), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + /* call 3rd function twice */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), + /* first time with fp-8 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + /* second time with fp-16 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + + /* subprog 2 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + /* lookup from map */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + /* write map_value_ptr into stack frame of main prog */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), /* return 0 */ + }, + .prog_type = BPF_PROG_TYPE_XDP, + .fixup_map1 = { 23 }, + .result = ACCEPT, + }, + { + "calls: two calls that return map_value with bool condition", + .insns = { + /* main prog */ + /* pass fp-16, fp-8 into a function */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + /* call 3rd function twice */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), + /* first time with fp-8 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 9), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + /* second time with fp-16 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + /* fetch secound map_value_ptr from the stack */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + + /* subprog 2 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + /* lookup from map */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), /* return 0 */ + /* write map_value_ptr into stack frame of main prog */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), /* return 1 */ + }, + .prog_type = BPF_PROG_TYPE_XDP, + .fixup_map1 = { 23 }, + .result = ACCEPT, + }, + { + "calls: two calls that return map_value with incorrect bool check", + .insns = { + /* main prog */ + /* pass fp-16, fp-8 into a function */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + /* call 3rd function twice */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), + /* first time with fp-8 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 9), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + /* second time with fp-16 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + /* fetch secound map_value_ptr from the stack */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + + /* subprog 2 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + /* lookup from map */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), /* return 0 */ + /* write map_value_ptr into stack frame of main prog */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), /* return 1 */ + }, + .prog_type = BPF_PROG_TYPE_XDP, + .fixup_map1 = { 23 }, + .result = REJECT, + .errstr = "invalid read from stack off -16+0 size 8", + }, + { + "calls: two calls that receive map_value via arg=ptr_stack_of_caller. test1", + .insns = { + /* main prog */ + /* pass fp-16, fp-8 into a function */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), + /* 1st lookup from map */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_8, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + /* write map_value_ptr into stack frame of main prog at fp-8 */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_8, 1), + + /* 2nd lookup from map */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* 20 */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, /* 24 */ + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_9, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + /* write map_value_ptr into stack frame of main prog at fp-16 */ + BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_9, 1), + + /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), /* 30 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_7), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_9), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), /* 34 */ + BPF_EXIT_INSN(), + + /* subprog 2 */ + /* if arg2 == 1 do *arg1 = 0 */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2), + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + + /* if arg4 == 1 do *arg3 = 0 */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 1, 2), + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 2, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map1 = { 12, 22 }, + .result = REJECT, + .errstr = "invalid access to map value, value_size=8 off=2 size=8", + }, + { + "calls: two calls that receive map_value via arg=ptr_stack_of_caller. test2", + .insns = { + /* main prog */ + /* pass fp-16, fp-8 into a function */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), + /* 1st lookup from map */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_8, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + /* write map_value_ptr into stack frame of main prog at fp-8 */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_8, 1), + + /* 2nd lookup from map */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* 20 */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, /* 24 */ + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_9, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + /* write map_value_ptr into stack frame of main prog at fp-16 */ + BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_9, 1), + + /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), /* 30 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_7), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_9), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), /* 34 */ + BPF_EXIT_INSN(), + + /* subprog 2 */ + /* if arg2 == 1 do *arg1 = 0 */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2), + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + + /* if arg4 == 1 do *arg3 = 0 */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 1, 2), + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map1 = { 12, 22 }, + .result = ACCEPT, + }, + { + "calls: two jumps that receive map_value via arg=ptr_stack_of_jumper. test3", + .insns = { + /* main prog */ + /* pass fp-16, fp-8 into a function */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), + /* 1st lookup from map */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -24, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -24), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_8, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + /* write map_value_ptr into stack frame of main prog at fp-8 */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_8, 1), + + /* 2nd lookup from map */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -24), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_9, 0), // 26 + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + /* write map_value_ptr into stack frame of main prog at fp-16 */ + BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_9, 1), + + /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), // 30 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_7), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_9), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1), // 34 + BPF_JMP_IMM(BPF_JA, 0, 0, -30), + + /* subprog 2 */ + /* if arg2 == 1 do *arg1 = 0 */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2), + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + + /* if arg4 == 1 do *arg3 = 0 */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 1, 2), + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 2, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, -8), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map1 = { 12, 22 }, + .result = REJECT, + .errstr = "invalid access to map value, value_size=8 off=2 size=8", + }, + { + "calls: two calls that receive map_value_ptr_or_null via arg. test1", + .insns = { + /* main prog */ + /* pass fp-16, fp-8 into a function */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), + /* 1st lookup from map */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + /* write map_value_ptr_or_null into stack frame of main prog at fp-8 */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_8, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV64_IMM(BPF_REG_8, 1), + + /* 2nd lookup from map */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + /* write map_value_ptr_or_null into stack frame of main prog at fp-16 */ + BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_9, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV64_IMM(BPF_REG_9, 1), + + /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_7), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_9), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + + /* subprog 2 */ + /* if arg2 == 1 do *arg1 = 0 */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2), + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + + /* if arg4 == 1 do *arg3 = 0 */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 1, 2), + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map1 = { 12, 22 }, + .result = ACCEPT, + }, + { + "calls: two calls that receive map_value_ptr_or_null via arg. test2", + .insns = { + /* main prog */ + /* pass fp-16, fp-8 into a function */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), + /* 1st lookup from map */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + /* write map_value_ptr_or_null into stack frame of main prog at fp-8 */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_8, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV64_IMM(BPF_REG_8, 1), + + /* 2nd lookup from map */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + /* write map_value_ptr_or_null into stack frame of main prog at fp-16 */ + BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_9, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV64_IMM(BPF_REG_9, 1), + + /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_7), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_9), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + + /* subprog 2 */ + /* if arg2 == 1 do *arg1 = 0 */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2), + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + + /* if arg4 == 0 do *arg3 = 0 */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 0, 2), + /* fetch map_value_ptr from the stack of this function */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), + /* write into map value */ + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map1 = { 12, 22 }, + .result = REJECT, + .errstr = "R0 invalid mem access 'inv'", + }, + { + "calls: pkt_ptr spill into caller stack", + .insns = { + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + /* spill unchecked pkt_ptr into stack of caller */ + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2), + /* now the pkt range is verified, read pkt_ptr from stack */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0), + /* write 4 bytes into packet */ + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "calls: pkt_ptr spill into caller stack 2", + .insns = { + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + /* Marking is still kept, but not in all cases safe. */ + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), + BPF_ST_MEM(BPF_W, BPF_REG_4, 0, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + /* spill unchecked pkt_ptr into stack of caller */ + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2), + /* now the pkt range is verified, read pkt_ptr from stack */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0), + /* write 4 bytes into packet */ + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "invalid access to packet", + .result = REJECT, + }, + { + "calls: pkt_ptr spill into caller stack 3", + .insns = { + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + /* Marking is still kept and safe here. */ + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), + BPF_ST_MEM(BPF_W, BPF_REG_4, 0, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + /* spill unchecked pkt_ptr into stack of caller */ + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3), + BPF_MOV64_IMM(BPF_REG_5, 1), + /* now the pkt range is verified, read pkt_ptr from stack */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0), + /* write 4 bytes into packet */ + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, + { + "calls: pkt_ptr spill into caller stack 4", + .insns = { + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + /* Check marking propagated. */ + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), + BPF_ST_MEM(BPF_W, BPF_REG_4, 0, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + /* spill unchecked pkt_ptr into stack of caller */ + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2), + BPF_MOV64_IMM(BPF_REG_5, 1), + /* don't read back pkt_ptr from stack here */ + /* write 4 bytes into packet */ + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, + { + "calls: pkt_ptr spill into caller stack 5", + .insns = { + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3), + /* spill checked pkt_ptr into stack of caller */ + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_5, 1), + /* don't read back pkt_ptr from stack here */ + /* write 4 bytes into packet */ + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "same insn cannot be used with different", + .result = REJECT, + }, + { + "calls: pkt_ptr spill into caller stack 6", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3), + /* spill checked pkt_ptr into stack of caller */ + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_5, 1), + /* don't read back pkt_ptr from stack here */ + /* write 4 bytes into packet */ + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "R4 invalid mem access", + .result = REJECT, + }, + { + "calls: pkt_ptr spill into caller stack 7", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3), + /* spill checked pkt_ptr into stack of caller */ + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_5, 1), + /* don't read back pkt_ptr from stack here */ + /* write 4 bytes into packet */ + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "R4 invalid mem access", + .result = REJECT, + }, + { + "calls: pkt_ptr spill into caller stack 8", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3), + /* spill checked pkt_ptr into stack of caller */ + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_5, 1), + /* don't read back pkt_ptr from stack here */ + /* write 4 bytes into packet */ + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, + { + "calls: pkt_ptr spill into caller stack 9", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_MOV64_IMM(BPF_REG_5, 0), + /* spill unchecked pkt_ptr into stack of caller */ + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2), + BPF_MOV64_IMM(BPF_REG_5, 1), + /* don't read back pkt_ptr from stack here */ + /* write 4 bytes into packet */ + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "invalid access to packet", + .result = REJECT, + }, + { + "calls: caller stack init to zero or map_value_or_null", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), + /* fetch map_value_or_null or const_zero from stack */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + /* store into map_value */ + BPF_ST_MEM(BPF_W, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + + /* subprog 1 */ + /* if (ctx == 0) return; */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 8), + /* else bpf_map_lookup() and *(fp - 8) = r0 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + /* write map_value_ptr_or_null into stack frame of main prog at fp-8 */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 13 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "calls: stack init to zero and pruning", + .insns = { + /* first make allocated_stack 16 byte */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0), + /* now fork the execution such that the false branch + * of JGT insn will be verified second and it skisp zero + * init of fp-8 stack slot. If stack liveness marking + * is missing live_read marks from call map_lookup + * processing then pruning will incorrectly assume + * that fp-8 stack slot was unused in the fall-through + * branch and will accept the program incorrectly + */ + BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 2, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 6 }, + .errstr = "invalid indirect read from stack off -8+0 size 8", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_XDP, + }, };
static int probe_filter_length(const struct bpf_insn *fp)
linux-merge@lists.open-mesh.org