Whenever a new VLAN is created on top of batman virtual interfaces the batman-adv kernel module creates internal structures to track the status of said VLAN. Amongst other things, the MAC address of the VLAN interface itself has to be stored.
Without this change a VLAN and its infrastructure could be created while the interface MAC address is not stored without triggering any error, thus creating issues in other parts of the code.
Prevent the VLAN from being created if the MAC address can not be stored.
Fixes: 952cebb57518 ("batman-adv: add per VLAN interface attribute framework")
Signed-off-by: Marek Lindner mareklindner@neomailbox.ch --- net/batman-adv/hard-interface.c | 2 +- net/batman-adv/soft-interface.c | 105 ++++++++++++++++++++++++-------- net/batman-adv/soft-interface.h | 3 +- 3 files changed, 83 insertions(+), 27 deletions(-)
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index c405d15b..0b22cc4d 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1000,7 +1000,7 @@ static int batadv_hard_if_event(struct notifier_block *this, if (batadv_softif_is_valid(net_dev) && event == NETDEV_REGISTER) { batadv_sysfs_add_meshif(net_dev); bat_priv = netdev_priv(net_dev); - batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS); + batadv_softif_create_vlan_late(bat_priv, BATADV_NO_FLAGS); return NOTIFY_DONE; }
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index edeffcb9..728d9d40 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -563,16 +563,36 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, }
/** - * batadv_softif_create_vlan() - allocate the needed resources for a new vlan + * batadv_softif_destroy_vlan() - remove and destroy a softif_vlan object + * @bat_priv: the bat priv with all the soft interface information + * @vlan: the object to remove + */ +static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, + struct batadv_softif_vlan *vlan) +{ + /* explicitly remove the associated TT local entry because it is marked + * with the NOPURGE flag + */ + batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr, + vlan->vid, "vlan interface destroyed", false); + + batadv_sysfs_del_vlan(bat_priv, vlan); + batadv_softif_vlan_put(vlan); +} + +/** + * batadv_softif_create_vlan_early() - allocate the needed resources for a new + * vlan, defer sysfs creation till later * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * * Return: 0 on success, a negative error otherwise. */ -int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) +static int batadv_softif_create_vlan_early(struct batadv_priv *bat_priv, + unsigned short vid) { struct batadv_softif_vlan *vlan; - int err; + bool client_added;
vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { @@ -590,12 +610,6 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
atomic_set(&vlan->ap_isolation, 0);
- err = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); - if (err) { - kfree(vlan); - return err; - } - spin_lock_bh(&bat_priv->softif_vlan_list_lock); kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); @@ -604,32 +618,63 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ - batadv_tt_local_add(bat_priv->soft_iface, - bat_priv->soft_iface->dev_addr, vid, - BATADV_NULL_IFINDEX, BATADV_NO_MARK); + client_added = batadv_tt_local_add(bat_priv->soft_iface, + bat_priv->soft_iface->dev_addr, vid, + BATADV_NULL_IFINDEX, BATADV_NO_MARK);
/* don't return reference to new softif_vlan */ batadv_softif_vlan_put(vlan);
+ if (!client_added) { + batadv_softif_destroy_vlan(bat_priv, vlan); + return -ENOENT; + } + return 0; }
/** - * batadv_softif_destroy_vlan() - remove and destroy a softif_vlan object + * batadv_softif_create_vlan_late() - complete softif vlan creation with the + * sysfs entries * @bat_priv: the bat priv with all the soft interface information - * @vlan: the object to remove + * @vid: the VLAN identifier + * + * Return: 0 on success, a negative error otherwise. */ -static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, - struct batadv_softif_vlan *vlan) +int batadv_softif_create_vlan_late(struct batadv_priv *bat_priv, + unsigned short vid) { - /* explicitly remove the associated TT local entry because it is marked - * with the NOPURGE flag - */ - batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr, - vlan->vid, "vlan interface destroyed", false); + struct batadv_softif_vlan *vlan; + int ret; + + vlan = batadv_softif_vlan_get(bat_priv, vid); + if (!vlan) + return -ENOENT; + + ret = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan);
- batadv_sysfs_del_vlan(bat_priv, vlan); batadv_softif_vlan_put(vlan); + return ret; +} + +/** + * batadv_softif_create_vlan() - allocate the needed resources for a new vlan + * @bat_priv: the bat priv with all the soft interface information + * @vid: the VLAN identifier + * + * Return: 0 on success, a negative error otherwise. + */ +static int batadv_softif_create_vlan(struct batadv_priv *bat_priv, + unsigned short vid) +{ + int err; + + err = batadv_softif_create_vlan_early(bat_priv, vid); + if (err) + return err; + + err = batadv_softif_create_vlan_late(bat_priv, vid); + return err; }
/** @@ -648,6 +693,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; + bool client_added; int ret;
/* only 802.1Q vlans are supported. @@ -683,9 +729,14 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, * flag. This must be added again, even if the vlan object already * exists, because the entry was deleted by kill_vid() */ - batadv_tt_local_add(bat_priv->soft_iface, - bat_priv->soft_iface->dev_addr, vid, - BATADV_NULL_IFINDEX, BATADV_NO_MARK); + client_added = batadv_tt_local_add(bat_priv->soft_iface, + bat_priv->soft_iface->dev_addr, vid, + BATADV_NULL_IFINDEX, BATADV_NO_MARK); + + if (!client_added) { + batadv_softif_destroy_vlan(bat_priv, vlan); + return -ENOENT; + }
return 0; } @@ -850,6 +901,10 @@ static int batadv_softif_init_late(struct net_device *dev) if (ret < 0) goto unreg_debugfs;
+ ret = batadv_softif_create_vlan_early(bat_priv, BATADV_NO_FLAGS); + if (ret < 0) + goto unreg_debugfs; + return 0;
unreg_debugfs: diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index daf87f07..b8a9a3c8 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -36,7 +36,8 @@ struct net_device *batadv_softif_create(struct net *net, const char *name); void batadv_softif_destroy_sysfs(struct net_device *soft_iface); bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; -int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); +int batadv_softif_create_vlan_late(struct batadv_priv *bat_priv, + unsigned short vid); void batadv_softif_vlan_put(struct batadv_softif_vlan *softif_vlan); struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid);
On Sat, May 12, 2018 at 02:57:23AM +0800, Marek Lindner wrote:
Whenever a new VLAN is created on top of batman virtual interfaces the batman-adv kernel module creates internal structures to track the status of said VLAN. Amongst other things, the MAC address of the VLAN interface itself has to be stored.
Without this change a VLAN and its infrastructure could be created while the interface MAC address is not stored without triggering any error, thus creating issues in other parts of the code.
Prevent the VLAN from being created if the MAC address can not be stored.
Fixes: 952cebb57518 ("batman-adv: add per VLAN interface attribute framework")
Signed-off-by: Marek Lindner mareklindner@neomailbox.ch
I tested this patch but so far could not spot any issues either in dmesg or logread.
I've added these patches to a branch for Gluon here:
https://github.com/T-X/gluon/tree/tt-vlan-patched
And used these images (warning, they have my SSH public added):
https://metameute.de/~tux/Freifunk/firmware/ffh-tt-patched/
I've tested with an isolated two nodes setup for now.
I started playing with restarting the network multiple times:
~~~~~ root@freifunk-b0487ae7f31e:~# rm /tmp/vlan-test.log; trap '' SIGPIPE; for i in `seq 1 30`; do echo "Starting network restart $i" >> /tmp/vlan-test.log; /etc/init.d/network restart; sleep 5; if batctl tl | grep " 0 ["; then echo "BROKEN - aborting" >> /tmp/vlan-test.log; batctl tl >> /tmp/vlan-test.log; sleep 3; echo "waiting..." >> /tmp/vlan-test.log; batctl tl >> /tmp/vlan-test.log; break; fi; done; echo "finished" >> /tmp/vlan-test.log ~~~~~
And the result is the following - which looks odd?
~~~~~ root@freifunk-b0487ae7f31e:~# cat /tmp/vlan-test.log Starting network restart 1 Starting network restart 2 Starting network restart 3 BROKEN - aborting [B.A.T.M.A.N. adv 2018.1, MainIF/MAC: primary0/66:c6:34:9d:58:43 (bat0/b0:48:7a:e7:f3:1e BATMAN_IV), TTVN: 1] Client VID Flags Last seen (CRC ) 9a:86:17:9c:5f:4f -1 [.P.X..] 0.000 (0x0ce60e81) b0:48:7a:e7:f3:1e 0 [.PN...] 0.000 (0x00000000) b0:48:7a:e7:f3:1e -1 [.PN...] 0.000 (0x0ce60e81) waiting... [B.A.T.M.A.N. adv 2018.1, MainIF/MAC: primary0/66:c6:34:9d:58:43 (bat0/b0:48:7a:e7:f3:1e BATMAN_IV), TTVN: 2] Client VID Flags Last seen (CRC ) b0:48:7a:e7:f3:1e 0 [.P....] 0.000 (0xc4c7d9cf) b0:48:7a:e7:f3:1e -1 [.P....] 0.000 (0x62afdc24) finished ~~~~~
However, this oddity seems to be temporary, now the local TT looks just fine, without having rebooted the node:
~~~~~ root@freifunk-b0487ae7f31e:~# batctl tl [B.A.T.M.A.N. adv 2018.1, MainIF/MAC: primary0/66:c6:34:9d:58:43 (bat0/b0:48:7a:e7:f3:1e BATMAN_IV), TTVN: 4] Client VID Flags Last seen (CRC ) 33:33:ff:40:f8:dc -1 [.P....] 0.000 (0xd118c666) b0:48:7a:e7:f3:1e 0 [.P....] 0.000 (0xc4c7d9cf) 33:33:00:00:00:02 -1 [.P....] 0.000 (0xd118c666) 33:33:ff:00:00:01 -1 [.P....] 0.000 (0xd118c666) 33:33:00:02:10:01 -1 [.P....] 0.000 (0xd118c666) 01:00:5e:00:00:01 -1 [.P....] 0.000 (0xd118c666) b0:48:7a:e7:f3:1e -1 [.P....] 0.000 (0xd118c666) 33:33:ff:e7:f3:1e -1 [.P....] 0.000 (0xd118c666) 33:33:00:00:00:01 -1 [.P....] 0.000 (0xd118c666) ~~~~~
Or is it expected that a TT VLAN entry with an "N" flag will have the CRC set to 0x00000000?
I also noticed that the VLAN 0 is added to bat0 by 8021q right after bat0 gets created and activated:
~~~~~ Sun Feb 25 14:20:28 2018 kern.info kernel: [ 7852.985327] batman_adv: bat0: Adding interface: primary0 Sun Feb 25 14:20:28 2018 kern.info kernel: [ 7852.990712] batman_adv: bat0: Interface activated: primary0 Sun Feb 25 14:20:28 2018 kern.info kernel: [ 7853.025080] 8021q: adding VLAN 0 to HW filter on device bat0 Sun Feb 25 14:20:28 2018 daemon.notice netifd: Interface 'bat0' is enabled Sun Feb 25 14:20:28 2018 kern.info kernel: [ 7853.038815] device bat0 entered promiscuous mode Sun Feb 25 14:20:28 2018 kern.info kernel: [ 7853.043649] br-client: port 3(bat0) entered forwarding state Sun Feb 25 14:20:28 2018 kern.info kernel: [ 7853.049388] br-client: port 3(bat0) entered forwarding state Sun Feb 25 14:20:28 2018 daemon.notice netifd: Network device 'bat0' link is up Sun Feb 25 14:20:28 2018 daemon.notice netifd: Interface 'bat0' has link connectivity Sun Feb 25 14:20:28 2018 daemon.notice netifd: Interface 'bat0' is setting up now Sun Feb 25 14:20:28 2018 daemon.notice netifd: Interface 'bat0' is now up ~~~~~
Which looks like it might have the potential for a race condition? Also the "HW filter" remark by 8021q seems a bit odd as this is a virtual interface, doesn't it?
Regards, Linus
Hi,
see comments below
On 23/05/18 05:12, Linus Lüssing wrote:
On Sat, May 12, 2018 at 02:57:23AM +0800, Marek Lindner wrote:
Whenever a new VLAN is created on top of batman virtual interfaces the batman-adv kernel module creates internal structures to track the status of said VLAN. Amongst other things, the MAC address of the VLAN interface itself has to be stored.
Without this change a VLAN and its infrastructure could be created while the interface MAC address is not stored without triggering any error, thus creating issues in other parts of the code.
Prevent the VLAN from being created if the MAC address can not be stored.
Fixes: 952cebb57518 ("batman-adv: add per VLAN interface attribute framework")
Signed-off-by: Marek Lindner mareklindner@neomailbox.ch
I tested this patch but so far could not spot any issues either in dmesg or logread.
I've added these patches to a branch for Gluon here:
https://github.com/T-X/gluon/tree/tt-vlan-patched
And used these images (warning, they have my SSH public added):
https://metameute.de/~tux/Freifunk/firmware/ffh-tt-patched/
I've tested with an isolated two nodes setup for now.
I started playing with restarting the network multiple times:
root@freifunk-b0487ae7f31e:~# rm /tmp/vlan-test.log; trap '' SIGPIPE; for i in `seq 1 30`; do echo "Starting network restart $i" >> /tmp/vlan-test.log; /etc/init.d/network restart; sleep 5; if batctl tl | grep " 0 \["; then echo "BROKEN - aborting" >> /tmp/vlan-test.log; batctl tl >> /tmp/vlan-test.log; sleep 3; echo "waiting..." >> /tmp/vlan-test.log; batctl tl >> /tmp/vlan-test.log; break; fi; done; echo "finished" >> /tmp/vlan-test.log
And the result is the following - which looks odd?
I don't fully understand the script...you check for $(grep " 0 [") returning success and then print BROKEN? in any case, please continue reading below..
root@freifunk-b0487ae7f31e:~# cat /tmp/vlan-test.log Starting network restart 1 Starting network restart 2 Starting network restart 3 BROKEN - aborting [B.A.T.M.A.N. adv 2018.1, MainIF/MAC: primary0/66:c6:34:9d:58:43 (bat0/b0:48:7a:e7:f3:1e BATMAN_IV), TTVN: 1] Client VID Flags Last seen (CRC ) 9a:86:17:9c:5f:4f -1 [.P.X..] 0.000 (0x0ce60e81) b0:48:7a:e7:f3:1e 0 [.PN...] 0.000 (0x00000000) b0:48:7a:e7:f3:1e -1 [.PN...] 0.000 (0x0ce60e81) waiting... [B.A.T.M.A.N. adv 2018.1, MainIF/MAC: primary0/66:c6:34:9d:58:43 (bat0/b0:48:7a:e7:f3:1e BATMAN_IV), TTVN: 2] Client VID Flags Last seen (CRC ) b0:48:7a:e7:f3:1e 0 [.P....] 0.000 (0xc4c7d9cf) b0:48:7a:e7:f3:1e -1 [.P....] 0.000 (0x62afdc24) finished
However, this oddity seems to be temporary, now the local TT looks just fine, without having rebooted the node:
root@freifunk-b0487ae7f31e:~# batctl tl [B.A.T.M.A.N. adv 2018.1, MainIF/MAC: primary0/66:c6:34:9d:58:43 (bat0/b0:48:7a:e7:f3:1e BATMAN_IV), TTVN: 4] Client VID Flags Last seen (CRC ) 33:33:ff:40:f8:dc -1 [.P....] 0.000 (0xd118c666) b0:48:7a:e7:f3:1e 0 [.P....] 0.000 (0xc4c7d9cf) 33:33:00:00:00:02 -1 [.P....] 0.000 (0xd118c666) 33:33:ff:00:00:01 -1 [.P....] 0.000 (0xd118c666) 33:33:00:02:10:01 -1 [.P....] 0.000 (0xd118c666) 01:00:5e:00:00:01 -1 [.P....] 0.000 (0xd118c666) b0:48:7a:e7:f3:1e -1 [.P....] 0.000 (0xd118c666) 33:33:ff:e7:f3:1e -1 [.P....] 0.000 (0xd118c666) 33:33:00:00:00:01 -1 [.P....] 0.000 (0xd118c666)
Or is it expected that a TT VLAN entry with an "N" flag will have the CRC set to 0x00000000?
Yes. TT entries marked with "N" are "New", which means they are part of the table but have not been "committed" yet and thus not included in the CRC computation. They will be included (and lose the "N" flag) at the next commit upon OGM generation.
I also noticed that the VLAN 0 is added to bat0 by 8021q right after bat0 gets created and activated:
Sun Feb 25 14:20:28 2018 kern.info kernel: [ 7852.985327] batman_adv: bat0: Adding interface: primary0 Sun Feb 25 14:20:28 2018 kern.info kernel: [ 7852.990712] batman_adv: bat0: Interface activated: primary0 Sun Feb 25 14:20:28 2018 kern.info kernel: [ 7853.025080] 8021q: adding VLAN 0 to HW filter on device bat0 Sun Feb 25 14:20:28 2018 daemon.notice netifd: Interface 'bat0' is enabled Sun Feb 25 14:20:28 2018 kern.info kernel: [ 7853.038815] device bat0 entered promiscuous mode Sun Feb 25 14:20:28 2018 kern.info kernel: [ 7853.043649] br-client: port 3(bat0) entered forwarding state Sun Feb 25 14:20:28 2018 kern.info kernel: [ 7853.049388] br-client: port 3(bat0) entered forwarding state Sun Feb 25 14:20:28 2018 daemon.notice netifd: Network device 'bat0' link is up Sun Feb 25 14:20:28 2018 daemon.notice netifd: Interface 'bat0' has link connectivity Sun Feb 25 14:20:28 2018 daemon.notice netifd: Interface 'bat0' is setting up now Sun Feb 25 14:20:28 2018 daemon.notice netifd: Interface 'bat0' is now up
Which looks like it might have the potential for a race condition? Also the "HW filter" remark by 8021q seems a bit odd as this is a virtual interface, doesn't it?
This is nothing related to batman-adv, but it's just an internal VLAN that I never fully understood why it is created.
What race condition are you talking about?
Cheers,
On Wednesday, 23 May 2018 05:12:14 HKT Linus Lüssing wrote:
I tested this patch but so far could not spot any issues either in dmesg or logread.
I've added these patches to a branch for Gluon here:
https://github.com/T-X/gluon/tree/tt-vlan-patched
And used these images (warning, they have my SSH public added):
Any update on whether this patch has caused any grief ? If not I'd submit as real patch.
Cheers, Marek
On Friday, 7 September 2018 12:25:33 CEST Marek Lindner wrote:
On Wednesday, 23 May 2018 05:12:14 HKT Linus Lüssing wrote:
I tested this patch but so far could not spot any issues either in dmesg or logread.
I've added these patches to a branch for Gluon here:
https://github.com/T-X/gluon/tree/tt-vlan-patched
And used these images (warning, they have my SSH public added):
Any update on whether this patch has caused any grief ? If not I'd submit as real patch.
Any updates?
Kind regards, Sven
b.a.t.m.a.n@lists.open-mesh.org