Hi,
Here are two fixes which address potential problems with parallel OGM processing. For the multicast subsystem the issues can be quite severe as found and pointed out by Sven here [0].
Cheers, Linus
[0]: https://lists.open-mesh.org/pipermail/b.a.t.m.a.n/2015-June/013193.html
Bitwise OR/AND assignments in C aren't guaranteed to be atomic. One OGM handler might undo the set/clear of a specific bit from another handler run in between. This can lead to various issues in other code paths, including kernel panics.
Fix this by using the atomic set_bit()/clear_bit() functions.
Fixes: 2b1c07b918d2 ("batman-adv: tvlv - add distributed arp table container") Signed-off-by: Linus Lüssing linus.luessing@c0d3.blue --- distributed-arp-table.c | 4 ++-- multicast.c | 6 +++--- network-coding.c | 4 ++-- translation-table.c | 4 ++-- types.h | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/distributed-arp-table.c b/distributed-arp-table.c index 0d791dc..b2cc19b 100644 --- a/distributed-arp-table.c +++ b/distributed-arp-table.c @@ -682,9 +682,9 @@ static void batadv_dat_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, uint16_t tvlv_value_len) { if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) - orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_DAT; + clear_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities); else - orig->capabilities |= BATADV_ORIG_CAPA_HAS_DAT; + set_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities); }
/** diff --git a/multicast.c b/multicast.c index 09f2838..00612bf 100644 --- a/multicast.c +++ b/multicast.c @@ -684,7 +684,7 @@ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, !(orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST)) { if (orig_initialized) atomic_dec(&bat_priv->mcast.num_disabled); - orig->capabilities |= BATADV_ORIG_CAPA_HAS_MCAST; + set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); /* If mcast support is being switched off or if this is an initial * OGM without mcast support then increase the disabled mcast * node counter. @@ -693,10 +693,10 @@ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, (orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST || !orig_initialized)) { atomic_inc(&bat_priv->mcast.num_disabled); - orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_MCAST; + clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); }
- orig->capa_initialized |= BATADV_ORIG_CAPA_HAS_MCAST; + set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized);
if (orig_mcast_enabled && tvlv_value && (tvlv_value_len >= sizeof(mcast_flags))) diff --git a/network-coding.c b/network-coding.c index 89e1d47..3ce493e 100644 --- a/network-coding.c +++ b/network-coding.c @@ -105,9 +105,9 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, uint16_t tvlv_value_len) { if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) - orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_NC; + clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); else - orig->capabilities |= BATADV_ORIG_CAPA_HAS_NC; + set_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); }
/** diff --git a/translation-table.c b/translation-table.c index b098e53..e95a424 100644 --- a/translation-table.c +++ b/translation-table.c @@ -1843,7 +1843,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, } spin_unlock_bh(list_lock); } - orig_node->capa_initialized &= ~BATADV_ORIG_CAPA_HAS_TT; + clear_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); }
static bool batadv_tt_global_to_purge(struct batadv_tt_global_entry *tt_global, @@ -2802,7 +2802,7 @@ static void _batadv_tt_update_changes(struct batadv_priv *bat_priv, return; } } - orig_node->capa_initialized |= BATADV_ORIG_CAPA_HAS_TT; + set_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); }
static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, diff --git a/types.h b/types.h index 28f2461..c6ec558 100644 --- a/types.h +++ b/types.h @@ -256,8 +256,8 @@ struct batadv_orig_node { struct hlist_node mcast_want_all_ipv4_node; struct hlist_node mcast_want_all_ipv6_node; #endif - uint8_t capabilities; - uint8_t capa_initialized; + unsigned long capabilities; + unsigned long capa_initialized; atomic_t last_ttvn; unsigned char *tt_buff; int16_t tt_buff_len;
On Monday, June 15, 2015 08:22:24 Linus Lüssing wrote:
Fix this by using the atomic set_bit()/clear_bit() functions.
Fixes: 2b1c07b918d2 ("batman-adv: tvlv - add distributed arp table container") Signed-off-by: Linus Lüssing linus.luessing@c0d3.blue
If I am not mistaken the referenced patch only touches the DAT capabilities. You should send separate patches for each regression.
Cheers, Marek
So far the mcast tvlv handler did not anticipate the processing of multiple incoming OGMs from the same originator at the same time. This can lead to various issues:
* Broken refcounting: For instance two mcast handlers might both assume that an originator just got multicast capabilities and will together wrongly decrease mcast.num_disabled by two, potentially leading to an integer underflow.
* Potential kernel panic on hlist_del_rcu(): Two mcast handlers might one after another try to do an hlist_del_rcu(&orig->mcast_want_all_*_node). The second one will cause memory corruption / crashes. (Reported by: Sven Eckelmann sven@narfation.org)
Right in the beginning the code path makes assumptions about the current multicast related state of an originator and bases all updates on that. The easiest and least error prune way to fix the issues in this case is to serialize multiple mcast handler invocations with a spinlock.
Fixes: 77ec494490d6 ("batman-adv: Announce new capability via multicast TVLV") Signed-off-by: Linus Lüssing linus.luessing@c0d3.blue --- multicast.c | 2 ++ originator.c | 1 + types.h | 2 ++ 3 files changed, 5 insertions(+)
diff --git a/multicast.c b/multicast.c index 00612bf..e6db978 100644 --- a/multicast.c +++ b/multicast.c @@ -674,6 +674,7 @@ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, uint8_t mcast_flags = BATADV_NO_FLAGS; bool orig_initialized;
+ spin_lock_bh(&orig->mcast_handler_lock); orig_initialized = orig->capa_initialized & BATADV_ORIG_CAPA_HAS_MCAST;
/* If mcast support is turned on decrease the disabled mcast node @@ -707,6 +708,7 @@ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags);
orig->mcast_flags = mcast_flags; + spin_unlock_bh(&orig->mcast_handler_lock); }
/** diff --git a/originator.c b/originator.c index e3900e4..78c027d 100644 --- a/originator.c +++ b/originator.c @@ -663,6 +663,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, spin_lock_init(&orig_node->tt_buff_lock); spin_lock_init(&orig_node->tt_lock); spin_lock_init(&orig_node->vlan_list_lock); + spin_lock_init(&orig_node->mcast_handler_lock);
batadv_nc_init_orig(orig_node);
diff --git a/types.h b/types.h index c6ec558..1d800c5 100644 --- a/types.h +++ b/types.h @@ -251,6 +251,8 @@ struct batadv_orig_node { unsigned long last_seen; unsigned long bcast_seqno_reset; #ifdef CONFIG_BATMAN_ADV_MCAST + /* synchronizes mcast tvlv specific orig changes */ + spinlock_t mcast_handler_lock; uint8_t mcast_flags; struct hlist_node mcast_want_all_unsnoopables_node; struct hlist_node mcast_want_all_ipv4_node;
On Monday, June 15, 2015 08:22:25 Linus Lüssing wrote:
So far the mcast tvlv handler did not anticipate the processing of multiple incoming OGMs from the same originator at the same time. This can lead to various issues:
Broken refcounting: For instance two mcast handlers might both assume that an originator just got multicast capabilities and will together wrongly decrease mcast.num_disabled by two, potentially leading to an integer underflow.
Potential kernel panic on hlist_del_rcu(): Two mcast handlers might one after another try to do an hlist_del_rcu(&orig->mcast_want_all_*_node). The second one will cause memory corruption / crashes. (Reported by: Sven Eckelmann sven@narfation.org)
As far as I can tell from looking at the code your patch does not address the issue raised by Sven.
The first problem is that the mcast code calls hlist_del_rcu() without verifying whether or not the element is still in the list. Adding a spinlock is not going to change that. You can still have a purge event going on while we just receive a new OGM because the purge caller does not need to hold the newly added lock.
Cheers, Marek
b.a.t.m.a.n@lists.open-mesh.org