Reference counting is used to ensure that batadv_hardif_neigh_node and batadv_hard_iface are not freed before/during batadv_v_elp_throughput_metric_update work is finished.
But there isn't a guarantee that the hard if will remain associated with a soft interface up until the work is finished.
This fixes a crash triggered by reboot that looks like this:
Call trace: batadv_v_mesh_free+0xd0/0x4dc [batman_adv] batadv_v_elp_throughput_metric_update+0x1c/0xa4 process_one_work+0x178/0x398 worker_thread+0x2e8/0x4d0 kthread+0xd8/0xdc ret_from_fork+0x10/0x20
(the batadv_v_mesh_free call is misleading, and does not actually happen)
I was able to make the issue happen more reliably by changing hardif_neigh->bat_v.metric_work work to be delayed work. This allowed me to track down and confirm the fix.
Signed-off-by: Andy Strohman andrew@andrewstrohman.com --- net/batman-adv/bat_v_elp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1d704574..7daaad9c 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -140,7 +140,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) }
default_throughput: - if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) { + if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT) && hard_iface->soft_iface) { batadv_info(hard_iface->soft_iface, "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n", hard_iface->net_dev->name,
On Thursday, 9 January 2025 03:27:56 CET Andy Strohman wrote:
Reference counting is used to ensure that batadv_hardif_neigh_node and batadv_hard_iface are not freed before/during batadv_v_elp_throughput_metric_update work is finished.
But there isn't a guarantee that the hard if will remain associated with a soft interface up until the work is finished.
This fixes a crash triggered by reboot that looks like this:
Call trace: batadv_v_mesh_free+0xd0/0x4dc [batman_adv] batadv_v_elp_throughput_metric_update+0x1c/0xa4 process_one_work+0x178/0x398 worker_thread+0x2e8/0x4d0 kthread+0xd8/0xdc ret_from_fork+0x10/0x20
(the batadv_v_mesh_free call is misleading, and does not actually happen)
I am not 100% sure how you build batman-adv but when you've used the external kernel module then you can use [1,2]:
make EXTRA_CFLAGS="-fno-inline -Og -fno-optimize-sibling-calls -fno-reorder-blocks -fno-ipa-cp-clone -fno-partial-inlining" KERNELPATH=...
to get actually useful backtraces. Unfortunately, compile time checks sometimes need inlining and compilations fails or some kernel configurations with '-fno-inline'. If this happens to you then you can at least try to use the rest of the extra flags.
[1] https://www.open-mesh.org/projects/devtools/wiki/Kernel_hacking_Debian_image... [2] https://www.open-mesh.org/projects/devtools/wiki/Kernel_debugging_with_kgdb#...
I was able to make the issue happen more reliably by changing hardif_neigh->bat_v.metric_work work to be delayed work. This allowed me to track down and confirm the fix.
Signed-off-by: Andy Strohman andrew@andrewstrohman.com
Please add before your Signed-off-by line following extra line:
Fixes: 5c3245172c01 ("batman-adv: ELP - compute the metric based on the estimated throughput")
net/batman-adv/bat_v_elp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1d704574..7daaad9c 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -140,7 +140,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) }
default_throughput:
- if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) {
- if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT) && hard_iface->soft_iface) { batadv_info(hard_iface->soft_iface, "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n", hard_iface->net_dev->name,
I would prefer something more explanatory instead of adding more conditions at the end of actually interesting checks. Something more like:
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1d704574..185b063f 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -66,12 +66,19 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) { struct batadv_hard_iface *hard_iface = neigh->if_incoming; + struct net_device *soft_iface = hard_iface->soft_iface; struct ethtool_link_ksettings link_settings; struct net_device *real_netdev; struct station_info sinfo; u32 throughput; int ret;
+ /* don't query throughput when no longer associated with any + * batman-adv interface + */ + if (!soft_iface) + return BATADV_THROUGHPUT_DEFAULT_VALUE; + /* if the user specified a customised value for this interface, then * return it directly */ @@ -141,7 +148,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
default_throughput: if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) { - batadv_info(hard_iface->soft_iface, + batadv_info(soft_iface, "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n", hard_iface->net_dev->name, BATADV_THROUGHPUT_DEFAULT_VALUE / 10,
Hi Sven,
Thanks for the review.
I am not 100% sure how you build batman-adv but when you've used the external kernel module then you can use [1,2]:
make EXTRA_CFLAGS="-fno-inline -Og -fno-optimize-sibling-calls -fno-reorder-blocks -fno-ipa-cp-clone -fno-partial-inlining" KERNELPATH=...
to get actually useful backtraces. Unfortunately, compile time checks sometimes need inlining and compilations fails or some kernel configurations with '-fno-inline'. If this happens to you then you can at least try to use the rest of the extra flags.
I'm using openwrt. I'll give this a shot.
Thanks for those links. I completely overlooked the devtools section of the wiki. There is a lot of good information there. Thanks for writing it all up.
Please add before your Signed-off-by line following extra line:
Fixes: 5c3245172c01 ("batman-adv: ELP - compute the metric based on the estimated throughput")
Will do in v2.
I would prefer something more explanatory instead of adding more conditions at the end of actually interesting checks. Something more like:
I agree that this is better than my original idea.
But this got me thinking about the unlikely scenario where the soft if netdevice is destroyed before this work is run.
Do you think that's possible? I can try to add delay in the work to get this contrived sequence to happen.
If this is a problem, I think grabbing RTNL before the check of the soft inerface and releasing RTNL after calling batadv_info() would fix it.
What do you think?
On Thursday, 9 January 2025 11:10:40 CET Andrew Strohman wrote:
But this got me thinking about the unlikely scenario where the soft if netdevice is destroyed before this work is run.
I would prefer when you would call cancel_work_sync when metric stuff should be stopped. I was expecting to see this somewhere around batadv_v_elp_iface_disable after the cancel_work_sync but it seems like it is missing there (or in a similar place)
void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface) { cancel_delayed_work_sync(&hard_iface->bat_v.elp_wq); + cancel_work_sync(&hard_iface->bat_v.metric_work);
dev_kfree_skb(hard_iface->bat_v.elp_skb); hard_iface->bat_v.elp_skb = NULL; }
The Fixes line for this patch would be:
Fixes: 5c3245172c01 ("batman-adv: ELP - compute the metric based on the estimated throughput")
Kind regards, Sven
I would prefer when you would call cancel_work_sync when metric stuff should be stopped. I was expecting to see this somewhere around batadv_v_elp_iface_disable after the cancel_work_sync but it seems like it is missing there (or in a similar place)
I tried this:
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1d704574..b35ded79 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -387,8 +387,20 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) */ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface) { + struct batadv_hardif_neigh_node *hardif_neigh; + cancel_delayed_work_sync(&hard_iface->bat_v.elp_wq);
+ rcu_read_lock(); + hlist_for_each_entry_rcu(hardif_neigh, + &hard_iface->neigh_list, list) { + if (!kref_get_unless_zero(&hardif_neigh->refcount)) + continue; + cancel_work_sync(&hardif_neigh->bat_v.metric_work); + batadv_hardif_neigh_put(hardif_neigh); + } + rcu_read_unlock(); + dev_kfree_skb(hard_iface->bat_v.elp_skb); hard_iface->bat_v.elp_skb = NULL; }
But it seems to cause a hang on reboot every once in a while. When the hang happens, I'm not able to trigger sysrq over serial.
I can try kgdb, but that requires sysrq to work, so I'm not sure how I can gain control after the machine becomes unresponsive.
I'm not sure why there isn't a watchdog bite when this happens.
Andy
Hi Andrew,
On Fri, Jan 10, 2025 at 01:02:19AM -0800, Andrew Strohman wrote:
I would prefer when you would call cancel_work_sync when metric stuff should be stopped. I was expecting to see this somewhere around batadv_v_elp_iface_disable after the cancel_work_sync but it seems like it is missing there (or in a similar place)
I tried this:
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1d704574..b35ded79 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -387,8 +387,20 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) */ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface) {
struct batadv_hardif_neigh_node *hardif_neigh;
cancel_delayed_work_sync(&hard_iface->bat_v.elp_wq);
rcu_read_lock();
hlist_for_each_entry_rcu(hardif_neigh,
&hard_iface->neigh_list, list) {
if (!kref_get_unless_zero(&hardif_neigh->refcount))
continue;
cancel_work_sync(&hardif_neigh->bat_v.metric_work);
batadv_hardif_neigh_put(hardif_neigh);
}
rcu_read_unlock();
dev_kfree_skb(hard_iface->bat_v.elp_skb); hard_iface->bat_v.elp_skb = NULL;
}
But it seems to cause a hang on reboot every once in a while. When the hang happens, I'm not able to trigger sysrq over serial.
Quickly looking at that I think that metric_work may need to sleep so calling cancel_work_sync() on this work does not seem safe while in rcu protected context.
I would try to frame the cancel_work_sync() call with rcu_read_unlock()/rcu_read_lock() as below:
rcu_read_unlock(); cancel_work_sync(...); rcu_read_lock(); batadv_hardif_neigh_put(...);
But be careful as batadv_hardif_neigh_put() could modify the list you are currently traversing. At first glance it seems safe to realease rcu read constraint to call cancel_work_sync() as long as you take it back before calling batadv_hardif_neigh_put(), but this could need more though on that.
Hi Remi,
Thanks for your thoughts.
Quickly looking at that I think that metric_work may need to sleep so calling cancel_work_sync() on this work does not seem safe while in rcu protected context.
Thanks, you're right. I'm not supposed to call cancel_work_sync() unless it's OK to sleep.
I would try to frame the cancel_work_sync() call with rcu_read_unlock()/rcu_read_lock() as below:
rcu_read_unlock(); cancel_work_sync(...); rcu_read_lock(); batadv_hardif_neigh_put(...);
Despite the necessity to not sleep while in the RCU read side critical section, this was not the cause of the hang.
I enabled hung task detection and was able see the call stacks of all the tasks during the hang, which made the issue clear. batadv_v_elp_iface_disable() is called with the RTNL lock held (via setlink). Calling cancel_work_sync() will wait on batadv_v_elp_throughput_metric_update() to complete, but batadv_v_elp_throughput_metric_update() calls batadv_v_elp_get_throughput(), which acquires the RTNL lock. So this is why I have deadlock: the work cannot move forward until the RTNL is released, but the caller of cancel_work_sync() holds the lock and won't release it until the work finishes.
Sven, does this change your opinion about how this issue should be handled? I'm not sure yet if it's safe to temporarily release RTNL lock during the duration of the cancel_work_sync() call.
But be careful as batadv_hardif_neigh_put() could modify the list you are currently traversing.
Yes, batadv_hardif_neigh_put() can cause the reference count to go to zero. If so, batadv_hardif_neigh_release() will be called, but it frees the memory allocated for the neighbor with kfree_rcu(). Since the suggestion was to acquire the RCU read lock before calling batadv_hardif_neigh_put, I don't anticipate a use-after-free outcome.
At first glance it seems safe to realease rcu read constraint to call cancel_work_sync() as long as you take it back before calling batadv_hardif_neigh_put(), but this could need more though on that.
Do you happen to know where I can find an example of this usage pattern?
I wonder if unlocking and then re-locking will invalidate this [1] guarantee:
"The reader is guaranteed to see all of the elements which were added to the list before they acquired the rcu_read_lock() and are still on the list when they drop the rcu_read_unlock()."
From testing, I don't see any issue when trying your suggestion, but it would be reassuring to see an example of this elsewhere in the code.
[1] https://docs.kernel.org/RCU/listRCU.html
Thanks,
Andy
I just received a notification from patchwork that this patch was accepted.
This version of the patch does not address Sven's comments.
This first version also does not address the concern that I found after submitting the patch, which is that the soft interface netdev can be destroyed before this work finishes. This less likely race can lead to a similar problem, so I don't feel like this is the right solution.
Please withdraw this patch.
I've decided to move on to other things, so if someone else wants to pick this up, please feel free.
Thanks,
Andy
On Monday, 13 January 2025 08:35:05 GMT+1 Andrew Strohman wrote:
Sven, does this change your opinion about how this issue should be handled? I'm not sure yet if it's safe to temporarily release RTNL lock during the duration of the cancel_work_sync() call.
No, it isn't
I understand my fault in thinking that it is safe to call cancel_work_sync in exactly this place. But you must still call this function for every bat_v.metric_work to ensure that this work is not running when the interface gets stopped.
Directly in batadv_v_elp_iface_disable was wrong because of the rtnl locking problem (by batadv_v_elp_get_throughput which requires it before calling __ethtool_get_link_ksettings). Which is in conflict with the backtrace:
#0 batadv_v_elp_iface_disable (hard_iface=hard_iface@entry=0xffff888009515800) at ../batman-adv/net/batman-adv/bat_v_elp.c:393 #1 0xffffffffa000a42e in batadv_v_iface_disable (hard_iface=0xffff888009515800) at ../batman-adv/net/batman-adv/bat_v.c:82 #2 0xffffffffa0028b7e in batadv_hardif_disable_interface (hard_iface=hard_iface@entry=0xffff888009515800) at ../batman-adv/net/batman-adv/hard-interface.c:848 #3 0xffffffffa0046dc0 in batadv_softif_destroy_netlink (soft_iface=0xffff88800c500000, head=0xffff88800b34fc90) at ../batman-adv/net/batman-adv/soft-interface.c:1125 #4 0xffffffff8216deaf in __rtnl_kill_links (net=0xffffffff84ff2c00 <init_net>, ops=<optimized out>) at net/core/rtnetlink.c:486 #5 __rtnl_link_unregister (ops=ops@entry=0xffffffffa006b4e0 <batadv_link_ops>) at net/core/rtnetlink.c:504 #6 0xffffffff82190442 in rtnl_link_unregister (ops=<optimized out>) at net/core/rtnetlink.c:541
batadv_softif_destroy_netlink is called with rtnl_lock held as you correctly pointed out. Causing the conflict.
In this case, we need to delay the call to cancel_work_sync in another context. For example, batadv_hardif_neigh_release is responsible to clean up resources and at the end is just calling kfree_rcu because it assumes that there is no pending work anywhere. If we want to change this, we unfortunately don't have access to the bat_algo anymore. And of course, it is also wrong because you are not allowed to sleep inside inside a call_rcu callback. So, something like this will also not work:
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 4eee53d19eb047bea986184498a3ef98ed6c84df..84b20640861b3e2df5dbf2906e2374b49670201b 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -77,6 +77,7 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) !bat_algo_ops->iface.update_mac || !bat_algo_ops->iface.primary_set || !bat_algo_ops->neigh.cmp || + !bat_algo_ops->neigh.hardif_init || !bat_algo_ops->neigh.is_similar_or_better) { pr_info("Routing algo '%s' does not implement required ops\n", bat_algo_ops->name); diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 07ae5dd1f150b062e97897b269d45b041e8b4dfe..6356ac0ac761b8a2e62fd4d5e1ae7319ab24db85 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -2497,6 +2497,18 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, cb->args[0] = idx_skip; }
+static void +batadv_iv_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) +{ +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + /* initialize the metric work to ensure that batadv_hardif_neigh_release + * can cancel the pending work independent of the used routing + * algorithm implementation + */ + INIT_WORK(&hardif_neigh->bat_v.metric_work, NULL); +#endif +} + static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .name = "BATMAN_IV", .iface = { @@ -2507,6 +2519,7 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .primary_set = batadv_iv_ogm_primary_iface_set, }, .neigh = { + .hardif_init = batadv_iv_hardif_neigh_init, .cmp = batadv_iv_ogm_neigh_cmp, .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, .dump = batadv_iv_ogm_neigh_dump, diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index bcc2e20e0cd6c7077f3d148f8c77b0947561ed3d..51b19c2782fb780aeb160d9054f5b370fd1f3fe0 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -258,6 +258,23 @@ void batadv_neigh_ifinfo_release(struct kref *ref) kfree_rcu(neigh_ifinfo, rcu); }
+/** + * batadv_hardif_neigh_free_rcu() - free the hardif_neigh + * @rcu: rcu pointer of the hardif_neigh + */ +static void batadv_hardif_neigh_free_rcu(struct rcu_head *rcu) +{ + struct batadv_hardif_neigh_node *hardif_neigh; + + hardif_neigh = container_of(rcu, struct batadv_hardif_neigh_node, rcu); + +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + cancel_work_sync(&hardif_neigh->bat_v.metric_work); +#endif + + kfree(hardif_neigh); +} + /** * batadv_hardif_neigh_release() - release hardif neigh node from lists and * queue for free after rcu grace period @@ -275,7 +292,7 @@ void batadv_hardif_neigh_release(struct kref *ref) spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock);
batadv_hardif_put(hardif_neigh->if_incoming); - kfree_rcu(hardif_neigh, rcu); + call_rcu(&hardif_neigh->rcu, batadv_hardif_neigh_free_rcu); }
/** @@ -590,8 +607,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
kref_init(&hardif_neigh->refcount);
- if (bat_priv->algo_ops->neigh.hardif_init) - bat_priv->algo_ops->neigh.hardif_init(hardif_neigh); + bat_priv->algo_ops->neigh.hardif_init(hardif_neigh);
hlist_add_head_rcu(&hardif_neigh->list, &hard_iface->neigh_list);
At this point, it starts to become complicated. For example by using a two phase deletion. The first part cleans the main resources and adds it to a free list. This free list is then cleaned up in a better context. But doing something like requires a major rework of the codebase
Attempting to work around the rtnl_lock by trylocking, sounded "good" when thinking about it:
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 4eee53d19eb047bea986184498a3ef98ed6c84df..84b20640861b3e2df5dbf2906e2374b49670201b 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -77,6 +77,7 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) !bat_algo_ops->iface.update_mac || !bat_algo_ops->iface.primary_set || !bat_algo_ops->neigh.cmp || + !bat_algo_ops->neigh.hardif_init || !bat_algo_ops->neigh.is_similar_or_better) { pr_info("Routing algo '%s' does not implement required ops\n", bat_algo_ops->name); diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 07ae5dd1f150b062e97897b269d45b041e8b4dfe..6356ac0ac761b8a2e62fd4d5e1ae7319ab24db85 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -2497,6 +2497,18 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, cb->args[0] = idx_skip; }
+static void +batadv_iv_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) +{ +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + /* initialize the metric work to ensure that batadv_hardif_neigh_release + * can cancel the pending work independent of the used routing + * algorithm implementation + */ + INIT_WORK(&hardif_neigh->bat_v.metric_work, NULL); +#endif +} + static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .name = "BATMAN_IV", .iface = { @@ -2507,6 +2519,7 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .primary_set = batadv_iv_ogm_primary_iface_set, }, .neigh = { + .hardif_init = batadv_iv_hardif_neigh_init, .cmp = batadv_iv_ogm_neigh_cmp, .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, .dump = batadv_iv_ogm_neigh_dump, diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 03d5d19306b5b1399523397e7b443ec563766fa0..61b3edd4bb1631e3be1102863c47eab1a69a5400 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -131,9 +131,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) /* if not a wifi interface, check if this device provides data via * ethtool (e.g. an Ethernet adapter) */ - rtnl_lock(); ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); - rtnl_unlock(); if (ret == 0) { /* link characteristics might change over time */ if (link_settings.base.duplex == DUPLEX_FULL) @@ -175,8 +173,14 @@ void batadv_v_elp_throughput_metric_update(struct work_struct *work) neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, bat_v);
- ewma_throughput_add(&neigh->bat_v.throughput, - batadv_v_elp_get_throughput(neigh)); + if (!rtnl_trylock()) { + queue_work(batadv_event_workqueue, work); + return; + } + + ewma_throughput_add(&neigh->bat_v.throughput, + batadv_v_elp_get_throughput(neigh)); + rtnl_unlock();
/* decrement refcounter to balance increment performed before scheduling * this task diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index bcc2e20e0cd6c7077f3d148f8c77b0947561ed3d..eb0ddaade08f876ea079dd91489e15145e00f246 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -270,6 +270,10 @@ void batadv_hardif_neigh_release(struct kref *ref) hardif_neigh = container_of(ref, struct batadv_hardif_neigh_node, refcount);
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V + cancel_work_sync(&hardif_neigh->bat_v.metric_work); +#endif + spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); hlist_del_init_rcu(&hardif_neigh->list); spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); @@ -590,8 +594,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
kref_init(&hardif_neigh->refcount);
- if (bat_priv->algo_ops->neigh.hardif_init) - bat_priv->algo_ops->neigh.hardif_init(hardif_neigh); + bat_priv->algo_ops->neigh.hardif_init(hardif_neigh);
hlist_add_head_rcu(&hardif_neigh->list, &hard_iface->neigh_list);
But I directly get a "possible recursive locking detected":
[ 12.495682][ T299] ============================================ [ 12.496199][ T299] WARNING: possible recursive locking detected [ 12.496610][ T299] 6.12.0 #1 Tainted: G O [ 12.496932][ T299] -------------------------------------------- [ 12.497419][ T299] kworker/u8:3/299 is trying to acquire lock: [ 12.497810][ T299] ffff88800a50ac68 ((work_completion)(&(&hardif_neigh->bat_v.metric_work)->work)){+.+.}-{0:0}, at: start_flush_work+0x3b7/0x9b0 [ 12.498541][ T299] [ 12.498541][ T299] but task is already holding lock: [ 12.498961][ T299] ffff88800d417d80 ((work_completion)(&(&hardif_neigh->bat_v.metric_work)->work)){+.+.}-{0:0}, at: process_one_work+0x7fc/0x15a0 [ 12.499698][ T299] [ 12.499698][ T299] other info that might help us debug this: [ 12.500123][ T299] Possible unsafe locking scenario: [ 12.500123][ T299] [ 12.500539][ T299] CPU0 [ 12.500760][ T299] ---- [ 12.500976][ T299] lock((work_completion)(&(&hardif_neigh->bat_v.metric_work)->work)); [ 12.501391][ T299] lock((work_completion)(&(&hardif_neigh->bat_v.metric_work)->work)); [ 12.501843][ T299] [ 12.501843][ T299] *** DEADLOCK *** [ 12.501843][ T299] [ 12.502256][ T299] May be due to missing lock nesting notation [ 12.502256][ T299] [ 12.502679][ T299] 3 locks held by kworker/u8:3/299: [ 12.502967][ T299] #0: ffff88800a6b8958 ((wq_completion)bat_events){+.+.}-{0:0}, at: process_one_work+0xd04/0x15a0 [ 12.503535][ T299] #1: ffff88800d417d80 ((work_completion)(&(&hardif_neigh->bat_v.metric_work)->work)){+.+.}-{0:0}, at: process_one_work+0x7fc/0x15a0 [ 12.504341][ T299] #2: ffffffff83608000 (rcu_read_lock){....}-{1:2}, at: start_flush_work+0x2d/0x9b0
And it is getting too late to look at this in more detail. But it seems like the relevant batadv_hardif_neigh_put happened inside batadv_v_elp_throughput_metric_update. And of course then we would try to cancel the work when still in the work. And this too obvious in hindsight.
I would therefore prefer to have the worker inside the hard_iface and cancel it there. The worker must then iterate over the batadv_hardif_neigh_node. Otherwise, iterating over the rcu list in batadv_v_elp_throughput_metric_update would potentially miss entries. But at the same time, you would then potentially sleep in rcu_read_lock() - which is not allowed. See (batadv_v_elp_periodic_work for details).
To work around this problem, we could do something like this in batadv_v_elp_periodic_work:
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index ac11f1f0..d35479c4 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -113,8 +113,6 @@ static void batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) { ewma_throughput_init(&hardif_neigh->bat_v.throughput); - INIT_WORK(&hardif_neigh->bat_v.metric_work, - batadv_v_elp_throughput_metric_update); }
/** diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index fbf499bc..a752f564 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -131,9 +131,13 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) /* if not a wifi interface, check if this device provides data via * ethtool (e.g. an Ethernet adapter) */ - rtnl_lock(); - ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); - rtnl_unlock(); + if (!rtnl_trylock()) { + // TODO would be better not to safe the default value in this case + ret = -EBUSY; + } else { + ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); + rtnl_unlock(); + } if (ret == 0) { /* link characteristics might change over time */ if (link_settings.base.duplex == DUPLEX_FULL) @@ -165,23 +169,10 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) * metric of a single hop neighbour * @work: the work queue item */ -void batadv_v_elp_throughput_metric_update(struct work_struct *work) +static void batadv_v_elp_throughput_metric_update(struct batadv_hardif_neigh_node *neigh) { - struct batadv_hardif_neigh_node_bat_v *neigh_bat_v; - struct batadv_hardif_neigh_node *neigh; - - neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v, - metric_work); - neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, - bat_v); - ewma_throughput_add(&neigh->bat_v.throughput, batadv_v_elp_get_throughput(neigh)); - - /* decrement refcounter to balance increment performed before scheduling - * this task - */ - batadv_hardif_neigh_put(neigh); }
/** @@ -247,6 +238,11 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) return true; }
+struct batadv_v_metric_queue_entry { + struct batadv_hardif_neigh_node *hardif_neigh; + struct list_head list; +}; + /** * batadv_v_elp_periodic_work() - ELP periodic task per interface * @work: work queue item @@ -255,14 +251,16 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) */ static void batadv_v_elp_periodic_work(struct work_struct *work) { + struct batadv_v_metric_queue_entry *metric_entry; + struct batadv_v_metric_queue_entry *metric_safe; struct batadv_hardif_neigh_node *hardif_neigh; struct batadv_hard_iface *hard_iface; struct batadv_hard_iface_bat_v *bat_v; struct batadv_elp_packet *elp_packet; + struct list_head metric_queue; struct batadv_priv *bat_priv; struct sk_buff *skb; u32 elp_interval; - bool ret;
bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work); hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v); @@ -298,6 +296,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work)
atomic_inc(&hard_iface->bat_v.elp_seqno);
+ INIT_LIST_HEAD(&metric_queue); + /* The throughput metric is updated on each sent packet. This way, if a * node is dead and no longer sends packets, batman-adv is still able to * react timely to its death. @@ -320,18 +320,25 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) if (!kref_get_unless_zero(&hardif_neigh->refcount)) continue;
- /* Reading the estimated throughput from cfg80211 is a task that - * may sleep and that is not allowed in an rcu protected - * context. Therefore schedule a task for that. - */ - ret = queue_work(batadv_event_workqueue, - &hardif_neigh->bat_v.metric_work); - - if (!ret) + metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC); + if (!metric_entry) { batadv_hardif_neigh_put(hardif_neigh); + continue; + } + + metric_entry->hardif_neigh = hardif_neigh; + list_add(&metric_entry->list, &metric_queue); } rcu_read_unlock();
+ list_for_each_entry_safe(metric_entry, metric_safe, &metric_queue, list) { + batadv_v_elp_throughput_metric_update(metric_entry->hardif_neigh); + + batadv_hardif_neigh_put(metric_entry->hardif_neigh); + list_del(&metric_entry->list); + kfree(metric_entry); + } + restart_timer: batadv_v_elp_start_timer(hard_iface); out: diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 9e274019..29e4177b 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -19,6 +19,5 @@ void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface, void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface); int batadv_v_elp_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming); -void batadv_v_elp_throughput_metric_update(struct work_struct *work);
#endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index f491bff8..fe89f085 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -596,9 +596,6 @@ struct batadv_hardif_neigh_node_bat_v { * neighbor */ unsigned long last_unicast_tx; - - /** @metric_work: work queue callback item for metric update */ - struct work_struct metric_work; };
/**
Kind regards, Sven
b.a.t.m.a.n@lists.open-mesh.org