WARNING: The following patches were only compile-time tested - you've been warned ;).
The last round of multicast patches to add support for the multicast optimizations in bridged scenarios, too, unfortunately had one major conceptual flaw: It could lead to packet loss. It's not sufficient to have the unicasting of reports implemented on bridge-nodes only. Nodes without bridges need to treat reports the same way.
The issue is described in detail here:
https://www.open-mesh.org/projects/batman-adv/wiki/Multicast-optimizations-l...
These patches are one suggestion to tackle this issue (alternatives can be found on the wikipage, too). Let me know what you think about this approach.
Cheers, Linus
Let's use these new, neat helpers.
Signed-off-by: Linus Lüssing linus.luessing@c0d3.blue --- net/bridge/br_multicast.c | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index c465876..3c630c7 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1609,16 +1609,8 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, if (!pskb_may_pull(skb2, sizeof(*ih))) goto out;
- switch (skb2->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb2->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb2->csum = 0; - if (skb_checksum_complete(skb2)) - goto out; - } + if (skb_checksum_simple_validate(skb2)) + goto out;
err = 0;
@@ -1736,20 +1728,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
ip6h = ipv6_hdr(skb2);
- switch (skb2->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb2->len, - IPPROTO_ICMPV6, skb2->csum)) - break; - /*FALLTHROUGH*/ - case CHECKSUM_NONE: - skb2->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, - &ip6h->daddr, - skb2->len, - IPPROTO_ICMPV6, 0)); - if (__skb_checksum_complete(skb2)) - goto out; - } + if (skb_checksum_validate(skb2, IPPROTO_ICMPV6, ip6_compute_pseudo)) + goto out;
err = 0;
With this patch, the IGMP and MLD message validation functions are moved from the bridge code to the IPv4/IPv6 multicast files. Some small refactoring was done to enhance readibility and to iron out some differences in behaviour between the IGMP and MLD parsing code (e.g. the skb-cloning of MLD messages is now only done if necessary, just like the IGMP part always did).
Finally, these IGMP and MLD message validation functions are exported so that not only the bridge can use it but batman-adv later, too. --- include/linux/igmp.h | 1 + include/linux/skbuff.h | 3 + include/net/addrconf.h | 1 + net/bridge/br_multicast.c | 218 ++++++--------------------------------------- net/core/skbuff.c | 38 ++++++++ net/ipv4/igmp.c | 152 +++++++++++++++++++++++++++++++ net/ipv6/mcast.c | 178 ++++++++++++++++++++++++++++++++++++ 7 files changed, 401 insertions(+), 190 deletions(-)
diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 2c677af..8ac3d24 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -130,5 +130,6 @@ extern void ip_mc_unmap(struct in_device *); extern void ip_mc_remap(struct in_device *); extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr); extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr); +extern int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed);
#endif diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 30007af..9c0410c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3394,6 +3394,9 @@ static inline void skb_checksum_none_assert(const struct sk_buff *skb) bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
int skb_checksum_setup(struct sk_buff *skb, bool recalculate); +int skb_checksum_trimmed(struct sk_buff *skb, unsigned int transport_len, + __sum16 (*skb_check_func)(struct sk_buff *skb), + struct sk_buff **skb_trimmed);
u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 80456f7..def59d3 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -142,6 +142,7 @@ void ipv6_mc_unmap(struct inet6_dev *idev); void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); +int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed); void addrconf_dad_failure(struct inet6_ifaddr *ifp);
bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 3c630c7..cd5586a 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -974,9 +974,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, int err = 0; __be32 group;
- if (!pskb_may_pull(skb, sizeof(*ih))) - return -EINVAL; - ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); len = sizeof(*ih); @@ -1247,25 +1244,14 @@ static int br_ip4_multicast_query(struct net_bridge *br, max_delay = 10 * HZ; group = 0; } - } else { - if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) { - err = -EINVAL; - goto out; - } - + } else if (skb->len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) goto out;
max_delay = ih3->code ? IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; - } - - /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer - * all-systems destination addresses (224.0.0.1) for general queries - */ - if (!group && iph->daddr != htonl(INADDR_ALLHOSTS_GROUP)) { - err = -EINVAL; + } else { goto out; }
@@ -1328,12 +1314,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out;
- /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ - if (!(ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL)) { - err = -EINVAL; - goto out; - } - if (skb->len == sizeof(*mld)) { if (!pskb_may_pull(skb, sizeof(*mld))) { err = -EINVAL; @@ -1357,14 +1337,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
is_general_query = group && ipv6_addr_any(group);
- /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer - * all-nodes destination address (ff02::1) for general queries - */ - if (is_general_query && !ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) { - err = -EINVAL; - goto out; - } - if (is_general_query) { saddr.proto = htons(ETH_P_IPV6); saddr.u.ip6 = ip6h->saddr; @@ -1556,66 +1528,20 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb2 = skb; - const struct iphdr *iph; + struct sk_buff *skb_trimmed = NULL; struct igmphdr *ih; - unsigned int len; - unsigned int offset; int err;
- /* We treat OOM as packet loss for now. */ - if (!pskb_may_pull(skb, sizeof(*iph))) - return -EINVAL; - - iph = ip_hdr(skb); - - if (iph->ihl < 5 || iph->version != 4) - return -EINVAL; - - if (!pskb_may_pull(skb, ip_hdrlen(skb))) - return -EINVAL; - - iph = ip_hdr(skb); + err = ip_mc_check_igmp(skb, &skb_trimmed);
- if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) - return -EINVAL; - - if (iph->protocol != IPPROTO_IGMP) { - if (!ipv4_is_local_multicast(iph->daddr)) - BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - return 0; - } - - len = ntohs(iph->tot_len); - if (skb->len < len || len < ip_hdrlen(skb)) - return -EINVAL; - - if (skb->len > len) { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - - err = pskb_trim_rcsum(skb2, len); - if (err) - goto err_out; - } - - len -= ip_hdrlen(skb2); - offset = skb_network_offset(skb2) + ip_hdrlen(skb2); - __skb_pull(skb2, offset); - skb_reset_transport_header(skb2); - - err = -EINVAL; - if (!pskb_may_pull(skb2, sizeof(*ih))) - goto out; - - if (skb_checksum_simple_validate(skb2)) - goto out; + if (err == -ENOMSG && !ipv4_is_local_multicast(ip_hdr(skb)->daddr)) + BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
- err = 0; + if (err < 0) + return err;
BR_INPUT_SKB_CB(skb)->igmp = 1; - ih = igmp_hdr(skb2); + ih = igmp_hdr(skb);
switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: @@ -1624,21 +1550,19 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = br_ip4_multicast_add_group(br, port, ih->group, vid); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: - err = br_ip4_multicast_igmp3_report(br, port, skb2, vid); + err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: - err = br_ip4_multicast_query(br, port, skb2, vid); + err = br_ip4_multicast_query(br, port, skb_trimmed, vid); break; case IGMP_HOST_LEAVE_MESSAGE: br_ip4_multicast_leave_group(br, port, ih->group, vid); break; }
-out: - __skb_push(skb2, offset); -err_out: - if (skb2 != skb) - kfree_skb(skb2); + if (skb_trimmed && skb_trimmed != skb) + kfree_skb(skb_trimmed); + return err; }
@@ -1648,126 +1572,40 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb2; - const struct ipv6hdr *ip6h; - u8 icmp6_type; - u8 nexthdr; - __be16 frag_off; - unsigned int len; - int offset; + struct sk_buff *skb_trimmed = NULL; + struct mld_msg *mld; int err;
- if (!pskb_may_pull(skb, sizeof(*ip6h))) - return -EINVAL; - - ip6h = ipv6_hdr(skb); + err = ipv6_mc_check_mld(skb, &skb_trimmed);
- /* - * We're interested in MLD messages only. - * - Version is 6 - * - MLD has always Router Alert hop-by-hop option - * - But we do not support jumbrograms. - */ - if (ip6h->version != 6) - return 0; - - /* Prevent flooding this packet if there is no listener present */ - if (!ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) + if (err == -ENOMSG && !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
- if (ip6h->nexthdr != IPPROTO_HOPOPTS || - ip6h->payload_len == 0) - return 0; - - len = ntohs(ip6h->payload_len) + sizeof(*ip6h); - if (skb->len < len) - return -EINVAL; - - nexthdr = ip6h->nexthdr; - offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off); - - if (offset < 0 || nexthdr != IPPROTO_ICMPV6) - return 0; - - /* Okay, we found ICMPv6 header */ - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - - err = -EINVAL; - if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr))) - goto out; - - len -= offset - skb_network_offset(skb2); - - __skb_pull(skb2, offset); - skb_reset_transport_header(skb2); - skb_postpull_rcsum(skb2, skb_network_header(skb2), - skb_network_header_len(skb2)); - - icmp6_type = icmp6_hdr(skb2)->icmp6_type; - - switch (icmp6_type) { - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - case ICMPV6_MLD2_REPORT: - break; - default: - err = 0; - goto out; - } - - /* Okay, we found MLD message. Check further. */ - if (skb2->len > len) { - err = pskb_trim_rcsum(skb2, len); - if (err) - goto out; - err = -EINVAL; - } - - ip6h = ipv6_hdr(skb2); - - if (skb_checksum_validate(skb2, IPPROTO_ICMPV6, ip6_compute_pseudo)) - goto out; - - err = 0; + if (err < 0) + return err;
BR_INPUT_SKB_CB(skb)->igmp = 1; + mld = (struct mld_msg *)skb_transport_header(skb);
- switch (icmp6_type) { + switch (mld->mld_type) { case ICMPV6_MGM_REPORT: - { - struct mld_msg *mld; - if (!pskb_may_pull(skb2, sizeof(*mld))) { - err = -EINVAL; - goto out; - } - mld = (struct mld_msg *)skb_transport_header(skb2); BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid); break; - } case ICMPV6_MLD2_REPORT: - err = br_ip6_multicast_mld2_report(br, port, skb2, vid); + err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_QUERY: - err = br_ip6_multicast_query(br, port, skb2, vid); + err = br_ip6_multicast_query(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_REDUCTION: - { - struct mld_msg *mld; - if (!pskb_may_pull(skb2, sizeof(*mld))) { - err = -EINVAL; - goto out; - } - mld = (struct mld_msg *)skb_transport_header(skb2); br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid); - } + break; }
-out: - kfree_skb(skb2); + if (skb_trimmed && skb_trimmed != skb) + kfree_skb(skb_trimmed); + return err; } #endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f805078..ba1d021 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4053,6 +4053,44 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate) } EXPORT_SYMBOL(skb_checksum_setup);
+int skb_checksum_trimmed(struct sk_buff *skb, unsigned int transport_len, + __sum16 (*skb_check_func)(struct sk_buff *skb), + struct sk_buff **skb_trimmed) +{ + struct sk_buff *skb_chk = skb; + unsigned int offset = skb_transport_offset(skb); + int ret = 0; + + __skb_pull(skb, offset); + + /* if there's data beyond the IP packet: + * avoid it in checksum validation by using a trimmed clone + */ + if (skb->len > transport_len) { + skb_chk = skb_clone(skb, GFP_ATOMIC); + if (!skb_chk) { + ret = -ENOMEM; + goto out; + } + + ret = pskb_trim_rcsum(skb_chk, transport_len); + if (ret) + goto out; + } + + ret = skb_check_func(skb_chk); + if (ret) + goto out; + + if (skb_trimmed) + *skb_trimmed = skb_chk; + +out: + __skb_push(skb, offset); + return ret; +} +EXPORT_SYMBOL(skb_checksum_trimmed); + void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 666cf36..9beee32 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1338,6 +1338,158 @@ out: } EXPORT_SYMBOL(ip_mc_inc_group);
+static int ip_mc_check_iphdr(struct sk_buff *skb) +{ + const struct iphdr *iph; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph)) + return -EINVAL; + + offset += ip_hdrlen(skb) - sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + return -EINVAL; + + len = skb_network_offset(skb) + ntohs(iph->tot_len); + if (skb->len < len || len < offset) + return -EINVAL; + + skb_set_transport_header(skb, offset); + + return 0; +} + +static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmpv3_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ip_mc_check_igmp_query(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len+= sizeof(struct igmphdr); + if (skb->len < len) + return -EINVAL; + + /* IGMPv{1,2}? */ + if (skb->len != len) { + /* or IGMPv3? */ + len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer + * all-systems destination addresses (224.0.0.1) for general queries + */ + if (!igmp_hdr(skb)->group && + ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP)) + return -EINVAL; + + return 0; +} + +static int ip_mc_check_igmp_msg(struct sk_buff *skb) +{ + switch (igmp_hdr(skb)->type) { + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + /* fall through */ + return 0; + case IGMPV3_HOST_MEMBERSHIP_REPORT: + return ip_mc_check_igmp_reportv3(skb); + case IGMP_HOST_MEMBERSHIP_QUERY: + return ip_mc_check_igmp_query(skb); + default: + return -EINVAL; + } +} + +static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_simple_validate(skb); +} + +static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk = NULL; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); + int ret; + + if (!pskb_may_pull(skb, len)) + return -EINVAL; + + transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); + + if (skb_checksum_trimmed(skb, transport_len, + ip_mc_validate_checksum, &skb_chk)) + return -EINVAL; + + ret = ip_mc_check_igmp_msg(skb_chk); + + if (!ret && skb_trimmed) + *skb_trimmed = skb_chk; + else if (skb_chk && skb_chk != skb) + kfree_skb(skb_chk); + + return ret; +} + +/** + * ip_mc_check_igmp - checks whether this is a sane IGMP packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional) + * + * Checks whether an IPv4 packet is a valid IGMP packet. If so sets + * skb network and transport headers accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an IGMP packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame was padded beyond the IP header. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + */ +int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret = ip_mc_check_iphdr(skb); + + if (ret < 0) + return ret; + + if (ip_hdr(skb)->protocol != IPPROTO_IGMP) + return -ENOMSG; + + return __ip_mc_check_igmp(skb, skb_trimmed); +} +EXPORT_SYMBOL(ip_mc_check_igmp); + /* * Resend IGMP JOIN report; used by netdev notifier. */ diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 5ce107c..96ea442 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2575,6 +2575,184 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) write_unlock_bh(&idev->lock); }
+static int ipv6_mc_check_ip6hdr(struct sk_buff *skb, + unsigned int *transport_len) +{ + const struct ipv6hdr *ip6h; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + ip6h = ipv6_hdr(skb); + + if (ip6h->version != 6) + return -EINVAL; + + len = offset + ntohs(ip6h->payload_len); + if (skb->len < len || len <= offset) + return -EINVAL; + + return 0; + +} + +static int ipv6_mc_check_exthdrs(struct sk_buff *skb, + unsigned int *transport_len) +{ + const struct ipv6hdr *ip6h; + u8 nexthdr; + __be16 frag_off; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_HOPOPTS) + return -ENOMSG; + + nexthdr = ip6h->nexthdr; + offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off); + + if (offset < 0) + return -EINVAL; + + if (nexthdr != IPPROTO_ICMPV6) + return -ENOMSG; + + skb_set_transport_header(skb, offset); +} + +static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct mld2_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ipv6_mc_check_mld_query(struct sk_buff *skb) +{ + struct mld_msg *mld; + unsigned int len = skb_transport_offset(skb); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + len+= sizeof(struct mld_msg); + if (skb->len < len) + return -EINVAL; + + /* MLDv1? */ + if (skb->len != len) { + /* or MLDv2? */ + len += sizeof(struct mld2_query) - sizeof(struct mld_msg); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + mld = (struct mld_msg *)skb_transport_header(skb); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer + * all-nodes destination address (ff02::1) for general queries + */ + if (ipv6_addr_any(mld->mld_mca) && + !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr) + return -EINVAL; + + return 0; +} + +static int ipv6_mc_check_mld_msg(struct sk_buff *skb) +{ + struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb); + + switch (mld->mld_type) { + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MGM_REPORT: + /* fall through */ + return 0; + case ICMPV6_MLD2_REPORT: + return ipv6_mc_check_mld_reportv2(skb); + case ICMPV6_MGM_QUERY: + return ipv6_mc_check_mld_query(skb); + default: + return -EINVAL; + } +} + +static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo); +} + +static int __ipv6_mc_check_mld(struct sk_buff *skb, + struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk = NULL; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg); + int ret; + + if (!pskb_may_pull(skb, len)) + return -EINVAL; + + transport_len = ipv6_hdr(skb)->payload_len; + + if (skb_checksum_trimmed(skb, transport_len, + ip_mc_validate_checksum, &skb_chk)) + return -EINVAL; + + ret = ip6_mc_check_mld_msg(skb_chk); + + if (!ret && skb_trimmed) + *skb_trimmed = skb_chk; + else if (skb_chk && skb_chk != skb) + kfree_skb(skb_chk); + + return ret; +} + + +/** + * ipv6_mc_check_mld - checks whether this is a sane MLD packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional) + * + * Checks whether an IPv6 packet is a valid MLD packet. If so sets + * skb network and transport headers accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an IGMP packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame was padded beyond the IP header. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + */ +int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret; + + ret = ipv6_mc_check_ip6hdr(skb); + if (ret < 0) + return ret; + + ret = ipv6_mc_check_exthdrs(skb); + if (ret < 0) + return ret; + + return __ipv6_mc_check_mld(skb, skb_trimmed); +} +EXPORT_SYMBOL(ipv6_mc_check_mld); + #ifdef CONFIG_PROC_FS struct igmp6_mc_iter_state { struct seq_net_private p;
With this patch IGMP or MLD reports are only forwarded to the selected IGMP/MLD querier as RFC4541 suggests. This is necessary to avoid multicast packet loss in bridged scenarios later:
An IGMPv2/MLDv1 querier does not actively join the multicast group the reports are sent to. Because of this, this leads to snooping bridges/switches not being able to learn of multicast listeners in the mesh and wrongly shutting down ports for multicast traffic to the mesh. --- net/batman-adv/multicast.c | 240 ++++++++++++++++++++++++++++++++++++++- net/batman-adv/multicast.h | 8 ++ net/batman-adv/soft-interface.c | 8 ++ net/batman-adv/types.h | 14 +++ 4 files changed, 267 insertions(+), 3 deletions(-)
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index b24e4bb..bafc323 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -247,10 +247,76 @@ out: }
/** + * batadv_mcast_is_report_ipv4 - check for IGMP reports (and queries) + * @bat_priv: the bat priv with all the soft interface information + * @skb: the ethernet frame destined for the mesh + * @orig: if IGMP report: to be set to the querier to forward the skb to + * + * Checks whether the given frame is an IGMP report and if so sets the + * orig pointer to the originator of the selected IGMP querier if one exists + * and returns true. Otherwise returns false. + * + * If the packet is a general IGMP query instead then we delete the memorized, + * foreign IGMP querier (if one exists): We became the selected querier and + * therefore do not need to forward reports into the mesh. + * + * This call might reallocate skb data. + */ +static bool batadv_mcast_is_report_ipv4(struct batadv_priv *bat_priv, + struct sk_buff *skb, + struct batadv_orig_node **orig) +{ + struct batadv_mcast_querier_state *querier; + struct batadv_orig_node *orig_node; + + if (!ip_mc_check_igmp(skb, NULL)) + return false; + + querier = &bat_priv->mcast.querier_ipv4; + + switch (igmp_hdr(skb)->type) { + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + case IGMPV3_HOST_MEMBERSHIP_REPORT: + rcu_read_lock(); + orig_node = rcu_dereference(querier->orig); + if (orig_node && atomic_inc_not_zero(&orig_node->refcount)) { + /* TODO: include multicast routers via MRD (RFC4286) */ + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "Redirecting IGMP Report to %pM\n", + orig_node->orig); + *orig = orig_node; + } + rcu_read_unlock(); + + return true; + case IGMP_HOST_MEMBERSHIP_QUERY: + /* RFC4541, section 2.1.1.1.b) says: + * ignore general queries from 0.0.0.0 + */ + if (!ip_hdr(skb)->saddr || igmp_hdr(skb)->group) + break; + + spin_lock_bh(&querier->orig_lock); + orig_node = rcu_dereference(querier->orig); + if (orig_node) + rcu_assign_pointer(querier->orig, NULL); + spin_unlock_bh(&querier->orig_lock); + + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "Snooped own IGMP Query\n"); + break; + } + + return false; +} + +/** * batadv_mcast_forw_mode_check_ipv4 - check for optimized forwarding potential * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv4 packet to check * @is_unsnoopable: stores whether the destination is snoopable + * @orig: for IGMP reports: to be set to the querier to forward the skb to * * Checks whether the given IPv4 packet has the potential to be forwarded with a * mode more optimal than classic flooding. @@ -260,7 +326,8 @@ out: */ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, - bool *is_unsnoopable) + bool *is_unsnoopable, + struct batadv_orig_node **orig) { struct iphdr *iphdr;
@@ -268,6 +335,9 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr))) return -ENOMEM;
+ if (batadv_mcast_is_report_ipv4(bat_priv, skb, orig)) + return 0; + iphdr = ip_hdr(skb);
/* TODO: Implement Multicast Router Discovery (RFC4286), @@ -285,10 +355,71 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, }
/** + * batadv_mcast_is_report_ipv6 - check for MLD reports (and queries) + * @bat_priv: the bat priv with all the soft interface information + * @skb: the ethernet frame destined for the mesh + * @orig: if MLD report: to be set to the querier to forward the skb to + * + * Checks whether the given frame is an MLD report and if so sets the + * orig pointer to the originator of the selected MLD querier if one exists + * and returns true. Otherwise returns false. + * + * If the packet is a general MLD query instead then we delete the memorized, + * foreign MLD querier (if one exists): We became the selected querier and + * therefore do not need to forward reports into the mesh. + * + * This call might reallocate skb data. + */ +static bool batadv_mcast_is_report_ipv6(struct batadv_priv *bat_priv, + struct sk_buff *skb, + struct batadv_orig_node **orig) +{ + struct mld_msg *mld; + struct batadv_mcast_querier_state *querier; + struct batadv_orig_node *orig_node; + + if (!ipv6_mc_check_mld(skb, NULL)) + return false; + + querier = &bat_priv->mcast.querier_ipv6; + mld = (struct mld_msg *)icmp6_hdr(skb); + + switch (mld->mld_type) { + case ICMPV6_MGM_REPORT: + case ICMPV6_MLD2_REPORT: + rcu_read_lock(); + orig_node = rcu_dereference(querier->orig); + if (orig_node && atomic_inc_not_zero(&orig_node->refcount)) { + /* TODO: include multicast routers via MRD (RFC4286) */ + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "Redirecting MLD Report to %pM\n", + orig_node->orig); + *orig = orig_node; + } + rcu_read_unlock(); + + return true; + case ICMPV6_MGM_QUERY: + spin_lock_bh(&querier->orig_lock); + orig_node = rcu_dereference(querier->orig); + if (orig_node) + rcu_assign_pointer(querier->orig, NULL); + spin_unlock_bh(&querier->orig_lock); + + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "Snooped own MLD Query\n"); + break; + } + + return false; +} + +/** * batadv_mcast_forw_mode_check_ipv6 - check for optimized forwarding potential * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv6 packet to check * @is_unsnoopable: stores whether the destination is snoopable + * @orig: for MLD reports: to be set to the querier to forward the skb to * * Checks whether the given IPv6 packet has the potential to be forwarded with a * mode more optimal than classic flooding. @@ -298,7 +429,8 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, */ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, - bool *is_unsnoopable) + bool *is_unsnoopable, + struct batadv_orig_node **orig) { struct ipv6hdr *ip6hdr;
@@ -306,6 +438,9 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr))) return -ENOMEM;
+ if (batadv_mcast_is_report_ipv6(bat_priv, skb, orig)) + return 0; + ip6hdr = ipv6_hdr(skb);
/* TODO: Implement Multicast Router Discovery (RFC4286), @@ -521,12 +656,16 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, bool is_unsnoopable = false; struct ethhdr *ethhdr;
- ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable); + ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable, + orig); if (ret == -ENOMEM) return BATADV_FORW_NONE; else if (ret < 0) return BATADV_FORW_ALL;
+ if (*orig) + return BATADV_FORW_SINGLE; + ethhdr = eth_hdr(skb);
tt_count = batadv_tt_global_hash_count(bat_priv, ethhdr->h_dest, @@ -558,6 +697,101 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, }
/** + * batadv_mcast_snoop_query_ipv4 - snoop for the selected MLD querier + * @skb: the unencapsulated ethernet frame coming from the mesh + * @orig_node: the originator this frame came from + * + * Checks whether the given frame is a general IGMP query from the selected + * querier and if so memorizes the originator this frame came from. + */ +static void batadv_mcast_snoop_query_ipv4(struct sk_buff *skb, + struct batadv_orig_node *orig_node) +{ + struct batadv_mcast_querier_state *querier; + + if (!ip_mc_check_igmp(skb, NULL); + return; + + /* we are only interested in general queries (group == 0.0.0.0) */ + if (igmp_hdr(skb)->type != IGMP_HOST_MEMBERSHIP_QUERY || + igmp_hdr(skb)->group) + return; + + /* RFC4541, section 2.1.1.1.b) says: ignore queries from 0.0.0.0 */ + if (!ip_hdr(skb)->saddr) + return; + + querier = &orig_node->bat_priv->mcast.querier_ipv4; + + spin_lock_bh(&querier->orig_lock); + rcu_assign_pointer(querier->orig, orig_node); + spin_unlock_bh(&querier->orig_lock); + + batadv_dbg(BATADV_DBG_MCAST, orig_node->bat_priv, + "Snooped IGMP Query from originator %pM\n", orig_node->orig); +} + +/** + * batadv_mcast_snoop_query_ipv6 - snoop for the selected MLD querier + * @skb: the unencapsulated ethernet frame coming from the mesh + * @orig_node: the originator this frame came from + * + * Checks whether the given frame is a general MLD query from the selected + * querier and if so memorizes the originator this frame came from. + */ +static void batadv_mcast_snoop_query_ipv6(struct sk_buff *skb, + struct batadv_orig_node *orig_node) +{ + struct mld_msg *mld; + struct batadv_mcast_querier_state *querier; + + if (!ipv6_mc_check_mld(skb, NULL)) + return; + + mld = (struct mld_msg *)icmp6_hdr(skb); + + /* we are only interested in general queries (mca == ::) */ + if (mld->mld_type != ICMPV6_MGM_QUERY || + !ipv6_addr_any(&mld->mld_mca)) + return; + + querier = &orig_node->bat_priv->mcast.querier_ipv6; + + spin_lock_bh(&querier->orig_lock); + rcu_assign_pointer(querier->orig, orig_node); + spin_unlock_bh(&querier->orig_lock); + + batadv_dbg(BATADV_DBG_MCAST, orig_node->bat_priv, + "Snooped MLD Query from originator %pM\n", orig_node->orig); +} + +/** + * batadv_mcast_snoop_query - snoop the selected IGMP/MLD querier + * @skb: the unencapsulated ethernet frame coming from the mesh + * @orig_node: the originator this frame came from + * + * Checks whether the given frame is a general IGMP or MLD query + * from the selected querier and if so memorizes the originator + * this frame came from. + * + * This call might reallocate skb data. + */ +void batadv_mcast_snoop_query(struct sk_buff *skb, + struct batadv_orig_node *orig_node) +{ + struct ethhdr *ethhdr = eth_hdr(skb); + + switch (ntohs(ethhdr->h_proto)) { + case ETH_P_IP: + batadv_mcast_snoop_query_ipv4(skb, orig_node); + break; + case ETH_P_IPV6: + batadv_mcast_snoop_query_ipv6(skb, orig_node); + break; + } +} + +/** * batadv_mcast_want_unsnoop_update - update unsnoop counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 3a44ebd..3727beb 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -40,6 +40,9 @@ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node **mcast_single_orig);
+void batadv_mcast_snoop_query(struct sk_buff *skb, + struct batadv_orig_node *orig_node); + void batadv_mcast_init(struct batadv_priv *bat_priv);
void batadv_mcast_free(struct batadv_priv *bat_priv); @@ -59,6 +62,11 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, return BATADV_FORW_ALL; }
+static inline void batadv_mcast_snoop_query(struct sk_buff *skb, + struct batadv_orig_node *orig_node) +{ +} + static inline int batadv_mcast_init(struct batadv_priv *bat_priv) { return 0; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 5ec31d7..52b5737 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -428,6 +428,9 @@ void batadv_interface_rx(struct net_device *soft_iface, skb->mark &= ~bat_priv->isolation_mark_mask; skb->mark |= bat_priv->isolation_mark; } + + if (orig_node) + batadv_mcast_snoop_query(skb, orig_node); } else if (batadv_is_ap_isolated(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid)) { goto dropped; @@ -738,6 +741,11 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->distributed_arp_table, 1); #endif #ifdef CONFIG_BATMAN_ADV_MCAST + rcu_assign_pointer(bat_priv->mcast.querier_ipv4.orig, NULL); + spin_lock_init(&bat_priv->mcast.querier_ipv4.orig_lock); + rcu_assign_pointer(bat_priv->mcast.querier_ipv6.orig, NULL); + spin_lock_init(&bat_priv->mcast.querier_ipv6.orig_lock); + bat_priv->mcast.flags = BATADV_NO_FLAGS; atomic_set(&bat_priv->multicast_mode, 1); atomic_set(&bat_priv->mcast.num_disabled, 0); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 9398c3f..a0f5bf8 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -623,12 +623,24 @@ struct batadv_priv_dat {
#ifdef CONFIG_BATMAN_ADV_MCAST /** + * struct batadv_mcast_querier_state - IGMP/MLD querier state when bridged + * @orig: node on which the selected querier resides + * @orig_lock: protects updates of the selected querier in 'orig' + */ +struct batadv_mcast_querier_state { + struct batadv_orig_node __rcu *orig; /* rcu protected pointer */ + spinlock_t orig_lock; /* protects updates of orig */ +}; + +/** * struct batadv_priv_mcast - per mesh interface mcast data * @mla_list: list of multicast addresses we are currently announcing via TT * @want_all_unsnoopables_list: a list of orig_nodes wanting all unsnoopable * multicast traffic * @want_all_ipv4_list: a list of orig_nodes wanting all IPv4 multicast traffic * @want_all_ipv6_list: a list of orig_nodes wanting all IPv6 multicast traffic + * @querier_ipv4: the current state of an IGMP querier in the mesh + * @querier_ipv6: the current state of an MLD querier in the mesh * @flags: the flags we have last sent in our mcast tvlv * @enabled: whether the multicast tvlv is currently enabled * @num_disabled: number of nodes that have no mcast tvlv @@ -643,6 +655,8 @@ struct batadv_priv_mcast { struct hlist_head want_all_unsnoopables_list; struct hlist_head want_all_ipv4_list; struct hlist_head want_all_ipv6_list; + struct batadv_mcast_querier_state querier_ipv4; + struct batadv_mcast_querier_state querier_ipv6; uint8_t flags; bool enabled; atomic_t num_disabled;
The multicast optimizations bridge integration will require the just implemented IGMP/MLD report handling later. Therefore bumping the version number. --- net/batman-adv/multicast.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index bafc323..a46d8e1 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -200,7 +200,7 @@ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) if (batadv_mcast_has_bridge(bat_priv)) { if (bat_priv->mcast.enabled) { batadv_tvlv_container_unregister(bat_priv, - BATADV_TVLV_MCAST, 1); + BATADV_TVLV_MCAST, 2); bat_priv->mcast.enabled = false; }
@@ -209,7 +209,7 @@ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv)
if (!bat_priv->mcast.enabled || mcast_data.flags != bat_priv->mcast.flags) { - batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 1, + batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, &mcast_data, sizeof(mcast_data)); bat_priv->mcast.flags = mcast_data.flags; bat_priv->mcast.enabled = true; @@ -950,7 +950,7 @@ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, void batadv_mcast_init(struct batadv_priv *bat_priv) { batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler_v1, - NULL, BATADV_TVLV_MCAST, 1, + NULL, BATADV_TVLV_MCAST, 2, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); }
@@ -960,8 +960,8 @@ void batadv_mcast_init(struct batadv_priv *bat_priv) */ void batadv_mcast_free(struct batadv_priv *bat_priv) { - batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 1); - batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 1); + batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2); + batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
batadv_mcast_mla_tt_retract(bat_priv, NULL); }
On Sun, Mar 15, 2015 at 02:14:05AM +0100, Linus Lüssing wrote:
These patches are one suggestion to tackle this issue (alternatives can be found on the wikipage, too). Let me know what you think about this approach.
PS: I had included the bridge patches too so that we can not only decide on the algorithm but also whether the API is fine for us from the batman-adv point of view before I submit that to netdev.
b.a.t.m.a.n@lists.open-mesh.org