Hi,
a device running alfred maybe has to remove the underlying device (e.g. wlan device has to be reconfigured). This currently makes it necessary to restart alfred to get the device sending data again. An unwanted side effect of this procedure is the removal of all data currently stored in alfred.
Instead alfred should detect such problems and try to recover by reopening the device after sleeping a while. The sleeping is automatically done by select.
I've tested the changes only using macvtap:
Terminal 1:
$ ip link del dev macvlan0 $ ip link add link eth0 name macvlan0 type macvlan $ ip link set macvlan0 address 1a:46:0b:ca:bc:7b up $ ./alfred -b none -i macvlan0
Terminal 2 (after letting alfred run for a while)
$ ip link del dev macvlan0 $ ip link add link eth0 name macvlan0 type macvlan $ ip link set macvlan0 address 1a:46:0b:ca:bc:7b up
Kind regards, Sven
alfred.h | 4 ++-- netsock.c | 16 ++++++++-------- recv.c | 3 +++ send.c | 27 +++++++++++++++++++++------ server.c | 36 +++++++++++++++++++++++++++--------- 5 files changed, 61 insertions(+), 25 deletions(-)
A failed initialization of the netsock is currently not cleaning up the socket. This is not problematic at the moment because a failed initialization also closes the server.
This can be a problem later when the netsock is reopened with a modified configuration and a failed re-initialization is still accepted.
Signed-off-by: Sven Eckelmann sven@open-mesh.com --- netsock.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/netsock.c b/netsock.c index 39281af..bed62e9 100644 --- a/netsock.c +++ b/netsock.c @@ -49,6 +49,8 @@ int netsock_open(struct globals *globals) struct sockaddr_in6 sin6; struct ifreq ifr;
+ globals->netsock = -1; + sock = socket(PF_INET6, SOCK_DGRAM, 0); if (sock < 0) { fprintf(stderr, "can't open socket: %s\n", strerror(errno)); @@ -59,7 +61,7 @@ int netsock_open(struct globals *globals) strncpy(ifr.ifr_name, globals->interface, IFNAMSIZ); if (ioctl(sock, SIOCGIFINDEX, &ifr) == -1) { fprintf(stderr, "can't get interface: %s\n", strerror(errno)); - return -1; + goto err; }
globals->scope_id = ifr.ifr_ifindex; @@ -72,7 +74,7 @@ int netsock_open(struct globals *globals)
if (ioctl(sock, SIOCGIFHWADDR, &ifr) == -1) { fprintf(stderr, "can't get MAC address: %s\n", strerror(errno)); - return -1; + goto err; }
memcpy(&globals->hwaddr, &ifr.ifr_hwaddr.sa_data, 6); @@ -80,16 +82,19 @@ int netsock_open(struct globals *globals)
if (ioctl(sock, SIOCGIFMTU, &ifr) == -1) { fprintf(stderr, "can't get MTU: %s\n", strerror(errno)); - return -1; + goto err; }
if (bind(sock, (struct sockaddr *)&sin6, sizeof(sin6)) < 0) { fprintf(stderr, "can't bind\n"); - return -1; + goto err; }
fcntl(sock, F_SETFL, fcntl(sock, F_GETFL, 0) | O_NONBLOCK); globals->netsock = sock;
return 0; +err: + close(sock); + return -1; }
Alfred currently doesn't invalidate the netsock but it could happen in future changes. Accessing it in an invalid state would otherwise cause the program to crash or behave unexpected.
Signed-off-by: Sven Eckelmann sven@open-mesh.com --- recv.c | 3 +++ send.c | 3 +++ server.c | 19 ++++++++++++------- 3 files changed, 18 insertions(+), 7 deletions(-)
diff --git a/recv.c b/recv.c index 5ed4a62..b755681 100644 --- a/recv.c +++ b/recv.c @@ -378,6 +378,9 @@ int recv_alfred_packet(struct globals *globals) struct sockaddr_in6 source; socklen_t sourcelen;
+ if (globals->netsock < 0) + return -1; + sourcelen = sizeof(source); length = recvfrom(globals->netsock, buf, sizeof(buf), 0, (struct sockaddr *)&source, &sourcelen); diff --git a/send.c b/send.c index 607afa1..603e84a 100644 --- a/send.c +++ b/send.c @@ -158,6 +158,9 @@ int send_alfred_packet(struct globals *globals, const struct in6_addr *dest, dest_addr.sin6_scope_id = globals->scope_id; memcpy(&dest_addr.sin6_addr, dest, sizeof(*dest));
+ if (globals->netsock < 0) + return 0; + ret = sendto(globals->netsock, buf, length, 0, (struct sockaddr *)&dest_addr, sizeof(struct sockaddr_in6)); diff --git a/server.c b/server.c index 0d8e6c8..6864c1d 100644 --- a/server.c +++ b/server.c @@ -236,10 +236,6 @@ int alfred_server(struct globals *globals) if (netsock_open(globals)) return -1;
- maxsock = globals->netsock; - if (globals->unix_sock > maxsock) - maxsock = globals->unix_sock; - clock_gettime(CLOCK_MONOTONIC, &last_check);
while (1) { @@ -250,9 +246,16 @@ int alfred_server(struct globals *globals) tv.tv_nsec = 0; }
+ maxsock = -1; + if (globals->netsock > maxsock) + maxsock = globals->netsock; + if (globals->unix_sock > maxsock) + maxsock = globals->unix_sock; + FD_ZERO(&fds); FD_SET(globals->unix_sock, &fds); - FD_SET(globals->netsock, &fds); + if (globals->netsock >= 0) + FD_SET(globals->netsock, &fds); ret = pselect(maxsock + 1, &fds, NULL, NULL, &tv, NULL);
if (ret == -1) { @@ -263,7 +266,8 @@ int alfred_server(struct globals *globals) printf("read unix socket\n"); unix_sock_read(globals); continue; - } else if (FD_ISSET(globals->netsock, &fds)) { + } else if (globals->netsock >= 0 && + FD_ISSET(globals->netsock, &fds)) { recv_alfred_packet(globals); continue; } @@ -282,7 +286,8 @@ int alfred_server(struct globals *globals) purge_data(globals); }
- netsock_close(globals->netsock); + if (globals->netsock >= 0) + netsock_close(globals->netsock); unix_sock_close(globals); return 0; }
The netdev socket can have an error which needs a reinitialization. Alfred should mark the socket as invalid instead of trying to send over the broken socket.
Signed-off-by: Sven Eckelmann sven@open-mesh.com --- server.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/server.c b/server.c index 6864c1d..bc3b551 100644 --- a/server.c +++ b/server.c @@ -216,7 +216,7 @@ int alfred_server(struct globals *globals) { int maxsock, ret; struct timespec last_check, now, tv; - fd_set fds; + fd_set fds, errfds;
if (create_hashes(globals)) return -1; @@ -253,15 +253,25 @@ int alfred_server(struct globals *globals) maxsock = globals->unix_sock;
FD_ZERO(&fds); + FD_ZERO(&errfds); FD_SET(globals->unix_sock, &fds); - if (globals->netsock >= 0) + if (globals->netsock >= 0) { FD_SET(globals->netsock, &fds); - ret = pselect(maxsock + 1, &fds, NULL, NULL, &tv, NULL); + FD_SET(globals->netsock, &errfds); + } + ret = pselect(maxsock + 1, &fds, NULL, &errfds, &tv, NULL);
if (ret == -1) { fprintf(stderr, "main loop select failed ...: %s\n", strerror(errno)); } else if (ret) { + if (globals->netsock >= 0 && + FD_ISSET(globals->netsock, &errfds)) { + fprintf(stderr, "Error on netsock detected\n"); + netsock_close(globals->netsock); + globals->netsock = -1; + } + if (FD_ISSET(globals->unix_sock, &fds)) { printf("read unix socket\n"); unix_sock_read(globals);
An error detected by select doesn't include a missing/down device or other things which prevent sending. Detecting these can be done by checking -EPERM of a send like the announcment.
Signed-off-by: Sven Eckelmann sven@open-mesh.com --- alfred.h | 4 ++-- send.c | 24 ++++++++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-)
diff --git a/alfred.h b/alfred.h index e931b1b..0b7a5ab 100644 --- a/alfred.h +++ b/alfred.h @@ -137,8 +137,8 @@ int push_data(struct globals *globals, struct in6_addr *destination, int announce_master(struct globals *globals); int push_local_data(struct globals *globals); int sync_data(struct globals *globals); -int send_alfred_packet(struct globals *globals, const struct in6_addr *dest, - void *buf, int length); +ssize_t send_alfred_packet(struct globals *globals, const struct in6_addr *dest, + void *buf, int length); /* unix_sock.c */ int unix_sock_read(struct globals *globals); int unix_sock_open_daemon(struct globals *globals, const char *path); diff --git a/send.c b/send.c index 603e84a..85d1234 100644 --- a/send.c +++ b/send.c @@ -24,20 +24,32 @@ #include <stdint.h> #include <string.h> #include <sys/socket.h> +#include <errno.h> +#include <stdio.h> #include "alfred.h" #include "hash.h" #include "packet.h"
int announce_master(struct globals *globals) { + ssize_t ret; struct alfred_announce_master_v0 announcement;
+ if (globals->netsock < 0) + return -1; + announcement.header.type = ALFRED_ANNOUNCE_MASTER; announcement.header.version = ALFRED_VERSION; announcement.header.length = htons(0);
- send_alfred_packet(globals, &in6addr_localmcast, &announcement, - sizeof(announcement)); + ret = send_alfred_packet(globals, &in6addr_localmcast, &announcement, + sizeof(announcement)); + if (ret == -EPERM) { + fprintf(stderr, "Error during announcement\n"); + netsock_close(globals->netsock); + globals->netsock = -1; + } +
return 0; } @@ -146,10 +158,10 @@ int push_local_data(struct globals *globals) return 0; }
-int send_alfred_packet(struct globals *globals, const struct in6_addr *dest, - void *buf, int length) +ssize_t send_alfred_packet(struct globals *globals, const struct in6_addr *dest, + void *buf, int length) { - int ret; + ssize_t ret; struct sockaddr_in6 dest_addr;
memset(&dest_addr, 0, sizeof(dest_addr)); @@ -165,5 +177,5 @@ int send_alfred_packet(struct globals *globals, const struct in6_addr *dest, (struct sockaddr *)&dest_addr, sizeof(struct sockaddr_in6));
- return (ret == length); + return ret; }
Alfred needs to recover when the netsock became invalid. Otherwise it is incapable of communicating with other alfred instances. This recovery can fail until the used interface reappeared.
Signed-off-by: Sven Eckelmann sven@open-mesh.com --- server.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/server.c b/server.c index bc3b551..ed04f70 100644 --- a/server.c +++ b/server.c @@ -246,6 +246,9 @@ int alfred_server(struct globals *globals) tv.tv_nsec = 0; }
+ if (globals->netsock < 0) + netsock_open(globals); + maxsock = -1; if (globals->netsock > maxsock) maxsock = globals->netsock;
Signed-off-by: Sven Eckelmann sven@open-mesh.com --- netsock.c | 5 ----- 1 file changed, 5 deletions(-)
diff --git a/netsock.c b/netsock.c index bed62e9..dc57b3c 100644 --- a/netsock.c +++ b/netsock.c @@ -80,11 +80,6 @@ int netsock_open(struct globals *globals) memcpy(&globals->hwaddr, &ifr.ifr_hwaddr.sa_data, 6); mac_to_ipv6(&globals->hwaddr, &globals->address);
- if (ioctl(sock, SIOCGIFMTU, &ifr) == -1) { - fprintf(stderr, "can't get MTU: %s\n", strerror(errno)); - goto err; - } - if (bind(sock, (struct sockaddr *)&sin6, sizeof(sin6)) < 0) { fprintf(stderr, "can't bind\n"); goto err;
On Tue, Oct 01, 2013 at 02:02:08PM +0200, Sven Eckelmann wrote:
Hi,
a device running alfred maybe has to remove the underlying device (e.g. wlan device has to be reconfigured). This currently makes it necessary to restart alfred to get the device sending data again. An unwanted side effect of this procedure is the removal of all data currently stored in alfred.
Instead alfred should detect such problems and try to recover by reopening the device after sleeping a while. The sleeping is automatically done by select.
I've tested the changes only using macvtap:
Terminal 1:
$ ip link del dev macvlan0 $ ip link add link eth0 name macvlan0 type macvlan $ ip link set macvlan0 address 1a:46:0b:ca:bc:7b up $ ./alfred -b none -i macvlan0
Terminal 2 (after letting alfred run for a while)
$ ip link del dev macvlan0 $ ip link add link eth0 name macvlan0 type macvlan $ ip link set macvlan0 address 1a:46:0b:ca:bc:7b up
Kind regards, Sven
Thanks, applied the whole series (785bc8b, 87c1916, 02997b2, 1b79312, 65d26ca, d6de864).
Your testsetup did not work for me (no IPv6 addresses where assigned to these macvlan0 interfaces), but I've tried on my WiFi network in alfred master mode and could confirm that alfred continues to send announcements after tearing down the WiFi and setting it up again.
Thanks, Simon
b.a.t.m.a.n@lists.open-mesh.org