diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 9a20821b111c..c5fb9e6bc3a5 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -92,4 +92,7 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 51 +#define SO_ATTACH_REUSEPORT_EBPF 52 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h index 2b65ed6b277c..9de0796240a0 100644 --- a/arch/avr32/include/uapi/asm/socket.h +++ b/arch/avr32/include/uapi/asm/socket.h @@ -85,4 +85,7 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 51 +#define SO_ATTACH_REUSEPORT_EBPF 52 + #endif /* _UAPI__ASM_AVR32_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index 4823ad125578..f02e4849ae83 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -85,5 +85,8 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 51 +#define SO_ATTACH_REUSEPORT_EBPF 52 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 59be3d87f86d..bce29166de1b 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -94,4 +94,7 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 51 +#define SO_ATTACH_REUSEPORT_EBPF 52 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index 7bc4cb273856..14aa4a6bccf1 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -85,4 +85,7 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 51 +#define SO_ATTACH_REUSEPORT_EBPF 52 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index dec3c850f36b..5910fe294e93 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -103,4 +103,7 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 51 +#define SO_ATTACH_REUSEPORT_EBPF 52 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index cab7d6d50051..58b1aa01ab9f 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -85,4 +85,7 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 51 +#define SO_ATTACH_REUSEPORT_EBPF 52 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index a5cd40cd8ee1..f9cf1223422c 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -84,4 +84,7 @@ #define SO_ATTACH_BPF 0x402B #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 0x402C +#define SO_ATTACH_REUSEPORT_EBPF 0x402D + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index c046666038f8..dd54f28ecdec 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -92,4 +92,7 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 51 +#define SO_ATTACH_REUSEPORT_EBPF 52 + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 296942d56e6a..d02e89d14fef 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -91,4 +91,7 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 51 +#define SO_ATTACH_REUSEPORT_EBPF 52 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index e6a16c40be5f..d270ee91968e 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -81,6 +81,9 @@ #define SO_ATTACH_BPF 0x0034 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 0x0035 +#define SO_ATTACH_REUSEPORT_EBPF 0x0036 + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 4120af086160..fd3b96d1153f 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -96,4 +96,7 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 51 +#define SO_ATTACH_REUSEPORT_EBPF 52 + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 4165e9ac9e36..294c3cdf07b3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -447,6 +447,8 @@ void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); +int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); +int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 78003dfb8539..47f52d3cd8df 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -87,7 +87,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2); +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); diff --git a/include/net/sock.h b/include/net/sock.h index 3794cdde837a..e830c1006935 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -318,6 +318,7 @@ struct cg_proto; * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) * @sk_backlog_rcv: callback to process the backlog * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 + * @sk_reuseport_cb: reuseport group container */ struct sock { /* @@ -453,6 +454,7 @@ struct sock { int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); + struct sock_reuseport __rcu *sk_reuseport_cb; }; #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h new file mode 100644 index 000000000000..7dda3d7adba8 --- /dev/null +++ b/include/net/sock_reuseport.h @@ -0,0 +1,28 @@ +#ifndef _SOCK_REUSEPORT_H +#define _SOCK_REUSEPORT_H + +#include +#include +#include +#include + +struct sock_reuseport { + struct rcu_head rcu; + + u16 max_socks; /* length of socks */ + u16 num_socks; /* elements in socks */ + struct bpf_prog __rcu *prog; /* optional BPF sock selector */ + struct sock *socks[0]; /* array of sock pointers */ +}; + +extern int reuseport_alloc(struct sock *sk); +extern int reuseport_add_sock(struct sock *sk, const struct sock *sk2); +extern void reuseport_detach_sock(struct sock *sk); +extern struct sock *reuseport_select_sock(struct sock *sk, + u32 hash, + struct sk_buff *skb, + int hdr_len); +extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, + struct bpf_prog *prog); + +#endif /* _SOCK_REUSEPORT_H */ diff --git a/include/net/udp.h b/include/net/udp.h index 6d4ed18e1427..2842541e28e7 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -191,7 +191,7 @@ static inline void udp_lib_close(struct sock *sk, long timeout) } int udp_lib_get_port(struct sock *sk, unsigned short snum, - int (*)(const struct sock *, const struct sock *), + int (*)(const struct sock *, const struct sock *, bool), unsigned int hash2_nulladdr); u32 udp_flow_hashrnd(void); @@ -258,7 +258,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, - struct udp_table *tbl); + struct udp_table *tbl, struct sk_buff *skb); struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, @@ -266,7 +266,8 @@ struct sock *udp6_lib_lookup(struct net *net, struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, - int dif, struct udp_table *tbl); + int dif, struct udp_table *tbl, + struct sk_buff *skb); /* * SNMP statistics for UDP and UDP-Lite diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 5c15c2a5c123..fb8a41668382 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -87,4 +87,7 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 51 +#define SO_ATTACH_REUSEPORT_EBPF 52 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/net/core/Makefile b/net/core/Makefile index 086b01fbe1bd..0b835de04de3 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - sock_diag.o dev_ioctl.o tso.o + sock_diag.o dev_ioctl.o tso.o sock_reuseport.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o diff --git a/net/core/filter.c b/net/core/filter.c index c770196ae8d5..35e6fed28709 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -50,6 +50,7 @@ #include #include #include +#include /** * sk_filter - run a packet through a socket filter @@ -1167,6 +1168,68 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return 0; } +static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) +{ + struct bpf_prog *old_prog; + int err; + + if (bpf_prog_size(prog->len) > sysctl_optmem_max) + return -ENOMEM; + + if (sk_unhashed(sk)) { + err = reuseport_alloc(sk); + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + + old_prog = reuseport_attach_prog(sk, prog); + if (old_prog) + bpf_prog_destroy(old_prog); + + return 0; +} + +static +struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) +{ + unsigned int fsize = bpf_classic_proglen(fprog); + unsigned int bpf_fsize = bpf_prog_size(fprog->len); + struct bpf_prog *prog; + int err; + + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return ERR_PTR(-EPERM); + + /* Make sure new filter is there and in the right amounts. */ + if (fprog->filter == NULL) + return ERR_PTR(-EINVAL); + + prog = bpf_prog_alloc(bpf_fsize, 0); + if (!prog) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(prog->insns, fprog->filter, fsize)) { + __bpf_prog_free(prog); + return ERR_PTR(-EFAULT); + } + + prog->len = fprog->len; + + err = bpf_prog_store_orig_filter(prog, fprog); + if (err) { + __bpf_prog_free(prog); + return ERR_PTR(-ENOMEM); + } + + /* bpf_prepare_filter() already takes care of freeing + * memory in case something goes wrong. + */ + return bpf_prepare_filter(prog, NULL); +} + /** * sk_attach_filter - attach a socket filter * @fprog: the filter program @@ -1179,39 +1242,9 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) */ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { - unsigned int fsize = bpf_classic_proglen(fprog); - unsigned int bpf_fsize = bpf_prog_size(fprog->len); - struct bpf_prog *prog; + struct bpf_prog *prog = __get_filter(fprog, sk); int err; - if (sock_flag(sk, SOCK_FILTER_LOCKED)) - return -EPERM; - - /* Make sure new filter is there and in the right amounts. */ - if (fprog->filter == NULL) - return -EINVAL; - - prog = bpf_prog_alloc(bpf_fsize, 0); - if (!prog) - return -ENOMEM; - - if (copy_from_user(prog->insns, fprog->filter, fsize)) { - __bpf_prog_free(prog); - return -EFAULT; - } - - prog->len = fprog->len; - - err = bpf_prog_store_orig_filter(prog, fprog); - if (err) { - __bpf_prog_free(prog); - return -ENOMEM; - } - - /* bpf_prepare_filter() already takes care of freeing - * memory in case something goes wrong. - */ - prog = bpf_prepare_filter(prog, NULL); if (IS_ERR(prog)) return PTR_ERR(prog); @@ -1225,23 +1258,50 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) } EXPORT_SYMBOL_GPL(sk_attach_filter); -int sk_attach_bpf(u32 ufd, struct sock *sk) +int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { - struct bpf_prog *prog; + struct bpf_prog *prog = __get_filter(fprog, sk); int err; - if (sock_flag(sk, SOCK_FILTER_LOCKED)) - return -EPERM; - - prog = bpf_prog_get(ufd); if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { - bpf_prog_put(prog); - return -EINVAL; + err = __reuseport_attach_prog(prog, sk); + if (err < 0) { + __bpf_prog_release(prog); + return err; } + return 0; +} + +static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog; + + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return ERR_PTR(-EPERM); + + prog = bpf_prog_get(ufd); + if (IS_ERR(prog)) + return prog; + + if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(prog); + return ERR_PTR(-EINVAL); + } + + return prog; +} + +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog = __get_bpf(ufd, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); @@ -1251,6 +1311,23 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } +int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog = __get_bpf(ufd, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __reuseport_attach_prog(prog, sk); + if (err < 0) { + bpf_prog_put(prog); + return err; + } + + return 0; +} + #define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) #define BPF_LDST_LEN 16U diff --git a/net/core/sock.c b/net/core/sock.c index 565bab7baca9..51270238e269 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -134,6 +134,7 @@ #include #include +#include #include @@ -932,6 +933,32 @@ int sock_setsockopt(struct socket *sock, int level, int optname, } break; + case SO_ATTACH_REUSEPORT_CBPF: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; + + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) + break; + + ret = sk_reuseport_attach_filter(&fprog, sk); + } + break; + + case SO_ATTACH_REUSEPORT_EBPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_user(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_reuseport_attach_bpf(ufd, sk); + } + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; @@ -1443,6 +1470,8 @@ void sk_destruct(struct sock *sk) sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c new file mode 100644 index 000000000000..ae0969c0fc2e --- /dev/null +++ b/net/core/sock_reuseport.c @@ -0,0 +1,251 @@ +/* + * To speed up listener socket lookup, create an array to store all sockets + * listening on the same port. This allows a decision to be made after finding + * the first socket. An optional BPF program can also be configured for + * selecting the socket index from the array of available sockets. + */ + +#include +#include +#include + +#define INIT_SOCKS 128 + +static DEFINE_SPINLOCK(reuseport_lock); + +static struct sock_reuseport *__reuseport_alloc(u16 max_socks) +{ + size_t size = sizeof(struct sock_reuseport) + + sizeof(struct sock *) * max_socks; + struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); + + if (!reuse) + return NULL; + + reuse->max_socks = max_socks; + + RCU_INIT_POINTER(reuse->prog, NULL); + return reuse; +} + +int reuseport_alloc(struct sock *sk) +{ + struct sock_reuseport *reuse; + + /* bh lock used since this function call may precede hlist lock in + * soft irq of receive path or setsockopt from process context + */ + spin_lock_bh(&reuseport_lock); + WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)), + "multiple allocations for the same socket"); + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return -ENOMEM; + } + + reuse->socks[0] = sk; + reuse->num_socks = 1; + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + spin_unlock_bh(&reuseport_lock); + + return 0; +} +EXPORT_SYMBOL(reuseport_alloc); + +static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) +{ + struct sock_reuseport *more_reuse; + u32 more_socks_size, i; + + more_socks_size = reuse->max_socks * 2U; + if (more_socks_size > U16_MAX) + return NULL; + + more_reuse = __reuseport_alloc(more_socks_size); + if (!more_reuse) + return NULL; + + more_reuse->max_socks = more_socks_size; + more_reuse->num_socks = reuse->num_socks; + more_reuse->prog = reuse->prog; + + memcpy(more_reuse->socks, reuse->socks, + reuse->num_socks * sizeof(struct sock *)); + + for (i = 0; i < reuse->num_socks; ++i) + rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, + more_reuse); + + /* Note: we use kfree_rcu here instead of reuseport_free_rcu so + * that reuse and more_reuse can temporarily share a reference + * to prog. + */ + kfree_rcu(reuse, rcu); + return more_reuse; +} + +/** + * reuseport_add_sock - Add a socket to the reuseport group of another. + * @sk: New socket to add to the group. + * @sk2: Socket belonging to the existing reuseport group. + * May return ENOMEM and not add socket to group under memory pressure. + */ +int reuseport_add_sock(struct sock *sk, const struct sock *sk2) +{ + struct sock_reuseport *reuse; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)), + WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)), + "socket already in reuseport group"); + + if (reuse->num_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return -ENOMEM; + } + } + + reuse->socks[reuse->num_socks] = sk; + /* paired with smp_rmb() in reuseport_select_sock() */ + smp_wmb(); + reuse->num_socks++; + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + spin_unlock_bh(&reuseport_lock); + + return 0; +} +EXPORT_SYMBOL(reuseport_add_sock); + +static void reuseport_free_rcu(struct rcu_head *head) +{ + struct sock_reuseport *reuse; + + reuse = container_of(head, struct sock_reuseport, rcu); + if (reuse->prog) + bpf_prog_destroy(reuse->prog); + kfree(reuse); +} + +void reuseport_detach_sock(struct sock *sk) +{ + struct sock_reuseport *reuse; + int i; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); + + for (i = 0; i < reuse->num_socks; i++) { + if (reuse->socks[i] == sk) { + reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; + reuse->num_socks--; + if (reuse->num_socks == 0) + call_rcu(&reuse->rcu, reuseport_free_rcu); + break; + } + } + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_detach_sock); + +static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) +{ + struct sk_buff *nskb = NULL; + u32 index; + + if (skb_shared(skb)) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return NULL; + skb = nskb; + } + + /* temporarily advance data past protocol header */ + if (!pskb_pull(skb, hdr_len)) { + consume_skb(nskb); + return NULL; + } + index = bpf_prog_run_save_cb(prog, skb); + __skb_push(skb, hdr_len); + + consume_skb(nskb); + + if (index >= socks) + return NULL; + + return reuse->socks[index]; +} + +/** + * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. + * @sk: First socket in the group. + * @hash: When no BPF filter is available, use this hash to select. + * @skb: skb to run through BPF filter. + * @hdr_len: BPF filter expects skb data pointer at payload data. If + * the skb does not yet point at the payload, this parameter represents + * how far the pointer needs to advance to reach the payload. + * Returns a socket that should receive the packet (or NULL on error). + */ +struct sock *reuseport_select_sock(struct sock *sk, + u32 hash, + struct sk_buff *skb, + int hdr_len) +{ + struct sock_reuseport *reuse; + struct bpf_prog *prog; + struct sock *sk2 = NULL; + u16 socks; + + rcu_read_lock(); + reuse = rcu_dereference(sk->sk_reuseport_cb); + + /* if memory allocation failed or add call is not yet complete */ + if (!reuse) + goto out; + + prog = rcu_dereference(reuse->prog); + socks = READ_ONCE(reuse->num_socks); + if (likely(socks)) { + /* paired with smp_wmb() in reuseport_add_sock() */ + smp_rmb(); + + if (prog && skb) + sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); + else + sk2 = reuse->socks[reciprocal_scale(hash, socks)]; + } + +out: + rcu_read_unlock(); + return sk2; +} +EXPORT_SYMBOL(reuseport_select_sock); + +struct bpf_prog * +reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +{ + struct sock_reuseport *reuse; + struct bpf_prog *old_prog; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + old_prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); + rcu_assign_pointer(reuse->prog, prog); + spin_unlock_bh(&reuseport_lock); + + return old_prog; +} +EXPORT_SYMBOL(reuseport_attach_prog); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ac14ae44390d..835378365f25 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -113,6 +113,7 @@ #include #include #include "udp_impl.h" +#include struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -137,7 +138,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, unsigned long *bitmap, struct sock *sk, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2), + const struct sock *sk2, + bool match_wildcard), unsigned int log) { struct sock *sk2; @@ -152,8 +154,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || !uid_eq(uid, sock_i_uid(sk2))) && - saddr_comp(sk, sk2)) { + saddr_comp(sk, sk2, true)) { if (!bitmap) return 1; __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); @@ -170,7 +173,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, struct sock *sk, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2)) + const struct sock *sk2, + bool match_wildcard)) { struct sock *sk2; struct hlist_nulls_node *node; @@ -186,8 +190,9 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || !uid_eq(uid, sock_i_uid(sk2))) && - saddr_comp(sk, sk2)) { + saddr_comp(sk, sk2, true)) { res = 1; break; } @@ -196,6 +201,35 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, return res; } +static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) +{ + struct net *net = sock_net(sk); + struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); + struct sock *sk2; + + sk_nulls_for_each(sk2, node, &hslot->head) { + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + sk2->sk_family == sk->sk_family && + ipv6_only_sock(sk2) == ipv6_only_sock(sk) && + (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && + (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + (*saddr_same)(sk, sk2, false)) { + return reuseport_add_sock(sk, sk2); + } + } + + /* Initial allocation may have already happened via setsockopt */ + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return reuseport_alloc(sk); + return 0; +} + /** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * @@ -207,7 +241,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, */ int udp_lib_get_port(struct sock *sk, unsigned short snum, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2), + const struct sock *sk2, + bool match_wildcard), unsigned int hash2_nulladdr) { struct udp_hslot *hslot, *hslot2; @@ -290,6 +325,14 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, udp_sk(sk)->udp_port_hash = snum; udp_sk(sk)->udp_portaddr_hash ^= snum; if (sk_unhashed(sk)) { + if (sk->sk_reuseport && + udp_reuseport_add_sock(sk, hslot, saddr_comp)) { + inet_sk(sk)->inet_num = 0; + udp_sk(sk)->udp_port_hash = 0; + udp_sk(sk)->udp_portaddr_hash ^= snum; + goto fail_unlock; + } + sk_nulls_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -309,13 +352,22 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, } EXPORT_SYMBOL(udp_lib_get_port); -static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses + * match_wildcard == false: addresses must be exactly the same, i.e. + * 0.0.0.0 only equals to 0.0.0.0 + */ +static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, + bool match_wildcard) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); - return (!ipv6_only_sock(sk2) && - (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr || - inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); + if (!ipv6_only_sock(sk2)) { + if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr) + return 1; + if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr) + return match_wildcard; + } + return 0; } static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, @@ -459,8 +511,14 @@ static struct sock *udp4_lib_lookup2(struct net *net, badness = score; reuseport = sk->sk_reuseport; if (reuseport) { + struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); + sk2 = reuseport_select_sock(sk, hash, NULL, 0); + if (sk2) { + result = sk2; + goto found; + } matches = 1; } } else if (score == badness && reuseport) { @@ -478,6 +536,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, if (get_nulls_value(node) != slot2) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score2(result, net, saddr, sport, @@ -494,7 +553,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, */ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, - int dif, struct udp_table *udptable) + int dif, struct udp_table *udptable, struct sk_buff *skb) { struct sock *sk, *result; struct hlist_nulls_node *node; @@ -540,8 +599,15 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, badness = score; reuseport = sk->sk_reuseport; if (reuseport) { + struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + goto found; + } matches = 1; } } else if (score == badness && reuseport) { @@ -560,6 +626,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score(result, net, saddr, hnum, sport, @@ -581,13 +648,14 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, iph->daddr, dport, inet_iif(skb), - udptable); + udptable, skb); } struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { - return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); + return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, + &udp_table, NULL); } EXPORT_SYMBOL_GPL(udp4_lib_lookup); @@ -635,7 +703,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, - iph->saddr, uh->source, skb->dev->ifindex, udptable); + iph->saddr, uh->source, skb->dev->ifindex, udptable, + NULL); if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; /* No socket for error */ @@ -1398,6 +1467,8 @@ void udp_lib_unhash(struct sock *sk) hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock_bh(&hslot->lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); if (sk_nulls_del_node_init_rcu(sk)) { hslot->count--; inet_sk(sk)->inet_num = 0; @@ -1425,22 +1496,28 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); udp_sk(sk)->udp_portaddr_hash = newhash; - if (hslot2 != nhslot2) { + + if (hslot2 != nhslot2 || + rcu_access_pointer(sk->sk_reuseport_cb)) { hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); /* we must lock primary chain too */ spin_lock_bh(&hslot->lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); - spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); - hslot2->count--; - spin_unlock(&hslot2->lock); + if (hslot2 != nhslot2) { + spin_lock(&hslot2->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hslot2->count--; + spin_unlock(&hslot2->lock); - spin_lock(&nhslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, - &nhslot2->head); - nhslot2->count++; - spin_unlock(&nhslot2->lock); + spin_lock(&nhslot2->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &nhslot2->head); + nhslot2->count++; + spin_unlock(&nhslot2->lock); + } spin_unlock_bh(&hslot->lock); } diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 6116604bf6e8..df1966f3b6ec 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -44,7 +44,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, - req->id.idiag_if, tbl); + req->id.idiag_if, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, @@ -52,7 +52,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, - req->id.idiag_if, tbl); + req->id.idiag_if, tbl, NULL); #endif else goto out_nosk; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index a7ca2cde2ecb..36c3f0155010 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -51,12 +51,12 @@ int inet6_csk_bind_conflict(const struct sock *sk, (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid((struct sock *)sk2))))) { - if (ipv6_rcv_saddr_equal(sk, sk2)) + if (ipv6_rcv_saddr_equal(sk, sk2, true)) break; } if (!relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN && - ipv6_rcv_saddr_equal(sk, sk2)) + ipv6_rcv_saddr_equal(sk, sk2, true)) break; } } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 00775ee27d86..56fcb55fda31 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -76,7 +77,14 @@ static u32 udp6_ehashfn(const struct net *net, udp_ipv6_hash_secret + net_hash_mix(net)); } -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) +/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 + * only, and any IPv4 addresses if not IPv6 only + * match_wildcard == false: addresses must be exactly the same, i.e. + * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + * and 0.0.0.0 equals to 0.0.0.0 only + */ +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) { const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); int sk2_ipv6only = inet_v6_ipv6only(sk2); @@ -84,16 +92,24 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; /* if both are mapped, treat as IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) - return (!sk2_ipv6only && - (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr || - sk->sk_rcv_saddr == sk2->sk_rcv_saddr)); + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { + if (!sk2_ipv6only) { + if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) + return 1; + if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) + return match_wildcard; + } + return 0; + } - if (addr_type2 == IPV6_ADDR_ANY && + if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) + return 1; + + if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) return 1; - if (addr_type == IPV6_ADDR_ANY && + if (addr_type == IPV6_ADDR_ANY && match_wildcard && !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) return 1; @@ -253,8 +269,14 @@ static struct sock *udp6_lib_lookup2(struct net *net, badness = score; reuseport = sk->sk_reuseport; if (reuseport) { + struct sock *sk2; hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); + sk2 = reuseport_select_sock(sk, hash, NULL, 0); + if (sk2) { + result = sk2; + goto found; + } matches = 1; } } else if (score == badness && reuseport) { @@ -273,6 +295,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score2(result, net, saddr, sport, @@ -287,7 +310,8 @@ static struct sock *udp6_lib_lookup2(struct net *net, struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, - int dif, struct udp_table *udptable) + int dif, struct udp_table *udptable, + struct sk_buff *skb) { struct sock *sk, *result; struct hlist_nulls_node *node; @@ -332,8 +356,15 @@ struct sock *__udp6_lib_lookup(struct net *net, badness = score; reuseport = sk->sk_reuseport; if (reuseport) { + struct sock *sk2; hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + goto found; + } matches = 1; } } else if (score == badness && reuseport) { @@ -352,6 +383,7 @@ struct sock *__udp6_lib_lookup(struct net *net, goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score(result, net, hnum, saddr, sport, @@ -377,13 +409,13 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, return sk; return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), - udptable); + udptable, skb); } struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif) { - return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); + return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL); } EXPORT_SYMBOL_GPL(udp6_lib_lookup); @@ -549,8 +581,8 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int err; struct net *net = dev_net(skb->dev); - sk = __udp6_lib_lookup(net, daddr, uh->dest, - saddr, uh->source, inet6_iif(skb), udptable); + sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, + inet6_iif(skb), udptable, skb); if (!sk) { ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 00326629d4af..6fb23366b258 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -1,3 +1,4 @@ socket psock_fanout psock_tpacket +reuseport_bpf diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index fac4782c51d8..41449b5ad0a9 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -4,7 +4,7 @@ CFLAGS = -Wall -O2 -g CFLAGS += -I../../../../usr/include/ -NET_PROGS = socket psock_fanout psock_tpacket +NET_PROGS = socket psock_fanout psock_tpacket reuseport_bpf all: $(NET_PROGS) %: %.c diff --git a/tools/testing/selftests/net/reuseport_bpf.c b/tools/testing/selftests/net/reuseport_bpf.c new file mode 100644 index 000000000000..74ff09988958 --- /dev/null +++ b/tools/testing/selftests/net/reuseport_bpf.c @@ -0,0 +1,467 @@ +/* + * Test functionality of BPF filters for SO_REUSEPORT. The tests below will use + * a BPF program (both classic and extended) to read the first word from an + * incoming packet (expected to be in network byte-order), calculate a modulus + * of that number, and then dispatch the packet to the Nth socket using the + * result. These tests are run for each supported address family and protocol. + * Additionally, a few edge cases in the implementation are tested. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#endif + +struct test_params { + int recv_family; + int send_family; + int protocol; + size_t recv_socks; + uint16_t recv_port; + uint16_t send_port_min; +}; + +static size_t sockaddr_size(void) +{ + return sizeof(struct sockaddr_storage); +} + +static struct sockaddr *new_any_sockaddr(int family, uint16_t port) +{ + struct sockaddr_storage *addr; + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + addr = malloc(sizeof(struct sockaddr_storage)); + memset(addr, 0, sizeof(struct sockaddr_storage)); + + switch (family) { + case AF_INET: + addr4 = (struct sockaddr_in *)addr; + addr4->sin_family = AF_INET; + addr4->sin_addr.s_addr = htonl(INADDR_ANY); + addr4->sin_port = htons(port); + break; + case AF_INET6: + addr6 = (struct sockaddr_in6 *)addr; + addr6->sin6_family = AF_INET6; + addr6->sin6_addr = in6addr_any; + addr6->sin6_port = htons(port); + break; + default: + error(1, 0, "Unsupported family %d", family); + } + return (struct sockaddr *)addr; +} + +static struct sockaddr *new_loopback_sockaddr(int family, uint16_t port) +{ + struct sockaddr *addr = new_any_sockaddr(family, port); + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + switch (family) { + case AF_INET: + addr4 = (struct sockaddr_in *)addr; + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + break; + case AF_INET6: + addr6 = (struct sockaddr_in6 *)addr; + addr6->sin6_addr = in6addr_loopback; + break; + default: + error(1, 0, "Unsupported family %d", family); + } + return addr; +} + +static void attach_ebpf(int fd, uint16_t mod) +{ + static char bpf_log_buf[65536]; + static const char bpf_license[] = "GPL"; + + int bpf_fd; + const struct bpf_insn prog[] = { + /* BPF_MOV64_REG(BPF_REG_6, BPF_REG_1) */ + { BPF_ALU64 | BPF_MOV | BPF_X, BPF_REG_6, BPF_REG_1, 0, 0 }, + /* BPF_LD_ABS(BPF_W, 0) R0 = (uint32_t)skb[0] */ + { BPF_LD | BPF_ABS | BPF_W, 0, 0, 0, 0 }, + /* BPF_ALU64_IMM(BPF_MOD, BPF_REG_0, mod) */ + { BPF_ALU64 | BPF_MOD | BPF_K, BPF_REG_0, 0, 0, mod }, + /* BPF_EXIT_INSN() */ + { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 } + }; + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insn_cnt = ARRAY_SIZE(prog); + attr.insns = (uint64_t)prog; + attr.license = (uint64_t)bpf_license; + attr.log_buf = (uint64_t)bpf_log_buf; + attr.log_size = sizeof(bpf_log_buf); + attr.log_level = 1; + attr.kern_version = 0; + + bpf_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + if (bpf_fd < 0) + error(1, errno, "ebpf error. log:\n%s\n", bpf_log_buf); + + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &bpf_fd, + sizeof(bpf_fd))) + error(1, errno, "failed to set SO_ATTACH_REUSEPORT_EBPF"); +} + +static void attach_cbpf(int fd, uint16_t mod) +{ + struct sock_filter code[] = { + /* A = (uint32_t)skb[0] */ + { BPF_LD | BPF_W | BPF_ABS, 0, 0, 0 }, + /* A = A % mod */ + { BPF_ALU | BPF_MOD, 0, 0, mod }, + /* return A */ + { BPF_RET | BPF_A, 0, 0, 0 }, + }; + struct sock_fprog p = { + .len = ARRAY_SIZE(code), + .filter = code, + }; + + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &p, sizeof(p))) + error(1, errno, "failed to set SO_ATTACH_REUSEPORT_CBPF"); +} + +static void build_recv_group(const struct test_params p, int fd[], uint16_t mod, + void (*attach_bpf)(int, uint16_t)) +{ + struct sockaddr * const addr = + new_any_sockaddr(p.recv_family, p.recv_port); + int i, opt; + + for (i = 0; i < p.recv_socks; ++i) { + fd[i] = socket(p.recv_family, p.protocol, 0); + if (fd[i] < 0) + error(1, errno, "failed to create recv %d", i); + + opt = 1; + if (setsockopt(fd[i], SOL_SOCKET, SO_REUSEPORT, &opt, + sizeof(opt))) + error(1, errno, "failed to set SO_REUSEPORT on %d", i); + + if (i == 0) + attach_bpf(fd[i], mod); + + if (bind(fd[i], addr, sockaddr_size())) + error(1, errno, "failed to bind recv socket %d", i); + + if (p.protocol == SOCK_STREAM) + if (listen(fd[i], p.recv_socks * 10)) + error(1, errno, "failed to listen on socket"); + } + free(addr); +} + +static void send_from(struct test_params p, uint16_t sport, char *buf, + size_t len) +{ + struct sockaddr * const saddr = new_any_sockaddr(p.send_family, sport); + struct sockaddr * const daddr = + new_loopback_sockaddr(p.send_family, p.recv_port); + const int fd = socket(p.send_family, p.protocol, 0); + + if (fd < 0) + error(1, errno, "failed to create send socket"); + + if (bind(fd, saddr, sockaddr_size())) + error(1, errno, "failed to bind send socket"); + if (connect(fd, daddr, sockaddr_size())) + error(1, errno, "failed to connect"); + + if (send(fd, buf, len, 0) < 0) + error(1, errno, "failed to send message"); + + close(fd); + free(saddr); + free(daddr); +} + +static void test_recv_order(const struct test_params p, int fd[], int mod) +{ + char recv_buf[8], send_buf[8]; + struct msghdr msg; + struct iovec recv_io = { recv_buf, 8 }; + struct epoll_event ev; + int epfd, conn, i, sport, expected; + uint32_t data, ndata; + + epfd = epoll_create(1); + if (epfd < 0) + error(1, errno, "failed to create epoll"); + for (i = 0; i < p.recv_socks; ++i) { + ev.events = EPOLLIN; + ev.data.fd = fd[i]; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd[i], &ev)) + error(1, errno, "failed to register sock %d epoll", i); + } + + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &recv_io; + msg.msg_iovlen = 1; + + for (data = 0; data < p.recv_socks * 2; ++data) { + sport = p.send_port_min + data; + ndata = htonl(data); + memcpy(send_buf, &ndata, sizeof(ndata)); + send_from(p, sport, send_buf, sizeof(ndata)); + + i = epoll_wait(epfd, &ev, 1, -1); + if (i < 0) + error(1, errno, "epoll wait failed"); + + if (p.protocol == SOCK_STREAM) { + conn = accept(ev.data.fd, NULL, NULL); + if (conn < 0) + error(1, errno, "error accepting"); + i = recvmsg(conn, &msg, 0); + close(conn); + } else { + i = recvmsg(ev.data.fd, &msg, 0); + } + if (i < 0) + error(1, errno, "recvmsg error"); + if (i != sizeof(ndata)) + error(1, 0, "expected size %zd got %d", + sizeof(ndata), i); + + for (i = 0; i < p.recv_socks; ++i) + if (ev.data.fd == fd[i]) + break; + memcpy(&ndata, recv_buf, sizeof(ndata)); + fprintf(stderr, "Socket %d: %d\n", i, ntohl(ndata)); + + expected = (sport % mod); + if (i != expected) + error(1, 0, "expected socket %d", expected); + } +} + +static void test_reuseport_ebpf(const struct test_params p) +{ + int i, fd[p.recv_socks]; + + fprintf(stderr, "Testing EBPF mod %zd...\n", p.recv_socks); + build_recv_group(p, fd, p.recv_socks, attach_ebpf); + test_recv_order(p, fd, p.recv_socks); + + fprintf(stderr, "Reprograming, testing mod %zd...\n", p.recv_socks / 2); + attach_ebpf(fd[0], p.recv_socks / 2); + test_recv_order(p, fd, p.recv_socks / 2); + + for (i = 0; i < p.recv_socks; ++i) + close(fd[i]); +} + +static void test_reuseport_cbpf(const struct test_params p) +{ + int i, fd[p.recv_socks]; + + fprintf(stderr, "Testing CBPF mod %zd...\n", p.recv_socks); + build_recv_group(p, fd, p.recv_socks, attach_cbpf); + test_recv_order(p, fd, p.recv_socks); + + fprintf(stderr, "Reprograming, testing mod %zd...\n", p.recv_socks / 2); + attach_cbpf(fd[0], p.recv_socks / 2); + test_recv_order(p, fd, p.recv_socks / 2); + + for (i = 0; i < p.recv_socks; ++i) + close(fd[i]); +} + +static void test_extra_filter(const struct test_params p) +{ + struct sockaddr * const addr = + new_any_sockaddr(p.recv_family, p.recv_port); + int fd1, fd2, opt; + + fprintf(stderr, "Testing too many filters...\n"); + fd1 = socket(p.recv_family, p.protocol, 0); + if (fd1 < 0) + error(1, errno, "failed to create socket 1"); + fd2 = socket(p.recv_family, p.protocol, 0); + if (fd2 < 0) + error(1, errno, "failed to create socket 2"); + + opt = 1; + if (setsockopt(fd1, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt))) + error(1, errno, "failed to set SO_REUSEPORT on socket 1"); + if (setsockopt(fd2, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt))) + error(1, errno, "failed to set SO_REUSEPORT on socket 2"); + + attach_ebpf(fd1, 10); + attach_ebpf(fd2, 10); + + if (bind(fd1, addr, sockaddr_size())) + error(1, errno, "failed to bind recv socket 1"); + + if (!bind(fd2, addr, sockaddr_size()) && errno != EADDRINUSE) + error(1, errno, "bind socket 2 should fail with EADDRINUSE"); + + free(addr); +} + +static void test_filter_no_reuseport(const struct test_params p) +{ + struct sockaddr * const addr = + new_any_sockaddr(p.recv_family, p.recv_port); + const char bpf_license[] = "GPL"; + struct bpf_insn ecode[] = { + { BPF_ALU64 | BPF_MOV | BPF_K, BPF_REG_0, 0, 0, 10 }, + { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 } + }; + struct sock_filter ccode[] = {{ BPF_RET | BPF_A, 0, 0, 0 }}; + union bpf_attr eprog; + struct sock_fprog cprog; + int fd, bpf_fd; + + fprintf(stderr, "Testing filters on non-SO_REUSEPORT socket...\n"); + + memset(&eprog, 0, sizeof(eprog)); + eprog.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + eprog.insn_cnt = ARRAY_SIZE(ecode); + eprog.insns = (uint64_t)ecode; + eprog.license = (uint64_t)bpf_license; + eprog.kern_version = 0; + + memset(&cprog, 0, sizeof(cprog)); + cprog.len = ARRAY_SIZE(ccode); + cprog.filter = ccode; + + + bpf_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &eprog, sizeof(eprog)); + if (bpf_fd < 0) + error(1, errno, "ebpf error"); + fd = socket(p.recv_family, p.protocol, 0); + if (fd < 0) + error(1, errno, "failed to create socket 1"); + + if (bind(fd, addr, sockaddr_size())) + error(1, errno, "failed to bind recv socket 1"); + + errno = 0; + if (!setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &bpf_fd, + sizeof(bpf_fd)) || errno != EINVAL) + error(1, errno, "setsockopt should have returned EINVAL"); + + errno = 0; + if (!setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &cprog, + sizeof(cprog)) || errno != EINVAL) + error(1, errno, "setsockopt should have returned EINVAL"); + + free(addr); +} + +static void test_filter_without_bind(void) +{ + int fd1, fd2; + + fprintf(stderr, "Testing filter add without bind...\n"); + fd1 = socket(AF_INET, SOCK_DGRAM, 0); + if (fd1 < 0) + error(1, errno, "failed to create socket 1"); + fd2 = socket(AF_INET, SOCK_DGRAM, 0); + if (fd2 < 0) + error(1, errno, "failed to create socket 2"); + + attach_ebpf(fd1, 10); + attach_cbpf(fd2, 10); + + close(fd1); + close(fd2); +} + + +int main(void) +{ + fprintf(stderr, "---- IPv4 UDP ----\n"); + test_reuseport_ebpf((struct test_params) { + .recv_family = AF_INET, + .send_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_socks = 10, + .recv_port = 8000, + .send_port_min = 9000}); + test_reuseport_cbpf((struct test_params) { + .recv_family = AF_INET, + .send_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_socks = 10, + .recv_port = 8001, + .send_port_min = 9020}); + test_extra_filter((struct test_params) { + .recv_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_port = 8002}); + test_filter_no_reuseport((struct test_params) { + .recv_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_port = 8008}); + + fprintf(stderr, "---- IPv6 UDP ----\n"); + test_reuseport_ebpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET6, + .protocol = SOCK_DGRAM, + .recv_socks = 10, + .recv_port = 8003, + .send_port_min = 9040}); + test_reuseport_cbpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET6, + .protocol = SOCK_DGRAM, + .recv_socks = 10, + .recv_port = 8004, + .send_port_min = 9060}); + test_extra_filter((struct test_params) { + .recv_family = AF_INET6, + .protocol = SOCK_DGRAM, + .recv_port = 8005}); + test_filter_no_reuseport((struct test_params) { + .recv_family = AF_INET6, + .protocol = SOCK_DGRAM, + .recv_port = 8009}); + + fprintf(stderr, "---- IPv6 UDP w/ mapped IPv4 ----\n"); + test_reuseport_ebpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_socks = 10, + .recv_port = 8006, + .send_port_min = 9080}); + test_reuseport_cbpf((struct test_params) { + .recv_family = AF_INET6, + .send_family = AF_INET, + .protocol = SOCK_DGRAM, + .recv_socks = 10, + .recv_port = 8007, + .send_port_min = 9100}); + + + test_filter_without_bind(); + + fprintf(stderr, "SUCCESS\n"); + return 0; +}