From d1361840f8c519eaee9a78ffe09e4f0a1b586846 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:35 -0700 Subject: [PATCH 1/5] tcp: fix SO_RCVLOWAT and RCVBUF autotuning Applications might use SO_RCVLOWAT on TCP socket hoping to receive one [E]POLLIN event only when a given amount of bytes are ready in socket receive queue. Problem is that receive autotuning is not aware of this constraint, meaning sk_rcvbuf might be too small to allow all bytes to be stored. Add a new (struct proto_ops)->set_rcvlowat method so that a protocol can override the default setsockopt(SO_RCVLOWAT) behavior. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/net.h | 1 + include/net/tcp.h | 1 + net/core/sock.c | 5 ++++- net/ipv4/af_inet.c | 1 + net/ipv4/tcp.c | 21 +++++++++++++++++++++ net/ipv6/af_inet6.c | 1 + 6 files changed, 29 insertions(+), 1 deletion(-) diff --git a/include/linux/net.h b/include/linux/net.h index 2248a052061d..6554d3ba4396 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -197,6 +197,7 @@ struct proto_ops { int offset, size_t size, int flags); int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, size_t size); + int (*set_rcvlowat)(struct sock *sk, int val); }; #define DECLARE_SOCKADDR(type, dst, src) \ diff --git a/include/net/tcp.h b/include/net/tcp.h index 9c9b3768b350..b2318242cad8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -402,6 +402,7 @@ void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); +int tcp_set_rcvlowat(struct sock *sk, int val); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); diff --git a/net/core/sock.c b/net/core/sock.c index 6444525f610c..b2c3db169ca1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -905,7 +905,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname, case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - sk->sk_rcvlowat = val ? : 1; + if (sock->ops->set_rcvlowat) + ret = sock->ops->set_rcvlowat(sk, val); + else + sk->sk_rcvlowat = val ? : 1; break; case SO_RCVTIMEO: diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eaed0367e669..f5c562aaef35 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1006,6 +1006,7 @@ const struct proto_ops inet_stream_ops = { .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif + .set_rcvlowat = tcp_set_rcvlowat, }; EXPORT_SYMBOL(inet_stream_ops); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bccc4c270087..0abd8d1d3d1d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1701,6 +1701,27 @@ int tcp_peek_len(struct socket *sock) } EXPORT_SYMBOL(tcp_peek_len); +/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ +int tcp_set_rcvlowat(struct sock *sk, int val) +{ + sk->sk_rcvlowat = val ? : 1; + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + return 0; + + /* val comes from user space and might be close to INT_MAX */ + val <<= 1; + if (val < 0) + val = INT_MAX; + + val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + if (val > sk->sk_rcvbuf) { + sk->sk_rcvbuf = val; + tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); + } + return 0; +} +EXPORT_SYMBOL(tcp_set_rcvlowat); + static void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping *tss) { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8da0b513f188..e70d59fb26e1 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -590,6 +590,7 @@ const struct proto_ops inet6_stream_ops = { .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif + .set_rcvlowat = tcp_set_rcvlowat, }; const struct proto_ops inet6_dgram_ops = { From 796f82eafcd96629c2f9a0332dbb4f474854aaf8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:36 -0700 Subject: [PATCH 2/5] tcp: fix delayed acks behavior for SO_RCVLOWAT We should not delay acks if there are not enough bytes in receive queue to satisfy SO_RCVLOWAT. Since [E]POLLIN event is not going to be generated, there is little hope for a delayed ack to be useful. In fact, delaying ACK prevents sender from completing the transfer. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 367def6ddeda..d854363a4387 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5026,9 +5026,12 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && /* ... and right edge of window advances far enough. - * (tcp_recvmsg() will send ACK otherwise). Or... + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. */ - __tcp_select_window(sk) >= tp->rcv_wnd) || + (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || + __tcp_select_window(sk) >= tp->rcv_wnd)) || /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* We have out of order data. */ From 03f45c883c6f391ed4fff8292415b35bd1107519 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:37 -0700 Subject: [PATCH 3/5] tcp: avoid extra wakeups for SO_RCVLOWAT users SO_RCVLOWAT is properly handled in tcp_poll(), so that POLLIN is only generated when enough bytes are available in receive queue, after David change (commit c7004482e8dc "tcp: Respect SO_RCVLOWAT in tcp_poll().") But TCP still calls sk->sk_data_ready() for each chunk added in receive queue, meaning thread is awaken, and goes back to sleep shortly after. Tested: tcp_mmap test program, receiving 32768 MB of data with SO_RCVLOWAT set to 512KB -> Should get ~2 wakeups (c-switches) per MB, regardless of how many (tiny or big) packets were received. High speed (mostly full size GRO packets) received 32768 MB (100 % mmap'ed) in 8.03112 s, 34.2266 Gbit, cpu usage user:0.037 sys:1.404, 43.9758 usec per MB, 65497 c-switches received 32768 MB (99.9954 % mmap'ed) in 7.98453 s, 34.4263 Gbit, cpu usage user:0.03 sys:1.422, 44.3115 usec per MB, 65485 c-switches Low speed (sender is ratelimited and sends 1-MSS at a time, so GRO is not helping) received 22474.5 MB (100 % mmap'ed) in 6015.35 s, 0.0313414 Gbit, cpu usage user:0.05 sys:1.586, 72.7952 usec per MB, 44950 c-switches Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp.c | 4 ++++ net/ipv4/tcp_input.c | 15 +++++++++++++-- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index b2318242cad8..0ee85c47c185 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -403,6 +403,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); +void tcp_data_ready(struct sock *sk); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0abd8d1d3d1d..c768d306b657 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1705,6 +1705,10 @@ EXPORT_SYMBOL(tcp_peek_len); int tcp_set_rcvlowat(struct sock *sk, int val) { sk->sk_rcvlowat = val ? : 1; + + /* Check if we need to signal EPOLLIN right now */ + tcp_data_ready(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d854363a4387..f93687f97d80 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4576,6 +4576,17 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) } +void tcp_data_ready(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + int avail = tp->rcv_nxt - tp->copied_seq; + + if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE)) + return; + + sk->sk_data_ready(sk); +} + static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -4633,7 +4644,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + tcp_data_ready(sk); return; } @@ -5434,7 +5445,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, no_ack: if (eaten) kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk); + tcp_data_ready(sk); return; } } From 93ab6cc69162775201587cc9da00d5016dc890e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:38 -0700 Subject: [PATCH 4/5] tcp: implement mmap() for zero copy receive Some networks can make sure TCP payload can exactly fit 4KB pages, with well chosen MSS/MTU and architectures. Implement mmap() system call so that applications can avoid copying data without complex splice() games. Note that a successful mmap( X bytes) on TCP socket is consuming bytes, as if recvmsg() has been done. (tp->copied += X) Only PROT_READ mappings are accepted, as skb page frags are fundamentally shared and read only. If tcp_mmap() finds data that is not a full page, or a patch of urgent data, -EINVAL is returned, no bytes are consumed. Application must fallback to recvmsg() to read the problematic sequence. mmap() wont block, regardless of socket being in blocking or non-blocking mode. If not enough bytes are in receive queue, mmap() would return -EAGAIN, or -EIO if socket is in a state where no other bytes can be added into receive queue. An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD) to efficiently use mmap() On the sender side, MSG_EOR might help to clearly separate unaligned headers and 4K-aligned chunks if necessary. Tested: mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch. MTU set to 4168 (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header) Without mmap() (tcp_mmap -s) received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit, cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit, cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit, cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit, cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches With mmap() on receiver (tcp_mmap -s -z) received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit, cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit, cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit, cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit, cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 + net/ipv4/af_inet.c | 2 +- net/ipv4/tcp.c | 113 ++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/af_inet6.c | 2 +- 4 files changed, 117 insertions(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 0ee85c47c185..833154e3df17 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -404,6 +404,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); void tcp_data_ready(struct sock *sk); +int tcp_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f5c562aaef35..3ebf599cebae 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -994,7 +994,7 @@ const struct proto_ops inet_stream_ops = { .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, - .mmap = sock_no_mmap, + .mmap = tcp_mmap, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c768d306b657..438fbca96cd3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1726,6 +1726,119 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); +/* When user wants to mmap X pages, we first need to perform the mapping + * before freeing any skbs in receive queue, otherwise user would be unable + * to fallback to standard recvmsg(). This happens if some data in the + * requested block is not exactly fitting in a page. + * + * We only support order-0 pages for the moment. + * mmap() on TCP is very strict, there is no point + * trying to accommodate with pathological layouts. + */ +int tcp_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + unsigned long size = vma->vm_end - vma->vm_start; + unsigned int nr_pages = size >> PAGE_SHIFT; + struct page **pages_array = NULL; + u32 seq, len, offset, nr = 0; + struct sock *sk = sock->sk; + const skb_frag_t *frags; + struct tcp_sock *tp; + struct sk_buff *skb; + int ret; + + if (vma->vm_pgoff || !nr_pages) + return -EINVAL; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + /* TODO: Maybe the following is not needed if pages are COW */ + vma->vm_flags &= ~VM_MAYWRITE; + + lock_sock(sk); + + ret = -ENOTCONN; + if (sk->sk_state == TCP_LISTEN) + goto out; + + sock_rps_record_flow(sk); + + if (tcp_inq(sk) < size) { + ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN; + goto out; + } + tp = tcp_sk(sk); + seq = tp->copied_seq; + /* Abort if urgent data is in the area */ + if (unlikely(tp->urg_data)) { + u32 urg_offset = tp->urg_seq - seq; + + ret = -EINVAL; + if (urg_offset < size) + goto out; + } + ret = -ENOMEM; + pages_array = kvmalloc_array(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!pages_array) + goto out; + skb = tcp_recv_skb(sk, seq, &offset); + ret = -EINVAL; +skb_start: + /* We do not support anything not in page frags */ + offset -= skb_headlen(skb); + if ((int)offset < 0) + goto out; + if (skb_has_frag_list(skb)) + goto out; + len = skb->data_len - offset; + frags = skb_shinfo(skb)->frags; + while (offset) { + if (frags->size > offset) + goto out; + offset -= frags->size; + frags++; + } + while (nr < nr_pages) { + if (len) { + if (len < PAGE_SIZE) + goto out; + if (frags->size != PAGE_SIZE || frags->page_offset) + goto out; + pages_array[nr++] = skb_frag_page(frags); + frags++; + len -= PAGE_SIZE; + seq += PAGE_SIZE; + continue; + } + skb = skb->next; + offset = seq - TCP_SKB_CB(skb)->seq; + goto skb_start; + } + /* OK, we have a full set of pages ready to be inserted into vma */ + for (nr = 0; nr < nr_pages; nr++) { + ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT), + pages_array[nr]); + if (ret) + goto out; + } + /* operation is complete, we can 'consume' all skbs */ + tp->copied_seq = seq; + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + tcp_recv_skb(sk, seq, &offset); + tcp_cleanup_rbuf(sk, size); + + ret = 0; +out: + release_sock(sk); + kvfree(pages_array); + return ret; +} +EXPORT_SYMBOL(tcp_mmap); + static void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping *tss) { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e70d59fb26e1..2c694912df2e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -579,7 +579,7 @@ const struct proto_ops inet6_stream_ops = { .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet_sendmsg, /* ok */ .recvmsg = inet_recvmsg, /* ok */ - .mmap = sock_no_mmap, + .mmap = tcp_mmap, .sendpage = inet_sendpage, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, From 192dc405f3080f1f7d8e90f638dc08c0e5a803a2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:39 -0700 Subject: [PATCH 5/5] selftests: net: add tcp_mmap program This is a reference program showing how mmap() can be used on TCP flows to implement receive zero copy. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 2 + tools/testing/selftests/net/tcp_mmap.c | 437 +++++++++++++++++++++++++ 2 files changed, 439 insertions(+) create mode 100644 tools/testing/selftests/net/tcp_mmap.c diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 785fc18a16b4..23e725f4305e 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -8,9 +8,11 @@ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetl TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh TEST_GEN_FILES = socket TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy +TEST_GEN_FILES += tcp_mmap TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict include ../lib.mk $(OUTPUT)/reuseport_bpf_numa: LDFLAGS += -lnuma +$(OUTPUT)/tcp_mmap: LDFLAGS += -lpthread diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c new file mode 100644 index 000000000000..dea342fe6f4e --- /dev/null +++ b/tools/testing/selftests/net/tcp_mmap.c @@ -0,0 +1,437 @@ +/* + * Copyright 2018 Google Inc. + * Author: Eric Dumazet (edumazet@google.com) + * + * Reference program demonstrating tcp mmap() usage, + * and SO_RCVLOWAT hints for receiver. + * + * Note : NIC with header split is needed to use mmap() on TCP : + * Each incoming frame must be a multiple of PAGE_SIZE bytes of TCP payload. + * + * How to use on loopback interface : + * + * ifconfig lo mtu 61512 # 15*4096 + 40 (ipv6 header) + 32 (TCP with TS option header) + * tcp_mmap -s -z & + * tcp_mmap -H ::1 -z + * + * Or leave default lo mtu, but use -M option to set TCP_MAXSEG option to (4096 + 12) + * (4096 : page size on x86, 12: TCP TS option length) + * tcp_mmap -s -z -M $((4096+12)) & + * tcp_mmap -H ::1 -z -M $((4096+12)) + * + * Note: -z option on sender uses MSG_ZEROCOPY, which forces a copy when packets go through loopback interface. + * We might use sendfile() instead, but really this test program is about mmap(), for receivers ;) + * + * $ ./tcp_mmap -s & # Without mmap() + * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done + * received 32768 MB (0 % mmap'ed) in 14.1157 s, 19.4732 Gbit + * cpu usage user:0.057 sys:7.815, 240.234 usec per MB, 65531 c-switches + * received 32768 MB (0 % mmap'ed) in 14.6833 s, 18.7204 Gbit + * cpu usage user:0.043 sys:8.103, 248.596 usec per MB, 65524 c-switches + * received 32768 MB (0 % mmap'ed) in 11.143 s, 24.6682 Gbit + * cpu usage user:0.044 sys:6.576, 202.026 usec per MB, 65519 c-switches + * received 32768 MB (0 % mmap'ed) in 14.9056 s, 18.4413 Gbit + * cpu usage user:0.036 sys:8.193, 251.129 usec per MB, 65530 c-switches + * $ kill %1 # kill tcp_mmap server + * + * $ ./tcp_mmap -s -z & # With mmap() + * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done + * received 32768 MB (99.9939 % mmap'ed) in 6.73792 s, 40.7956 Gbit + * cpu usage user:0.045 sys:2.827, 87.6465 usec per MB, 65532 c-switches + * received 32768 MB (99.9939 % mmap'ed) in 7.26732 s, 37.8238 Gbit + * cpu usage user:0.037 sys:3.087, 95.3369 usec per MB, 65532 c-switches + * received 32768 MB (99.9939 % mmap'ed) in 7.61661 s, 36.0893 Gbit + * cpu usage user:0.046 sys:3.559, 110.016 usec per MB, 65529 c-switches + * received 32768 MB (99.9939 % mmap'ed) in 7.43764 s, 36.9577 Gbit + * cpu usage user:0.035 sys:3.467, 106.873 usec per MB, 65530 c-switches + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef MSG_ZEROCOPY +#define MSG_ZEROCOPY 0x4000000 +#endif + +#define FILE_SZ (1UL << 35) +static int cfg_family = AF_INET6; +static socklen_t cfg_alen = sizeof(struct sockaddr_in6); +static int cfg_port = 8787; + +static int rcvbuf; /* Default: autotuning. Can be set with -r option */ +static int sndbuf; /* Default: autotuning. Can be set with -w option */ +static int zflg; /* zero copy option. (MSG_ZEROCOPY for sender, mmap() for receiver */ +static int xflg; /* hash received data (simple xor) (-h option) */ +static int keepflag; /* -k option: receiver shall keep all received file in memory (no munmap() calls) */ + +static int chunk_size = 512*1024; + +unsigned long htotal; + +static inline void prefetch(const void *x) +{ +#if defined(__x86_64__) + asm volatile("prefetcht0 %P0" : : "m" (*(const char *)x)); +#endif +} + +void hash_zone(void *zone, unsigned int length) +{ + unsigned long temp = htotal; + + while (length >= 8*sizeof(long)) { + prefetch(zone + 384); + temp ^= *(unsigned long *)zone; + temp ^= *(unsigned long *)(zone + sizeof(long)); + temp ^= *(unsigned long *)(zone + 2*sizeof(long)); + temp ^= *(unsigned long *)(zone + 3*sizeof(long)); + temp ^= *(unsigned long *)(zone + 4*sizeof(long)); + temp ^= *(unsigned long *)(zone + 5*sizeof(long)); + temp ^= *(unsigned long *)(zone + 6*sizeof(long)); + temp ^= *(unsigned long *)(zone + 7*sizeof(long)); + zone += 8*sizeof(long); + length -= 8*sizeof(long); + } + while (length >= 1) { + temp ^= *(unsigned char *)zone; + zone += 1; + length--; + } + htotal = temp; +} + +void *child_thread(void *arg) +{ + unsigned long total_mmap = 0, total = 0; + unsigned long delta_usec; + int flags = MAP_SHARED; + struct timeval t0, t1; + char *buffer = NULL; + void *oaddr = NULL; + double throughput; + struct rusage ru; + int lu, fd; + + fd = (int)(unsigned long)arg; + + gettimeofday(&t0, NULL); + + fcntl(fd, F_SETFL, O_NDELAY); + buffer = malloc(chunk_size); + if (!buffer) { + perror("malloc"); + goto error; + } + while (1) { + struct pollfd pfd = { .fd = fd, .events = POLLIN, }; + int sub; + + poll(&pfd, 1, 10000); + if (zflg) { + void *naddr; + + naddr = mmap(oaddr, chunk_size, PROT_READ, flags, fd, 0); + if (naddr == (void *)-1) { + if (errno == EAGAIN) { + /* That is if SO_RCVLOWAT is buggy */ + usleep(1000); + continue; + } + if (errno == EINVAL) { + flags = MAP_SHARED; + oaddr = NULL; + goto fallback; + } + if (errno != EIO) + perror("mmap()"); + break; + } + total_mmap += chunk_size; + if (xflg) + hash_zone(naddr, chunk_size); + total += chunk_size; + if (!keepflag) { + flags |= MAP_FIXED; + oaddr = naddr; + } + continue; + } +fallback: + sub = 0; + while (sub < chunk_size) { + lu = read(fd, buffer + sub, chunk_size - sub); + if (lu == 0) + goto end; + if (lu < 0) + break; + if (xflg) + hash_zone(buffer + sub, lu); + total += lu; + sub += lu; + } + } +end: + gettimeofday(&t1, NULL); + delta_usec = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; + + throughput = 0; + if (delta_usec) + throughput = total * 8.0 / (double)delta_usec / 1000.0; + getrusage(RUSAGE_THREAD, &ru); + if (total > 1024*1024) { + unsigned long total_usec; + unsigned long mb = total >> 20; + total_usec = 1000000*ru.ru_utime.tv_sec + ru.ru_utime.tv_usec + + 1000000*ru.ru_stime.tv_sec + ru.ru_stime.tv_usec; + printf("received %lg MB (%lg %% mmap'ed) in %lg s, %lg Gbit\n" + " cpu usage user:%lg sys:%lg, %lg usec per MB, %lu c-switches\n", + total / (1024.0 * 1024.0), + 100.0*total_mmap/total, + (double)delta_usec / 1000000.0, + throughput, + (double)ru.ru_utime.tv_sec + (double)ru.ru_utime.tv_usec / 1000000.0, + (double)ru.ru_stime.tv_sec + (double)ru.ru_stime.tv_usec / 1000000.0, + (double)total_usec/mb, + ru.ru_nvcsw); + } +error: + free(buffer); + close(fd); + pthread_exit(0); +} + +static void apply_rcvsnd_buf(int fd) +{ + if (rcvbuf && setsockopt(fd, SOL_SOCKET, + SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)) == -1) { + perror("setsockopt SO_RCVBUF"); + } + + if (sndbuf && setsockopt(fd, SOL_SOCKET, + SO_SNDBUF, &sndbuf, sizeof(sndbuf)) == -1) { + perror("setsockopt SO_SNDBUF"); + } +} + + +static void setup_sockaddr(int domain, const char *str_addr, + struct sockaddr_storage *sockaddr) +{ + struct sockaddr_in6 *addr6 = (void *) sockaddr; + struct sockaddr_in *addr4 = (void *) sockaddr; + + switch (domain) { + case PF_INET: + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = AF_INET; + addr4->sin_port = htons(cfg_port); + if (str_addr && + inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) + error(1, 0, "ipv4 parse error: %s", str_addr); + break; + case PF_INET6: + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(cfg_port); + if (str_addr && + inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) + error(1, 0, "ipv6 parse error: %s", str_addr); + break; + default: + error(1, 0, "illegal domain"); + } +} + +static void do_accept(int fdlisten) +{ + if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT, + &chunk_size, sizeof(chunk_size)) == -1) { + perror("setsockopt SO_RCVLOWAT"); + } + + apply_rcvsnd_buf(fdlisten); + + while (1) { + struct sockaddr_in addr; + socklen_t addrlen = sizeof(addr); + pthread_t th; + int fd, res; + + fd = accept(fdlisten, (struct sockaddr *)&addr, &addrlen); + if (fd == -1) { + perror("accept"); + continue; + } + res = pthread_create(&th, NULL, child_thread, + (void *)(unsigned long)fd); + if (res) { + errno = res; + perror("pthread_create"); + close(fd); + } + } +} + +int main(int argc, char *argv[]) +{ + struct sockaddr_storage listenaddr, addr; + unsigned int max_pacing_rate = 0; + unsigned long total = 0; + char *host = NULL; + int fd, c, on = 1; + char *buffer; + int sflg = 0; + int mss = 0; + + while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:")) != -1) { + switch (c) { + case '4': + cfg_family = PF_INET; + cfg_alen = sizeof(struct sockaddr_in); + break; + case '6': + cfg_family = PF_INET6; + cfg_alen = sizeof(struct sockaddr_in6); + break; + case 'p': + cfg_port = atoi(optarg); + break; + case 'H': + host = optarg; + break; + case 's': /* server : listen for incoming connections */ + sflg++; + break; + case 'r': + rcvbuf = atoi(optarg); + break; + case 'w': + sndbuf = atoi(optarg); + break; + case 'z': + zflg = 1; + break; + case 'M': + mss = atoi(optarg); + break; + case 'x': + xflg = 1; + break; + case 'k': + keepflag = 1; + break; + case 'P': + max_pacing_rate = atoi(optarg) ; + break; + default: + exit(1); + } + } + if (sflg) { + int fdlisten = socket(cfg_family, SOCK_STREAM, 0); + + if (fdlisten == -1) { + perror("socket"); + exit(1); + } + apply_rcvsnd_buf(fdlisten); + setsockopt(fdlisten, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + + setup_sockaddr(cfg_family, host, &listenaddr); + + if (mss && + setsockopt(fdlisten, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) { + perror("setsockopt TCP_MAXSEG"); + exit(1); + } + if (bind(fdlisten, (const struct sockaddr *)&listenaddr, cfg_alen) == -1) { + perror("bind"); + exit(1); + } + if (listen(fdlisten, 128) == -1) { + perror("listen"); + exit(1); + } + do_accept(fdlisten); + } + buffer = mmap(NULL, chunk_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buffer == (char *)-1) { + perror("mmap"); + exit(1); + } + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (fd == -1) { + perror("socket"); + exit(1); + } + apply_rcvsnd_buf(fd); + + setup_sockaddr(cfg_family, host, &addr); + + if (mss && + setsockopt(fd, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) { + perror("setsockopt TCP_MAXSEG"); + exit(1); + } + if (connect(fd, (const struct sockaddr *)&addr, cfg_alen) == -1) { + perror("connect"); + exit(1); + } + if (max_pacing_rate && + setsockopt(fd, SOL_SOCKET, SO_MAX_PACING_RATE, + &max_pacing_rate, sizeof(max_pacing_rate)) == -1) + perror("setsockopt SO_MAX_PACING_RATE"); + + if (zflg && setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, + &on, sizeof(on)) == -1) { + perror("setsockopt SO_ZEROCOPY, (-z option disabled)"); + zflg = 0; + } + while (total < FILE_SZ) { + long wr = FILE_SZ - total; + + if (wr > chunk_size) + wr = chunk_size; + /* Note : we just want to fill the pipe with 0 bytes */ + wr = send(fd, buffer, wr, zflg ? MSG_ZEROCOPY : 0); + if (wr <= 0) + break; + total += wr; + } + close(fd); + munmap(buffer, chunk_size); + return 0; +}