diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a63e13082397..686a666d073d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -454,9 +454,15 @@ enum { * all frags to avoid possible bad checksum */ SKBFL_SHARED_FRAG = BIT(1), + + /* segment contains only zerocopy data and should not be + * charged to the kernel memory. + */ + SKBFL_PURE_ZEROCOPY = BIT(2), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) +#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY) /* * The callback notifies userspace to release buffers when skb DMA is done in @@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) return is_zcopy ? skb_uarg(skb) : NULL; } +static inline bool skb_zcopy_pure(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; +} + +static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, + const struct sk_buff *skb2) +{ + return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2); +} + static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); @@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) if (!skb_zcopy_is_nouarg(skb)) uarg->callback(skb, uarg, zerocopy_success); - skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG; + skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; } } diff --git a/include/net/tcp.h b/include/net/tcp.h index 70972f3ac8fa..4da22b41bde6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -293,7 +293,10 @@ static inline bool tcp_out_of_memory(struct sock *sk) static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); - sk_mem_uncharge(sk, skb->truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, skb->truesize); + else + sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb))); __kfree_skb(skb); } @@ -974,7 +977,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return likely(tcp_skb_can_collapse_to(to) && - mptcp_skb_can_collapse(to, from)); + mptcp_skb_can_collapse(to, from) && + skb_pure_zcopy_same(to, from)); } /* Events passed to congestion control interface */ diff --git a/net/core/datagram.c b/net/core/datagram.c index 15ab9ffb27fe..ee290776c661 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -646,7 +646,8 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, skb->truesize += truesize; if (sk && sk->sk_type == SOCK_STREAM) { sk_wmem_queued_add(sk, truesize); - sk_mem_charge(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3ec42cdee16a..ba2f38246f07 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3433,8 +3433,9 @@ static inline void skb_split_no_header(struct sk_buff *skb, void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); + const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; - skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; + skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bc7f419184aa..b461ae573afc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -863,6 +863,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, if (likely(skb)) { bool mem_scheduled; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); if (force_schedule) { mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); @@ -1319,6 +1320,15 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) copy = min_t(int, copy, pfrag->size - pfrag->offset); + /* skb changing from pure zc to mixed, must charge zc */ + if (unlikely(skb_zcopy_pure(skb))) { + if (!sk_wmem_schedule(sk, skb->data_len)) + goto wait_for_space; + + sk_mem_charge(sk, skb->data_len); + skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; + } + if (!sk_wmem_schedule(sk, copy)) goto wait_for_space; @@ -1339,8 +1349,16 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) } pfrag->offset += copy; } else { - if (!sk_wmem_schedule(sk, copy)) - goto wait_for_space; + /* First append to a fragless skb builds initial + * pure zerocopy skb + */ + if (!skb->len) + skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; + + if (!skb_zcopy_pure(skb)) { + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_space; + } err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); if (err == -EMSGSIZE || err == -EEXIST) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 76cc1641beb4..6f7860e283c6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1677,7 +1677,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (delta_truesize) { skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); - sk_mem_uncharge(sk, delta_truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, delta_truesize); } /* Any change of skb->len requires recalculation of tso factor. */ @@ -2295,7 +2296,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (len <= skb->len) break; - if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb)) + if (unlikely(TCP_SKB_CB(skb)->eor) || + tcp_has_tx_tstamp(skb) || + !skb_pure_zcopy_same(skb, next)) return false; len -= skb->len;