ip: use rb trees for IP frag queue.

Similar to TCP OOO RX queue, it makes sense to use rb trees to store
IP fragments, so that OOO fragments are inserted faster.

Tested:

- a follow-up patch contains a rather comprehensive ip defrag
  self-test (functional)
- ran neper `udp_stream -c -H <host> -F 100 -l 300 -T 20`:
    netstat --statistics
    Ip:
        282078937 total packets received
        0 forwarded
        0 incoming packets discarded
        946760 incoming packets delivered
        18743456 requests sent out
        101 fragments dropped after timeout
        282077129 reassemblies required
        944952 packets reassembled ok
        262734239 packet reassembles failed
   (The numbers/stats above are somewhat better re:
    reassemblies vs a kernel without this patchset. More
    comprehensive performance testing TBD).

Reported-by: Jann Horn <jannh@google.com>
Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Peter Oskolkov 2018-08-02 23:34:39 +00:00 committed by David S. Miller
parent 385114dec8
commit fa0f527358
6 changed files with 120 additions and 90 deletions

View File

@ -676,13 +676,16 @@ struct sk_buff {
* UDP receive path is one user. * UDP receive path is one user.
*/ */
unsigned long dev_scratch; unsigned long dev_scratch;
int ip_defrag_offset;
}; };
}; };
struct rb_node rbnode; /* used in netem & tcp stack */ struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
struct list_head list; struct list_head list;
}; };
union {
struct sock *sk; struct sock *sk;
int ip_defrag_offset;
};
union { union {
ktime_t tstamp; ktime_t tstamp;

View File

@ -75,7 +75,8 @@ struct inet_frag_queue {
struct timer_list timer; struct timer_list timer;
spinlock_t lock; spinlock_t lock;
refcount_t refcnt; refcount_t refcnt;
struct sk_buff *fragments; struct sk_buff *fragments; /* Used in IPv6. */
struct rb_root rb_fragments; /* Used in IPv4. */
struct sk_buff *fragments_tail; struct sk_buff *fragments_tail;
ktime_t stamp; ktime_t stamp;
int len; int len;

View File

@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
fp = q->fragments; fp = q->fragments;
nf = q->net; nf = q->net;
f = nf->f; f = nf->f;
while (fp) { if (fp) {
do {
struct sk_buff *xp = fp->next; struct sk_buff *xp = fp->next;
sum_truesize += fp->truesize; sum_truesize += fp->truesize;
kfree_skb(fp); kfree_skb(fp);
fp = xp; fp = xp;
} while (fp);
} else {
sum_truesize = skb_rbtree_purge(&q->rb_fragments);
} }
sum = sum_truesize + f->qsize; sum = sum_truesize + f->qsize;

View File

@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t)
{ {
struct inet_frag_queue *frag = from_timer(frag, t, timer); struct inet_frag_queue *frag = from_timer(frag, t, timer);
const struct iphdr *iph; const struct iphdr *iph;
struct sk_buff *head; struct sk_buff *head = NULL;
struct net *net; struct net *net;
struct ipq *qp; struct ipq *qp;
int err; int err;
@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t)
ipq_kill(qp); ipq_kill(qp);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
head = qp->q.fragments;
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) if (!qp->q.flags & INET_FRAG_FIRST_IN)
goto out; goto out;
/* sk_buff::dev and sk_buff::rbnode are unionized. So we
* pull the head out of the tree in order to be able to
* deal with head->dev.
*/
if (qp->q.fragments) {
head = qp->q.fragments;
qp->q.fragments = head->next;
} else {
head = skb_rb_first(&qp->q.rb_fragments);
if (!head)
goto out;
rb_erase(&head->rbnode, &qp->q.rb_fragments);
memset(&head->rbnode, 0, sizeof(head->rbnode));
barrier();
}
if (head == qp->q.fragments_tail)
qp->q.fragments_tail = NULL;
sub_frag_mem_limit(qp->q.net, head->truesize);
head->dev = dev_get_by_index_rcu(net, qp->iif); head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev) if (!head->dev)
goto out; goto out;
@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t)
(skb_rtable(head)->rt_type != RTN_LOCAL)) (skb_rtable(head)->rt_type != RTN_LOCAL))
goto out; goto out;
skb_get(head);
spin_unlock(&qp->q.lock); spin_unlock(&qp->q.lock);
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
kfree_skb(head);
goto out_rcu_unlock; goto out_rcu_unlock;
out: out:
spin_unlock(&qp->q.lock); spin_unlock(&qp->q.lock);
out_rcu_unlock: out_rcu_unlock:
rcu_read_unlock(); rcu_read_unlock();
if (head)
kfree_skb(head);
ipq_put(qp); ipq_put(qp);
} }
@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp)
end = atomic_inc_return(&peer->rid); end = atomic_inc_return(&peer->rid);
qp->rid = end; qp->rid = end;
rc = qp->q.fragments && (end - start) > max; rc = qp->q.fragments_tail && (end - start) > max;
if (rc) { if (rc) {
struct net *net; struct net *net;
@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *qp)
static int ip_frag_reinit(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp)
{ {
struct sk_buff *fp;
unsigned int sum_truesize = 0; unsigned int sum_truesize = 0;
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp)
return -ETIMEDOUT; return -ETIMEDOUT;
} }
fp = qp->q.fragments; sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
do {
struct sk_buff *xp = fp->next;
sum_truesize += fp->truesize;
kfree_skb(fp);
fp = xp;
} while (fp);
sub_frag_mem_limit(qp->q.net, sum_truesize); sub_frag_mem_limit(qp->q.net, sum_truesize);
qp->q.flags = 0; qp->q.flags = 0;
qp->q.len = 0; qp->q.len = 0;
qp->q.meat = 0; qp->q.meat = 0;
qp->q.fragments = NULL; qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL; qp->q.fragments_tail = NULL;
qp->iif = 0; qp->iif = 0;
qp->ecn = 0; qp->ecn = 0;
@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp)
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{ {
struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct sk_buff *prev, *next; struct rb_node **rbn, *parent;
struct sk_buff *skb1;
struct net_device *dev; struct net_device *dev;
unsigned int fragsize; unsigned int fragsize;
int flags, offset; int flags, offset;
@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (err) if (err)
goto err; goto err;
/* Find out which fragments are in front and at the back of us /* Note : skb->rbnode and skb->dev share the same location. */
* in the chain of fragments so far. We must know where to put dev = skb->dev;
* this fragment, right? /* Makes sure compiler wont do silly aliasing games */
*/ barrier();
prev = qp->q.fragments_tail;
if (!prev || prev->ip_defrag_offset < offset) {
next = NULL;
goto found;
}
prev = NULL;
for (next = qp->q.fragments; next != NULL; next = next->next) {
if (next->ip_defrag_offset >= offset)
break; /* bingo! */
prev = next;
}
found:
/* RFC5722, Section 4, amended by Errata ID : 3089 /* RFC5722, Section 4, amended by Errata ID : 3089
* When reassembling an IPv6 datagram, if * When reassembling an IPv6 datagram, if
* one or more its constituent fragments is determined to be an * one or more its constituent fragments is determined to be an
* overlapping fragment, the entire datagram (and any constituent * overlapping fragment, the entire datagram (and any constituent
* fragments) MUST be silently discarded. * fragments) MUST be silently discarded.
* *
* We do the same here for IPv4. * We do the same here for IPv4 (and increment an snmp counter).
*/ */
/* Is there an overlap with the previous fragment? */ /* Find out where to put this fragment. */
if (prev && skb1 = qp->q.fragments_tail;
(prev->ip_defrag_offset + prev->len) > offset) if (!skb1) {
/* This is the first fragment we've received. */
rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
qp->q.fragments_tail = skb;
} else if ((skb1->ip_defrag_offset + skb1->len) < end) {
/* This is the common/special case: skb goes to the end. */
/* Detect and discard overlaps. */
if (offset < (skb1->ip_defrag_offset + skb1->len))
goto discard_qp; goto discard_qp;
/* Insert after skb1. */
/* Is there an overlap with the next fragment? */ rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
if (next && next->ip_defrag_offset < end) qp->q.fragments_tail = skb;
} else {
/* Binary search. Note that skb can become the first fragment, but
* not the last (covered above). */
rbn = &qp->q.rb_fragments.rb_node;
do {
parent = *rbn;
skb1 = rb_to_skb(parent);
if (end <= skb1->ip_defrag_offset)
rbn = &parent->rb_left;
else if (offset >= skb1->ip_defrag_offset + skb1->len)
rbn = &parent->rb_right;
else /* Found an overlap with skb1. */
goto discard_qp; goto discard_qp;
} while (*rbn);
/* Here we have parent properly set, and rbn pointing to
* one of its NULL left/right children. Insert skb. */
rb_link_node(&skb->rbnode, parent, rbn);
}
rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
/* Note : skb->ip_defrag_offset and skb->dev share the same location */
dev = skb->dev;
if (dev) if (dev)
qp->iif = dev->ifindex; qp->iif = dev->ifindex;
/* Makes sure compiler wont do silly aliasing games */
barrier();
skb->ip_defrag_offset = offset; skb->ip_defrag_offset = offset;
/* Insert this fragment in the chain of fragments. */
skb->next = next;
if (!next)
qp->q.fragments_tail = skb;
if (prev)
prev->next = skb;
else
qp->q.fragments = skb;
qp->q.stamp = skb->tstamp; qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len; qp->q.meat += skb->len;
qp->ecn |= ecn; qp->ecn |= ecn;
@ -414,7 +425,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
unsigned long orefdst = skb->_skb_refdst; unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL; skb->_skb_refdst = 0UL;
err = ip_frag_reasm(qp, prev, dev); err = ip_frag_reasm(qp, skb, dev);
skb->_skb_refdst = orefdst; skb->_skb_refdst = orefdst;
return err; return err;
} }
@ -431,15 +442,15 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
return err; return err;
} }
/* Build a new IP datagram from all its fragments. */ /* Build a new IP datagram from all its fragments. */
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct net_device *dev) struct net_device *dev)
{ {
struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph; struct iphdr *iph;
struct sk_buff *fp, *head = qp->q.fragments; struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
struct sk_buff **nextp; /* To build frag_list. */
struct rb_node *rbn;
int len; int len;
int ihlen; int ihlen;
int err; int err;
@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_fail; goto out_fail;
} }
/* Make the one we just received the head. */ /* Make the one we just received the head. */
if (prev) { if (head != skb) {
head = prev->next; fp = skb_clone(skb, GFP_ATOMIC);
fp = skb_clone(head, GFP_ATOMIC);
if (!fp) if (!fp)
goto out_nomem; goto out_nomem;
rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
fp->next = head->next; if (qp->q.fragments_tail == skb)
if (!fp->next)
qp->q.fragments_tail = fp; qp->q.fragments_tail = fp;
prev->next = fp; skb_morph(skb, head);
rb_replace_node(&head->rbnode, &skb->rbnode,
skb_morph(head, qp->q.fragments); &qp->q.rb_fragments);
head->next = qp->q.fragments->next; consume_skb(head);
head = skb;
consume_skb(qp->q.fragments);
qp->q.fragments = head;
} }
WARN_ON(!head);
WARN_ON(head->ip_defrag_offset != 0); WARN_ON(head->ip_defrag_offset != 0);
/* Allocate a new buffer for the datagram. */ /* Allocate a new buffer for the datagram. */
@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
clone = alloc_skb(0, GFP_ATOMIC); clone = alloc_skb(0, GFP_ATOMIC);
if (!clone) if (!clone)
goto out_nomem; goto out_nomem;
clone->next = head->next;
head->next = clone;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head); skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++) for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]); plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen; clone->len = clone->data_len = head->data_len - plen;
head->data_len -= clone->len; skb->truesize += clone->truesize;
head->len -= clone->len;
clone->csum = 0; clone->csum = 0;
clone->ip_summed = head->ip_summed; clone->ip_summed = head->ip_summed;
add_frag_mem_limit(qp->q.net, clone->truesize); add_frag_mem_limit(qp->q.net, clone->truesize);
skb_shinfo(head)->frag_list = clone;
nextp = &clone->next;
} else {
nextp = &skb_shinfo(head)->frag_list;
} }
skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head)); skb_push(head, head->data - skb_network_header(head));
for (fp=head->next; fp; fp = fp->next) { /* Traverse the tree in order, to build frag_list. */
rbn = rb_next(&head->rbnode);
rb_erase(&head->rbnode, &qp->q.rb_fragments);
while (rbn) {
struct rb_node *rbnext = rb_next(rbn);
fp = rb_to_skb(rbn);
rb_erase(rbn, &qp->q.rb_fragments);
rbn = rbnext;
*nextp = fp;
nextp = &fp->next;
fp->prev = NULL;
memset(&fp->rbnode, 0, sizeof(fp->rbnode));
head->data_len += fp->len; head->data_len += fp->len;
head->len += fp->len; head->len += fp->len;
if (head->ip_summed != fp->ip_summed) if (head->ip_summed != fp->ip_summed)
@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
} }
sub_frag_mem_limit(qp->q.net, head->truesize); sub_frag_mem_limit(qp->q.net, head->truesize);
*nextp = NULL;
head->next = NULL; head->next = NULL;
head->prev = NULL;
head->dev = dev; head->dev = dev;
head->tstamp = qp->q.stamp; head->tstamp = qp->q.stamp;
IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL; qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL; qp->q.fragments_tail = NULL;
return 0; return 0;

View File

@ -463,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
head->csum); head->csum);
fq->q.fragments = NULL; fq->q.fragments = NULL;
fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL; fq->q.fragments_tail = NULL;
return true; return true;

View File

@ -405,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
rcu_read_unlock(); rcu_read_unlock();
fq->q.fragments = NULL; fq->q.fragments = NULL;
fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL; fq->q.fragments_tail = NULL;
return 1; return 1;