Merge branch 'ip_frag_next'
Florian Westphal says: ==================== net: force refragmentation for DF reassembed skbs output path tests: if (skb->len > mtu) ip_fragment() This breaks connectivity in one corner case: If the skb was reassembled, but has the DF bit set and .. .. its reassembled size is <= outdev mtu .. .. we will forward a DF packet larger than what the sender transmitted on wire. If a router later in the path can't forward this packet, it will send an icmp error in response to an mtu that the original sender never exceeded. This changes ipv4 defrag/output path to a) force refragmentation for DF reassembled skbs and b) set DF bit on all fragments when refragmenting if it was set on original frags. tested via: from scapy.all import * dip="10.23.42.2" payload="A"*1400 packet=IP(dst=dip,id=12345,flags='DF')/UDP(sport=42,dport=42)/payload frags=fragment(packet,fragsize=1200) for fragment in frags: send(fragment) Without this patch, we generate fragments without df bit set based on the outgoing device mtu when fragmenting after forwarding, ie. IP (ttl 64, id 12345, offset 0, flags [+, DF], proto UDP (17), length 1204) 192.168.7.1.42 > 10.23.42.2.42: UDP, length 1400 IP (ttl 64, id 12345, offset 1184, flags [DF], proto UDP (17), length 244) 192.168.7.1 > 10.23.42.2: ip-proto-17 on ingress will either turn into IP (ttl 63, id 12345, offset 0, flags [+], proto UDP (17), length 1396) 192.168.7.1.42 > 10.23.42.2.42: UDP, length 1400 IP (ttl 63, id 12345, offset 1376, flags [none], proto UDP (17), length 52) (mtu 1400: We strip df and send larger fragment), or IP (ttl 63, id 12345, offset 0, flags [DF], proto UDP (17), length 1428) 192.168.7.1.42 > 10.23.42.2.42: [udp sum ok] UDP, length 1400 if mtu is 1500. And in this case things break; router with a smaller mtu will send icmp error, but original sender only sent packets <= 1204 byte. With patch, we keep intent of such fragments and will emit DF-fragments that won't exceed 1204 byte in size. Joint work with Hannes Frederic Sowa. Changes since v2: - split unrelated patches from series - rework changelog of patch #2 to better illustrate breakage ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
837b9955b1
|
@ -43,7 +43,7 @@ enum {
|
|||
* @len: total length of the original datagram
|
||||
* @meat: length of received fragments so far
|
||||
* @flags: fragment queue flags
|
||||
* @max_size: (ipv4 only) maximum received fragment size with IP_DF set
|
||||
* @max_size: maximum received fragment size
|
||||
* @net: namespace that this frag belongs to
|
||||
*/
|
||||
struct inet_frag_queue {
|
||||
|
|
|
@ -45,6 +45,7 @@ struct inet_skb_parm {
|
|||
#define IPSKB_FRAG_COMPLETE BIT(3)
|
||||
#define IPSKB_REROUTED BIT(4)
|
||||
#define IPSKB_DOREDIRECT BIT(5)
|
||||
#define IPSKB_FRAG_PMTU BIT(6)
|
||||
|
||||
u16 frag_max_size;
|
||||
};
|
||||
|
|
|
@ -75,6 +75,7 @@ struct ipq {
|
|||
__be16 id;
|
||||
u8 protocol;
|
||||
u8 ecn; /* RFC3168 support */
|
||||
u16 max_df_size; /* largest frag with DF set seen */
|
||||
int iif;
|
||||
unsigned int rid;
|
||||
struct inet_peer *peer;
|
||||
|
@ -326,6 +327,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
|
|||
{
|
||||
struct sk_buff *prev, *next;
|
||||
struct net_device *dev;
|
||||
unsigned int fragsize;
|
||||
int flags, offset;
|
||||
int ihl, end;
|
||||
int err = -ENOENT;
|
||||
|
@ -481,9 +483,14 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
|
|||
if (offset == 0)
|
||||
qp->q.flags |= INET_FRAG_FIRST_IN;
|
||||
|
||||
fragsize = skb->len + ihl;
|
||||
|
||||
if (fragsize > qp->q.max_size)
|
||||
qp->q.max_size = fragsize;
|
||||
|
||||
if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
|
||||
skb->len + ihl > qp->q.max_size)
|
||||
qp->q.max_size = skb->len + ihl;
|
||||
fragsize > qp->max_df_size)
|
||||
qp->max_df_size = fragsize;
|
||||
|
||||
if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
|
||||
qp->q.meat == qp->q.len) {
|
||||
|
@ -613,13 +620,27 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
|
|||
head->next = NULL;
|
||||
head->dev = dev;
|
||||
head->tstamp = qp->q.stamp;
|
||||
IPCB(head)->frag_max_size = qp->q.max_size;
|
||||
IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
|
||||
|
||||
iph = ip_hdr(head);
|
||||
/* max_size != 0 implies at least one fragment had IP_DF set */
|
||||
iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
|
||||
iph->tot_len = htons(len);
|
||||
iph->tos |= ecn;
|
||||
|
||||
/* When we set IP_DF on a refragmented skb we must also force a
|
||||
* call to ip_fragment to avoid forwarding a DF-skb of size s while
|
||||
* original sender only sent fragments of size f (where f < s).
|
||||
*
|
||||
* We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
|
||||
* frag seen to avoid sending tiny DF-fragments in case skb was built
|
||||
* from one very small df-fragment and one large non-df frag.
|
||||
*/
|
||||
if (qp->max_df_size == qp->q.max_size) {
|
||||
IPCB(head)->flags |= IPSKB_FRAG_PMTU;
|
||||
iph->frag_off = htons(IP_DF);
|
||||
} else {
|
||||
iph->frag_off = 0;
|
||||
}
|
||||
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
|
||||
qp->q.fragments = NULL;
|
||||
qp->q.fragments_tail = NULL;
|
||||
|
|
|
@ -84,6 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
|
|||
EXPORT_SYMBOL(sysctl_ip_default_ttl);
|
||||
|
||||
static int ip_fragment(struct sock *sk, struct sk_buff *skb,
|
||||
unsigned int mtu,
|
||||
int (*output)(struct sock *, struct sk_buff *));
|
||||
|
||||
/* Generate a checksum for an outgoing IP datagram. */
|
||||
|
@ -219,7 +220,8 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
|
||||
static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
|
||||
unsigned int mtu)
|
||||
{
|
||||
netdev_features_t features;
|
||||
struct sk_buff *segs;
|
||||
|
@ -227,7 +229,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
|
|||
|
||||
/* common case: locally created skb or seglen is <= mtu */
|
||||
if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
|
||||
skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
|
||||
skb_gso_network_seglen(skb) <= mtu)
|
||||
return ip_finish_output2(sk, skb);
|
||||
|
||||
/* Slowpath - GSO segment length is exceeding the dst MTU.
|
||||
|
@ -251,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
|
|||
int err;
|
||||
|
||||
segs->next = NULL;
|
||||
err = ip_fragment(sk, segs, ip_finish_output2);
|
||||
err = ip_fragment(sk, segs, mtu, ip_finish_output2);
|
||||
|
||||
if (err && ret == 0)
|
||||
ret = err;
|
||||
|
@ -263,6 +265,8 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
|
|||
|
||||
static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
unsigned int mtu;
|
||||
|
||||
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
|
||||
/* Policy lookup after SNAT yielded a new policy */
|
||||
if (skb_dst(skb)->xfrm) {
|
||||
|
@ -270,11 +274,12 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
|
|||
return dst_output_sk(sk, skb);
|
||||
}
|
||||
#endif
|
||||
mtu = ip_skb_dst_mtu(skb);
|
||||
if (skb_is_gso(skb))
|
||||
return ip_finish_output_gso(sk, skb);
|
||||
return ip_finish_output_gso(sk, skb, mtu);
|
||||
|
||||
if (skb->len > ip_skb_dst_mtu(skb))
|
||||
return ip_fragment(sk, skb, ip_finish_output2);
|
||||
if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
|
||||
return ip_fragment(sk, skb, mtu, ip_finish_output2);
|
||||
|
||||
return ip_finish_output2(sk, skb);
|
||||
}
|
||||
|
@ -482,12 +487,15 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
|
|||
}
|
||||
|
||||
static int ip_fragment(struct sock *sk, struct sk_buff *skb,
|
||||
unsigned int mtu,
|
||||
int (*output)(struct sock *, struct sk_buff *))
|
||||
{
|
||||
struct iphdr *iph = ip_hdr(skb);
|
||||
unsigned int mtu = ip_skb_dst_mtu(skb);
|
||||
|
||||
if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
|
||||
if ((iph->frag_off & htons(IP_DF)) == 0)
|
||||
return ip_do_fragment(sk, skb, output);
|
||||
|
||||
if (unlikely(!skb->ignore_df ||
|
||||
(IPCB(skb)->frag_max_size &&
|
||||
IPCB(skb)->frag_max_size > mtu))) {
|
||||
struct rtable *rt = skb_rtable(skb);
|
||||
|
@ -532,6 +540,8 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
|
|||
iph = ip_hdr(skb);
|
||||
|
||||
mtu = ip_skb_dst_mtu(skb);
|
||||
if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
|
||||
mtu = IPCB(skb)->frag_max_size;
|
||||
|
||||
/*
|
||||
* Setup starting values.
|
||||
|
@ -727,6 +737,9 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
|
|||
iph = ip_hdr(skb2);
|
||||
iph->frag_off = htons((offset >> 3));
|
||||
|
||||
if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
|
||||
iph->frag_off |= htons(IP_DF);
|
||||
|
||||
/* ANK: dirty, but effective trick. Upgrade options only if
|
||||
* the segment to be fragmented was THE FIRST (otherwise,
|
||||
* options are already fixed) and make it ONCE
|
||||
|
|
Loading…
Reference in New Issue