linux/net/ipv6/datagram.c

1047 lines
25 KiB
C
Raw Normal View History

/*
* common UDP/RAW code
* Linux INET6 implementation
*
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in6.h>
#include <linux/ipv6.h>
#include <linux/route.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <linux/export.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/tcp_states.h>
#include <net/dsfield.h>
#include <linux/errqueue.h>
#include <linux/uaccess.h>
static bool ipv6_mapped_addr_any(const struct in6_addr *a)
{
return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0);
}
static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
memset(fl6, 0, sizeof(*fl6));
fl6->flowi6_proto = sk->sk_protocol;
fl6->daddr = sk->sk_v6_daddr;
fl6->saddr = np->saddr;
fl6->flowi6_oif = sk->sk_bound_dev_if;
fl6->flowi6_mark = sk->sk_mark;
fl6->fl6_dport = inet->inet_dport;
fl6->fl6_sport = inet->inet_sport;
fl6->flowlabel = np->flow_label;
fl6->flowi6_uid = sk->sk_uid;
if (!fl6->flowi6_oif)
fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr))
fl6->flowi6_oif = np->mcast_oif;
security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
}
ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update There is a case in connected UDP socket such that getsockopt(IPV6_MTU) will return a stale MTU value. The reproducible sequence could be the following: 1. Create a connected UDP socket 2. Send some datagrams out 3. Receive a ICMPV6_PKT_TOOBIG 4. No new outgoing datagrams to trigger the sk_dst_check() logic to update the sk->sk_dst_cache. 5. getsockopt(IPV6_MTU) returns the mtu from the invalid sk->sk_dst_cache instead of the newly created RTF_CACHE clone. This patch updates the sk->sk_dst_cache for a connected datagram sk during pmtu-update code path. Note that the sk->sk_v6_daddr is used to do the route lookup instead of skb->data (i.e. iph). It is because a UDP socket can become connected after sending out some datagrams in un-connected state. or It can be connected multiple times to different destinations. Hence, iph may not be related to where sk is currently connected to. It is done under '!sock_owned_by_user(sk)' condition because the user may make another ip6_datagram_connect() (i.e changing the sk->sk_v6_daddr) while dst lookup is happening in the pmtu-update code path. For the sock_owned_by_user(sk) == true case, the next patch will introduce a release_cb() which will update the sk->sk_dst_cache. Test: Server (Connected UDP Socket): ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Route Details: [root@arch-fb-vm1 ~]# ip -6 r show | egrep '2fac' 2fac::/64 dev eth0 proto kernel metric 256 pref medium 2fac:face::/64 via 2fac::face dev eth0 metric 1024 pref medium A simple python code to create a connected UDP socket: import socket import errno HOST = '2fac::1' PORT = 8080 s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) s.bind((HOST, PORT)) s.connect(('2fac:face::face', 53)) print("connected") while True: try: data = s.recv(1024) except socket.error as se: if se.errno == errno.EMSGSIZE: pmtu = s.getsockopt(41, 24) print("PMTU:%d" % pmtu) break s.close() Python program output after getting a ICMPV6_PKT_TOOBIG: [root@arch-fb-vm1 ~]# python2 ~/devshare/kernel/tasks/fib6/udp-connect-53-8080.py connected PMTU:1300 Cache routes after recieving TOOBIG: [root@arch-fb-vm1 ~]# ip -6 r show table cache 2fac:face::face via 2fac::face dev eth0 metric 0 cache expires 463sec mtu 1300 pref medium Client (Send the ICMPV6_PKT_TOOBIG): ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ scapy is used to generate the TOOBIG message. Here is the scapy script I have used: >>> p=Ether(src='da:75:4d:36:ac:32', dst='52:54:00:12:34:66', type=0x86dd)/IPv6(src='2fac::face', dst='2fac::1')/ICMPv6PacketTooBig(mtu=1300)/IPv6(src='2fac:: 1',dst='2fac:face::face', nh='UDP')/UDP(sport=8080,dport=53) >>> sendp(p, iface='qemubr0') Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception") Signed-off-by: Martin KaFai Lau <kafai@fb.com> Reported-by: Wei Wang <weiwan@google.com> Cc: Cong Wang <xiyou.wangcong@gmail.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Wei Wang <weiwan@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-12 06:29:36 +08:00
int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
{
struct ip6_flowlabel *flowlabel = NULL;
struct in6_addr *final_p, final;
struct ipv6_txoptions *opt;
struct dst_entry *dst;
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
struct flowi6 fl6;
int err = 0;
if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) {
flowlabel = fl6_sock_lookup(sk, np->flow_label);
if (!flowlabel)
return -EINVAL;
}
ip6_datagram_flow_key_init(&fl6, sk);
rcu_read_lock();
opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt);
final_p = fl6_update_dst(&fl6, opt, &final);
rcu_read_unlock();
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto out;
}
ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update There is a case in connected UDP socket such that getsockopt(IPV6_MTU) will return a stale MTU value. The reproducible sequence could be the following: 1. Create a connected UDP socket 2. Send some datagrams out 3. Receive a ICMPV6_PKT_TOOBIG 4. No new outgoing datagrams to trigger the sk_dst_check() logic to update the sk->sk_dst_cache. 5. getsockopt(IPV6_MTU) returns the mtu from the invalid sk->sk_dst_cache instead of the newly created RTF_CACHE clone. This patch updates the sk->sk_dst_cache for a connected datagram sk during pmtu-update code path. Note that the sk->sk_v6_daddr is used to do the route lookup instead of skb->data (i.e. iph). It is because a UDP socket can become connected after sending out some datagrams in un-connected state. or It can be connected multiple times to different destinations. Hence, iph may not be related to where sk is currently connected to. It is done under '!sock_owned_by_user(sk)' condition because the user may make another ip6_datagram_connect() (i.e changing the sk->sk_v6_daddr) while dst lookup is happening in the pmtu-update code path. For the sock_owned_by_user(sk) == true case, the next patch will introduce a release_cb() which will update the sk->sk_dst_cache. Test: Server (Connected UDP Socket): ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Route Details: [root@arch-fb-vm1 ~]# ip -6 r show | egrep '2fac' 2fac::/64 dev eth0 proto kernel metric 256 pref medium 2fac:face::/64 via 2fac::face dev eth0 metric 1024 pref medium A simple python code to create a connected UDP socket: import socket import errno HOST = '2fac::1' PORT = 8080 s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) s.bind((HOST, PORT)) s.connect(('2fac:face::face', 53)) print("connected") while True: try: data = s.recv(1024) except socket.error as se: if se.errno == errno.EMSGSIZE: pmtu = s.getsockopt(41, 24) print("PMTU:%d" % pmtu) break s.close() Python program output after getting a ICMPV6_PKT_TOOBIG: [root@arch-fb-vm1 ~]# python2 ~/devshare/kernel/tasks/fib6/udp-connect-53-8080.py connected PMTU:1300 Cache routes after recieving TOOBIG: [root@arch-fb-vm1 ~]# ip -6 r show table cache 2fac:face::face via 2fac::face dev eth0 metric 0 cache expires 463sec mtu 1300 pref medium Client (Send the ICMPV6_PKT_TOOBIG): ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ scapy is used to generate the TOOBIG message. Here is the scapy script I have used: >>> p=Ether(src='da:75:4d:36:ac:32', dst='52:54:00:12:34:66', type=0x86dd)/IPv6(src='2fac::face', dst='2fac::1')/ICMPv6PacketTooBig(mtu=1300)/IPv6(src='2fac:: 1',dst='2fac:face::face', nh='UDP')/UDP(sport=8080,dport=53) >>> sendp(p, iface='qemubr0') Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception") Signed-off-by: Martin KaFai Lau <kafai@fb.com> Reported-by: Wei Wang <weiwan@google.com> Cc: Cong Wang <xiyou.wangcong@gmail.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Wei Wang <weiwan@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-12 06:29:36 +08:00
if (fix_sk_saddr) {
if (ipv6_addr_any(&np->saddr))
np->saddr = fl6.saddr;
ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update There is a case in connected UDP socket such that getsockopt(IPV6_MTU) will return a stale MTU value. The reproducible sequence could be the following: 1. Create a connected UDP socket 2. Send some datagrams out 3. Receive a ICMPV6_PKT_TOOBIG 4. No new outgoing datagrams to trigger the sk_dst_check() logic to update the sk->sk_dst_cache. 5. getsockopt(IPV6_MTU) returns the mtu from the invalid sk->sk_dst_cache instead of the newly created RTF_CACHE clone. This patch updates the sk->sk_dst_cache for a connected datagram sk during pmtu-update code path. Note that the sk->sk_v6_daddr is used to do the route lookup instead of skb->data (i.e. iph). It is because a UDP socket can become connected after sending out some datagrams in un-connected state. or It can be connected multiple times to different destinations. Hence, iph may not be related to where sk is currently connected to. It is done under '!sock_owned_by_user(sk)' condition because the user may make another ip6_datagram_connect() (i.e changing the sk->sk_v6_daddr) while dst lookup is happening in the pmtu-update code path. For the sock_owned_by_user(sk) == true case, the next patch will introduce a release_cb() which will update the sk->sk_dst_cache. Test: Server (Connected UDP Socket): ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Route Details: [root@arch-fb-vm1 ~]# ip -6 r show | egrep '2fac' 2fac::/64 dev eth0 proto kernel metric 256 pref medium 2fac:face::/64 via 2fac::face dev eth0 metric 1024 pref medium A simple python code to create a connected UDP socket: import socket import errno HOST = '2fac::1' PORT = 8080 s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) s.bind((HOST, PORT)) s.connect(('2fac:face::face', 53)) print("connected") while True: try: data = s.recv(1024) except socket.error as se: if se.errno == errno.EMSGSIZE: pmtu = s.getsockopt(41, 24) print("PMTU:%d" % pmtu) break s.close() Python program output after getting a ICMPV6_PKT_TOOBIG: [root@arch-fb-vm1 ~]# python2 ~/devshare/kernel/tasks/fib6/udp-connect-53-8080.py connected PMTU:1300 Cache routes after recieving TOOBIG: [root@arch-fb-vm1 ~]# ip -6 r show table cache 2fac:face::face via 2fac::face dev eth0 metric 0 cache expires 463sec mtu 1300 pref medium Client (Send the ICMPV6_PKT_TOOBIG): ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ scapy is used to generate the TOOBIG message. Here is the scapy script I have used: >>> p=Ether(src='da:75:4d:36:ac:32', dst='52:54:00:12:34:66', type=0x86dd)/IPv6(src='2fac::face', dst='2fac::1')/ICMPv6PacketTooBig(mtu=1300)/IPv6(src='2fac:: 1',dst='2fac:face::face', nh='UDP')/UDP(sport=8080,dport=53) >>> sendp(p, iface='qemubr0') Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception") Signed-off-by: Martin KaFai Lau <kafai@fb.com> Reported-by: Wei Wang <weiwan@google.com> Cc: Cong Wang <xiyou.wangcong@gmail.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Wei Wang <weiwan@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-12 06:29:36 +08:00
if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
sk->sk_v6_rcv_saddr = fl6.saddr;
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
}
}
ip6_sk_dst_store_flow(sk, dst, &fl6);
out:
fl6_sock_release(flowlabel);
return err;
}
void ip6_datagram_release_cb(struct sock *sk)
{
struct dst_entry *dst;
if (ipv6_addr_v4mapped(&sk->sk_v6_daddr))
return;
rcu_read_lock();
dst = __sk_dst_get(sk);
if (!dst || !dst->obsolete ||
dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) {
rcu_read_unlock();
return;
}
rcu_read_unlock();
ip6_datagram_dst_update(sk, false);
}
EXPORT_SYMBOL_GPL(ip6_datagram_release_cb);
int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_addr *daddr, old_daddr;
__be32 fl6_flowlabel = 0;
__be32 old_fl6_flowlabel;
__be16 old_dport;
int addr_type;
int err;
if (usin->sin6_family == AF_INET) {
if (__ipv6_only_sock(sk))
return -EAFNOSUPPORT;
err = __ip4_datagram_connect(sk, uaddr, addr_len);
goto ipv4_connected;
}
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
if (usin->sin6_family != AF_INET6)
return -EAFNOSUPPORT;
if (np->sndflow)
fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
if (ipv6_addr_any(&usin->sin6_addr)) {
/*
* connect to self
*/
if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
&usin->sin6_addr);
else
usin->sin6_addr = in6addr_loopback;
}
addr_type = ipv6_addr_type(&usin->sin6_addr);
daddr = &usin->sin6_addr;
if (addr_type & IPV6_ADDR_MAPPED) {
struct sockaddr_in sin;
if (__ipv6_only_sock(sk)) {
err = -ENETUNREACH;
goto out;
}
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = daddr->s6_addr32[3];
sin.sin_port = usin->sin6_port;
err = __ip4_datagram_connect(sk,
(struct sockaddr *) &sin,
sizeof(sin));
ipv4_connected:
if (err)
goto out;
ipv6_addr_set_v4mapped(inet->inet_daddr, &sk->sk_v6_daddr);
if (ipv6_addr_any(&np->saddr) ||
ipv6_mapped_addr_any(&np->saddr))
ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) ||
ipv6_mapped_addr_any(&sk->sk_v6_rcv_saddr)) {
ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
&sk->sk_v6_rcv_saddr);
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
}
goto out;
}
if (__ipv6_addr_needs_scope_id(addr_type)) {
if (addr_len >= sizeof(struct sockaddr_in6) &&
usin->sin6_scope_id) {
if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) {
err = -EINVAL;
goto out;
}
sk->sk_bound_dev_if = usin->sin6_scope_id;
}
if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST))
sk->sk_bound_dev_if = np->mcast_oif;
/* Connect to link-local address requires an interface */
if (!sk->sk_bound_dev_if) {
err = -EINVAL;
goto out;
}
}
/* save the current peer information before updating it */
old_daddr = sk->sk_v6_daddr;
old_fl6_flowlabel = np->flow_label;
old_dport = inet->inet_dport;
sk->sk_v6_daddr = *daddr;
np->flow_label = fl6_flowlabel;
inet->inet_dport = usin->sin6_port;
/*
* Check for a route to destination an obtain the
* destination cache for it.
*/
ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update There is a case in connected UDP socket such that getsockopt(IPV6_MTU) will return a stale MTU value. The reproducible sequence could be the following: 1. Create a connected UDP socket 2. Send some datagrams out 3. Receive a ICMPV6_PKT_TOOBIG 4. No new outgoing datagrams to trigger the sk_dst_check() logic to update the sk->sk_dst_cache. 5. getsockopt(IPV6_MTU) returns the mtu from the invalid sk->sk_dst_cache instead of the newly created RTF_CACHE clone. This patch updates the sk->sk_dst_cache for a connected datagram sk during pmtu-update code path. Note that the sk->sk_v6_daddr is used to do the route lookup instead of skb->data (i.e. iph). It is because a UDP socket can become connected after sending out some datagrams in un-connected state. or It can be connected multiple times to different destinations. Hence, iph may not be related to where sk is currently connected to. It is done under '!sock_owned_by_user(sk)' condition because the user may make another ip6_datagram_connect() (i.e changing the sk->sk_v6_daddr) while dst lookup is happening in the pmtu-update code path. For the sock_owned_by_user(sk) == true case, the next patch will introduce a release_cb() which will update the sk->sk_dst_cache. Test: Server (Connected UDP Socket): ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Route Details: [root@arch-fb-vm1 ~]# ip -6 r show | egrep '2fac' 2fac::/64 dev eth0 proto kernel metric 256 pref medium 2fac:face::/64 via 2fac::face dev eth0 metric 1024 pref medium A simple python code to create a connected UDP socket: import socket import errno HOST = '2fac::1' PORT = 8080 s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) s.bind((HOST, PORT)) s.connect(('2fac:face::face', 53)) print("connected") while True: try: data = s.recv(1024) except socket.error as se: if se.errno == errno.EMSGSIZE: pmtu = s.getsockopt(41, 24) print("PMTU:%d" % pmtu) break s.close() Python program output after getting a ICMPV6_PKT_TOOBIG: [root@arch-fb-vm1 ~]# python2 ~/devshare/kernel/tasks/fib6/udp-connect-53-8080.py connected PMTU:1300 Cache routes after recieving TOOBIG: [root@arch-fb-vm1 ~]# ip -6 r show table cache 2fac:face::face via 2fac::face dev eth0 metric 0 cache expires 463sec mtu 1300 pref medium Client (Send the ICMPV6_PKT_TOOBIG): ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ scapy is used to generate the TOOBIG message. Here is the scapy script I have used: >>> p=Ether(src='da:75:4d:36:ac:32', dst='52:54:00:12:34:66', type=0x86dd)/IPv6(src='2fac::face', dst='2fac::1')/ICMPv6PacketTooBig(mtu=1300)/IPv6(src='2fac:: 1',dst='2fac:face::face', nh='UDP')/UDP(sport=8080,dport=53) >>> sendp(p, iface='qemubr0') Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception") Signed-off-by: Martin KaFai Lau <kafai@fb.com> Reported-by: Wei Wang <weiwan@google.com> Cc: Cong Wang <xiyou.wangcong@gmail.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Wei Wang <weiwan@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-12 06:29:36 +08:00
err = ip6_datagram_dst_update(sk, true);
if (err) {
/* Restore the socket peer info, to keep it consistent with
* the old socket state
*/
sk->sk_v6_daddr = old_daddr;
np->flow_label = old_fl6_flowlabel;
inet->inet_dport = old_dport;
goto out;
}
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
out:
return err;
}
EXPORT_SYMBOL_GPL(__ip6_datagram_connect);
int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
int res;
lock_sock(sk);
res = __ip6_datagram_connect(sk, uaddr, addr_len);
release_sock(sk);
return res;
}
EXPORT_SYMBOL_GPL(ip6_datagram_connect);
int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr);
if (sin6->sin6_family != AF_INET6)
return -EAFNOSUPPORT;
return ip6_datagram_connect(sk, uaddr, addr_len);
}
EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only);
void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
__be16 port, u32 info, u8 *payload)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct icmp6hdr *icmph = icmp6_hdr(skb);
struct sock_exterr_skb *serr;
if (!np->recverr)
return;
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb)
return;
IPv6: fix IPV6_RECVERR handling of locally-generated errors I noticed when I added support for IPV6_DONTFRAG that if you set IPV6_RECVERR and tried to send a UDP packet larger than 64K to an IPv6 destination, you'd correctly get an EMSGSIZE, but reading from MSG_ERRQUEUE returned the incorrect address in the cmsg: struct msghdr: msg_name 0x7fff8f3c96d0 msg_namelen 28 struct sockaddr_in6: sin6_family 10 sin6_port 7639 sin6_flowinfo 0 sin6_addr ::ffff:38.32.0.0 sin6_scope_id 0 ((null)) It should have returned this in my case: struct msghdr: msg_name 0x7fffd866b510 msg_namelen 28 struct sockaddr_in6: sin6_family 10 sin6_port 7639 sin6_flowinfo 0 sin6_addr 2620:0:a09:e000:21f:29ff:fe57:f88b sin6_scope_id 0 ((null)) The problem is that ipv6_recv_error() assumes that if the error wasn't generated by ICMPv6, it's an IPv4 address sitting there, and proceeds to create a v4-mapped address from it. Change ipv6_icmp_error() and ipv6_local_error() to set skb->protocol to htons(ETH_P_IPV6) so that ipv6_recv_error() knows the address sitting right after the extended error is IPv6, else it will incorrectly map the first octet into an IPv4-mapped IPv6 address in the cmsg structure returned in a recvmsg() call to obtain the error. Signed-off-by: Brian Haley <brian.haley@hp.com> -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Signed-off-by: David S. Miller <davem@davemloft.net>
2010-05-03 23:44:27 +08:00
skb->protocol = htons(ETH_P_IPV6);
serr = SKB_EXT_ERR(skb);
serr->ee.ee_errno = err;
serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6;
serr->ee.ee_type = icmph->icmp6_type;
serr->ee.ee_code = icmph->icmp6_code;
serr->ee.ee_pad = 0;
serr->ee.ee_info = info;
serr->ee.ee_data = 0;
serr->addr_offset = (u8 *)&(((struct ipv6hdr *)(icmph + 1))->daddr) -
skb_network_header(skb);
serr->port = port;
__skb_pull(skb, payload - skb->data);
skb_reset_transport_header(skb);
if (sock_queue_err_skb(sk, skb))
kfree_skb(skb);
}
void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
{
const struct ipv6_pinfo *np = inet6_sk(sk);
struct sock_exterr_skb *serr;
struct ipv6hdr *iph;
struct sk_buff *skb;
if (!np->recverr)
return;
skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
if (!skb)
return;
IPv6: fix IPV6_RECVERR handling of locally-generated errors I noticed when I added support for IPV6_DONTFRAG that if you set IPV6_RECVERR and tried to send a UDP packet larger than 64K to an IPv6 destination, you'd correctly get an EMSGSIZE, but reading from MSG_ERRQUEUE returned the incorrect address in the cmsg: struct msghdr: msg_name 0x7fff8f3c96d0 msg_namelen 28 struct sockaddr_in6: sin6_family 10 sin6_port 7639 sin6_flowinfo 0 sin6_addr ::ffff:38.32.0.0 sin6_scope_id 0 ((null)) It should have returned this in my case: struct msghdr: msg_name 0x7fffd866b510 msg_namelen 28 struct sockaddr_in6: sin6_family 10 sin6_port 7639 sin6_flowinfo 0 sin6_addr 2620:0:a09:e000:21f:29ff:fe57:f88b sin6_scope_id 0 ((null)) The problem is that ipv6_recv_error() assumes that if the error wasn't generated by ICMPv6, it's an IPv4 address sitting there, and proceeds to create a v4-mapped address from it. Change ipv6_icmp_error() and ipv6_local_error() to set skb->protocol to htons(ETH_P_IPV6) so that ipv6_recv_error() knows the address sitting right after the extended error is IPv6, else it will incorrectly map the first octet into an IPv4-mapped IPv6 address in the cmsg structure returned in a recvmsg() call to obtain the error. Signed-off-by: Brian Haley <brian.haley@hp.com> -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Signed-off-by: David S. Miller <davem@davemloft.net>
2010-05-03 23:44:27 +08:00
skb->protocol = htons(ETH_P_IPV6);
skb_put(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
iph = ipv6_hdr(skb);
iph->daddr = fl6->daddr;
serr = SKB_EXT_ERR(skb);
serr->ee.ee_errno = err;
serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
serr->ee.ee_type = 0;
serr->ee.ee_code = 0;
serr->ee.ee_pad = 0;
serr->ee.ee_info = info;
serr->ee.ee_data = 0;
serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
serr->port = fl6->fl6_dport;
__skb_pull(skb, skb_tail_pointer(skb) - skb->data);
skb_reset_transport_header(skb);
if (sock_queue_err_skb(sk, skb))
kfree_skb(skb);
}
void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct ipv6hdr *iph;
struct sk_buff *skb;
struct ip6_mtuinfo *mtu_info;
if (!np->rxopt.bits.rxpmtu)
return;
skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
if (!skb)
return;
skb_put(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
iph = ipv6_hdr(skb);
iph->daddr = fl6->daddr;
mtu_info = IP6CBMTU(skb);
mtu_info->ip6m_mtu = mtu;
mtu_info->ip6m_addr.sin6_family = AF_INET6;
mtu_info->ip6m_addr.sin6_port = 0;
mtu_info->ip6m_addr.sin6_flowinfo = 0;
mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif;
mtu_info->ip6m_addr.sin6_addr = ipv6_hdr(skb)->daddr;
__skb_pull(skb, skb_tail_pointer(skb) - skb->data);
skb_reset_transport_header(skb);
skb = xchg(&np->rxpmtu, skb);
kfree_skb(skb);
}
/* For some errors we have valid addr_offset even with zero payload and
* zero port. Also, addr_offset should be supported if port is set.
*/
static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr)
{
return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 ||
serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port;
}
ip: fix error queue empty skb handling When reading from the error queue, msg_name and msg_control are only populated for some errors. A new exception for empty timestamp skbs added a false positive on icmp errors without payload. `traceroute -M udpconn` only displayed gateways that return payload with the icmp error: the embedded network headers are pulled before sock_queue_err_skb, leaving an skb with skb->len == 0 otherwise. Fix this regression by refining when msg_name and msg_control branches are taken. The solutions for the two fields are independent. msg_name only makes sense for errors that configure serr->port and serr->addr_offset. Test the first instead of skb->len. This also fixes another issue. saddr could hold the wrong data, as serr->addr_offset is not initialized in some code paths, pointing to the start of the network header. It is only valid when serr->port is set (non-zero). msg_control support differs between IPv4 and IPv6. IPv4 only honors requests for ICMP and timestamps with SOF_TIMESTAMPING_OPT_CMSG. The skb->len test can simply be removed, because skb->dev is also tested and never true for empty skbs. IPv6 honors requests for all errors aside from local errors and timestamps on empty skbs. In both cases, make the policy more explicit by moving this logic to a new function that decides whether to process msg_control and that optionally prepares the necessary fields in skb->cb[]. After this change, the IPv4 and IPv6 paths are more similar. The last case is rxrpc. Here, simply refine to only match timestamps. Fixes: 49ca0d8bfaf3 ("net-timestamp: no-payload option") Reported-by: Jan Niehusmann <jan@gondor.com> Signed-off-by: Willem de Bruijn <willemb@google.com> ---- Changes v1->v2 - fix local origin test inversion in ip6_datagram_support_cmsg - make v4 and v6 code paths more similar by introducing analogous ipv4_datagram_support_cmsg - fix compile bug in rxrpc Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-08 09:33:22 +08:00
/* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL.
*
* At one point, excluding local errors was a quick test to identify icmp/icmp6
* errors. This is no longer true, but the test remained, so the v6 stack,
* unlike v4, also honors cmsg requests on all wifi and timestamp errors.
*/
static bool ip6_datagram_support_cmsg(struct sk_buff *skb,
struct sock_exterr_skb *serr)
net-timestamp: allow reading recv cmsg on errqueue with origin tstamp Allow reading of timestamps and cmsg at the same time on all relevant socket families. One use is to correlate timestamps with egress device, by asking for cmsg IP_PKTINFO. on AF_INET sockets, call the relevant function (ip_cmsg_recv). To avoid changing legacy expectations, only do so if the caller sets a new timestamping flag SOF_TIMESTAMPING_OPT_CMSG. on AF_INET6 sockets, IPV6_PKTINFO and all other recv cmsg are already returned for all origins. only change is to set ifindex, which is not initialized for all error origins. In both cases, only generate the pktinfo message if an ifindex is known. This is not the case for ACK timestamps. The difference between the protocol families is probably a historical accident as a result of the different conditions for generating cmsg in the relevant ip(v6)_recv_error function: ipv4: if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { ipv6: if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { At one time, this was the same test bar for the ICMP/ICMP6 distinction. This is no longer true. Signed-off-by: Willem de Bruijn <willemb@google.com> ---- Changes v1 -> v2 large rewrite - integrate with existing pktinfo cmsg generation code - on ipv4: only send with new flag, to maintain legacy behavior - on ipv6: send at most a single pktinfo cmsg - on ipv6: initialize fields if not yet initialized The recv cmsg interfaces are also relevant to the discussion of whether looping packet headers is problematic. For v6, cmsgs that identify many headers are already returned. This patch expands that to v4. If it sounds reasonable, I will follow with patches 1. request timestamps without payload with SOF_TIMESTAMPING_OPT_TSONLY (http://patchwork.ozlabs.org/patch/366967/) 2. sysctl to conditionally drop all timestamps that have payload or cmsg from users without CAP_NET_RAW. Signed-off-by: David S. Miller <davem@davemloft.net>
2014-12-01 11:22:34 +08:00
{
ip: fix error queue empty skb handling When reading from the error queue, msg_name and msg_control are only populated for some errors. A new exception for empty timestamp skbs added a false positive on icmp errors without payload. `traceroute -M udpconn` only displayed gateways that return payload with the icmp error: the embedded network headers are pulled before sock_queue_err_skb, leaving an skb with skb->len == 0 otherwise. Fix this regression by refining when msg_name and msg_control branches are taken. The solutions for the two fields are independent. msg_name only makes sense for errors that configure serr->port and serr->addr_offset. Test the first instead of skb->len. This also fixes another issue. saddr could hold the wrong data, as serr->addr_offset is not initialized in some code paths, pointing to the start of the network header. It is only valid when serr->port is set (non-zero). msg_control support differs between IPv4 and IPv6. IPv4 only honors requests for ICMP and timestamps with SOF_TIMESTAMPING_OPT_CMSG. The skb->len test can simply be removed, because skb->dev is also tested and never true for empty skbs. IPv6 honors requests for all errors aside from local errors and timestamps on empty skbs. In both cases, make the policy more explicit by moving this logic to a new function that decides whether to process msg_control and that optionally prepares the necessary fields in skb->cb[]. After this change, the IPv4 and IPv6 paths are more similar. The last case is rxrpc. Here, simply refine to only match timestamps. Fixes: 49ca0d8bfaf3 ("net-timestamp: no-payload option") Reported-by: Jan Niehusmann <jan@gondor.com> Signed-off-by: Willem de Bruijn <willemb@google.com> ---- Changes v1->v2 - fix local origin test inversion in ip6_datagram_support_cmsg - make v4 and v6 code paths more similar by introducing analogous ipv4_datagram_support_cmsg - fix compile bug in rxrpc Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-08 09:33:22 +08:00
if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6)
return true;
if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL)
return false;
if (!IP6CB(skb)->iif)
ip: fix error queue empty skb handling When reading from the error queue, msg_name and msg_control are only populated for some errors. A new exception for empty timestamp skbs added a false positive on icmp errors without payload. `traceroute -M udpconn` only displayed gateways that return payload with the icmp error: the embedded network headers are pulled before sock_queue_err_skb, leaving an skb with skb->len == 0 otherwise. Fix this regression by refining when msg_name and msg_control branches are taken. The solutions for the two fields are independent. msg_name only makes sense for errors that configure serr->port and serr->addr_offset. Test the first instead of skb->len. This also fixes another issue. saddr could hold the wrong data, as serr->addr_offset is not initialized in some code paths, pointing to the start of the network header. It is only valid when serr->port is set (non-zero). msg_control support differs between IPv4 and IPv6. IPv4 only honors requests for ICMP and timestamps with SOF_TIMESTAMPING_OPT_CMSG. The skb->len test can simply be removed, because skb->dev is also tested and never true for empty skbs. IPv6 honors requests for all errors aside from local errors and timestamps on empty skbs. In both cases, make the policy more explicit by moving this logic to a new function that decides whether to process msg_control and that optionally prepares the necessary fields in skb->cb[]. After this change, the IPv4 and IPv6 paths are more similar. The last case is rxrpc. Here, simply refine to only match timestamps. Fixes: 49ca0d8bfaf3 ("net-timestamp: no-payload option") Reported-by: Jan Niehusmann <jan@gondor.com> Signed-off-by: Willem de Bruijn <willemb@google.com> ---- Changes v1->v2 - fix local origin test inversion in ip6_datagram_support_cmsg - make v4 and v6 code paths more similar by introducing analogous ipv4_datagram_support_cmsg - fix compile bug in rxrpc Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-08 09:33:22 +08:00
return false;
net-timestamp: allow reading recv cmsg on errqueue with origin tstamp Allow reading of timestamps and cmsg at the same time on all relevant socket families. One use is to correlate timestamps with egress device, by asking for cmsg IP_PKTINFO. on AF_INET sockets, call the relevant function (ip_cmsg_recv). To avoid changing legacy expectations, only do so if the caller sets a new timestamping flag SOF_TIMESTAMPING_OPT_CMSG. on AF_INET6 sockets, IPV6_PKTINFO and all other recv cmsg are already returned for all origins. only change is to set ifindex, which is not initialized for all error origins. In both cases, only generate the pktinfo message if an ifindex is known. This is not the case for ACK timestamps. The difference between the protocol families is probably a historical accident as a result of the different conditions for generating cmsg in the relevant ip(v6)_recv_error function: ipv4: if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { ipv6: if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { At one time, this was the same test bar for the ICMP/ICMP6 distinction. This is no longer true. Signed-off-by: Willem de Bruijn <willemb@google.com> ---- Changes v1 -> v2 large rewrite - integrate with existing pktinfo cmsg generation code - on ipv4: only send with new flag, to maintain legacy behavior - on ipv6: send at most a single pktinfo cmsg - on ipv6: initialize fields if not yet initialized The recv cmsg interfaces are also relevant to the discussion of whether looping packet headers is problematic. For v6, cmsgs that identify many headers are already returned. This patch expands that to v4. If it sounds reasonable, I will follow with patches 1. request timestamps without payload with SOF_TIMESTAMPING_OPT_TSONLY (http://patchwork.ozlabs.org/patch/366967/) 2. sysctl to conditionally drop all timestamps that have payload or cmsg from users without CAP_NET_RAW. Signed-off-by: David S. Miller <davem@davemloft.net>
2014-12-01 11:22:34 +08:00
ip: fix error queue empty skb handling When reading from the error queue, msg_name and msg_control are only populated for some errors. A new exception for empty timestamp skbs added a false positive on icmp errors without payload. `traceroute -M udpconn` only displayed gateways that return payload with the icmp error: the embedded network headers are pulled before sock_queue_err_skb, leaving an skb with skb->len == 0 otherwise. Fix this regression by refining when msg_name and msg_control branches are taken. The solutions for the two fields are independent. msg_name only makes sense for errors that configure serr->port and serr->addr_offset. Test the first instead of skb->len. This also fixes another issue. saddr could hold the wrong data, as serr->addr_offset is not initialized in some code paths, pointing to the start of the network header. It is only valid when serr->port is set (non-zero). msg_control support differs between IPv4 and IPv6. IPv4 only honors requests for ICMP and timestamps with SOF_TIMESTAMPING_OPT_CMSG. The skb->len test can simply be removed, because skb->dev is also tested and never true for empty skbs. IPv6 honors requests for all errors aside from local errors and timestamps on empty skbs. In both cases, make the policy more explicit by moving this logic to a new function that decides whether to process msg_control and that optionally prepares the necessary fields in skb->cb[]. After this change, the IPv4 and IPv6 paths are more similar. The last case is rxrpc. Here, simply refine to only match timestamps. Fixes: 49ca0d8bfaf3 ("net-timestamp: no-payload option") Reported-by: Jan Niehusmann <jan@gondor.com> Signed-off-by: Willem de Bruijn <willemb@google.com> ---- Changes v1->v2 - fix local origin test inversion in ip6_datagram_support_cmsg - make v4 and v6 code paths more similar by introducing analogous ipv4_datagram_support_cmsg - fix compile bug in rxrpc Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-08 09:33:22 +08:00
return true;
net-timestamp: allow reading recv cmsg on errqueue with origin tstamp Allow reading of timestamps and cmsg at the same time on all relevant socket families. One use is to correlate timestamps with egress device, by asking for cmsg IP_PKTINFO. on AF_INET sockets, call the relevant function (ip_cmsg_recv). To avoid changing legacy expectations, only do so if the caller sets a new timestamping flag SOF_TIMESTAMPING_OPT_CMSG. on AF_INET6 sockets, IPV6_PKTINFO and all other recv cmsg are already returned for all origins. only change is to set ifindex, which is not initialized for all error origins. In both cases, only generate the pktinfo message if an ifindex is known. This is not the case for ACK timestamps. The difference between the protocol families is probably a historical accident as a result of the different conditions for generating cmsg in the relevant ip(v6)_recv_error function: ipv4: if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { ipv6: if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { At one time, this was the same test bar for the ICMP/ICMP6 distinction. This is no longer true. Signed-off-by: Willem de Bruijn <willemb@google.com> ---- Changes v1 -> v2 large rewrite - integrate with existing pktinfo cmsg generation code - on ipv4: only send with new flag, to maintain legacy behavior - on ipv6: send at most a single pktinfo cmsg - on ipv6: initialize fields if not yet initialized The recv cmsg interfaces are also relevant to the discussion of whether looping packet headers is problematic. For v6, cmsgs that identify many headers are already returned. This patch expands that to v4. If it sounds reasonable, I will follow with patches 1. request timestamps without payload with SOF_TIMESTAMPING_OPT_TSONLY (http://patchwork.ozlabs.org/patch/366967/) 2. sysctl to conditionally drop all timestamps that have payload or cmsg from users without CAP_NET_RAW. Signed-off-by: David S. Miller <davem@davemloft.net>
2014-12-01 11:22:34 +08:00
}
/*
* Handle MSG_ERRQUEUE
*/
int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct sock_exterr_skb *serr;
struct sk_buff *skb;
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name);
struct {
struct sock_extended_err ee;
struct sockaddr_in6 offender;
} errhdr;
int err;
int copied;
err = -EAGAIN;
skb = sock_dequeue_err_skb(sk);
if (!skb)
goto out;
copied = skb->len;
if (copied > len) {
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
err = skb_copy_datagram_msg(skb, 0, msg, copied);
if (unlikely(err)) {
kfree_skb(skb);
return err;
}
sock_recv_timestamp(msg, sk, skb);
serr = SKB_EXT_ERR(skb);
if (sin && ipv6_datagram_support_addr(serr)) {
const unsigned char *nh = skb_network_header(skb);
sin->sin6_family = AF_INET6;
sin->sin6_flowinfo = 0;
sin->sin6_port = serr->port;
IPv6: fix IPV6_RECVERR handling of locally-generated errors I noticed when I added support for IPV6_DONTFRAG that if you set IPV6_RECVERR and tried to send a UDP packet larger than 64K to an IPv6 destination, you'd correctly get an EMSGSIZE, but reading from MSG_ERRQUEUE returned the incorrect address in the cmsg: struct msghdr: msg_name 0x7fff8f3c96d0 msg_namelen 28 struct sockaddr_in6: sin6_family 10 sin6_port 7639 sin6_flowinfo 0 sin6_addr ::ffff:38.32.0.0 sin6_scope_id 0 ((null)) It should have returned this in my case: struct msghdr: msg_name 0x7fffd866b510 msg_namelen 28 struct sockaddr_in6: sin6_family 10 sin6_port 7639 sin6_flowinfo 0 sin6_addr 2620:0:a09:e000:21f:29ff:fe57:f88b sin6_scope_id 0 ((null)) The problem is that ipv6_recv_error() assumes that if the error wasn't generated by ICMPv6, it's an IPv4 address sitting there, and proceeds to create a v4-mapped address from it. Change ipv6_icmp_error() and ipv6_local_error() to set skb->protocol to htons(ETH_P_IPV6) so that ipv6_recv_error() knows the address sitting right after the extended error is IPv6, else it will incorrectly map the first octet into an IPv4-mapped IPv6 address in the cmsg structure returned in a recvmsg() call to obtain the error. Signed-off-by: Brian Haley <brian.haley@hp.com> -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Signed-off-by: David S. Miller <davem@davemloft.net>
2010-05-03 23:44:27 +08:00
if (skb->protocol == htons(ETH_P_IPV6)) {
const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset),
struct ipv6hdr, daddr);
sin->sin6_addr = ip6h->daddr;
if (np->sndflow)
sin->sin6_flowinfo = ip6_flowinfo(ip6h);
sin->sin6_scope_id =
ipv6_iface_scope_id(&sin->sin6_addr,
IP6CB(skb)->iif);
} else {
ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset),
&sin->sin6_addr);
sin->sin6_scope_id = 0;
}
*addr_len = sizeof(*sin);
}
memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
sin = &errhdr.offender;
memset(sin, 0, sizeof(*sin));
ip: fix error queue empty skb handling When reading from the error queue, msg_name and msg_control are only populated for some errors. A new exception for empty timestamp skbs added a false positive on icmp errors without payload. `traceroute -M udpconn` only displayed gateways that return payload with the icmp error: the embedded network headers are pulled before sock_queue_err_skb, leaving an skb with skb->len == 0 otherwise. Fix this regression by refining when msg_name and msg_control branches are taken. The solutions for the two fields are independent. msg_name only makes sense for errors that configure serr->port and serr->addr_offset. Test the first instead of skb->len. This also fixes another issue. saddr could hold the wrong data, as serr->addr_offset is not initialized in some code paths, pointing to the start of the network header. It is only valid when serr->port is set (non-zero). msg_control support differs between IPv4 and IPv6. IPv4 only honors requests for ICMP and timestamps with SOF_TIMESTAMPING_OPT_CMSG. The skb->len test can simply be removed, because skb->dev is also tested and never true for empty skbs. IPv6 honors requests for all errors aside from local errors and timestamps on empty skbs. In both cases, make the policy more explicit by moving this logic to a new function that decides whether to process msg_control and that optionally prepares the necessary fields in skb->cb[]. After this change, the IPv4 and IPv6 paths are more similar. The last case is rxrpc. Here, simply refine to only match timestamps. Fixes: 49ca0d8bfaf3 ("net-timestamp: no-payload option") Reported-by: Jan Niehusmann <jan@gondor.com> Signed-off-by: Willem de Bruijn <willemb@google.com> ---- Changes v1->v2 - fix local origin test inversion in ip6_datagram_support_cmsg - make v4 and v6 code paths more similar by introducing analogous ipv4_datagram_support_cmsg - fix compile bug in rxrpc Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-08 09:33:22 +08:00
if (ip6_datagram_support_cmsg(skb, serr)) {
sin->sin6_family = AF_INET6;
ip: fix error queue empty skb handling When reading from the error queue, msg_name and msg_control are only populated for some errors. A new exception for empty timestamp skbs added a false positive on icmp errors without payload. `traceroute -M udpconn` only displayed gateways that return payload with the icmp error: the embedded network headers are pulled before sock_queue_err_skb, leaving an skb with skb->len == 0 otherwise. Fix this regression by refining when msg_name and msg_control branches are taken. The solutions for the two fields are independent. msg_name only makes sense for errors that configure serr->port and serr->addr_offset. Test the first instead of skb->len. This also fixes another issue. saddr could hold the wrong data, as serr->addr_offset is not initialized in some code paths, pointing to the start of the network header. It is only valid when serr->port is set (non-zero). msg_control support differs between IPv4 and IPv6. IPv4 only honors requests for ICMP and timestamps with SOF_TIMESTAMPING_OPT_CMSG. The skb->len test can simply be removed, because skb->dev is also tested and never true for empty skbs. IPv6 honors requests for all errors aside from local errors and timestamps on empty skbs. In both cases, make the policy more explicit by moving this logic to a new function that decides whether to process msg_control and that optionally prepares the necessary fields in skb->cb[]. After this change, the IPv4 and IPv6 paths are more similar. The last case is rxrpc. Here, simply refine to only match timestamps. Fixes: 49ca0d8bfaf3 ("net-timestamp: no-payload option") Reported-by: Jan Niehusmann <jan@gondor.com> Signed-off-by: Willem de Bruijn <willemb@google.com> ---- Changes v1->v2 - fix local origin test inversion in ip6_datagram_support_cmsg - make v4 and v6 code paths more similar by introducing analogous ipv4_datagram_support_cmsg - fix compile bug in rxrpc Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-08 09:33:22 +08:00
if (np->rxopt.all)
ip6_datagram_recv_common_ctl(sk, msg, skb);
IPv6: fix IPV6_RECVERR handling of locally-generated errors I noticed when I added support for IPV6_DONTFRAG that if you set IPV6_RECVERR and tried to send a UDP packet larger than 64K to an IPv6 destination, you'd correctly get an EMSGSIZE, but reading from MSG_ERRQUEUE returned the incorrect address in the cmsg: struct msghdr: msg_name 0x7fff8f3c96d0 msg_namelen 28 struct sockaddr_in6: sin6_family 10 sin6_port 7639 sin6_flowinfo 0 sin6_addr ::ffff:38.32.0.0 sin6_scope_id 0 ((null)) It should have returned this in my case: struct msghdr: msg_name 0x7fffd866b510 msg_namelen 28 struct sockaddr_in6: sin6_family 10 sin6_port 7639 sin6_flowinfo 0 sin6_addr 2620:0:a09:e000:21f:29ff:fe57:f88b sin6_scope_id 0 ((null)) The problem is that ipv6_recv_error() assumes that if the error wasn't generated by ICMPv6, it's an IPv4 address sitting there, and proceeds to create a v4-mapped address from it. Change ipv6_icmp_error() and ipv6_local_error() to set skb->protocol to htons(ETH_P_IPV6) so that ipv6_recv_error() knows the address sitting right after the extended error is IPv6, else it will incorrectly map the first octet into an IPv4-mapped IPv6 address in the cmsg structure returned in a recvmsg() call to obtain the error. Signed-off-by: Brian Haley <brian.haley@hp.com> -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Signed-off-by: David S. Miller <davem@davemloft.net>
2010-05-03 23:44:27 +08:00
if (skb->protocol == htons(ETH_P_IPV6)) {
sin->sin6_addr = ipv6_hdr(skb)->saddr;
if (np->rxopt.all)
ip6_datagram_recv_specific_ctl(sk, msg, skb);
sin->sin6_scope_id =
ipv6_iface_scope_id(&sin->sin6_addr,
IP6CB(skb)->iif);
} else {
ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
&sin->sin6_addr);
if (inet_sk(sk)->cmsg_flags)
ip_cmsg_recv(msg, skb);
}
}
put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr);
/* Now we could try to dump offended packet options */
msg->msg_flags |= MSG_ERRQUEUE;
err = copied;
consume_skb(skb);
out:
return err;
}
EXPORT_SYMBOL_GPL(ipv6_recv_error);
/*
* Handle IPV6_RECVPATHMTU
*/
int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
int *addr_len)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff *skb;
struct ip6_mtuinfo mtu_info;
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name);
int err;
int copied;
err = -EAGAIN;
skb = xchg(&np->rxpmtu, NULL);
if (!skb)
goto out;
copied = skb->len;
if (copied > len) {
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
err = skb_copy_datagram_msg(skb, 0, msg, copied);
if (err)
goto out_free_skb;
sock_recv_timestamp(msg, sk, skb);
memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info));
if (sin) {
sin->sin6_family = AF_INET6;
sin->sin6_flowinfo = 0;
sin->sin6_port = 0;
sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id;
sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr;
*addr_len = sizeof(*sin);
}
put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info);
err = copied;
out_free_skb:
kfree_skb(skb);
out:
return err;
}
void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg,
struct sk_buff *skb)
{
struct ipv6_pinfo *np = inet6_sk(sk);
bool is_ipv6 = skb->protocol == htons(ETH_P_IPV6);
if (np->rxopt.bits.rxinfo) {
struct in6_pktinfo src_info;
if (is_ipv6) {
src_info.ipi6_ifindex = IP6CB(skb)->iif;
src_info.ipi6_addr = ipv6_hdr(skb)->daddr;
} else {
src_info.ipi6_ifindex =
PKTINFO_SKB_CB(skb)->ipi_ifindex;
ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
&src_info.ipi6_addr);
}
net-timestamp: allow reading recv cmsg on errqueue with origin tstamp Allow reading of timestamps and cmsg at the same time on all relevant socket families. One use is to correlate timestamps with egress device, by asking for cmsg IP_PKTINFO. on AF_INET sockets, call the relevant function (ip_cmsg_recv). To avoid changing legacy expectations, only do so if the caller sets a new timestamping flag SOF_TIMESTAMPING_OPT_CMSG. on AF_INET6 sockets, IPV6_PKTINFO and all other recv cmsg are already returned for all origins. only change is to set ifindex, which is not initialized for all error origins. In both cases, only generate the pktinfo message if an ifindex is known. This is not the case for ACK timestamps. The difference between the protocol families is probably a historical accident as a result of the different conditions for generating cmsg in the relevant ip(v6)_recv_error function: ipv4: if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { ipv6: if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { At one time, this was the same test bar for the ICMP/ICMP6 distinction. This is no longer true. Signed-off-by: Willem de Bruijn <willemb@google.com> ---- Changes v1 -> v2 large rewrite - integrate with existing pktinfo cmsg generation code - on ipv4: only send with new flag, to maintain legacy behavior - on ipv6: send at most a single pktinfo cmsg - on ipv6: initialize fields if not yet initialized The recv cmsg interfaces are also relevant to the discussion of whether looping packet headers is problematic. For v6, cmsgs that identify many headers are already returned. This patch expands that to v4. If it sounds reasonable, I will follow with patches 1. request timestamps without payload with SOF_TIMESTAMPING_OPT_TSONLY (http://patchwork.ozlabs.org/patch/366967/) 2. sysctl to conditionally drop all timestamps that have payload or cmsg from users without CAP_NET_RAW. Signed-off-by: David S. Miller <davem@davemloft.net>
2014-12-01 11:22:34 +08:00
if (src_info.ipi6_ifindex >= 0)
put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO,
sizeof(src_info), &src_info);
}
}
void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
struct sk_buff *skb)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct inet6_skb_parm *opt = IP6CB(skb);
unsigned char *nh = skb_network_header(skb);
if (np->rxopt.bits.rxhlim) {
int hlim = ipv6_hdr(skb)->hop_limit;
put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
}
if (np->rxopt.bits.rxtclass) {
int tclass = ipv6_get_dsfield(ipv6_hdr(skb));
put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass);
}
if (np->rxopt.bits.rxflow) {
__be32 flowinfo = ip6_flowinfo((struct ipv6hdr *)nh);
if (flowinfo)
put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo);
}
/* HbH is allowed only once */
if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
u8 *ptr = nh + sizeof(struct ipv6hdr);
put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
}
if (opt->lastopt &&
(np->rxopt.bits.dstopts || np->rxopt.bits.srcrt)) {
/*
* Silly enough, but we need to reparse in order to
* report extension headers (except for HbH)
* in order.
*
* Also note that IPV6_RECVRTHDRDSTOPTS is NOT
* (and WILL NOT be) defined because
* IPV6_RECVDSTOPTS is more generic. --yoshfuji
*/
unsigned int off = sizeof(struct ipv6hdr);
u8 nexthdr = ipv6_hdr(skb)->nexthdr;
while (off <= opt->lastopt) {
unsigned int len;
u8 *ptr = nh + off;
switch (nexthdr) {
case IPPROTO_DSTOPTS:
nexthdr = ptr[0];
len = (ptr[1] + 1) << 3;
if (np->rxopt.bits.dstopts)
put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr);
break;
case IPPROTO_ROUTING:
nexthdr = ptr[0];
len = (ptr[1] + 1) << 3;
if (np->rxopt.bits.srcrt)
put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr);
break;
case IPPROTO_AH:
nexthdr = ptr[0];
len = (ptr[1] + 2) << 2;
break;
default:
nexthdr = ptr[0];
len = (ptr[1] + 1) << 3;
break;
}
off += len;
}
}
/* socket options in old style */
if (np->rxopt.bits.rxoinfo) {
struct in6_pktinfo src_info;
src_info.ipi6_ifindex = opt->iif;
src_info.ipi6_addr = ipv6_hdr(skb)->daddr;
put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
}
if (np->rxopt.bits.rxohlim) {
int hlim = ipv6_hdr(skb)->hop_limit;
put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
}
if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
u8 *ptr = nh + sizeof(struct ipv6hdr);
put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr);
}
if (np->rxopt.bits.odstopts && opt->dst0) {
u8 *ptr = nh + opt->dst0;
put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
}
if (np->rxopt.bits.osrcrt && opt->srcrt) {
struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt);
put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr);
}
if (np->rxopt.bits.odstopts && opt->dst1) {
u8 *ptr = nh + opt->dst1;
put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
}
if (np->rxopt.bits.rxorigdstaddr) {
struct sockaddr_in6 sin6;
__be16 *ports = (__be16 *) skb_transport_header(skb);
if (skb_transport_offset(skb) + 4 <= (int)skb->len) {
/* All current transport protocols have the port numbers in the
* first four bytes of the transport header and this function is
* written with this assumption in mind.
*/
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = ipv6_hdr(skb)->daddr;
sin6.sin6_port = ports[1];
sin6.sin6_flowinfo = 0;
sin6.sin6_scope_id =
ipv6_iface_scope_id(&ipv6_hdr(skb)->daddr,
opt->iif);
put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
}
}
if (np->rxopt.bits.recvfragsize && opt->frag_max_size) {
int val = opt->frag_max_size;
put_cmsg(msg, SOL_IPV6, IPV6_RECVFRAGSIZE, sizeof(val), &val);
}
}
void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
struct sk_buff *skb)
{
ip6_datagram_recv_common_ctl(sk, msg, skb);
ip6_datagram_recv_specific_ctl(sk, msg, skb);
}
EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
struct msghdr *msg, struct flowi6 *fl6,
struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc)
{
struct in6_pktinfo *src_info;
struct cmsghdr *cmsg;
struct ipv6_rt_hdr *rthdr;
struct ipv6_opt_hdr *hdr;
struct ipv6_txoptions *opt = ipc6->opt;
int len;
int err = 0;
for_each_cmsghdr(cmsg, msg) {
int addr_type;
if (!CMSG_OK(msg, cmsg)) {
err = -EINVAL;
goto exit_f;
}
if (cmsg->cmsg_level == SOL_SOCKET) {
err = __sock_cmsg_send(sk, msg, cmsg, sockc);
if (err)
return err;
continue;
}
if (cmsg->cmsg_level != SOL_IPV6)
continue;
switch (cmsg->cmsg_type) {
case IPV6_PKTINFO:
case IPV6_2292PKTINFO:
{
struct net_device *dev = NULL;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
err = -EINVAL;
goto exit_f;
}
src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
if (src_info->ipi6_ifindex) {
if (fl6->flowi6_oif &&
src_info->ipi6_ifindex != fl6->flowi6_oif)
return -EINVAL;
fl6->flowi6_oif = src_info->ipi6_ifindex;
}
addr_type = __ipv6_addr_type(&src_info->ipi6_addr);
rcu_read_lock();
if (fl6->flowi6_oif) {
dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
if (!dev) {
rcu_read_unlock();
return -ENODEV;
}
} else if (addr_type & IPV6_ADDR_LINKLOCAL) {
rcu_read_unlock();
return -EINVAL;
}
if (addr_type != IPV6_ADDR_ANY) {
int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) &&
net/ipv6: Change address check to always take a device argument ipv6_chk_addr_and_flags determines if an address is a local address and optionally if it is an address on a specific device. For example, it is called by ip6_route_info_create to determine if a given gateway address is a local address. The address check currently does not consider L3 domains and as a result does not allow a route to be added in one VRF if the nexthop points to an address in a second VRF. e.g., $ ip route add 2001:db8:1::/64 vrf r2 via 2001:db8:102::23 Error: Invalid gateway address. where 2001:db8:102::23 is an address on an interface in vrf r1. ipv6_chk_addr_and_flags needs to allow callers to always pass in a device with a separate argument to not limit the address to the specific device. The device is used used to determine the L3 domain of interest. To that end add an argument to skip the device check and update callers to always pass a device where possible and use the new argument to mean any address in the domain. Update a handful of users of ipv6_chk_addr with a NULL dev argument. This patch handles the change to these callers without adding the domain check. ip6_validate_gw needs to handle 2 cases - one where the device is given as part of the nexthop spec and the other where the device is resolved. There is at least 1 VRF case where deferring the check to only after the route lookup has resolved the device fails with an unintuitive error "RTNETLINK answers: No route to host" as opposed to the preferred "Error: Gateway can not be a local address." The 'no route to host' error is because of the fallback to a full lookup. The check is done twice to avoid this error. Signed-off-by: David Ahern <dsahern@gmail.com> Reviewed-by: Ido Schimmel <idosch@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-13 23:29:37 +08:00
!ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr,
dev, !strict, 0,
IFA_F_TENTATIVE) &&
!ipv6_chk_acast_addr_src(net, dev,
&src_info->ipi6_addr))
err = -EINVAL;
else
fl6->saddr = src_info->ipi6_addr;
}
rcu_read_unlock();
if (err)
goto exit_f;
break;
}
case IPV6_FLOWINFO:
if (cmsg->cmsg_len < CMSG_LEN(4)) {
err = -EINVAL;
goto exit_f;
}
if (fl6->flowlabel&IPV6_FLOWINFO_MASK) {
if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) {
err = -EINVAL;
goto exit_f;
}
}
fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg);
break;
case IPV6_2292HOPOPTS:
case IPV6_HOPOPTS:
if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
err = -EINVAL;
goto exit_f;
}
hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
len = ((hdr->hdrlen + 1) << 3);
if (cmsg->cmsg_len < CMSG_LEN(len)) {
err = -EINVAL;
goto exit_f;
}
net: Allow userns root to control ipv6 Allow an unpriviled user who has created a user namespace, and then created a network namespace to effectively use the new network namespace, by reducing capable(CAP_NET_ADMIN) and capable(CAP_NET_RAW) calls to be ns_capable(net->user_ns, CAP_NET_ADMIN), or capable(net->user_ns, CAP_NET_RAW) calls. Settings that merely control a single network device are allowed. Either the network device is a logical network device where restrictions make no difference or the network device is hardware NIC that has been explicity moved from the initial network namespace. In general policy and network stack state changes are allowed while resource control is left unchanged. Allow the SIOCSIFADDR ioctl to add ipv6 addresses. Allow the SIOCDIFADDR ioctl to delete ipv6 addresses. Allow the SIOCADDRT ioctl to add ipv6 routes. Allow the SIOCDELRT ioctl to delete ipv6 routes. Allow creation of ipv6 raw sockets. Allow setting the IPV6_JOIN_ANYCAST socket option. Allow setting the IPV6_FL_A_RENEW parameter of the IPV6_FLOWLABEL_MGR socket option. Allow setting the IPV6_TRANSPARENT socket option. Allow setting the IPV6_HOPOPTS socket option. Allow setting the IPV6_RTHDRDSTOPTS socket option. Allow setting the IPV6_DSTOPTS socket option. Allow setting the IPV6_IPSEC_POLICY socket option. Allow setting the IPV6_XFRM_POLICY socket option. Allow sending packets with the IPV6_2292HOPOPTS control message. Allow sending packets with the IPV6_2292DSTOPTS control message. Allow sending packets with the IPV6_RTHDRDSTOPTS control message. Allow setting the multicast routing socket options on non multicast routing sockets. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL, and SIOCDELTUNNEL ioctls for setting up, changing and deleting tunnels over ipv6. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL, SIOCDELTUNNEL ioctls for setting up, changing and deleting ipv6 over ipv4 tunnels. Allow the SIOCADDPRL, SIOCDELPRL, SIOCCHGPRL ioctls for adding, deleting, and changing the potential router list for ISATAP tunnels. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-16 11:03:06 +08:00
if (!ns_capable(net->user_ns, CAP_NET_RAW)) {
err = -EPERM;
goto exit_f;
}
opt->opt_nflen += len;
opt->hopopt = hdr;
break;
case IPV6_2292DSTOPTS:
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
err = -EINVAL;
goto exit_f;
}
hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
len = ((hdr->hdrlen + 1) << 3);
if (cmsg->cmsg_len < CMSG_LEN(len)) {
err = -EINVAL;
goto exit_f;
}
net: Allow userns root to control ipv6 Allow an unpriviled user who has created a user namespace, and then created a network namespace to effectively use the new network namespace, by reducing capable(CAP_NET_ADMIN) and capable(CAP_NET_RAW) calls to be ns_capable(net->user_ns, CAP_NET_ADMIN), or capable(net->user_ns, CAP_NET_RAW) calls. Settings that merely control a single network device are allowed. Either the network device is a logical network device where restrictions make no difference or the network device is hardware NIC that has been explicity moved from the initial network namespace. In general policy and network stack state changes are allowed while resource control is left unchanged. Allow the SIOCSIFADDR ioctl to add ipv6 addresses. Allow the SIOCDIFADDR ioctl to delete ipv6 addresses. Allow the SIOCADDRT ioctl to add ipv6 routes. Allow the SIOCDELRT ioctl to delete ipv6 routes. Allow creation of ipv6 raw sockets. Allow setting the IPV6_JOIN_ANYCAST socket option. Allow setting the IPV6_FL_A_RENEW parameter of the IPV6_FLOWLABEL_MGR socket option. Allow setting the IPV6_TRANSPARENT socket option. Allow setting the IPV6_HOPOPTS socket option. Allow setting the IPV6_RTHDRDSTOPTS socket option. Allow setting the IPV6_DSTOPTS socket option. Allow setting the IPV6_IPSEC_POLICY socket option. Allow setting the IPV6_XFRM_POLICY socket option. Allow sending packets with the IPV6_2292HOPOPTS control message. Allow sending packets with the IPV6_2292DSTOPTS control message. Allow sending packets with the IPV6_RTHDRDSTOPTS control message. Allow setting the multicast routing socket options on non multicast routing sockets. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL, and SIOCDELTUNNEL ioctls for setting up, changing and deleting tunnels over ipv6. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL, SIOCDELTUNNEL ioctls for setting up, changing and deleting ipv6 over ipv4 tunnels. Allow the SIOCADDPRL, SIOCDELPRL, SIOCCHGPRL ioctls for adding, deleting, and changing the potential router list for ISATAP tunnels. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-16 11:03:06 +08:00
if (!ns_capable(net->user_ns, CAP_NET_RAW)) {
err = -EPERM;
goto exit_f;
}
if (opt->dst1opt) {
err = -EINVAL;
goto exit_f;
}
opt->opt_flen += len;
opt->dst1opt = hdr;
break;
case IPV6_DSTOPTS:
case IPV6_RTHDRDSTOPTS:
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
err = -EINVAL;
goto exit_f;
}
hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
len = ((hdr->hdrlen + 1) << 3);
if (cmsg->cmsg_len < CMSG_LEN(len)) {
err = -EINVAL;
goto exit_f;
}
net: Allow userns root to control ipv6 Allow an unpriviled user who has created a user namespace, and then created a network namespace to effectively use the new network namespace, by reducing capable(CAP_NET_ADMIN) and capable(CAP_NET_RAW) calls to be ns_capable(net->user_ns, CAP_NET_ADMIN), or capable(net->user_ns, CAP_NET_RAW) calls. Settings that merely control a single network device are allowed. Either the network device is a logical network device where restrictions make no difference or the network device is hardware NIC that has been explicity moved from the initial network namespace. In general policy and network stack state changes are allowed while resource control is left unchanged. Allow the SIOCSIFADDR ioctl to add ipv6 addresses. Allow the SIOCDIFADDR ioctl to delete ipv6 addresses. Allow the SIOCADDRT ioctl to add ipv6 routes. Allow the SIOCDELRT ioctl to delete ipv6 routes. Allow creation of ipv6 raw sockets. Allow setting the IPV6_JOIN_ANYCAST socket option. Allow setting the IPV6_FL_A_RENEW parameter of the IPV6_FLOWLABEL_MGR socket option. Allow setting the IPV6_TRANSPARENT socket option. Allow setting the IPV6_HOPOPTS socket option. Allow setting the IPV6_RTHDRDSTOPTS socket option. Allow setting the IPV6_DSTOPTS socket option. Allow setting the IPV6_IPSEC_POLICY socket option. Allow setting the IPV6_XFRM_POLICY socket option. Allow sending packets with the IPV6_2292HOPOPTS control message. Allow sending packets with the IPV6_2292DSTOPTS control message. Allow sending packets with the IPV6_RTHDRDSTOPTS control message. Allow setting the multicast routing socket options on non multicast routing sockets. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL, and SIOCDELTUNNEL ioctls for setting up, changing and deleting tunnels over ipv6. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL, SIOCDELTUNNEL ioctls for setting up, changing and deleting ipv6 over ipv4 tunnels. Allow the SIOCADDPRL, SIOCDELPRL, SIOCCHGPRL ioctls for adding, deleting, and changing the potential router list for ISATAP tunnels. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-16 11:03:06 +08:00
if (!ns_capable(net->user_ns, CAP_NET_RAW)) {
err = -EPERM;
goto exit_f;
}
if (cmsg->cmsg_type == IPV6_DSTOPTS) {
opt->opt_flen += len;
opt->dst1opt = hdr;
} else {
opt->opt_nflen += len;
opt->dst0opt = hdr;
}
break;
case IPV6_2292RTHDR:
case IPV6_RTHDR:
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) {
err = -EINVAL;
goto exit_f;
}
rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg);
switch (rthdr->type) {
#if IS_ENABLED(CONFIG_IPV6_MIP6)
case IPV6_SRCRT_TYPE_2:
if (rthdr->hdrlen != 2 ||
rthdr->segments_left != 1) {
err = -EINVAL;
goto exit_f;
}
break;
#endif
default:
err = -EINVAL;
goto exit_f;
}
len = ((rthdr->hdrlen + 1) << 3);
if (cmsg->cmsg_len < CMSG_LEN(len)) {
err = -EINVAL;
goto exit_f;
}
/* segments left must also match */
if ((rthdr->hdrlen >> 1) != rthdr->segments_left) {
err = -EINVAL;
goto exit_f;
}
opt->opt_nflen += len;
opt->srcrt = rthdr;
if (cmsg->cmsg_type == IPV6_2292RTHDR && opt->dst1opt) {
int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3);
opt->opt_nflen += dsthdrlen;
opt->dst0opt = opt->dst1opt;
opt->dst1opt = NULL;
opt->opt_flen -= dsthdrlen;
}
break;
case IPV6_2292HOPLIMIT:
case IPV6_HOPLIMIT:
if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
err = -EINVAL;
goto exit_f;
}
ipc6->hlimit = *(int *)CMSG_DATA(cmsg);
if (ipc6->hlimit < -1 || ipc6->hlimit > 0xff) {
err = -EINVAL;
goto exit_f;
}
break;
case IPV6_TCLASS:
{
int tc;
err = -EINVAL;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
goto exit_f;
tc = *(int *)CMSG_DATA(cmsg);
if (tc < -1 || tc > 0xff)
goto exit_f;
err = 0;
ipc6->tclass = tc;
break;
}
case IPV6_DONTFRAG:
{
int df;
err = -EINVAL;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
goto exit_f;
df = *(int *)CMSG_DATA(cmsg);
if (df < 0 || df > 1)
goto exit_f;
err = 0;
ipc6->dontfrag = df;
break;
}
default:
net_dbg_ratelimited("invalid cmsg type: %d\n",
cmsg->cmsg_type);
err = -EINVAL;
goto exit_f;
}
}
exit_f:
return err;
}
EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl);
void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
__u16 srcp, __u16 destp, int bucket)
{
const struct in6_addr *dest, *src;
dest = &sp->sk_v6_daddr;
src = &sp->sk_v6_rcv_saddr;
seq_printf(seq,
"%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
"%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
bucket,
src->s6_addr32[0], src->s6_addr32[1],
src->s6_addr32[2], src->s6_addr32[3], srcp,
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3], destp,
sp->sk_state,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
0, 0L, 0,
from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
0,
sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
atomic_read(&sp->sk_drops));
}