Merge branch 'tcp_dccp_ports'
Eric Dumazet says:
====================
tcp/dccp: better use of ephemeral ports
Big servers have bloated bind table, making very hard to succeed
ephemeral port allocations, without special containers/namespace tricks.
This patch series extends the strategy added in commit 07f4c90062
("tcp/dccp: try to not exhaust ip_local_port_range in connect()").
Since ports used by connect() are much likely to be shared among them,
we give a hint to both bind() and connect() to keep the crowds separated
if possible.
Of course, if on a specific host an application needs to allocate ~30000
ports using bind(), it will still be able to do so. Same for ~30000 connect()
to a unique 2-tuple (dst addr, dst port)
New implemetation is also more friendly to softirqs and reschedules.
v2: rebase after TCP SO_REUSEPORT changes
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
e51271d4ce
|
@ -91,165 +91,153 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
|
|||
|
||||
/* Obtain a reference to a local port for the given sock,
|
||||
* if snum is zero it means select any available local port.
|
||||
* We try to allocate an odd port (and leave even ports for connect())
|
||||
*/
|
||||
int inet_csk_get_port(struct sock *sk, unsigned short snum)
|
||||
{
|
||||
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
|
||||
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
|
||||
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
|
||||
int ret = 1, attempts = 5, port = snum;
|
||||
int smallest_size = -1, smallest_port;
|
||||
struct inet_bind_hashbucket *head;
|
||||
struct inet_bind_bucket *tb;
|
||||
int ret, attempts = 5;
|
||||
struct net *net = sock_net(sk);
|
||||
int smallest_size = -1, smallest_rover;
|
||||
int i, low, high, attempt_half;
|
||||
struct inet_bind_bucket *tb;
|
||||
kuid_t uid = sock_i_uid(sk);
|
||||
int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
|
||||
u32 remaining, offset;
|
||||
|
||||
local_bh_disable();
|
||||
if (!snum) {
|
||||
int remaining, rover, low, high;
|
||||
|
||||
again:
|
||||
inet_get_local_port_range(net, &low, &high);
|
||||
if (attempt_half) {
|
||||
int half = low + ((high - low) >> 1);
|
||||
|
||||
if (attempt_half == 1)
|
||||
high = half;
|
||||
else
|
||||
low = half;
|
||||
}
|
||||
remaining = (high - low) + 1;
|
||||
smallest_rover = rover = prandom_u32() % remaining + low;
|
||||
|
||||
smallest_size = -1;
|
||||
do {
|
||||
if (inet_is_local_reserved_port(net, rover))
|
||||
goto next_nolock;
|
||||
head = &hashinfo->bhash[inet_bhashfn(net, rover,
|
||||
hashinfo->bhash_size)];
|
||||
spin_lock(&head->lock);
|
||||
inet_bind_bucket_for_each(tb, &head->chain)
|
||||
if (net_eq(ib_net(tb), net) && tb->port == rover) {
|
||||
if (((tb->fastreuse > 0 &&
|
||||
sk->sk_reuse &&
|
||||
sk->sk_state != TCP_LISTEN) ||
|
||||
(tb->fastreuseport > 0 &&
|
||||
sk->sk_reuseport &&
|
||||
!rcu_access_pointer(sk->sk_reuseport_cb) &&
|
||||
uid_eq(tb->fastuid, uid))) &&
|
||||
(tb->num_owners < smallest_size || smallest_size == -1)) {
|
||||
smallest_size = tb->num_owners;
|
||||
smallest_rover = rover;
|
||||
}
|
||||
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
|
||||
snum = rover;
|
||||
goto tb_found;
|
||||
}
|
||||
goto next;
|
||||
}
|
||||
break;
|
||||
next:
|
||||
spin_unlock(&head->lock);
|
||||
next_nolock:
|
||||
if (++rover > high)
|
||||
rover = low;
|
||||
} while (--remaining > 0);
|
||||
|
||||
/* Exhausted local port range during search? It is not
|
||||
* possible for us to be holding one of the bind hash
|
||||
* locks if this test triggers, because if 'remaining'
|
||||
* drops to zero, we broke out of the do/while loop at
|
||||
* the top level, not from the 'break;' statement.
|
||||
*/
|
||||
ret = 1;
|
||||
if (remaining <= 0) {
|
||||
if (smallest_size != -1) {
|
||||
snum = smallest_rover;
|
||||
goto have_snum;
|
||||
}
|
||||
if (attempt_half == 1) {
|
||||
/* OK we now try the upper half of the range */
|
||||
attempt_half = 2;
|
||||
goto again;
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
/* OK, here is the one we will use. HEAD is
|
||||
* non-NULL and we hold it's mutex.
|
||||
*/
|
||||
snum = rover;
|
||||
} else {
|
||||
have_snum:
|
||||
head = &hashinfo->bhash[inet_bhashfn(net, snum,
|
||||
hashinfo->bhash_size)];
|
||||
spin_lock(&head->lock);
|
||||
if (port) {
|
||||
have_port:
|
||||
head = &hinfo->bhash[inet_bhashfn(net, port,
|
||||
hinfo->bhash_size)];
|
||||
spin_lock_bh(&head->lock);
|
||||
inet_bind_bucket_for_each(tb, &head->chain)
|
||||
if (net_eq(ib_net(tb), net) && tb->port == snum)
|
||||
if (net_eq(ib_net(tb), net) && tb->port == port)
|
||||
goto tb_found;
|
||||
|
||||
goto tb_not_found;
|
||||
}
|
||||
tb = NULL;
|
||||
goto tb_not_found;
|
||||
again:
|
||||
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
|
||||
other_half_scan:
|
||||
inet_get_local_port_range(net, &low, &high);
|
||||
high++; /* [32768, 60999] -> [32768, 61000[ */
|
||||
if (high - low < 4)
|
||||
attempt_half = 0;
|
||||
if (attempt_half) {
|
||||
int half = low + (((high - low) >> 2) << 1);
|
||||
|
||||
if (attempt_half == 1)
|
||||
high = half;
|
||||
else
|
||||
low = half;
|
||||
}
|
||||
remaining = high - low;
|
||||
if (likely(remaining > 1))
|
||||
remaining &= ~1U;
|
||||
|
||||
offset = prandom_u32() % remaining;
|
||||
/* __inet_hash_connect() favors ports having @low parity
|
||||
* We do the opposite to not pollute connect() users.
|
||||
*/
|
||||
offset |= 1U;
|
||||
smallest_size = -1;
|
||||
smallest_port = low; /* avoid compiler warning */
|
||||
|
||||
other_parity_scan:
|
||||
port = low + offset;
|
||||
for (i = 0; i < remaining; i += 2, port += 2) {
|
||||
if (unlikely(port >= high))
|
||||
port -= remaining;
|
||||
if (inet_is_local_reserved_port(net, port))
|
||||
continue;
|
||||
head = &hinfo->bhash[inet_bhashfn(net, port,
|
||||
hinfo->bhash_size)];
|
||||
spin_lock_bh(&head->lock);
|
||||
inet_bind_bucket_for_each(tb, &head->chain)
|
||||
if (net_eq(ib_net(tb), net) && tb->port == port) {
|
||||
if (((tb->fastreuse > 0 && reuse) ||
|
||||
(tb->fastreuseport > 0 &&
|
||||
sk->sk_reuseport &&
|
||||
!rcu_access_pointer(sk->sk_reuseport_cb) &&
|
||||
uid_eq(tb->fastuid, uid))) &&
|
||||
(tb->num_owners < smallest_size || smallest_size == -1)) {
|
||||
smallest_size = tb->num_owners;
|
||||
smallest_port = port;
|
||||
}
|
||||
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
|
||||
goto tb_found;
|
||||
goto next_port;
|
||||
}
|
||||
goto tb_not_found;
|
||||
next_port:
|
||||
spin_unlock_bh(&head->lock);
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
if (smallest_size != -1) {
|
||||
port = smallest_port;
|
||||
goto have_port;
|
||||
}
|
||||
offset--;
|
||||
if (!(offset & 1))
|
||||
goto other_parity_scan;
|
||||
|
||||
if (attempt_half == 1) {
|
||||
/* OK we now try the upper half of the range */
|
||||
attempt_half = 2;
|
||||
goto other_half_scan;
|
||||
}
|
||||
return ret;
|
||||
|
||||
tb_not_found:
|
||||
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
|
||||
net, head, port);
|
||||
if (!tb)
|
||||
goto fail_unlock;
|
||||
tb_found:
|
||||
if (!hlist_empty(&tb->owners)) {
|
||||
if (sk->sk_reuse == SK_FORCE_REUSE)
|
||||
goto success;
|
||||
|
||||
if (((tb->fastreuse > 0 &&
|
||||
sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
|
||||
if (((tb->fastreuse > 0 && reuse) ||
|
||||
(tb->fastreuseport > 0 &&
|
||||
sk->sk_reuseport &&
|
||||
!rcu_access_pointer(sk->sk_reuseport_cb) &&
|
||||
uid_eq(tb->fastuid, uid))) && smallest_size == -1) {
|
||||
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
|
||||
smallest_size == -1)
|
||||
goto success;
|
||||
} else {
|
||||
ret = 1;
|
||||
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
|
||||
if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
|
||||
(tb->fastreuseport > 0 &&
|
||||
sk->sk_reuseport &&
|
||||
!rcu_access_pointer(sk->sk_reuseport_cb) &&
|
||||
uid_eq(tb->fastuid, uid))) &&
|
||||
smallest_size != -1 && --attempts >= 0) {
|
||||
spin_unlock(&head->lock);
|
||||
goto again;
|
||||
}
|
||||
|
||||
goto fail_unlock;
|
||||
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
|
||||
if ((reuse ||
|
||||
(tb->fastreuseport > 0 &&
|
||||
sk->sk_reuseport &&
|
||||
!rcu_access_pointer(sk->sk_reuseport_cb) &&
|
||||
uid_eq(tb->fastuid, uid))) &&
|
||||
smallest_size != -1 && --attempts >= 0) {
|
||||
spin_unlock_bh(&head->lock);
|
||||
goto again;
|
||||
}
|
||||
goto fail_unlock;
|
||||
}
|
||||
}
|
||||
tb_not_found:
|
||||
ret = 1;
|
||||
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
|
||||
net, head, snum)) == NULL)
|
||||
goto fail_unlock;
|
||||
if (hlist_empty(&tb->owners)) {
|
||||
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
|
||||
tb->fastreuse = 1;
|
||||
else
|
||||
if (!reuse)
|
||||
tb->fastreuse = 0;
|
||||
if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
|
||||
tb->fastreuseport = 0;
|
||||
} else {
|
||||
tb->fastreuse = reuse;
|
||||
if (sk->sk_reuseport) {
|
||||
tb->fastreuseport = 1;
|
||||
tb->fastuid = uid;
|
||||
} else
|
||||
tb->fastreuseport = 0;
|
||||
} else {
|
||||
if (tb->fastreuse &&
|
||||
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
|
||||
tb->fastreuse = 0;
|
||||
if (tb->fastreuseport &&
|
||||
(!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
|
||||
} else {
|
||||
tb->fastreuseport = 0;
|
||||
}
|
||||
}
|
||||
success:
|
||||
if (!inet_csk(sk)->icsk_bind_hash)
|
||||
inet_bind_hash(sk, tb, snum);
|
||||
inet_bind_hash(sk, tb, port);
|
||||
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
|
||||
ret = 0;
|
||||
|
||||
fail_unlock:
|
||||
spin_unlock(&head->lock);
|
||||
fail:
|
||||
local_bh_enable();
|
||||
spin_unlock_bh(&head->lock);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_get_port);
|
||||
|
|
|
@ -565,106 +565,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
|
|||
struct sock *, __u16, struct inet_timewait_sock **))
|
||||
{
|
||||
struct inet_hashinfo *hinfo = death_row->hashinfo;
|
||||
const unsigned short snum = inet_sk(sk)->inet_num;
|
||||
struct inet_timewait_sock *tw = NULL;
|
||||
struct inet_bind_hashbucket *head;
|
||||
struct inet_bind_bucket *tb;
|
||||
int ret;
|
||||
int port = inet_sk(sk)->inet_num;
|
||||
struct net *net = sock_net(sk);
|
||||
struct inet_bind_bucket *tb;
|
||||
u32 remaining, offset;
|
||||
int ret, i, low, high;
|
||||
static u32 hint;
|
||||
|
||||
if (!snum) {
|
||||
int i, remaining, low, high, port;
|
||||
static u32 hint;
|
||||
u32 offset = hint + port_offset;
|
||||
struct inet_timewait_sock *tw = NULL;
|
||||
|
||||
inet_get_local_port_range(net, &low, &high);
|
||||
remaining = (high - low) + 1;
|
||||
|
||||
/* By starting with offset being an even number,
|
||||
* we tend to leave about 50% of ports for other uses,
|
||||
* like bind(0).
|
||||
*/
|
||||
offset &= ~1;
|
||||
|
||||
local_bh_disable();
|
||||
for (i = 0; i < remaining; i++) {
|
||||
port = low + (i + offset) % remaining;
|
||||
if (inet_is_local_reserved_port(net, port))
|
||||
continue;
|
||||
head = &hinfo->bhash[inet_bhashfn(net, port,
|
||||
hinfo->bhash_size)];
|
||||
spin_lock(&head->lock);
|
||||
|
||||
/* Does not bother with rcv_saddr checks,
|
||||
* because the established check is already
|
||||
* unique enough.
|
||||
*/
|
||||
inet_bind_bucket_for_each(tb, &head->chain) {
|
||||
if (net_eq(ib_net(tb), net) &&
|
||||
tb->port == port) {
|
||||
if (tb->fastreuse >= 0 ||
|
||||
tb->fastreuseport >= 0)
|
||||
goto next_port;
|
||||
WARN_ON(hlist_empty(&tb->owners));
|
||||
if (!check_established(death_row, sk,
|
||||
port, &tw))
|
||||
goto ok;
|
||||
goto next_port;
|
||||
}
|
||||
}
|
||||
|
||||
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
|
||||
net, head, port);
|
||||
if (!tb) {
|
||||
spin_unlock(&head->lock);
|
||||
break;
|
||||
}
|
||||
tb->fastreuse = -1;
|
||||
tb->fastreuseport = -1;
|
||||
goto ok;
|
||||
|
||||
next_port:
|
||||
spin_unlock(&head->lock);
|
||||
if (port) {
|
||||
head = &hinfo->bhash[inet_bhashfn(net, port,
|
||||
hinfo->bhash_size)];
|
||||
tb = inet_csk(sk)->icsk_bind_hash;
|
||||
spin_lock_bh(&head->lock);
|
||||
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
|
||||
inet_ehash_nolisten(sk, NULL);
|
||||
spin_unlock_bh(&head->lock);
|
||||
return 0;
|
||||
}
|
||||
local_bh_enable();
|
||||
|
||||
return -EADDRNOTAVAIL;
|
||||
|
||||
ok:
|
||||
hint += (i + 2) & ~1;
|
||||
|
||||
/* Head lock still held and bh's disabled */
|
||||
inet_bind_hash(sk, tb, port);
|
||||
if (sk_unhashed(sk)) {
|
||||
inet_sk(sk)->inet_sport = htons(port);
|
||||
inet_ehash_nolisten(sk, (struct sock *)tw);
|
||||
}
|
||||
if (tw)
|
||||
inet_twsk_bind_unhash(tw, hinfo);
|
||||
spin_unlock(&head->lock);
|
||||
|
||||
if (tw)
|
||||
inet_twsk_deschedule_put(tw);
|
||||
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
|
||||
tb = inet_csk(sk)->icsk_bind_hash;
|
||||
spin_lock_bh(&head->lock);
|
||||
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
|
||||
inet_ehash_nolisten(sk, NULL);
|
||||
spin_unlock_bh(&head->lock);
|
||||
return 0;
|
||||
} else {
|
||||
spin_unlock(&head->lock);
|
||||
/* No definite answer... Walk to established hash table */
|
||||
ret = check_established(death_row, sk, snum, NULL);
|
||||
out:
|
||||
ret = check_established(death_row, sk, port, NULL);
|
||||
local_bh_enable();
|
||||
return ret;
|
||||
}
|
||||
|
||||
inet_get_local_port_range(net, &low, &high);
|
||||
high++; /* [32768, 60999] -> [32768, 61000[ */
|
||||
remaining = high - low;
|
||||
if (likely(remaining > 1))
|
||||
remaining &= ~1U;
|
||||
|
||||
offset = (hint + port_offset) % remaining;
|
||||
/* In first pass we try ports of @low parity.
|
||||
* inet_csk_get_port() does the opposite choice.
|
||||
*/
|
||||
offset &= ~1U;
|
||||
other_parity_scan:
|
||||
port = low + offset;
|
||||
for (i = 0; i < remaining; i += 2, port += 2) {
|
||||
if (unlikely(port >= high))
|
||||
port -= remaining;
|
||||
if (inet_is_local_reserved_port(net, port))
|
||||
continue;
|
||||
head = &hinfo->bhash[inet_bhashfn(net, port,
|
||||
hinfo->bhash_size)];
|
||||
spin_lock_bh(&head->lock);
|
||||
|
||||
/* Does not bother with rcv_saddr checks, because
|
||||
* the established check is already unique enough.
|
||||
*/
|
||||
inet_bind_bucket_for_each(tb, &head->chain) {
|
||||
if (net_eq(ib_net(tb), net) && tb->port == port) {
|
||||
if (tb->fastreuse >= 0 ||
|
||||
tb->fastreuseport >= 0)
|
||||
goto next_port;
|
||||
WARN_ON(hlist_empty(&tb->owners));
|
||||
if (!check_established(death_row, sk,
|
||||
port, &tw))
|
||||
goto ok;
|
||||
goto next_port;
|
||||
}
|
||||
}
|
||||
|
||||
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
|
||||
net, head, port);
|
||||
if (!tb) {
|
||||
spin_unlock_bh(&head->lock);
|
||||
return -ENOMEM;
|
||||
}
|
||||
tb->fastreuse = -1;
|
||||
tb->fastreuseport = -1;
|
||||
goto ok;
|
||||
next_port:
|
||||
spin_unlock_bh(&head->lock);
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
offset++;
|
||||
if ((offset & 1) && remaining > 1)
|
||||
goto other_parity_scan;
|
||||
|
||||
return -EADDRNOTAVAIL;
|
||||
|
||||
ok:
|
||||
hint += i + 2;
|
||||
|
||||
/* Head lock still held and bh's disabled */
|
||||
inet_bind_hash(sk, tb, port);
|
||||
if (sk_unhashed(sk)) {
|
||||
inet_sk(sk)->inet_sport = htons(port);
|
||||
inet_ehash_nolisten(sk, (struct sock *)tw);
|
||||
}
|
||||
if (tw)
|
||||
inet_twsk_bind_unhash(tw, hinfo);
|
||||
spin_unlock(&head->lock);
|
||||
if (tw)
|
||||
inet_twsk_deschedule_put(tw);
|
||||
local_bh_enable();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
Loading…
Reference in New Issue